ocrd 3.5.1__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +6 -2
- ocrd/cli/bashlib.py +7 -2
- ocrd/cli/log.py +7 -2
- ocrd/cli/network.py +0 -2
- ocrd/cli/ocrd_tool.py +26 -4
- ocrd/cli/process.py +1 -0
- ocrd/cli/resmgr.py +0 -1
- ocrd/cli/validate.py +32 -13
- ocrd/cli/workspace.py +125 -52
- ocrd/cli/zip.py +13 -4
- ocrd/decorators/__init__.py +28 -52
- ocrd/decorators/loglevel_option.py +4 -0
- ocrd/decorators/mets_find_options.py +2 -1
- ocrd/decorators/ocrd_cli_options.py +3 -7
- ocrd/decorators/parameter_option.py +12 -11
- ocrd/lib.bash +6 -13
- ocrd/mets_server.py +6 -10
- ocrd/processor/base.py +88 -71
- ocrd/processor/builtin/dummy_processor.py +7 -4
- ocrd/processor/builtin/filter_processor.py +3 -2
- ocrd/processor/helpers.py +5 -6
- ocrd/processor/ocrd_page_result.py +7 -5
- ocrd/resolver.py +42 -32
- ocrd/task_sequence.py +11 -4
- ocrd/workspace.py +64 -54
- ocrd/workspace_backup.py +3 -0
- ocrd/workspace_bagger.py +15 -8
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/METADATA +1 -1
- ocrd-3.6.0.dist-info/RECORD +125 -0
- ocrd_modelfactory/__init__.py +4 -2
- ocrd_models/constants.py +18 -1
- ocrd_models/ocrd_agent.py +1 -1
- ocrd_models/ocrd_exif.py +7 -3
- ocrd_models/ocrd_file.py +24 -19
- ocrd_models/ocrd_mets.py +90 -67
- ocrd_models/ocrd_page.py +17 -13
- ocrd_models/ocrd_xml_base.py +1 -0
- ocrd_models/report.py +2 -1
- ocrd_models/utils.py +4 -3
- ocrd_models/xpath_functions.py +3 -1
- ocrd_network/__init__.py +1 -2
- ocrd_network/cli/__init__.py +0 -2
- ocrd_network/cli/client.py +122 -50
- ocrd_network/cli/processing_server.py +1 -2
- ocrd_network/client.py +2 -2
- ocrd_network/client_utils.py +30 -13
- ocrd_network/constants.py +1 -6
- ocrd_network/database.py +3 -3
- ocrd_network/logging_utils.py +2 -7
- ocrd_network/models/__init__.py +0 -2
- ocrd_network/models/job.py +2 -5
- ocrd_network/models/workspace.py +1 -1
- ocrd_network/process_helpers.py +54 -17
- ocrd_network/processing_server.py +63 -114
- ocrd_network/processing_worker.py +6 -5
- ocrd_network/rabbitmq_utils/__init__.py +2 -0
- ocrd_network/rabbitmq_utils/helpers.py +24 -7
- ocrd_network/runtime_data/__init__.py +1 -2
- ocrd_network/runtime_data/deployer.py +12 -85
- ocrd_network/runtime_data/hosts.py +61 -130
- ocrd_network/runtime_data/network_agents.py +7 -31
- ocrd_network/runtime_data/network_services.py +1 -1
- ocrd_network/server_cache.py +1 -1
- ocrd_network/server_utils.py +13 -52
- ocrd_network/utils.py +1 -0
- ocrd_utils/__init__.py +4 -4
- ocrd_utils/config.py +86 -76
- ocrd_utils/deprecate.py +3 -0
- ocrd_utils/image.py +51 -23
- ocrd_utils/introspect.py +8 -3
- ocrd_utils/logging.py +12 -7
- ocrd_utils/os.py +16 -3
- ocrd_utils/str.py +32 -16
- ocrd_validators/json_validator.py +4 -1
- ocrd_validators/ocrd_tool_validator.py +2 -1
- ocrd_validators/ocrd_zip_validator.py +5 -4
- ocrd_validators/page_validator.py +21 -9
- ocrd_validators/parameter_validator.py +3 -2
- ocrd_validators/processing_server_config.schema.yml +1 -33
- ocrd_validators/resource_list_validator.py +3 -1
- ocrd_validators/workspace_validator.py +30 -20
- ocrd_validators/xsd_mets_validator.py +2 -1
- ocrd_validators/xsd_page_validator.py +2 -1
- ocrd_validators/xsd_validator.py +4 -2
- ocrd-3.5.1.dist-info/RECORD +0 -128
- ocrd_network/cli/processor_server.py +0 -31
- ocrd_network/models/ocrd_tool.py +0 -12
- ocrd_network/processor_server.py +0 -255
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/LICENSE +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/WHEEL +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/entry_points.txt +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/top_level.txt +0 -0
ocrd_models/ocrd_mets.py
CHANGED
|
@@ -3,7 +3,6 @@ API to METS
|
|
|
3
3
|
"""
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
import re
|
|
6
|
-
from lxml import etree as ET
|
|
7
6
|
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
|
|
8
7
|
|
|
9
8
|
from ocrd_utils import (
|
|
@@ -37,44 +36,45 @@ from .constants import (
|
|
|
37
36
|
METS_DIV_ATTRIBUTE_REGEX_PATTERN,
|
|
38
37
|
)
|
|
39
38
|
|
|
40
|
-
from .ocrd_xml_base import OcrdXmlDocument, ET
|
|
39
|
+
from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore
|
|
41
40
|
from .ocrd_file import OcrdFile
|
|
42
41
|
from .ocrd_agent import OcrdAgent
|
|
43
42
|
|
|
44
43
|
REGEX_PREFIX_LEN = len(REGEX_PREFIX)
|
|
45
44
|
|
|
45
|
+
|
|
46
46
|
class OcrdMets(OcrdXmlDocument):
|
|
47
47
|
"""
|
|
48
48
|
API to a single METS file
|
|
49
49
|
"""
|
|
50
|
-
_cache_flag
|
|
50
|
+
_cache_flag: bool
|
|
51
51
|
# Cache for the physical pages (mets:div) - two nested dictionaries
|
|
52
52
|
# The outer dictionary's key: attribute type
|
|
53
53
|
# The outer dictionary's value: inner dictionary
|
|
54
54
|
# The inner dictionary's key: attribute value (str)
|
|
55
55
|
# The inner dictionary's value: a 'div' object at some memory location
|
|
56
|
-
_page_cache
|
|
56
|
+
_page_cache: Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET._Element]]
|
|
57
57
|
# Cache for the files (mets:file) - two nested dictionaries
|
|
58
58
|
# The outer dictionary's Key: 'fileGrp.USE'
|
|
59
59
|
# The outer dictionary's Value: Inner dictionary
|
|
60
60
|
# The inner dictionary's Key: 'file.ID'
|
|
61
61
|
# The inner dictionary's Value: a 'file' object at some memory location
|
|
62
|
-
_file_cache
|
|
62
|
+
_file_cache: Dict[str, Dict[str, ET._Element]]
|
|
63
63
|
# Cache for the file pointers (mets:fptr) - two nested dictionaries
|
|
64
64
|
# The outer dictionary's Key: 'div.ID'
|
|
65
65
|
# The outer dictionary's Value: Inner dictionary
|
|
66
66
|
# The inner dictionary's Key: 'fptr.FILEID'
|
|
67
67
|
# The inner dictionary's Value: a 'fptr' object at some memory location
|
|
68
|
-
_fptr_cache
|
|
68
|
+
_fptr_cache: Dict[str, Dict[str, ET._Element]]
|
|
69
69
|
# Cache for the logical structural divs (mets:div) - two nested dictionaries
|
|
70
70
|
# The outer dictionary's key: attribute type
|
|
71
71
|
# The outer dictionary's value: inner dictionary
|
|
72
72
|
# The inner dictionary's key: attribute value (str)
|
|
73
73
|
# The inner dictionary's value: a list of corresponding physical div.ID
|
|
74
|
-
_struct_cache
|
|
74
|
+
_struct_cache: Dict[METS_STRUCT_DIV_ATTRIBUTE, Dict[str, List[str]]]
|
|
75
75
|
|
|
76
76
|
@staticmethod
|
|
77
|
-
def empty_mets(now
|
|
77
|
+
def empty_mets(now: Optional[str] = None, cache_flag: bool = False):
|
|
78
78
|
"""
|
|
79
79
|
Create an empty METS file from bundled template.
|
|
80
80
|
"""
|
|
@@ -94,11 +94,11 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
94
94
|
# then enable caching, if "false", disable caching, overriding the
|
|
95
95
|
# kwarg to the constructor
|
|
96
96
|
if config.is_set('OCRD_METS_CACHING'):
|
|
97
|
-
getLogger('ocrd.models.ocrd_mets').debug(
|
|
98
|
-
|
|
97
|
+
getLogger('ocrd.models.ocrd_mets').debug(
|
|
98
|
+
'METS Caching %s because OCRD_METS_CACHING is %s',
|
|
99
|
+
'enabled' if config.OCRD_METS_CACHING else 'disabled', config.raw_value('OCRD_METS_CACHING'))
|
|
99
100
|
self._cache_flag = config.OCRD_METS_CACHING
|
|
100
101
|
|
|
101
|
-
|
|
102
102
|
# If cache is enabled
|
|
103
103
|
if self._cache_flag:
|
|
104
104
|
self._initialize_caches()
|
|
@@ -109,7 +109,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
109
109
|
String representation
|
|
110
110
|
"""
|
|
111
111
|
return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (
|
|
112
|
-
|
|
112
|
+
self._cache_flag, self.file_groups, list(self.find_files()))
|
|
113
113
|
|
|
114
114
|
def _fill_caches(self) -> None:
|
|
115
115
|
"""
|
|
@@ -181,9 +181,9 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
181
181
|
def _initialize_caches(self) -> None:
|
|
182
182
|
self._file_cache = {}
|
|
183
183
|
# NOTE we can only guarantee uniqueness for @ID and @ORDER
|
|
184
|
-
self._page_cache = {k
|
|
184
|
+
self._page_cache = {k: {} for k in METS_PAGE_DIV_ATTRIBUTE}
|
|
185
185
|
self._fptr_cache = {}
|
|
186
|
-
self._struct_cache = {k
|
|
186
|
+
self._struct_cache = {k: {} for k in METS_STRUCT_DIV_ATTRIBUTE}
|
|
187
187
|
|
|
188
188
|
def _refresh_caches(self) -> None:
|
|
189
189
|
if self._cache_flag:
|
|
@@ -205,7 +205,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
205
205
|
return found.text
|
|
206
206
|
|
|
207
207
|
@unique_identifier.setter
|
|
208
|
-
def unique_identifier(self, purl
|
|
208
|
+
def unique_identifier(self, purl: str) -> None:
|
|
209
209
|
"""
|
|
210
210
|
Set the unique identifier by looking through ``mods:identifier``
|
|
211
211
|
See `specs <https://ocr-d.de/en/spec/mets#unique-id-for-the-document-processed>`_ for details.
|
|
@@ -268,15 +268,15 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
268
268
|
# pylint: disable=multiple-statements
|
|
269
269
|
def find_files(
|
|
270
270
|
self,
|
|
271
|
-
ID
|
|
272
|
-
fileGrp
|
|
273
|
-
pageId
|
|
274
|
-
mimetype
|
|
275
|
-
url
|
|
276
|
-
local_filename
|
|
277
|
-
local_only
|
|
278
|
-
include_fileGrp
|
|
279
|
-
exclude_fileGrp
|
|
271
|
+
ID: Optional[str] = None,
|
|
272
|
+
fileGrp: Optional[str] = None,
|
|
273
|
+
pageId: Optional[str] = None,
|
|
274
|
+
mimetype: Optional[str] = None,
|
|
275
|
+
url: Optional[str] = None,
|
|
276
|
+
local_filename: Optional[str] = None,
|
|
277
|
+
local_only: bool = False,
|
|
278
|
+
include_fileGrp: Optional[List[str]] = None,
|
|
279
|
+
exclude_fileGrp: Optional[List[str]] = None,
|
|
280
280
|
) -> Iterator[OcrdFile]:
|
|
281
281
|
"""
|
|
282
282
|
Search ``mets:file`` entries in this METS document and yield results.
|
|
@@ -346,24 +346,30 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
346
346
|
for cand in candidates:
|
|
347
347
|
if ID:
|
|
348
348
|
if isinstance(ID, str):
|
|
349
|
-
if not ID == cand.get('ID'):
|
|
349
|
+
if not ID == cand.get('ID'):
|
|
350
|
+
continue
|
|
350
351
|
else:
|
|
351
|
-
if not ID.fullmatch(cand.get('ID')):
|
|
352
|
+
if not ID.fullmatch(cand.get('ID')):
|
|
353
|
+
continue
|
|
352
354
|
|
|
353
355
|
if pageId is not None and cand.get('ID') not in pageId_list:
|
|
354
356
|
continue
|
|
355
357
|
|
|
356
358
|
if not self._cache_flag and fileGrp:
|
|
357
359
|
if isinstance(fileGrp, str):
|
|
358
|
-
if cand.getparent().get('USE') != fileGrp:
|
|
360
|
+
if cand.getparent().get('USE') != fileGrp:
|
|
361
|
+
continue
|
|
359
362
|
else:
|
|
360
|
-
if not fileGrp.fullmatch(cand.getparent().get('USE')):
|
|
363
|
+
if not fileGrp.fullmatch(cand.getparent().get('USE')):
|
|
364
|
+
continue
|
|
361
365
|
|
|
362
366
|
if mimetype:
|
|
363
367
|
if isinstance(mimetype, str):
|
|
364
|
-
if cand.get('MIMETYPE') != mimetype:
|
|
368
|
+
if cand.get('MIMETYPE') != mimetype:
|
|
369
|
+
continue
|
|
365
370
|
else:
|
|
366
|
-
if not mimetype.fullmatch(cand.get('MIMETYPE') or ''):
|
|
371
|
+
if not mimetype.fullmatch(cand.get('MIMETYPE') or ''):
|
|
372
|
+
continue
|
|
367
373
|
|
|
368
374
|
if url:
|
|
369
375
|
cand_locat = cand.find('mets:FLocat[@LOCTYPE="URL"]', namespaces=NS)
|
|
@@ -371,9 +377,11 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
371
377
|
continue
|
|
372
378
|
cand_url = cand_locat.get('{%s}href' % NS['xlink'])
|
|
373
379
|
if isinstance(url, str):
|
|
374
|
-
if cand_url != url:
|
|
380
|
+
if cand_url != url:
|
|
381
|
+
continue
|
|
375
382
|
else:
|
|
376
|
-
if not url.fullmatch(cand_url):
|
|
383
|
+
if not url.fullmatch(cand_url):
|
|
384
|
+
continue
|
|
377
385
|
|
|
378
386
|
if local_filename:
|
|
379
387
|
cand_locat = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', namespaces=NS)
|
|
@@ -381,9 +389,11 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
381
389
|
continue
|
|
382
390
|
cand_local_filename = cand_locat.get('{%s}href' % NS['xlink'])
|
|
383
391
|
if isinstance(local_filename, str):
|
|
384
|
-
if cand_local_filename != local_filename:
|
|
392
|
+
if cand_local_filename != local_filename:
|
|
393
|
+
continue
|
|
385
394
|
else:
|
|
386
|
-
if not local_filename.fullmatch(cand_local_filename):
|
|
395
|
+
if not local_filename.fullmatch(cand_local_filename):
|
|
396
|
+
continue
|
|
387
397
|
|
|
388
398
|
if local_only:
|
|
389
399
|
# deprecation_warning("'local_only' is deprecated, use 'local_filename=\"//.+\"' instead")
|
|
@@ -435,7 +445,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
435
445
|
if self._cache_flag:
|
|
436
446
|
self._file_cache[new] = self._file_cache.pop(old)
|
|
437
447
|
|
|
438
|
-
def remove_file_group(self, USE: str, recursive
|
|
448
|
+
def remove_file_group(self, USE: str, recursive: bool = False, force: bool = False) -> None:
|
|
439
449
|
"""
|
|
440
450
|
Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``)
|
|
441
451
|
Arguments:
|
|
@@ -479,16 +489,16 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
479
489
|
|
|
480
490
|
if self._cache_flag:
|
|
481
491
|
# Note: Since the files inside the group are removed
|
|
482
|
-
# with the 'remove_one_file' method above,
|
|
492
|
+
# with the 'remove_one_file' method above,
|
|
483
493
|
# we should not take care of that again.
|
|
484
494
|
# We just remove the fileGrp.
|
|
485
495
|
del self._file_cache[el_fileGrp.get('USE')]
|
|
486
496
|
|
|
487
497
|
el_fileGrp.getparent().remove(el_fileGrp)
|
|
488
498
|
|
|
489
|
-
def add_file(self, fileGrp
|
|
490
|
-
ID
|
|
491
|
-
local_filename
|
|
499
|
+
def add_file(self, fileGrp: str, mimetype: Optional[str] = None, url: Optional[str] = None,
|
|
500
|
+
ID: Optional[str] = None, pageId: Optional[str] = None, force: bool = False,
|
|
501
|
+
local_filename: Optional[str] = None, ignore: bool = False, **kwargs) -> OcrdFile:
|
|
492
502
|
"""
|
|
493
503
|
Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`.
|
|
494
504
|
Arguments:
|
|
@@ -499,7 +509,8 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
499
509
|
ID (string): ``@ID`` of the ``mets:file`` to use
|
|
500
510
|
pageId (string): ``@ID`` in the physical ``mets:structMap`` to link to
|
|
501
511
|
force (boolean): Whether to add the file even if a ``mets:file`` with the same ``@ID`` already exists.
|
|
502
|
-
ignore (boolean): Do not look for existing files at all.
|
|
512
|
+
ignore (boolean): Do not look for existing files at all.
|
|
513
|
+
(Shifts responsibility for preventing errors from duplicate ID to the user.)
|
|
503
514
|
local_filename (string):
|
|
504
515
|
"""
|
|
505
516
|
if not ID:
|
|
@@ -541,7 +552,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
541
552
|
|
|
542
553
|
return mets_file
|
|
543
554
|
|
|
544
|
-
def remove_file(self, *args, **kwargs) -> Union[List[OcrdFile],OcrdFile]:
|
|
555
|
+
def remove_file(self, *args, **kwargs) -> Union[List[OcrdFile], OcrdFile]:
|
|
545
556
|
"""
|
|
546
557
|
Delete each ``ocrd:file`` matching the query. Same arguments as :py:meth:`find_files`
|
|
547
558
|
"""
|
|
@@ -559,12 +570,14 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
559
570
|
return []
|
|
560
571
|
raise FileNotFoundError("File not found: %s %s" % (args, kwargs))
|
|
561
572
|
|
|
562
|
-
def remove_one_file(self, ID
|
|
573
|
+
def remove_one_file(self, ID: Union[str, OcrdFile], fileGrp: str = None) -> OcrdFile:
|
|
563
574
|
"""
|
|
564
575
|
Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`.
|
|
565
576
|
Arguments:
|
|
566
|
-
ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete
|
|
567
|
-
|
|
577
|
+
ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete.
|
|
578
|
+
(Can also be an :py:class:`ocrd_models.ocrd_file.OcrdFile` to avoid search via ``ID``.)
|
|
579
|
+
fileGrp (string): ``@USE`` of the ``mets:fileGrp`` containing the ``mets:file``.
|
|
580
|
+
(Used only for optimization.)
|
|
568
581
|
Returns:
|
|
569
582
|
The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference.
|
|
570
583
|
"""
|
|
@@ -629,8 +642,8 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
629
642
|
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID',
|
|
630
643
|
namespaces=NS)]
|
|
631
644
|
|
|
632
|
-
def get_physical_pages(self, for_fileIds
|
|
633
|
-
return_divs
|
|
645
|
+
def get_physical_pages(self, for_fileIds: Optional[List[str]] = None, for_pageIds: Optional[str] = None,
|
|
646
|
+
return_divs: bool = False) -> List[Union[str, ET._Element]]:
|
|
634
647
|
"""
|
|
635
648
|
List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``),
|
|
636
649
|
optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`,
|
|
@@ -718,7 +731,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
718
731
|
|
|
719
732
|
if for_fileIds == []:
|
|
720
733
|
return []
|
|
721
|
-
assert for_fileIds
|
|
734
|
+
assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright
|
|
722
735
|
ret = [None] * len(for_fileIds)
|
|
723
736
|
if self._cache_flag:
|
|
724
737
|
for pageId, fptrdict in self._fptr_cache.items():
|
|
@@ -793,7 +806,6 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
793
806
|
val = struct_cache[attr].setdefault(str(el_div.get(attr.name)), list())
|
|
794
807
|
val.extend(smlink_map.get(el_div.get('ID'), []))
|
|
795
808
|
log.debug("found %d smLink entries for %d logical divs", len(el_smlink_list), len(el_struct_list))
|
|
796
|
-
page_attr_patterns_matched = []
|
|
797
809
|
for page in self._tree.getroot().xpath(
|
|
798
810
|
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
|
|
799
811
|
namespaces=NS):
|
|
@@ -811,7 +823,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
811
823
|
METS_STRUCT_DIV_ATTRIBUTE.LABEL]):
|
|
812
824
|
continue
|
|
813
825
|
if cache_keys := [v for v in cache if pat.matches(v)]:
|
|
814
|
-
pat.attr = [attr]
|
|
826
|
+
pat.attr = [attr] # disambiguate next
|
|
815
827
|
if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
|
|
816
828
|
ret.append(page)
|
|
817
829
|
log.debug('physical match for %s on page %s', pat, page.get('ID'))
|
|
@@ -835,7 +847,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
835
847
|
pat.expr.remove(cache_key)
|
|
836
848
|
if not pat.expr:
|
|
837
849
|
patterns_exhausted.append(pat)
|
|
838
|
-
break
|
|
850
|
+
break # no more attributes for this pattern
|
|
839
851
|
# keep matching in order to exhaust and consume pattern list
|
|
840
852
|
#if page in ret:
|
|
841
853
|
# break # no more patterns for this page
|
|
@@ -847,7 +859,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
847
859
|
raise ValueError(f"Patterns {unmatched} match none of the pages")
|
|
848
860
|
|
|
849
861
|
ranges_without_start_match = []
|
|
850
|
-
ranges_without_stop_match = []
|
|
862
|
+
# ranges_without_stop_match = []
|
|
851
863
|
for pat in page_attr_patterns_copy:
|
|
852
864
|
if isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
|
|
853
865
|
# range expression, expanded to pattern list
|
|
@@ -865,8 +877,8 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
865
877
|
# raise ValueError(f"End of range patterns {ranges_without_stop_match} not matched - invalid range")
|
|
866
878
|
return ret
|
|
867
879
|
|
|
868
|
-
def set_physical_page_for_file(self, pageId
|
|
869
|
-
order
|
|
880
|
+
def set_physical_page_for_file(self, pageId: str, ocrd_file: OcrdFile,
|
|
881
|
+
order: Optional[str] = None, orderlabel: Optional[str] = None) -> None:
|
|
870
882
|
"""
|
|
871
883
|
Set the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry)
|
|
872
884
|
corresponding to the ``mets:file`` :py:attr:`ocrd_file`, creating all structures if necessary.
|
|
@@ -887,7 +899,10 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
887
899
|
fptrs.append(fptrdict[ocrd_file.ID])
|
|
888
900
|
else:
|
|
889
901
|
fptrs = self._tree.getroot().findall(
|
|
890
|
-
'mets:structMap[@TYPE="PHYSICAL"]/
|
|
902
|
+
'mets:structMap[@TYPE="PHYSICAL"]/'
|
|
903
|
+
'mets:div[@TYPE="physSequence"]/'
|
|
904
|
+
'mets:div[@TYPE="page"]/'
|
|
905
|
+
'mets:fptr[@FILEID="%s"]' %
|
|
891
906
|
ocrd_file.ID, namespaces=NS)
|
|
892
907
|
|
|
893
908
|
for el_fptr in fptrs:
|
|
@@ -923,7 +938,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
923
938
|
if self._cache_flag:
|
|
924
939
|
# Create a new entry in the page cache
|
|
925
940
|
self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] = el_pagediv
|
|
926
|
-
# Create a new entry in the fptr cache and
|
|
941
|
+
# Create a new entry in the fptr cache and
|
|
927
942
|
# assign an empty dictionary to hold the fileids
|
|
928
943
|
self._fptr_cache.setdefault(pageId, {})
|
|
929
944
|
|
|
@@ -934,7 +949,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
934
949
|
# Assign the ocrd fileID to the pageId in the cache
|
|
935
950
|
self._fptr_cache[pageId].update({ocrd_file.ID: el_fptr})
|
|
936
951
|
|
|
937
|
-
def update_physical_page_attributes(self, page_id
|
|
952
|
+
def update_physical_page_attributes(self, page_id: str, **kwargs) -> None:
|
|
938
953
|
invalid_keys = list(k for k in kwargs if k not in METS_PAGE_DIV_ATTRIBUTE.names())
|
|
939
954
|
if invalid_keys:
|
|
940
955
|
raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTE.names()}")
|
|
@@ -950,7 +965,7 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
950
965
|
else:
|
|
951
966
|
page_div.attrib[k] = v
|
|
952
967
|
|
|
953
|
-
def get_physical_page_for_file(self, ocrd_file
|
|
968
|
+
def get_physical_page_for_file(self, ocrd_file: OcrdFile) -> Optional[str]:
|
|
954
969
|
"""
|
|
955
970
|
Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry)
|
|
956
971
|
corresponding to the ``mets:file`` :py:attr:`ocrd_file`.
|
|
@@ -961,12 +976,15 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
961
976
|
return pageId
|
|
962
977
|
else:
|
|
963
978
|
ret = self._tree.getroot().find(
|
|
964
|
-
'mets:structMap[@TYPE="PHYSICAL"]/
|
|
979
|
+
'mets:structMap[@TYPE="PHYSICAL"]/'
|
|
980
|
+
'mets:div[@TYPE="physSequence"]/'
|
|
981
|
+
'mets:div[@TYPE="page"]/'
|
|
982
|
+
'mets:fptr[@FILEID="%s"]' %
|
|
965
983
|
ocrd_file.ID, namespaces=NS)
|
|
966
984
|
if ret is not None:
|
|
967
985
|
return ret.getparent().get('ID')
|
|
968
986
|
|
|
969
|
-
def remove_physical_page(self, ID
|
|
987
|
+
def remove_physical_page(self, ID: str) -> None:
|
|
970
988
|
"""
|
|
971
989
|
Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`.
|
|
972
990
|
"""
|
|
@@ -987,9 +1005,11 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
987
1005
|
del self._page_cache[attr][mets_div_attrib[attr.name]]
|
|
988
1006
|
del self._fptr_cache[ID]
|
|
989
1007
|
|
|
990
|
-
def remove_physical_page_fptr(self, fileId
|
|
1008
|
+
def remove_physical_page_fptr(self, fileId: str) -> List[str]:
|
|
991
1009
|
"""
|
|
992
|
-
Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]``
|
|
1010
|
+
Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]``
|
|
1011
|
+
for :py:attr:`fileId` from all ``mets:div`` entries in the physical ``mets:structMap``.
|
|
1012
|
+
|
|
993
1013
|
Returns:
|
|
994
1014
|
List of pageIds that mets:fptrs were deleted from
|
|
995
1015
|
"""
|
|
@@ -1006,7 +1026,10 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
1006
1026
|
mets_fptrs.append(fptrdict[fileId])
|
|
1007
1027
|
else:
|
|
1008
1028
|
mets_fptrs = self._tree.getroot().xpath(
|
|
1009
|
-
'mets:structMap[@TYPE="PHYSICAL"]/
|
|
1029
|
+
'mets:structMap[@TYPE="PHYSICAL"]/'
|
|
1030
|
+
'mets:div[@TYPE="physSequence"]/'
|
|
1031
|
+
'mets:div[@TYPE="page"]/'
|
|
1032
|
+
'mets:fptr[@FILEID="%s"]' % fileId,
|
|
1010
1033
|
namespaces=NS)
|
|
1011
1034
|
ret = []
|
|
1012
1035
|
for mets_fptr in mets_fptrs:
|
|
@@ -1029,11 +1052,11 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
1029
1052
|
return {div.get('ID'): (div.get('ORDER', None), div.get('ORDERLABEL', None), div.get('LABEL', None))
|
|
1030
1053
|
for div in divs}
|
|
1031
1054
|
|
|
1032
|
-
def merge(self, other_mets, force
|
|
1033
|
-
fileGrp_mapping
|
|
1034
|
-
fileId_mapping
|
|
1035
|
-
pageId_mapping
|
|
1036
|
-
after_add_cb
|
|
1055
|
+
def merge(self, other_mets, force: bool = False,
|
|
1056
|
+
fileGrp_mapping: Optional[Dict[str, str]] = None,
|
|
1057
|
+
fileId_mapping: Optional[Dict[str, str]] = None,
|
|
1058
|
+
pageId_mapping: Optional[Dict[str, str]] = None,
|
|
1059
|
+
after_add_cb: Optional[Callable[[OcrdFile], Any]] = None, **kwargs) -> None:
|
|
1037
1060
|
"""
|
|
1038
1061
|
Add all files from other_mets.
|
|
1039
1062
|
Accepts the same kwargs as :py:func:`find_files`
|
ocrd_models/ocrd_page.py
CHANGED
|
@@ -179,6 +179,7 @@ parseString.__doc__ = (
|
|
|
179
179
|
"""
|
|
180
180
|
)
|
|
181
181
|
|
|
182
|
+
|
|
182
183
|
class OcrdPage():
|
|
183
184
|
"""
|
|
184
185
|
Proxy object for :py:class:`ocrd_models.PcGtsType` (i.e. PRImA PAGE-XML
|
|
@@ -188,10 +189,10 @@ class OcrdPage():
|
|
|
188
189
|
"""
|
|
189
190
|
def __init__(
|
|
190
191
|
self,
|
|
191
|
-
pcgts
|
|
192
|
-
etree
|
|
193
|
-
mapping
|
|
194
|
-
revmap
|
|
192
|
+
pcgts: PcGtsType,
|
|
193
|
+
etree: ET._Element,
|
|
194
|
+
mapping: Dict[str, ET._Element],
|
|
195
|
+
revmap: Dict[ET._Element, Any],
|
|
195
196
|
):
|
|
196
197
|
self._pcgts = pcgts
|
|
197
198
|
self.etree = etree
|
|
@@ -214,8 +215,10 @@ class OcrdPage():
|
|
|
214
215
|
def __getattr__(self, name):
|
|
215
216
|
return getattr(self._pcgts, name)
|
|
216
217
|
|
|
218
|
+
|
|
217
219
|
OcrdPageType = Union[OcrdPage, PcGtsType]
|
|
218
220
|
|
|
221
|
+
|
|
219
222
|
def to_xml(el, skip_declaration=False) -> str:
|
|
220
223
|
"""
|
|
221
224
|
Serialize ``pc:PcGts`` document as string.
|
|
@@ -229,15 +232,16 @@ def to_xml(el, skip_declaration=False) -> str:
|
|
|
229
232
|
name = 'PcGts'
|
|
230
233
|
sio = StringIO()
|
|
231
234
|
el.export(
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
235
|
+
outfile=sio,
|
|
236
|
+
level=0,
|
|
237
|
+
name_=name,
|
|
238
|
+
namespaceprefix_='pc:',
|
|
239
|
+
namespacedef_='xmlns:pc="%s" ' % NAMESPACES['page'] +
|
|
240
|
+
'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' +
|
|
241
|
+
'xsi:schemaLocation="%s %s/pagecontent.xsd"' % (
|
|
242
|
+
NAMESPACES['page'],
|
|
243
|
+
NAMESPACES['page']
|
|
244
|
+
))
|
|
241
245
|
ret = sio.getvalue()
|
|
242
246
|
if not skip_declaration:
|
|
243
247
|
ret = '<?xml version="1.0" encoding="UTF-8"?>\n' + ret
|
ocrd_models/ocrd_xml_base.py
CHANGED
ocrd_models/report.py
CHANGED
ocrd_models/utils.py
CHANGED
|
@@ -13,6 +13,7 @@ __all__ = [
|
|
|
13
13
|
'extract_mets_from_oai_content'
|
|
14
14
|
]
|
|
15
15
|
|
|
16
|
+
|
|
16
17
|
def xmllint_format(xml):
|
|
17
18
|
"""
|
|
18
19
|
Pretty-print XML like ``xmllint`` does.
|
|
@@ -25,6 +26,7 @@ def xmllint_format(xml):
|
|
|
25
26
|
return ('%s\n%s' % ('<?xml version="1.0" encoding="UTF-8"?>',
|
|
26
27
|
ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8')
|
|
27
28
|
|
|
29
|
+
|
|
28
30
|
def handle_oai_response(response):
|
|
29
31
|
"""
|
|
30
32
|
In case of a valid OAI-Response, extract first METS-Entry-Data
|
|
@@ -62,9 +64,8 @@ def extract_mets_from_oai_content(data, preamble='<?xml version="1.0" encoding="
|
|
|
62
64
|
mets_root_el = xml_root.find('.//{%s}mets' % NS['mets'])
|
|
63
65
|
if mets_root_el is not None:
|
|
64
66
|
new_tree = ET.ElementTree(mets_root_el)
|
|
65
|
-
xml_formatted = ET.tostring(
|
|
66
|
-
|
|
67
|
-
encoding='UTF-8').decode('UTF-8')
|
|
67
|
+
xml_formatted = ET.tostring(
|
|
68
|
+
new_tree, pretty_print=True, encoding='UTF-8').decode('UTF-8')
|
|
68
69
|
formatted_content = '{}\n{}'.format(preamble, xml_formatted)
|
|
69
70
|
return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n')
|
|
70
71
|
|
ocrd_models/xpath_functions.py
CHANGED
|
@@ -2,10 +2,12 @@ from ocrd_utils import xywh_from_points
|
|
|
2
2
|
|
|
3
3
|
pc_functions = []
|
|
4
4
|
|
|
5
|
+
|
|
5
6
|
def _export(func):
|
|
6
7
|
pc_functions.append(func)
|
|
7
8
|
return func
|
|
8
9
|
|
|
10
|
+
|
|
9
11
|
@_export
|
|
10
12
|
def pc_pixelarea(nodes):
|
|
11
13
|
"""
|
|
@@ -24,6 +26,7 @@ def pc_pixelarea(nodes):
|
|
|
24
26
|
area += xywh['w'] * xywh['h']
|
|
25
27
|
return area
|
|
26
28
|
|
|
29
|
+
|
|
27
30
|
@_export
|
|
28
31
|
def pc_textequiv(nodes):
|
|
29
32
|
"""
|
|
@@ -48,4 +51,3 @@ def pc_textequiv(nodes):
|
|
|
48
51
|
continue
|
|
49
52
|
text += str(string.text)
|
|
50
53
|
return text
|
|
51
|
-
|
ocrd_network/__init__.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from .client import Client
|
|
2
|
-
from .constants import
|
|
2
|
+
from .constants import JobState
|
|
3
3
|
from .processing_server import ProcessingServer
|
|
4
4
|
from .processing_worker import ProcessingWorker
|
|
5
|
-
from .processor_server import ProcessorServer
|
|
6
5
|
from .param_validators import DatabaseParamType, ServerAddressParamType, QueueServerParamType
|
|
7
6
|
from .server_cache import CacheLockedPages, CacheProcessingRequests
|
ocrd_network/cli/__init__.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
from .client import client_cli
|
|
2
2
|
from .processing_server import processing_server_cli
|
|
3
3
|
from .processing_worker import processing_worker_cli
|
|
4
|
-
from .processor_server import processor_server_cli
|
|
5
4
|
|
|
6
5
|
__all__ = [
|
|
7
6
|
'client_cli',
|
|
8
7
|
'processing_server_cli',
|
|
9
8
|
'processing_worker_cli',
|
|
10
|
-
'processor_server_cli'
|
|
11
9
|
]
|