ocrd 3.4.0__py3-none-any.whl → 3.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +6 -0
- ocrd/decorators/ocrd_cli_options.py +1 -1
- ocrd/processor/base.py +21 -13
- ocrd/workspace.py +15 -19
- {ocrd-3.4.0.dist-info → ocrd-3.5.0.dist-info}/METADATA +2 -2
- {ocrd-3.4.0.dist-info → ocrd-3.5.0.dist-info}/RECORD +14 -14
- ocrd_models/constants.py +105 -0
- ocrd_models/ocrd_mets.py +231 -97
- ocrd_network/constants.py +9 -5
- ocrd_utils/os.py +1 -1
- {ocrd-3.4.0.dist-info → ocrd-3.5.0.dist-info}/LICENSE +0 -0
- {ocrd-3.4.0.dist-info → ocrd-3.5.0.dist-info}/WHEEL +0 -0
- {ocrd-3.4.0.dist-info → ocrd-3.5.0.dist-info}/entry_points.txt +0 -0
- {ocrd-3.4.0.dist-info → ocrd-3.5.0.dist-info}/top_level.txt +0 -0
ocrd/cli/__init__.py
CHANGED
|
@@ -67,6 +67,12 @@ Variables:
|
|
|
67
67
|
\b
|
|
68
68
|
{config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)}
|
|
69
69
|
\b
|
|
70
|
+
{config.describe('OCRD_MAX_MISSING_OUTPUTS')}
|
|
71
|
+
\b
|
|
72
|
+
{config.describe('OCRD_MAX_PARALLEL_PAGES')}
|
|
73
|
+
\b
|
|
74
|
+
{config.describe('OCRD_PROCESSING_PAGE_TIMEOUT')}
|
|
75
|
+
\b
|
|
70
76
|
{config.describe('OCRD_METS_CACHING')}
|
|
71
77
|
\b
|
|
72
78
|
{config.describe('OCRD_MAX_PROCESSOR_CACHE')}
|
|
@@ -56,7 +56,7 @@ def ocrd_cli_options(f):
|
|
|
56
56
|
# subcommands. So we have to work around that by creating a
|
|
57
57
|
# pseudo-subcommand handled in ocrd_cli_wrap_processor
|
|
58
58
|
argument('subcommand', nargs=1, required=False,
|
|
59
|
-
type=click.Choice(
|
|
59
|
+
type=click.Choice(list(map(str, AgentType)))),
|
|
60
60
|
]
|
|
61
61
|
for param in params:
|
|
62
62
|
param(f)
|
ocrd/processor/base.py
CHANGED
|
@@ -29,8 +29,7 @@ from frozendict import frozendict
|
|
|
29
29
|
# this is where the fixes came from:
|
|
30
30
|
from loky import Future, ProcessPoolExecutor
|
|
31
31
|
import multiprocessing as mp
|
|
32
|
-
from
|
|
33
|
-
from _thread import interrupt_main
|
|
32
|
+
from multiprocessing.pool import ThreadPool
|
|
34
33
|
|
|
35
34
|
from click import wrap_text
|
|
36
35
|
from deprecated import deprecated
|
|
@@ -783,11 +782,16 @@ class Processor():
|
|
|
783
782
|
page_id = input_files[input_pos].pageId
|
|
784
783
|
self._base_logger.info("processing page %s", page_id)
|
|
785
784
|
for i, input_file in enumerate(input_files):
|
|
785
|
+
grp = self.input_file_grp.split(',')[i]
|
|
786
786
|
if input_file is None:
|
|
787
|
-
grp = self.input_file_grp.split(',')[i]
|
|
788
787
|
self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}")
|
|
789
788
|
continue
|
|
790
789
|
assert isinstance(input_file, get_args(OcrdFileType))
|
|
790
|
+
if not input_file.local_filename:
|
|
791
|
+
self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}')
|
|
792
|
+
if config.OCRD_MISSING_INPUT == 'ABORT':
|
|
793
|
+
raise MissingInputFile(grp, page_id, input_file.mimetype)
|
|
794
|
+
continue
|
|
791
795
|
self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
|
|
792
796
|
try:
|
|
793
797
|
page_ = page_from_file(input_file)
|
|
@@ -796,6 +800,9 @@ class Processor():
|
|
|
796
800
|
except ValueError as err:
|
|
797
801
|
# not PAGE and not an image to generate PAGE for
|
|
798
802
|
self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
|
|
803
|
+
if not any(input_pcgts):
|
|
804
|
+
self._base_logger.warning(f'skipping page {page_id}')
|
|
805
|
+
return
|
|
799
806
|
output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
|
|
800
807
|
if input_files[input_pos].fileGrp == self.output_file_grp:
|
|
801
808
|
# input=output fileGrp: re-use ID exactly
|
|
@@ -1107,7 +1114,11 @@ class Processor():
|
|
|
1107
1114
|
self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
|
|
1108
1115
|
f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
|
|
1109
1116
|
ifts = []
|
|
1110
|
-
|
|
1117
|
+
# use physical page order
|
|
1118
|
+
for page in self.workspace.mets.physical_pages:
|
|
1119
|
+
if page not in pages:
|
|
1120
|
+
continue
|
|
1121
|
+
ifiles = pages[page]
|
|
1111
1122
|
for i, ifg in enumerate(ifgs):
|
|
1112
1123
|
if not ifiles[i]:
|
|
1113
1124
|
# could be from non-unique with on_error=skip or from true gap
|
|
@@ -1150,18 +1161,15 @@ def _page_worker(timeout, *input_files):
|
|
|
1150
1161
|
"""
|
|
1151
1162
|
page_id = next((file.pageId for file in input_files
|
|
1152
1163
|
if hasattr(file, 'pageId')), "")
|
|
1153
|
-
|
|
1154
|
-
timer = Timer(timeout, interrupt_main)
|
|
1155
|
-
timer.start()
|
|
1164
|
+
pool = ThreadPool(processes=1)
|
|
1156
1165
|
try:
|
|
1157
|
-
_page_worker_processor.process_page_file(*input_files)
|
|
1166
|
+
#_page_worker_processor.process_page_file(*input_files)
|
|
1167
|
+
async_result = pool.apply_async(_page_worker_processor.process_page_file, input_files)
|
|
1168
|
+
async_result.get(timeout or None)
|
|
1158
1169
|
_page_worker_processor.logger.debug("page worker completed for page %s", page_id)
|
|
1159
|
-
except
|
|
1170
|
+
except mp.TimeoutError:
|
|
1160
1171
|
_page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
|
|
1161
|
-
raise
|
|
1162
|
-
finally:
|
|
1163
|
-
if timeout > 0:
|
|
1164
|
-
timer.cancel()
|
|
1172
|
+
raise
|
|
1165
1173
|
|
|
1166
1174
|
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
|
|
1167
1175
|
"""Generate a string describing the full CLI of this processor including params.
|
ocrd/workspace.py
CHANGED
|
@@ -777,16 +777,14 @@ class Workspace():
|
|
|
777
777
|
raise Exception('Found no AlternativeImage that satisfies all requirements ' +
|
|
778
778
|
'filename="%s" in page "%s"' % (
|
|
779
779
|
filename, page_id))
|
|
780
|
-
if not all(feature in page_coords['features']
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
'filter="%s" in page "%s"' % (
|
|
789
|
-
feature_filter, page_id))
|
|
780
|
+
if (not all(feature in page_coords['features']
|
|
781
|
+
for feature in feature_selector.split(',') if feature) or
|
|
782
|
+
any(feature in page_coords['features']
|
|
783
|
+
for feature in feature_filter.split(',') if feature)):
|
|
784
|
+
raise Exception('Found no AlternativeImage that satisfies all requirements' +
|
|
785
|
+
' selector="%s"' % feature_selector +
|
|
786
|
+
' filter="%s"' % feature_filter +
|
|
787
|
+
' in page "%s"' % page_id)
|
|
790
788
|
# ensure DPI will be set in image meta-data again
|
|
791
789
|
if 'DPI' in page_coords:
|
|
792
790
|
dpi = page_coords['DPI']
|
|
@@ -1038,16 +1036,14 @@ class Workspace():
|
|
|
1038
1036
|
raise Exception('Found no AlternativeImage that satisfies all requirements ' +
|
|
1039
1037
|
'filename="%s" in segment "%s"' % (
|
|
1040
1038
|
filename, segment.id))
|
|
1041
|
-
if not all(feature in segment_coords['features']
|
|
1042
|
-
|
|
1039
|
+
if (not all(feature in segment_coords['features']
|
|
1040
|
+
for feature in feature_selector.split(',') if feature) or
|
|
1041
|
+
any(feature in segment_coords['features']
|
|
1042
|
+
for feature in feature_filter.split(',') if feature)):
|
|
1043
1043
|
raise Exception('Found no AlternativeImage that satisfies all requirements' +
|
|
1044
|
-
'selector="%s"
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
for feature in feature_filter.split(',') if feature):
|
|
1048
|
-
raise Exception('Found no AlternativeImage that satisfies all requirements ' +
|
|
1049
|
-
'filter="%s" in segment "%s"' % (
|
|
1050
|
-
feature_filter, segment.id))
|
|
1044
|
+
' selector="%s"' % feature_selector +
|
|
1045
|
+
' filter="%s"' % feature_filter +
|
|
1046
|
+
' in segment "%s"' % segment.id)
|
|
1051
1047
|
# ensure DPI will be set in image meta-data again
|
|
1052
1048
|
if 'DPI' in segment_coords:
|
|
1053
1049
|
dpi = segment_coords['DPI']
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -21,7 +21,7 @@ Requires-Dist: elementpath
|
|
|
21
21
|
Requires-Dist: fastapi>=0.78.0
|
|
22
22
|
Requires-Dist: filetype
|
|
23
23
|
Requires-Dist: Flask
|
|
24
|
-
Requires-Dist: frozendict>=2.
|
|
24
|
+
Requires-Dist: frozendict>=2.4.0
|
|
25
25
|
Requires-Dist: gdown
|
|
26
26
|
Requires-Dist: httpx>=0.22.0
|
|
27
27
|
Requires-Dist: jsonschema>=4
|
|
@@ -7,10 +7,10 @@ ocrd/resolver.py,sha256=A7BrZlUGrfJye-etaEuT-fdJFgvQcCxWovjufT-WmRY,15119
|
|
|
7
7
|
ocrd/resource_list.yml,sha256=82-PiqkZnka1kTj3MQqNn4wXWKHHtoFchsQuetWuqFs,2633
|
|
8
8
|
ocrd/resource_manager.py,sha256=kIWDoKxWH4IJE1gcoTcCRQjYjieCqiQclyuyF6Y9b8A,16813
|
|
9
9
|
ocrd/task_sequence.py,sha256=spiaUQaMM7M8WdBDoQGmLuTPm7tOugYXD6rcJ2UXzxw,6991
|
|
10
|
-
ocrd/workspace.py,sha256=
|
|
10
|
+
ocrd/workspace.py,sha256=eLuGSJtOh3y2miKgcF8219YH1RkAaEi-qwXHarz8O8k,64916
|
|
11
11
|
ocrd/workspace_backup.py,sha256=iab_JjZ_mMP-G8NIUk4PZmfpNlQuGRoqc3NbTSSew1w,3621
|
|
12
12
|
ocrd/workspace_bagger.py,sha256=yU8H3xR5WmQKvgQewac71ie-DUWcfLnMS01D55zsEHQ,11971
|
|
13
|
-
ocrd/cli/__init__.py,sha256
|
|
13
|
+
ocrd/cli/__init__.py,sha256=LpQb8ne1nzAq2j52lGWDTZlBCmrLwUsz17PTwJkWNcU,2884
|
|
14
14
|
ocrd/cli/bashlib.py,sha256=ypFBM3-IULz_IEBx0Y04eGt9VbQWwEWm4ujm9g_hPWY,6009
|
|
15
15
|
ocrd/cli/log.py,sha256=6_FrVmTKIIVNUaNLkuOJx8pvPhensHMuayJ0PA7T-XA,1562
|
|
16
16
|
ocrd/cli/network.py,sha256=oWBHFEURxfUdb_t-F4svP_ri7o5mqBoNQnLZLbsZLTA,602
|
|
@@ -23,10 +23,10 @@ ocrd/cli/zip.py,sha256=MMJLw3OXWiJVfVtrdJcBkbB8vA1IzSautluazZRuCQ0,5910
|
|
|
23
23
|
ocrd/decorators/__init__.py,sha256=n2Lb1WLXGlvPrhNTSGZYRqugpa__MZSWV546EmQnTtc,7678
|
|
24
24
|
ocrd/decorators/loglevel_option.py,sha256=tgipROEu3t4hkwWvFssd80k2SbTBwBIC4WNE6Gc-XAg,798
|
|
25
25
|
ocrd/decorators/mets_find_options.py,sha256=d4oATKMP6bFQHNqOK6nLqgUiWF2FYdkPvzkTVRMYpKo,635
|
|
26
|
-
ocrd/decorators/ocrd_cli_options.py,sha256=
|
|
26
|
+
ocrd/decorators/ocrd_cli_options.py,sha256=psS7u42mXTOWIXQd9kcrgW7kDnFURHbmZ0946aqBz3A,2659
|
|
27
27
|
ocrd/decorators/parameter_option.py,sha256=n8hYw7XVTd3i3tvpK8F1Jx_CqRp6EGF9qJVH95yj92Q,1076
|
|
28
28
|
ocrd/processor/__init__.py,sha256=39ymNwYRdc-b_OJzzKmWCvo2ga3KdsGSYDHE1Hzkn_w,274
|
|
29
|
-
ocrd/processor/base.py,sha256=
|
|
29
|
+
ocrd/processor/base.py,sha256=_h0V5FevEPLb1q0zGtShuKXRj_tOWhD0M7_ufn34MPc,60476
|
|
30
30
|
ocrd/processor/helpers.py,sha256=WFdC5zeB8F7T0FkpJwfTqWsSPNRtBCBUmFLgixw-rYs,10999
|
|
31
31
|
ocrd/processor/ocrd_page_result.py,sha256=eDkpyVHcpaBzTHXiGrcNk9PP9Xr-XZru2w_uoX_ZeNA,510
|
|
32
32
|
ocrd/processor/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -36,12 +36,12 @@ ocrd/processor/builtin/dummy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
36
36
|
ocrd/processor/builtin/dummy/ocrd-tool.json,sha256=NgMAXN1AQpGk4Ss73ThDY4QyFPKhj54qcrdeCGwTb10,2339
|
|
37
37
|
ocrd_modelfactory/__init__.py,sha256=NyJT1uSvmeEwibRFOkh0AEoVnYfP0mzxU--pP23B-TQ,4404
|
|
38
38
|
ocrd_models/__init__.py,sha256=A0aj0mOraNb-xfiUueACdoaqISnp0qH-F49nTJg2vCs,380
|
|
39
|
-
ocrd_models/constants.py,sha256=
|
|
39
|
+
ocrd_models/constants.py,sha256=z5XAFMgz3pttMJOHVzTWNZr3ZqMjonVIDmXk3GQTJ30,6954
|
|
40
40
|
ocrd_models/mets-empty.xml,sha256=dFixfbxSXrgjZx9BfdIKWHX-khNmp7dNYaFe2qQSwCY,1203
|
|
41
41
|
ocrd_models/ocrd_agent.py,sha256=E9OtDhz9UfKb6ou2qvsuCL9NlO1V6zMb0s8nVq8dVos,5609
|
|
42
42
|
ocrd_models/ocrd_exif.py,sha256=wRSprHxCy9LCXw41Fi9kp-CbFc5NFX9ZFIFNszB41qk,4585
|
|
43
43
|
ocrd_models/ocrd_file.py,sha256=7lyHezuNnl2FEYV1lV35-QTCrgYAL-3wO2ulFUNq2Ak,9717
|
|
44
|
-
ocrd_models/ocrd_mets.py,sha256=
|
|
44
|
+
ocrd_models/ocrd_mets.py,sha256=FHZnztf1cfWim_sAtTVFXt2ZuQx2HVDTQ1xIobIVIeQ,50540
|
|
45
45
|
ocrd_models/ocrd_page.py,sha256=TTCnvpKGyZx1dqH8LnDiVVVPjU6emWGVLO_4o9rQHtw,6233
|
|
46
46
|
ocrd_models/ocrd_page_generateds.py,sha256=IWoN3V-v3C4JgyPaFh9OQC87ob__wUP1Q6ELBxhLA1w,841794
|
|
47
47
|
ocrd_models/ocrd_xml_base.py,sha256=OW57mXLlwm1nH8CNefvXmwLRws9KL9zSrb-3vH--mX8,1641
|
|
@@ -51,7 +51,7 @@ ocrd_models/xpath_functions.py,sha256=AwR8tHf56-mmIksnw_GeOQ760sxNHqK92T7z9OfsEE
|
|
|
51
51
|
ocrd_network/__init__.py,sha256=gMejC614J5PPGgXDKBiQS0jt-Jx8qOrLbWH7zt8x8Gs,374
|
|
52
52
|
ocrd_network/client.py,sha256=rzqtamZ8krRRy-QTO-AeWH8Lr3HhRiQe2R1-Lovd40g,3020
|
|
53
53
|
ocrd_network/client_utils.py,sha256=VVZMNBgGznh41exZ78S48X3DDwHcWTuOq-LNdxjRvak,5002
|
|
54
|
-
ocrd_network/constants.py,sha256=
|
|
54
|
+
ocrd_network/constants.py,sha256=AAcE6zZQNcNp2oqPD6oIgoVLSs4IHTkg8AS92WCQ6Xo,1968
|
|
55
55
|
ocrd_network/database.py,sha256=fcft7vdRDoR7vmPL1xNYTIeOg5DwRPcggwYDYxLy5ik,10706
|
|
56
56
|
ocrd_network/logging_utils.py,sha256=ijWpM8B943Jx6F0NeK3ggni0198UYjM5NCkYpARLk_E,2472
|
|
57
57
|
ocrd_network/param_validators.py,sha256=Jl1VwiPPKJ50k-xEHLdvW-1QDOkJHCiMz4k9Ipqm-Uc,1489
|
|
@@ -96,7 +96,7 @@ ocrd_utils/image.py,sha256=zNNX1cnRy6yvrxx8mnYQiqWraAh5-i4a1AOfCCg4SmI,24781
|
|
|
96
96
|
ocrd_utils/introspect.py,sha256=gfBlmeEFuRmRUSgdSK0jOxRpYqDRXl2IAE6gv2MZ6as,1977
|
|
97
97
|
ocrd_utils/logging.py,sha256=XYTL7DxUvdX4V56jhAYH6PkhjMFOmaa0kf_XkhSTTe0,7816
|
|
98
98
|
ocrd_utils/ocrd_logging.conf,sha256=JlWmA_5vg6HnjPGjTC4mA5vFHqmnEinwllSTiOw5CCo,3473
|
|
99
|
-
ocrd_utils/os.py,sha256=
|
|
99
|
+
ocrd_utils/os.py,sha256=tMjikpVXJ8sCgYBOrgjgT3vlR2Pok39nSKysYc6mUQ4,9863
|
|
100
100
|
ocrd_utils/str.py,sha256=cRgqYILDGOAqWr0qrCrV52I3y4wvpwDVtnBGEUjXNS4,10116
|
|
101
101
|
ocrd_validators/__init__.py,sha256=ZFc-UqRVBk9o1YesZFmr9lOepttNJ_NKx1Zdb7g_YsU,972
|
|
102
102
|
ocrd_validators/bagit-profile.yml,sha256=sdQJlSi7TOn1E9WYMOZ1shewJ-i_nPaKmsAFkh28TGY,1011
|
|
@@ -120,9 +120,9 @@ ocrd_validators/xlink.xsd,sha256=8fW7YAMWXN2PbB_MMvj9H5ZeFoEBDzuYBtlGC8_6ijw,318
|
|
|
120
120
|
ocrd_validators/xsd_mets_validator.py,sha256=4GWfLyqkmca0x7osDuXuExYuM0HWVrKoqn0S35sFhHU,467
|
|
121
121
|
ocrd_validators/xsd_page_validator.py,sha256=BNz_9u-Ek4UCeyZu3KxSQoolfW9lvuaSR9nIu1XXxeE,467
|
|
122
122
|
ocrd_validators/xsd_validator.py,sha256=6HrVAf6SzCvfUIuQdIzz9bOq4V-zhyii9yrUPoK2Uvo,2094
|
|
123
|
-
ocrd-3.
|
|
124
|
-
ocrd-3.
|
|
125
|
-
ocrd-3.
|
|
126
|
-
ocrd-3.
|
|
127
|
-
ocrd-3.
|
|
128
|
-
ocrd-3.
|
|
123
|
+
ocrd-3.5.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
124
|
+
ocrd-3.5.0.dist-info/METADATA,sha256=t6X-RzqcEpiZDXRaRR0kSfU6hgu4fAa5qLpGVD4Lhjs,10442
|
|
125
|
+
ocrd-3.5.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
126
|
+
ocrd-3.5.0.dist-info/entry_points.txt,sha256=4hcJ2LkK_OlIabHnKgFit35Ap7b5Lz1Gb4hzkxV0Kiw,152
|
|
127
|
+
ocrd-3.5.0.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
|
|
128
|
+
ocrd-3.5.0.dist-info/RECORD,,
|
ocrd_models/constants.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Constants for ocrd_models.
|
|
3
3
|
"""
|
|
4
|
+
from re import Pattern
|
|
4
5
|
from enum import Enum, auto
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any, List, Optional, Union
|
|
5
9
|
from ocrd_utils import resource_string
|
|
6
10
|
|
|
7
11
|
__all__ = [
|
|
@@ -28,6 +32,10 @@ __all__ = [
|
|
|
28
32
|
'TAG_PAGE_TEXTEQUIV',
|
|
29
33
|
'TAG_PAGE_TEXTREGION',
|
|
30
34
|
'METS_PAGE_DIV_ATTRIBUTE',
|
|
35
|
+
'METS_STRUCT_DIV_ATTRIBUTE',
|
|
36
|
+
'METS_DIV_ATTRIBUTE_ATOM_PATTERN',
|
|
37
|
+
'METS_DIV_ATTRIBUTE_RANGE_PATTERN',
|
|
38
|
+
'METS_DIV_ATTRIBUTE_REGEX_PATTERN',
|
|
31
39
|
'PAGE_REGION_TYPES',
|
|
32
40
|
'PAGE_ALTIMG_FEATURES',
|
|
33
41
|
]
|
|
@@ -89,6 +97,7 @@ PAGE_ALTIMG_FEATURES = [
|
|
|
89
97
|
|
|
90
98
|
|
|
91
99
|
class METS_PAGE_DIV_ATTRIBUTE(Enum):
|
|
100
|
+
"""page selection attributes of PHYSICAL mets:structMap//mets:div"""
|
|
92
101
|
ID = auto()
|
|
93
102
|
ORDER = auto()
|
|
94
103
|
ORDERLABEL = auto()
|
|
@@ -98,3 +107,99 @@ class METS_PAGE_DIV_ATTRIBUTE(Enum):
|
|
|
98
107
|
@classmethod
|
|
99
108
|
def names(cls):
|
|
100
109
|
return [x.name for x in cls]
|
|
110
|
+
@classmethod
|
|
111
|
+
def type_prefix(cls):
|
|
112
|
+
"""disambiguation prefix to use for all subtypes"""
|
|
113
|
+
return "physical:"
|
|
114
|
+
def prefix(self):
|
|
115
|
+
"""disambiguation prefix to use for this attribute type"""
|
|
116
|
+
return self.type_prefix() + self.name.lower() + ":"
|
|
117
|
+
|
|
118
|
+
class METS_STRUCT_DIV_ATTRIBUTE(Enum):
|
|
119
|
+
"""page selection attributes of LOGICAL mets:structMap//mets:div"""
|
|
120
|
+
ID = auto()
|
|
121
|
+
DMDID = auto()
|
|
122
|
+
TYPE = auto()
|
|
123
|
+
LABEL = auto()
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def names(cls):
|
|
127
|
+
return [x.name for x in cls]
|
|
128
|
+
@classmethod
|
|
129
|
+
def type_prefix(cls):
|
|
130
|
+
"""disambiguation prefix to use for all subtypes"""
|
|
131
|
+
return "logical:"
|
|
132
|
+
def prefix(self):
|
|
133
|
+
"""disambiguation prefix to use for this attribute type"""
|
|
134
|
+
return self.type_prefix() + self.name.lower() + ":"
|
|
135
|
+
|
|
136
|
+
@dataclass
|
|
137
|
+
class METS_DIV_ATTRIBUTE_PATTERN(ABC):
|
|
138
|
+
"""page selection pattern (abstract supertype)"""
|
|
139
|
+
|
|
140
|
+
expr: Any
|
|
141
|
+
"""pattern value to match a mets:div against"""
|
|
142
|
+
attr: List[Union[METS_PAGE_DIV_ATTRIBUTE, METS_STRUCT_DIV_ATTRIBUTE]] = field(
|
|
143
|
+
default_factory=lambda: list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE))
|
|
144
|
+
"""attribute type(s) to match a mets:div for
|
|
145
|
+
(pre-disambiguated with prefix syntax, or filled upon first match)
|
|
146
|
+
"""
|
|
147
|
+
has_matched: bool = field(init=False, default=False)
|
|
148
|
+
"""whether this pattern has already been matched"""
|
|
149
|
+
|
|
150
|
+
def attr_prefix(self):
|
|
151
|
+
"""attribute type disambiguation prefix corresponding to the current state of disambiguation"""
|
|
152
|
+
if self.attr == list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE):
|
|
153
|
+
return ""
|
|
154
|
+
if self.attr == list(METS_PAGE_DIV_ATTRIBUTE):
|
|
155
|
+
return METS_PAGE_DIV_ATTRIBUTE.type_prefix()
|
|
156
|
+
if self.attr == list(METS_STRUCT_DIV_ATTRIBUTE):
|
|
157
|
+
return METS_STRUCT_DIV_ATTRIBUTE.type_prefix()
|
|
158
|
+
assert len(self.attr) == 1, "unexpected type ambiguity: %s" % repr(self.attr)
|
|
159
|
+
return self.attr[0].prefix()
|
|
160
|
+
|
|
161
|
+
@abstractmethod
|
|
162
|
+
def _matches(self, input) -> bool:
|
|
163
|
+
return
|
|
164
|
+
def matches(self, input) -> bool:
|
|
165
|
+
"""does the selection pattern match on the given attribute value?"""
|
|
166
|
+
if (matched := self._matches(input)):
|
|
167
|
+
self.has_matched = True
|
|
168
|
+
return matched
|
|
169
|
+
|
|
170
|
+
@dataclass
|
|
171
|
+
class METS_DIV_ATTRIBUTE_ATOM_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
|
|
172
|
+
"""page selection pattern for literal (single value) matching"""
|
|
173
|
+
|
|
174
|
+
expr: str
|
|
175
|
+
def __repr__(self):
|
|
176
|
+
return "%s%s" % (self.attr_prefix(), self.expr)
|
|
177
|
+
def _matches(self, input):
|
|
178
|
+
return input == self.expr
|
|
179
|
+
|
|
180
|
+
@dataclass
|
|
181
|
+
class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
|
|
182
|
+
"""page selection pattern for interval (list expansion) matching"""
|
|
183
|
+
|
|
184
|
+
expr: List[str]
|
|
185
|
+
start: str = field(init=False)
|
|
186
|
+
"""first value of the range after expansion, before matching-exhausting"""
|
|
187
|
+
stop: str = field(init=False)
|
|
188
|
+
"""last value of the range after expansion, before matching-exhausting"""
|
|
189
|
+
def __post_init__(self):
|
|
190
|
+
self.start = self.expr[0]
|
|
191
|
+
self.stop = self.expr[-1]
|
|
192
|
+
def __repr__(self):
|
|
193
|
+
return "%s%s..%s" % (self.attr_prefix(), self.start, self.stop)
|
|
194
|
+
def _matches(self, input):
|
|
195
|
+
return input in self.expr
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class METS_DIV_ATTRIBUTE_REGEX_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
|
|
199
|
+
"""page selection pattern for regular expression matching"""
|
|
200
|
+
|
|
201
|
+
expr: Pattern
|
|
202
|
+
def __repr__(self):
|
|
203
|
+
return "%s//%s" % (self.attr_prefix(), self.expr.pattern)
|
|
204
|
+
def _matches(self, input):
|
|
205
|
+
return bool(self.expr.fullmatch(input))
|
ocrd_models/ocrd_mets.py
CHANGED
|
@@ -29,7 +29,12 @@ from .constants import (
|
|
|
29
29
|
IDENTIFIER_PRIORITY,
|
|
30
30
|
TAG_MODS_IDENTIFIER,
|
|
31
31
|
METS_XML_EMPTY,
|
|
32
|
-
METS_PAGE_DIV_ATTRIBUTE
|
|
32
|
+
METS_PAGE_DIV_ATTRIBUTE,
|
|
33
|
+
METS_STRUCT_DIV_ATTRIBUTE,
|
|
34
|
+
METS_DIV_ATTRIBUTE_PATTERN,
|
|
35
|
+
METS_DIV_ATTRIBUTE_ATOM_PATTERN,
|
|
36
|
+
METS_DIV_ATTRIBUTE_RANGE_PATTERN,
|
|
37
|
+
METS_DIV_ATTRIBUTE_REGEX_PATTERN,
|
|
33
38
|
)
|
|
34
39
|
|
|
35
40
|
from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore
|
|
@@ -43,9 +48,11 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
43
48
|
API to a single METS file
|
|
44
49
|
"""
|
|
45
50
|
_cache_flag : bool
|
|
46
|
-
# Cache for the pages (mets:div)
|
|
47
|
-
# The dictionary's
|
|
48
|
-
# The dictionary's
|
|
51
|
+
# Cache for the physical pages (mets:div) - two nested dictionaries
|
|
52
|
+
# The outer dictionary's key: attribute type
|
|
53
|
+
# The outer dictionary's value: inner dictionary
|
|
54
|
+
# The inner dictionary's key: attribute value (str)
|
|
55
|
+
# The inner dictionary's value: a 'div' object at some memory location
|
|
49
56
|
_page_cache : Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET._Element]]
|
|
50
57
|
# Cache for the files (mets:file) - two nested dictionaries
|
|
51
58
|
# The outer dictionary's Key: 'fileGrp.USE'
|
|
@@ -59,6 +66,12 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
59
66
|
# The inner dictionary's Key: 'fptr.FILEID'
|
|
60
67
|
# The inner dictionary's Value: a 'fptr' object at some memory location
|
|
61
68
|
_fptr_cache : Dict[str, Dict[str, ET._Element]]
|
|
69
|
+
# Cache for the logical structural divs (mets:div) - two nested dictionaries
|
|
70
|
+
# The outer dictionary's key: attribute type
|
|
71
|
+
# The outer dictionary's value: inner dictionary
|
|
72
|
+
# The inner dictionary's key: attribute value (str)
|
|
73
|
+
# The inner dictionary's value: a list of corresponding physical div.ID
|
|
74
|
+
_struct_cache : Dict[METS_STRUCT_DIV_ATTRIBUTE, Dict[str, List[str]]]
|
|
62
75
|
|
|
63
76
|
@staticmethod
|
|
64
77
|
def empty_mets(now : Optional[str] = None, cache_flag : bool = False):
|
|
@@ -111,7 +124,6 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
111
124
|
return
|
|
112
125
|
|
|
113
126
|
log = getLogger('ocrd.models.ocrd_mets._fill_caches-files')
|
|
114
|
-
|
|
115
127
|
for el_fileGrp in el_fileSec.findall('mets:fileGrp', NS):
|
|
116
128
|
fileGrp_use = el_fileGrp.get('USE')
|
|
117
129
|
|
|
@@ -124,10 +136,10 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
124
136
|
# log.info("File added to the cache: %s" % file_id)
|
|
125
137
|
|
|
126
138
|
# Fill with pages
|
|
139
|
+
log = getLogger('ocrd.models.ocrd_mets._fill_caches-pages')
|
|
127
140
|
el_div_list = tree_root.findall(".//mets:div[@TYPE='page']", NS)
|
|
128
141
|
if len(el_div_list) == 0:
|
|
129
142
|
return
|
|
130
|
-
log = getLogger('ocrd.models.ocrd_mets._fill_caches-pages')
|
|
131
143
|
|
|
132
144
|
for el_div in el_div_list:
|
|
133
145
|
div_id = el_div.get('ID')
|
|
@@ -148,11 +160,30 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
148
160
|
# log.info("Len of page_cache: %s" % len(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID]))
|
|
149
161
|
# log.info("Len of fptr_cache: %s" % len(self._fptr_cache))
|
|
150
162
|
|
|
163
|
+
# Fill with logical divs
|
|
164
|
+
log = getLogger('ocrd.models.ocrd_mets._fill_caches-structs')
|
|
165
|
+
el_struct_list = tree_root.findall("mets:structMap[@TYPE='LOGICAL']//mets:div", NS)
|
|
166
|
+
el_smlink_list = tree_root.findall("mets:structLink/mets:smLink", NS)
|
|
167
|
+
if len(el_struct_list) == 0 or len(el_smlink_list) == 0:
|
|
168
|
+
return
|
|
169
|
+
smlink_map = {}
|
|
170
|
+
for link in el_smlink_list:
|
|
171
|
+
link_log = link.get('{%s}from' % NS['xlink'])
|
|
172
|
+
link_phy = link.get('{%s}to' % NS['xlink'])
|
|
173
|
+
smlink_map.setdefault(link_log, list()).append(link_phy)
|
|
174
|
+
for el_div in el_struct_list:
|
|
175
|
+
for attr in METS_STRUCT_DIV_ATTRIBUTE:
|
|
176
|
+
val = self._struct_cache[attr].setdefault(str(el_div.get(attr.name)), list())
|
|
177
|
+
val.extend(smlink_map.get(el_div.get('ID'), []))
|
|
178
|
+
|
|
179
|
+
# log.info("Len of struct_cache: %s" % len(self._struct_cache[METS_STRUCT_DIV_ATTRIBUTE.ID]))
|
|
180
|
+
|
|
151
181
|
def _initialize_caches(self) -> None:
|
|
152
182
|
self._file_cache = {}
|
|
153
183
|
# NOTE we can only guarantee uniqueness for @ID and @ORDER
|
|
154
184
|
self._page_cache = {k : {} for k in METS_PAGE_DIV_ATTRIBUTE}
|
|
155
185
|
self._fptr_cache = {}
|
|
186
|
+
self._struct_cache = {k : {} for k in METS_STRUCT_DIV_ATTRIBUTE}
|
|
156
187
|
|
|
157
188
|
def _refresh_caches(self) -> None:
|
|
158
189
|
if self._cache_flag:
|
|
@@ -253,12 +284,20 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
253
284
|
:py:attr:`url` and :py:attr:`mimetype` parameters can each be either a
|
|
254
285
|
literal string, or a regular expression if the string starts with
|
|
255
286
|
``//`` (double slash).
|
|
287
|
+
|
|
256
288
|
If it is a regex, the leading ``//`` is removed and candidates are matched
|
|
257
289
|
against the regex with `re.fullmatch`. If it is a literal string, comparison
|
|
258
290
|
is done with string equality.
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
291
|
+
|
|
292
|
+
The :py:attr:`pageId` parameter also supports comma-separated lists, as well
|
|
293
|
+
as the numeric range operator ``..`` and the negation operator ``~``.
|
|
294
|
+
|
|
295
|
+
For example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``, the
|
|
296
|
+
both expressions ``PHYS_0001..PHYS_0003`` and ``PHYS_0001,PHYS_0002,PHYS_0003``
|
|
297
|
+
will be expanded to the same 3 pages. To find all files above that subrange,
|
|
298
|
+
both expressions ``~PHYS_0001..PHYS_0003`` and ``~PHYS_0001,~PHYS_0002,~PHYS_0003``
|
|
299
|
+
will be expanded to ``PHYS_0004`` and upwards.
|
|
300
|
+
|
|
262
301
|
Keyword Args:
|
|
263
302
|
ID (string) : ``@ID`` of the ``mets:file``
|
|
264
303
|
fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of
|
|
@@ -609,101 +648,73 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
609
648
|
|
|
610
649
|
return self.physical_pages
|
|
611
650
|
|
|
612
|
-
|
|
651
|
+
log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
|
|
613
652
|
if for_pageIds is not None:
|
|
614
|
-
ret = []
|
|
615
653
|
page_attr_patterns = []
|
|
616
|
-
|
|
617
|
-
for pageId_token in
|
|
654
|
+
page_attr_antipatterns = []
|
|
655
|
+
for pageId_token in re.split(r',', for_pageIds):
|
|
656
|
+
pageId_token_raw = pageId_token
|
|
657
|
+
# prefix for disambiguation of attribute?
|
|
658
|
+
attr = list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE)
|
|
659
|
+
for attr_type in [METS_STRUCT_DIV_ATTRIBUTE, METS_PAGE_DIV_ATTRIBUTE]:
|
|
660
|
+
if pageId_token.startswith(attr_type.type_prefix()):
|
|
661
|
+
for attr_val in list(attr_type):
|
|
662
|
+
if pageId_token.startswith(attr_val.prefix()):
|
|
663
|
+
# disambiguated to e.g. "logical:label:"
|
|
664
|
+
attr = [attr_val]
|
|
665
|
+
pageId_token = pageId_token[len(attr_val.prefix()):]
|
|
666
|
+
break
|
|
667
|
+
if len(attr) > 1:
|
|
668
|
+
# just "logical:" or "physical:"
|
|
669
|
+
attr = list(attr_type)
|
|
670
|
+
pageId_token = pageId_token[len(attr_type.type_prefix()):]
|
|
671
|
+
break
|
|
672
|
+
if not pageId_token:
|
|
673
|
+
raise ValueError("invalid pageId syntax '%s': empty after type prefix" % pageId_token_raw)
|
|
674
|
+
# negation prefix
|
|
675
|
+
if pageId_token.startswith('~'):
|
|
676
|
+
page_attr_xpatterns = page_attr_antipatterns
|
|
677
|
+
pageId_token = pageId_token[1:]
|
|
678
|
+
else:
|
|
679
|
+
page_attr_xpatterns = page_attr_patterns
|
|
680
|
+
if not pageId_token:
|
|
681
|
+
raise ValueError("invalid pageId syntax '%s': empty after negator prefix" % pageId_token_raw)
|
|
682
|
+
# operator prefix
|
|
618
683
|
if pageId_token.startswith(REGEX_PREFIX):
|
|
619
|
-
|
|
684
|
+
pageId_token = pageId_token[REGEX_PREFIX_LEN:]
|
|
685
|
+
if not pageId_token:
|
|
686
|
+
raise ValueError("invalid pageId syntax '%s': empty after regex prefix" % pageId_token_raw)
|
|
687
|
+
val_expr = re.compile(pageId_token)
|
|
688
|
+
page_attr_xpatterns.append(
|
|
689
|
+
METS_DIV_ATTRIBUTE_REGEX_PATTERN(val_expr, attr))
|
|
620
690
|
elif '..' in pageId_token:
|
|
621
|
-
|
|
622
|
-
|
|
691
|
+
try:
|
|
692
|
+
val_range = generate_range(*pageId_token.split('..', 1))
|
|
693
|
+
except ValueError as e:
|
|
694
|
+
raise ValueError("invalid pageId syntax '%s': %s" % (pageId_token_raw, str(e))) from None
|
|
695
|
+
page_attr_xpatterns.append(
|
|
696
|
+
METS_DIV_ATTRIBUTE_RANGE_PATTERN(val_range, attr))
|
|
623
697
|
else:
|
|
624
|
-
|
|
625
|
-
|
|
698
|
+
if not pageId_token:
|
|
699
|
+
raise ValueError("invalid pageId syntax '%s': empty" % pageId_token_raw)
|
|
700
|
+
page_attr_xpatterns.append(
|
|
701
|
+
METS_DIV_ATTRIBUTE_ATOM_PATTERN(pageId_token, attr))
|
|
702
|
+
log.debug("parsed pattern '%s' to %s", pageId_token_raw, page_attr_xpatterns[-1])
|
|
703
|
+
if not page_attr_patterns and not page_attr_antipatterns:
|
|
626
704
|
return []
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
if self._cache_flag:
|
|
630
|
-
for pat in page_attr_patterns:
|
|
631
|
-
try:
|
|
632
|
-
attr : METS_PAGE_DIV_ATTRIBUTE
|
|
633
|
-
if isinstance(pat, str):
|
|
634
|
-
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat in self._page_cache[a])
|
|
635
|
-
cache_keys = [pat]
|
|
636
|
-
elif isinstance(pat, list):
|
|
637
|
-
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x in self._page_cache[a] for x in pat))
|
|
638
|
-
cache_keys = [v for v in pat if v in self._page_cache[attr]]
|
|
639
|
-
for k in cache_keys:
|
|
640
|
-
pat.remove(k)
|
|
641
|
-
elif isinstance(pat, tuple):
|
|
642
|
-
_, re_pat = pat
|
|
643
|
-
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if re_pat.fullmatch(v))
|
|
644
|
-
cache_keys = [v for v in self._page_cache[attr] if re_pat.fullmatch(v)]
|
|
645
|
-
else:
|
|
646
|
-
raise ValueError
|
|
647
|
-
if return_divs:
|
|
648
|
-
ret += [self._page_cache[attr][v] for v in cache_keys]
|
|
649
|
-
else:
|
|
650
|
-
ret += [self._page_cache[attr][v].get('ID') for v in cache_keys]
|
|
651
|
-
except StopIteration:
|
|
652
|
-
raise ValueError(f"{pat} matches none of the keys of any of the _page_caches.")
|
|
705
|
+
if page_attr_patterns:
|
|
706
|
+
divs = self.get_physical_page_patterns(page_attr_patterns)
|
|
653
707
|
else:
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
patterns_exhausted.append(pat)
|
|
665
|
-
elif isinstance(pat, list):
|
|
666
|
-
if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE):
|
|
667
|
-
pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x == page.get(a.name) for x in pat)))
|
|
668
|
-
attr_val = page.get(pat[0].name)
|
|
669
|
-
if attr_val in pat:
|
|
670
|
-
pat.remove(attr_val)
|
|
671
|
-
ret.append(page if return_divs else page.get('ID'))
|
|
672
|
-
if len(pat) == 1:
|
|
673
|
-
patterns_exhausted.append(pat)
|
|
674
|
-
elif isinstance(pat, tuple):
|
|
675
|
-
attr, re_pat = pat
|
|
676
|
-
if not attr:
|
|
677
|
-
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if re_pat.fullmatch(page.get(a.name) or ''))
|
|
678
|
-
page_attr_patterns[pat_idx] = (attr, re_pat)
|
|
679
|
-
if re_pat.fullmatch(page.get(attr.name) or ''):
|
|
680
|
-
ret.append(page if return_divs else page.get('ID'))
|
|
681
|
-
else:
|
|
682
|
-
raise ValueError
|
|
683
|
-
page_attr_patterns_matched.append(pat)
|
|
684
|
-
except StopIteration:
|
|
685
|
-
continue
|
|
686
|
-
for p in patterns_exhausted:
|
|
687
|
-
page_attr_patterns.remove(p)
|
|
688
|
-
unmatched = [x for x in page_attr_patterns_copy if x not in page_attr_patterns_matched]
|
|
689
|
-
if unmatched:
|
|
690
|
-
raise ValueError(f"Patterns {unmatched} match none of the pages")
|
|
691
|
-
|
|
692
|
-
ranges_without_start_match = []
|
|
693
|
-
ranges_without_last_match = []
|
|
694
|
-
for idx, pat in enumerate(page_attr_patterns_copy):
|
|
695
|
-
if isinstance(pat, list):
|
|
696
|
-
start, last = range_patterns_first_last[idx]
|
|
697
|
-
if start in pat:
|
|
698
|
-
print(pat, start, last)
|
|
699
|
-
ranges_without_start_match.append(page_attr_patterns_raw[idx])
|
|
700
|
-
# if last in pat:
|
|
701
|
-
# ranges_without_last_match.append(page_attr_patterns_raw[idx])
|
|
702
|
-
if ranges_without_start_match:
|
|
703
|
-
raise ValueError(f"Start of range patterns {ranges_without_start_match} not matched - invalid range")
|
|
704
|
-
# if ranges_without_last_match:
|
|
705
|
-
# raise ValueError(f"End of range patterns {ranges_without_last_match} not matched - invalid range")
|
|
706
|
-
return ret
|
|
708
|
+
all_pages = [METS_DIV_ATTRIBUTE_REGEX_PATTERN(
|
|
709
|
+
re.compile(".*"), [METS_PAGE_DIV_ATTRIBUTE.ID])]
|
|
710
|
+
divs = self.get_physical_page_patterns(all_pages)
|
|
711
|
+
if page_attr_antipatterns:
|
|
712
|
+
antidivs = self.get_physical_page_patterns(page_attr_antipatterns)
|
|
713
|
+
divs = [div for div in divs if div not in antidivs]
|
|
714
|
+
if return_divs:
|
|
715
|
+
return divs
|
|
716
|
+
else:
|
|
717
|
+
return [div.get('ID') for div in divs]
|
|
707
718
|
|
|
708
719
|
if for_fileIds == []:
|
|
709
720
|
return []
|
|
@@ -731,6 +742,129 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
731
742
|
ret[index] = page.get('ID')
|
|
732
743
|
return ret
|
|
733
744
|
|
|
745
|
+
def get_physical_page_patterns(self, page_attr_patterns: List[METS_DIV_ATTRIBUTE_PATTERN]) -> List[ET._Element]:
|
|
746
|
+
log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
|
|
747
|
+
ret = []
|
|
748
|
+
page_attr_patterns_copy = list(page_attr_patterns)
|
|
749
|
+
if self._cache_flag:
|
|
750
|
+
for pat in page_attr_patterns:
|
|
751
|
+
for attr in pat.attr:
|
|
752
|
+
if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
|
|
753
|
+
cache = self._page_cache[attr]
|
|
754
|
+
else:
|
|
755
|
+
cache = self._struct_cache[attr]
|
|
756
|
+
if (isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN) and
|
|
757
|
+
# @TYPE makes no sense in range expressions
|
|
758
|
+
# @LABEL makes no sense in range expressions
|
|
759
|
+
attr in [METS_STRUCT_DIV_ATTRIBUTE.TYPE,
|
|
760
|
+
METS_STRUCT_DIV_ATTRIBUTE.LABEL]):
|
|
761
|
+
continue
|
|
762
|
+
if cache_keys := [v for v in cache if pat.matches(v)]:
|
|
763
|
+
if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
|
|
764
|
+
ret += [cache[v] for v in cache_keys]
|
|
765
|
+
log.debug('physical matches for %s: %s', pat, str(cache_keys))
|
|
766
|
+
else:
|
|
767
|
+
for v in cache_keys:
|
|
768
|
+
ret += [self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][p]
|
|
769
|
+
for p in cache[v]]
|
|
770
|
+
log.debug('logical matches for %s: %s', pat, str(cache_keys))
|
|
771
|
+
if isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
|
|
772
|
+
# remove matches for final range check
|
|
773
|
+
for v in cache_keys:
|
|
774
|
+
pat.expr.remove(v)
|
|
775
|
+
break
|
|
776
|
+
if not cache_keys:
|
|
777
|
+
raise ValueError(f"{pat} matches none of the keys of any of the _page_caches and _struct_caches.")
|
|
778
|
+
else:
|
|
779
|
+
# cache logical structmap:
|
|
780
|
+
el_struct_list = self._tree.getroot().findall("mets:structMap[@TYPE='LOGICAL']//mets:div", NS)
|
|
781
|
+
el_smlink_list = self._tree.getroot().findall("mets:structLink/mets:smLink", NS)
|
|
782
|
+
smlink_map = {}
|
|
783
|
+
for link in el_smlink_list:
|
|
784
|
+
link_log = link.get('{%s}from' % NS['xlink'])
|
|
785
|
+
link_phy = link.get('{%s}to' % NS['xlink'])
|
|
786
|
+
smlink_map.setdefault(link_log, list()).append(link_phy)
|
|
787
|
+
struct_cache = {k: {} for k in METS_STRUCT_DIV_ATTRIBUTE}
|
|
788
|
+
for el_div in el_struct_list:
|
|
789
|
+
for attr in METS_STRUCT_DIV_ATTRIBUTE:
|
|
790
|
+
if not el_div.get(attr.name):
|
|
791
|
+
# avoid mapping None indiscriminately
|
|
792
|
+
continue
|
|
793
|
+
val = struct_cache[attr].setdefault(str(el_div.get(attr.name)), list())
|
|
794
|
+
val.extend(smlink_map.get(el_div.get('ID'), []))
|
|
795
|
+
log.debug("found %d smLink entries for %d logical divs", len(el_smlink_list), len(el_struct_list))
|
|
796
|
+
page_attr_patterns_matched = []
|
|
797
|
+
for page in self._tree.getroot().xpath(
|
|
798
|
+
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
|
|
799
|
+
namespaces=NS):
|
|
800
|
+
patterns_exhausted = []
|
|
801
|
+
for pat in page_attr_patterns:
|
|
802
|
+
for attr in pat.attr:
|
|
803
|
+
if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
|
|
804
|
+
cache = [page.get(attr.name) or '']
|
|
805
|
+
else:
|
|
806
|
+
cache = struct_cache[attr]
|
|
807
|
+
if (isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN) and
|
|
808
|
+
# @TYPE makes no sense in range expressions
|
|
809
|
+
# @LABEL makes no sense in range expressions
|
|
810
|
+
attr in [METS_STRUCT_DIV_ATTRIBUTE.TYPE,
|
|
811
|
+
METS_STRUCT_DIV_ATTRIBUTE.LABEL]):
|
|
812
|
+
continue
|
|
813
|
+
if cache_keys := [v for v in cache if pat.matches(v)]:
|
|
814
|
+
pat.attr = [attr] # disambiguate next
|
|
815
|
+
if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
|
|
816
|
+
ret.append(page)
|
|
817
|
+
log.debug('physical match for %s on page %s', pat, page.get('ID'))
|
|
818
|
+
if isinstance(pat, METS_DIV_ATTRIBUTE_ATOM_PATTERN):
|
|
819
|
+
patterns_exhausted.append(pat)
|
|
820
|
+
elif isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
|
|
821
|
+
# remove for efficiency and final range check
|
|
822
|
+
pat.expr.remove(cache_keys[0])
|
|
823
|
+
if not pat.expr:
|
|
824
|
+
patterns_exhausted.append(pat)
|
|
825
|
+
elif cache_key := next((v for v in cache_keys
|
|
826
|
+
if page.get('ID') in cache[v]), None):
|
|
827
|
+
ret.append(page)
|
|
828
|
+
log.debug('logical match for %s on page %s', pat, page.get('ID'))
|
|
829
|
+
cache[cache_key].remove(page.get('ID'))
|
|
830
|
+
# remove for efficiency and final range check
|
|
831
|
+
if not cache[cache_key]:
|
|
832
|
+
if isinstance(pat, METS_DIV_ATTRIBUTE_ATOM_PATTERN):
|
|
833
|
+
patterns_exhausted.append(pat)
|
|
834
|
+
elif isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
|
|
835
|
+
pat.expr.remove(cache_key)
|
|
836
|
+
if not pat.expr:
|
|
837
|
+
patterns_exhausted.append(pat)
|
|
838
|
+
break # no more attributes for this pattern
|
|
839
|
+
# keep matching in order to exhaust and consume pattern list
|
|
840
|
+
#if page in ret:
|
|
841
|
+
# break # no more patterns for this page
|
|
842
|
+
for p in patterns_exhausted:
|
|
843
|
+
page_attr_patterns.remove(p)
|
|
844
|
+
unmatched = [pat for pat in page_attr_patterns_copy
|
|
845
|
+
if not pat.has_matched]
|
|
846
|
+
if unmatched:
|
|
847
|
+
raise ValueError(f"Patterns {unmatched} match none of the pages")
|
|
848
|
+
|
|
849
|
+
ranges_without_start_match = []
|
|
850
|
+
ranges_without_stop_match = []
|
|
851
|
+
for pat in page_attr_patterns_copy:
|
|
852
|
+
if isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
|
|
853
|
+
# range expression, expanded to pattern list
|
|
854
|
+
# list items get consumed (pat.expr.remove) when matched,
|
|
855
|
+
# exhausted patterns also get consumed (page_attr_patterns.remove)
|
|
856
|
+
# (but top-level list copy references the same list objects)
|
|
857
|
+
if pat.start in pat.expr:
|
|
858
|
+
log.debug((pat, pat.expr))
|
|
859
|
+
ranges_without_start_match.append(pat)
|
|
860
|
+
# if pat.stop in pat.expr:
|
|
861
|
+
# ranges_without_stop_match.append(pat)
|
|
862
|
+
if ranges_without_start_match:
|
|
863
|
+
raise ValueError(f"Start of range patterns {ranges_without_start_match} not matched - invalid range")
|
|
864
|
+
# if ranges_without_stop_match:
|
|
865
|
+
# raise ValueError(f"End of range patterns {ranges_without_stop_match} not matched - invalid range")
|
|
866
|
+
return ret
|
|
867
|
+
|
|
734
868
|
def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile,
|
|
735
869
|
order : Optional[str] = None, orderlabel : Optional[str] = None) -> None:
|
|
736
870
|
"""
|
ocrd_network/constants.py
CHANGED
|
@@ -11,12 +11,16 @@ OCRD_ALL_TOOL_JSON = "ocrd-all-tool.json"
|
|
|
11
11
|
SERVER_ALL_PAGES_PLACEHOLDER = "all_pages"
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class
|
|
14
|
+
class StrEnum(str, Enum):
|
|
15
|
+
def __str__(self):
|
|
16
|
+
return self.value
|
|
17
|
+
|
|
18
|
+
class AgentType(StrEnum):
|
|
15
19
|
PROCESSING_WORKER = "worker"
|
|
16
20
|
PROCESSOR_SERVER = "server"
|
|
17
21
|
|
|
18
22
|
|
|
19
|
-
class DeployType(
|
|
23
|
+
class DeployType(StrEnum):
|
|
20
24
|
# Deployed by the Processing Server config file
|
|
21
25
|
DOCKER = "docker"
|
|
22
26
|
NATIVE = "native"
|
|
@@ -26,7 +30,7 @@ class DeployType(str, Enum):
|
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
# TODO: Make the states uppercase
|
|
29
|
-
class JobState(
|
|
33
|
+
class JobState(StrEnum):
|
|
30
34
|
# The processing job is cached inside the Processing Server requests cache
|
|
31
35
|
cached = "CACHED"
|
|
32
36
|
# The processing job was cancelled due to failed dependencies
|
|
@@ -43,7 +47,7 @@ class JobState(str, Enum):
|
|
|
43
47
|
unset = "UNSET"
|
|
44
48
|
|
|
45
49
|
|
|
46
|
-
class NetworkLoggingDirs(
|
|
50
|
+
class NetworkLoggingDirs(StrEnum):
|
|
47
51
|
METS_SERVERS = "mets_servers"
|
|
48
52
|
PROCESSING_JOBS = "processing_jobs"
|
|
49
53
|
PROCESSING_SERVERS = "processing_servers"
|
|
@@ -51,7 +55,7 @@ class NetworkLoggingDirs(str, Enum):
|
|
|
51
55
|
PROCESSOR_SERVERS = "processor_servers"
|
|
52
56
|
|
|
53
57
|
|
|
54
|
-
class ServerApiTags(
|
|
58
|
+
class ServerApiTags(StrEnum):
|
|
55
59
|
ADMIN = "admin"
|
|
56
60
|
DISCOVERY = "discovery"
|
|
57
61
|
PROCESSING = "processing"
|
ocrd_utils/os.py
CHANGED
|
@@ -254,7 +254,7 @@ def guess_media_type(input_file : str, fallback : str = None, application_xml :
|
|
|
254
254
|
if mimetype is None:
|
|
255
255
|
mimetype = EXT_TO_MIME.get(''.join(Path(input_file).suffixes), fallback)
|
|
256
256
|
if mimetype is None:
|
|
257
|
-
raise ValueError("Could not determine MIME type of input_file
|
|
257
|
+
raise ValueError(f"Could not determine MIME type of {input_file}")
|
|
258
258
|
if mimetype == 'application/xml':
|
|
259
259
|
mimetype = application_xml
|
|
260
260
|
return mimetype
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|