ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +34 -26
- ocrd/cli/bashlib.py +32 -18
- ocrd/cli/ocrd_tool.py +7 -5
- ocrd/cli/workspace.py +10 -8
- ocrd/decorators/__init__.py +13 -7
- ocrd/decorators/ocrd_cli_options.py +1 -1
- ocrd/lib.bash +3 -0
- ocrd/mets_server.py +3 -4
- ocrd/processor/__init__.py +1 -1
- ocrd/processor/base.py +421 -98
- ocrd/processor/builtin/dummy_processor.py +4 -11
- ocrd/processor/helpers.py +24 -161
- ocrd/processor/ocrd_page_result.py +3 -3
- ocrd/resolver.py +0 -3
- ocrd/resource_manager.py +9 -5
- ocrd/workspace.py +10 -11
- ocrd/workspace_backup.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/METADATA +32 -10
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/RECORD +49 -48
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/WHEEL +1 -1
- ocrd_modelfactory/__init__.py +1 -1
- ocrd_models/constants.py +0 -1
- ocrd_models/ocrd_exif.py +2 -2
- ocrd_models/ocrd_file.py +2 -2
- ocrd_models/ocrd_mets.py +22 -22
- ocrd_models/ocrd_page.py +0 -1
- ocrd_models/ocrd_xml_base.py +2 -2
- ocrd_network/cli/client.py +134 -30
- ocrd_network/client.py +53 -27
- ocrd_network/client_utils.py +101 -0
- ocrd_network/processing_server.py +1 -1
- ocrd_network/runtime_data/deployer.py +12 -3
- ocrd_network/server_utils.py +12 -10
- ocrd_utils/__init__.py +2 -0
- ocrd_utils/config.py +31 -2
- ocrd_utils/image.py +25 -25
- ocrd_utils/logging.py +20 -20
- ocrd_utils/os.py +4 -5
- ocrd_utils/str.py +10 -3
- ocrd_validators/json_validator.py +1 -3
- ocrd_validators/ocrd_tool_validator.py +2 -2
- ocrd_validators/page_validator.py +56 -56
- ocrd_validators/parameter_validator.py +2 -2
- ocrd_validators/resource_list_validator.py +4 -3
- ocrd_validators/workspace_validator.py +21 -21
- ocrd_validators/xsd_validator.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/LICENSE +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/entry_points.txt +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/top_level.txt +0 -0
ocrd/processor/base.py
CHANGED
|
@@ -9,6 +9,7 @@ __all__ = [
|
|
|
9
9
|
'run_processor'
|
|
10
10
|
]
|
|
11
11
|
|
|
12
|
+
from functools import cached_property
|
|
12
13
|
from os.path import exists, join
|
|
13
14
|
from shutil import copyfileobj
|
|
14
15
|
import json
|
|
@@ -20,36 +21,48 @@ import sys
|
|
|
20
21
|
import inspect
|
|
21
22
|
import tarfile
|
|
22
23
|
import io
|
|
23
|
-
|
|
24
|
+
import weakref
|
|
25
|
+
from frozendict import frozendict
|
|
26
|
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError
|
|
27
|
+
|
|
28
|
+
from click import wrap_text
|
|
24
29
|
from deprecated import deprecated
|
|
25
30
|
from requests import HTTPError
|
|
26
31
|
|
|
27
|
-
from
|
|
32
|
+
from ..workspace import Workspace
|
|
33
|
+
from ..mets_server import ClientSideOcrdMets
|
|
28
34
|
from ocrd_models.ocrd_file import OcrdFileType
|
|
29
|
-
from
|
|
35
|
+
from .ocrd_page_result import OcrdPageResult
|
|
30
36
|
from ocrd_utils import (
|
|
31
37
|
VERSION as OCRD_VERSION,
|
|
32
38
|
MIMETYPE_PAGE,
|
|
33
39
|
MIME_TO_EXT,
|
|
34
40
|
config,
|
|
35
41
|
getLogger,
|
|
36
|
-
initLogging,
|
|
37
42
|
list_resource_candidates,
|
|
38
43
|
pushd_popd,
|
|
39
44
|
list_all_resources,
|
|
40
45
|
get_processor_resource_types,
|
|
41
46
|
resource_filename,
|
|
42
|
-
|
|
47
|
+
parse_json_file_with_comments,
|
|
43
48
|
make_file_id,
|
|
44
49
|
deprecation_warning
|
|
45
50
|
)
|
|
46
51
|
from ocrd_validators import ParameterValidator
|
|
47
|
-
from ocrd_models.ocrd_page import
|
|
52
|
+
from ocrd_models.ocrd_page import (
|
|
53
|
+
PageType,
|
|
54
|
+
AlternativeImageType,
|
|
55
|
+
MetadataItemType,
|
|
56
|
+
LabelType,
|
|
57
|
+
LabelsType,
|
|
58
|
+
OcrdPage,
|
|
59
|
+
to_xml,
|
|
60
|
+
)
|
|
48
61
|
from ocrd_modelfactory import page_from_file
|
|
49
62
|
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
|
|
50
63
|
|
|
51
64
|
# XXX imports must remain for backwards-compatibility
|
|
52
|
-
from .helpers import run_cli, run_processor
|
|
65
|
+
from .helpers import run_cli, run_processor # pylint: disable=unused-import
|
|
53
66
|
|
|
54
67
|
|
|
55
68
|
class ResourceNotFoundError(FileNotFoundError):
|
|
@@ -94,54 +107,163 @@ class MissingInputFile(ValueError):
|
|
|
94
107
|
|
|
95
108
|
class Processor():
|
|
96
109
|
"""
|
|
97
|
-
A processor is a tool that implements the uniform OCR-D
|
|
98
|
-
for run-time data processing.
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
110
|
+
A processor is a tool that implements the uniform OCR-D
|
|
111
|
+
`command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
|
|
112
|
+
|
|
113
|
+
That is, it executes a single workflow step, or a combination of workflow steps,
|
|
114
|
+
on the workspace (represented by local METS). It reads input files for all or selected
|
|
115
|
+
physical pages of the input fileGrp(s), computes additional annotation, and writes output
|
|
116
|
+
files for them into the output fileGrp(s). It may take a number of optional or mandatory
|
|
117
|
+
parameters.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
max_instances : int = -1
|
|
121
|
+
"""
|
|
122
|
+
maximum number of cached instances (ignored if negative), to be applied on top of
|
|
123
|
+
:py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
|
|
124
|
+
|
|
125
|
+
(Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
max_workers : int = -1
|
|
129
|
+
"""
|
|
130
|
+
maximum number of processor threads for page-parallel processing (ignored if negative),
|
|
131
|
+
to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
|
|
132
|
+
whatever is smaller).
|
|
133
|
+
|
|
134
|
+
(Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
|
|
135
|
+
- at once, or if your class is not thread-safe.)
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
max_page_seconds : int = -1
|
|
139
|
+
"""
|
|
140
|
+
maximum number of seconds may be spent processing a single page (ignored if negative),
|
|
141
|
+
to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
|
|
142
|
+
(i.e. whatever is smaller).
|
|
143
|
+
|
|
144
|
+
(Override this if you know how costly this processor may be, irrespective of image size
|
|
145
|
+
or complexity of the page.)
|
|
103
146
|
"""
|
|
104
147
|
|
|
105
148
|
@property
|
|
149
|
+
def metadata_filename(self) -> str:
|
|
150
|
+
"""
|
|
151
|
+
Relative location of the ``ocrd-tool.json`` file inside the package.
|
|
152
|
+
|
|
153
|
+
Used by :py:data:`metadata_location`.
|
|
154
|
+
|
|
155
|
+
(Override if ``ocrd-tool.json`` is not in the root of the module,
|
|
156
|
+
e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
|
|
157
|
+
"""
|
|
158
|
+
return 'ocrd-tool.json'
|
|
159
|
+
|
|
160
|
+
@cached_property
|
|
161
|
+
def metadata_location(self) -> Path:
|
|
162
|
+
"""
|
|
163
|
+
Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
|
|
164
|
+
|
|
165
|
+
Used by :py:data:`metadata_rawdict`.
|
|
166
|
+
|
|
167
|
+
(Override if ``ocrd-tool.json`` is not distributed with the Python package.)
|
|
168
|
+
"""
|
|
169
|
+
# XXX HACK
|
|
170
|
+
module_tokens = self.__module__.split('.')
|
|
171
|
+
if module_tokens[0] == 'src':
|
|
172
|
+
module_tokens.pop(0)
|
|
173
|
+
return resource_filename(module_tokens[0], self.metadata_filename)
|
|
174
|
+
|
|
175
|
+
@cached_property
|
|
176
|
+
def metadata_rawdict(self) -> dict:
|
|
177
|
+
"""
|
|
178
|
+
Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
|
|
179
|
+
|
|
180
|
+
Used by :py:data:`metadata`.
|
|
181
|
+
|
|
182
|
+
(Override if ``ocrd-tool.json`` is not in a file.)
|
|
183
|
+
"""
|
|
184
|
+
return parse_json_file_with_comments(self.metadata_location)
|
|
185
|
+
|
|
186
|
+
@cached_property
|
|
106
187
|
def metadata(self) -> dict:
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
188
|
+
"""
|
|
189
|
+
The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
|
|
190
|
+
`spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
|
|
191
|
+
|
|
192
|
+
After deserialisation, it also gets validated against the
|
|
193
|
+
`schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
|
|
194
|
+
expanded.
|
|
195
|
+
|
|
196
|
+
Used by :py:data:`ocrd_tool` and :py:data:`version`.
|
|
197
|
+
|
|
198
|
+
(Override if you want to provide metadata programmatically instead of a
|
|
199
|
+
JSON file.)
|
|
200
|
+
"""
|
|
201
|
+
metadata = self.metadata_rawdict
|
|
202
|
+
report = OcrdToolValidator.validate(metadata)
|
|
112
203
|
if not report.is_valid:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
return self._metadata
|
|
204
|
+
self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
|
|
205
|
+
f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
|
|
206
|
+
return metadata
|
|
117
207
|
|
|
118
|
-
@
|
|
208
|
+
@cached_property
|
|
119
209
|
def version(self) -> str:
|
|
120
|
-
"""
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
self._version = self.metadata['version']
|
|
124
|
-
return self._version
|
|
210
|
+
"""
|
|
211
|
+
The program version of the package.
|
|
212
|
+
Usually the ``version`` part of :py:data:`metadata`.
|
|
125
213
|
|
|
126
|
-
|
|
214
|
+
(Override if you do not want to use :py:data:`metadata` lookup
|
|
215
|
+
mechanism.)
|
|
216
|
+
"""
|
|
217
|
+
return self.metadata['version']
|
|
218
|
+
|
|
219
|
+
@cached_property
|
|
127
220
|
def executable(self) -> str:
|
|
128
|
-
"""
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
self._executable = os.path.basename(inspect.stack()[-1].filename)
|
|
132
|
-
return self._executable
|
|
221
|
+
"""
|
|
222
|
+
The executable name of this processor tool. Taken from the runtime
|
|
223
|
+
filename.
|
|
133
224
|
|
|
134
|
-
|
|
225
|
+
Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
|
|
226
|
+
|
|
227
|
+
(Override if your entry-point name deviates from the ``executable``
|
|
228
|
+
name, or the processor gets instantiated from another runtime.)
|
|
229
|
+
"""
|
|
230
|
+
return os.path.basename(inspect.stack()[-1].filename)
|
|
231
|
+
|
|
232
|
+
@cached_property
|
|
135
233
|
def ocrd_tool(self) -> dict:
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
234
|
+
"""
|
|
235
|
+
The ``ocrd-tool.json`` dict contents of this processor tool.
|
|
236
|
+
Usually the :py:data:`executable` key of the ``tools`` part
|
|
237
|
+
of :py:data:`metadata`.
|
|
238
|
+
|
|
239
|
+
(Override if you do not want to use :py:data:`metadata` lookup
|
|
240
|
+
mechanism.)
|
|
241
|
+
"""
|
|
242
|
+
return self.metadata['tools'][self.executable]
|
|
243
|
+
|
|
244
|
+
@property
|
|
245
|
+
def parameter(self) -> Optional[dict]:
|
|
246
|
+
"""the runtime parameter dict to be used by this processor"""
|
|
247
|
+
if hasattr(self, '_parameter'):
|
|
248
|
+
return self._parameter
|
|
249
|
+
return None
|
|
250
|
+
|
|
251
|
+
@parameter.setter
|
|
252
|
+
def parameter(self, parameter : dict) -> None:
|
|
253
|
+
if self.parameter is not None:
|
|
254
|
+
self.shutdown()
|
|
255
|
+
parameterValidator = ParameterValidator(self.ocrd_tool)
|
|
256
|
+
report = parameterValidator.validate(parameter)
|
|
257
|
+
if not report.is_valid:
|
|
258
|
+
raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
|
|
259
|
+
# make parameter dict read-only
|
|
260
|
+
self._parameter = frozendict(parameter)
|
|
261
|
+
# (re-)run setup to load models etc
|
|
262
|
+
self.setup()
|
|
141
263
|
|
|
142
264
|
def __init__(
|
|
143
265
|
self,
|
|
144
|
-
# FIXME:
|
|
266
|
+
# FIXME: remove in favor of process_workspace(workspace)
|
|
145
267
|
workspace : Optional[Workspace],
|
|
146
268
|
ocrd_tool=None,
|
|
147
269
|
parameter=None,
|
|
@@ -179,12 +301,12 @@ class Processor():
|
|
|
179
301
|
if ocrd_tool is not None:
|
|
180
302
|
deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
|
|
181
303
|
"use or override metadata/executable/ocrd-tool properties instead")
|
|
182
|
-
self.
|
|
183
|
-
self.
|
|
304
|
+
self.ocrd_tool = ocrd_tool
|
|
305
|
+
self.executable = ocrd_tool['executable']
|
|
184
306
|
if version is not None:
|
|
185
307
|
deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
|
|
186
308
|
"use or override metadata/version properties instead")
|
|
187
|
-
self.
|
|
309
|
+
self.version = version
|
|
188
310
|
if workspace is not None:
|
|
189
311
|
deprecation_warning("Passing a workspace argument other than 'None' to Processor "
|
|
190
312
|
"is deprecated - pass as argument to process_workspace instead")
|
|
@@ -204,19 +326,14 @@ class Processor():
|
|
|
204
326
|
"is deprecated - pass as argument to process_workspace instead")
|
|
205
327
|
self.page_id = page_id or None
|
|
206
328
|
self.download = download_files
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
parameterValidator = ParameterValidator(self.ocrd_tool)
|
|
210
|
-
|
|
211
|
-
report = parameterValidator.validate(parameter)
|
|
212
|
-
if not report.is_valid:
|
|
213
|
-
raise ValueError("Invalid parameters %s" % report.errors)
|
|
214
|
-
self.parameter = parameter
|
|
215
|
-
# NOTE: this is the logger to be used by processor implementations,
|
|
216
|
-
# `processor.base` default implementations should use
|
|
217
|
-
# :py:attr:`self._base_logger`
|
|
329
|
+
#: The logger to be used by processor implementations.
|
|
330
|
+
# `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
|
|
218
331
|
self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
|
|
219
332
|
self._base_logger = getLogger('ocrd.processor.base')
|
|
333
|
+
if parameter is not None:
|
|
334
|
+
self.parameter = parameter
|
|
335
|
+
# ensure that shutdown gets called at destruction
|
|
336
|
+
self._finalizer = weakref.finalize(self, self.shutdown)
|
|
220
337
|
# workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
|
|
221
338
|
setattr(self, 'process',
|
|
222
339
|
deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
|
|
@@ -254,14 +371,10 @@ class Processor():
|
|
|
254
371
|
assert len(grps) >= minimum, msg % (len(grps), str(spec))
|
|
255
372
|
if maximum > 0:
|
|
256
373
|
assert len(grps) <= maximum, msg % (len(grps), str(spec))
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
"Unexpected number of input file groups %d vs %s")
|
|
262
|
-
if 'output_file_grp_cardinality' in self.ocrd_tool:
|
|
263
|
-
assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
|
|
264
|
-
"Unexpected number of output file groups %d vs %s")
|
|
374
|
+
assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
|
|
375
|
+
"Unexpected number of input file groups %d vs %s")
|
|
376
|
+
assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
|
|
377
|
+
"Unexpected number of output file groups %d vs %s")
|
|
265
378
|
for input_file_grp in input_file_grps:
|
|
266
379
|
assert input_file_grp in self.workspace.mets.file_groups
|
|
267
380
|
# keep this for backwards compatibility:
|
|
@@ -272,14 +385,12 @@ class Processor():
|
|
|
272
385
|
Print :py:attr:`ocrd_tool` on stdout.
|
|
273
386
|
"""
|
|
274
387
|
print(json.dumps(self.ocrd_tool, indent=True))
|
|
275
|
-
return
|
|
276
388
|
|
|
277
389
|
def dump_module_dir(self):
|
|
278
390
|
"""
|
|
279
391
|
Print :py:attr:`moduledir` on stdout.
|
|
280
392
|
"""
|
|
281
393
|
print(self.moduledir)
|
|
282
|
-
return
|
|
283
394
|
|
|
284
395
|
def list_resources(self):
|
|
285
396
|
"""
|
|
@@ -287,7 +398,6 @@ class Processor():
|
|
|
287
398
|
"""
|
|
288
399
|
for res in self.list_all_resources():
|
|
289
400
|
print(res)
|
|
290
|
-
return
|
|
291
401
|
|
|
292
402
|
def setup(self) -> None:
|
|
293
403
|
"""
|
|
@@ -299,6 +409,16 @@ class Processor():
|
|
|
299
409
|
"""
|
|
300
410
|
pass
|
|
301
411
|
|
|
412
|
+
def shutdown(self) -> None:
|
|
413
|
+
"""
|
|
414
|
+
Bring down the processor after data processing,
|
|
415
|
+
after to changing back from the workspace directory but
|
|
416
|
+
before exiting (or setting up with different parameters).
|
|
417
|
+
|
|
418
|
+
(Override this to unload models from memory etc.)
|
|
419
|
+
"""
|
|
420
|
+
pass
|
|
421
|
+
|
|
302
422
|
@deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
|
|
303
423
|
def process(self) -> None:
|
|
304
424
|
"""
|
|
@@ -330,7 +450,29 @@ class Processor():
|
|
|
330
450
|
self.workspace = workspace
|
|
331
451
|
self.verify()
|
|
332
452
|
try:
|
|
333
|
-
|
|
453
|
+
nr_succeeded = 0
|
|
454
|
+
nr_skipped = 0
|
|
455
|
+
nr_copied = 0
|
|
456
|
+
|
|
457
|
+
# set up multithreading
|
|
458
|
+
if self.max_workers <= 0:
|
|
459
|
+
max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
|
|
460
|
+
else:
|
|
461
|
+
max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers))
|
|
462
|
+
if max_workers > 1:
|
|
463
|
+
assert isinstance(workspace.mets, ClientSideOcrdMets), \
|
|
464
|
+
"OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
|
|
465
|
+
if self.max_page_seconds <= 0:
|
|
466
|
+
max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
|
|
467
|
+
else:
|
|
468
|
+
max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds))
|
|
469
|
+
executor = ThreadPoolExecutor(
|
|
470
|
+
max_workers=max_workers or 1,
|
|
471
|
+
thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
|
|
472
|
+
)
|
|
473
|
+
self._base_logger.debug("started executor %s", str(executor))
|
|
474
|
+
tasks = {}
|
|
475
|
+
|
|
334
476
|
for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
|
|
335
477
|
input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
|
|
336
478
|
page_id = next(input_file.pageId
|
|
@@ -349,35 +491,55 @@ class Processor():
|
|
|
349
491
|
except (ValueError, FileNotFoundError, HTTPError) as e:
|
|
350
492
|
self._base_logger.error(repr(e))
|
|
351
493
|
self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
|
|
494
|
+
# process page
|
|
495
|
+
tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
|
|
496
|
+
self._base_logger.debug("submitted %d processing tasks", len(tasks))
|
|
497
|
+
|
|
498
|
+
for task in tasks:
|
|
499
|
+
# wait for results, handle errors
|
|
500
|
+
page_id, input_files = tasks[task]
|
|
352
501
|
# FIXME: differentiate error cases in various ways:
|
|
353
502
|
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
|
|
354
503
|
# - transient (I/O or OOM) error → maybe sleep, retry
|
|
355
504
|
# - persistent (data) error → skip / dummy / raise
|
|
356
505
|
try:
|
|
357
|
-
self.
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
506
|
+
self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
|
|
507
|
+
task.result(timeout=max_seconds or None)
|
|
508
|
+
nr_succeeded += 1
|
|
509
|
+
# exclude NotImplementedError, so we can try process() below
|
|
510
|
+
except NotImplementedError:
|
|
511
|
+
raise
|
|
512
|
+
# handle input failures separately
|
|
513
|
+
except FileExistsError as err:
|
|
514
|
+
if config.OCRD_EXISTING_OUTPUT == 'ABORT':
|
|
361
515
|
raise err
|
|
362
|
-
if
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
self._base_logger.exception(f"Failure on page {page_id}: {err}")
|
|
516
|
+
if config.OCRD_EXISTING_OUTPUT == 'SKIP':
|
|
517
|
+
continue
|
|
518
|
+
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
519
|
+
# too late here, must not happen
|
|
520
|
+
raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
|
|
521
|
+
# broad coverage of output failures (including TimeoutError)
|
|
522
|
+
except (Exception, TimeoutError) as err:
|
|
523
|
+
# FIXME: add re-usable/actionable logging
|
|
524
|
+
self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
372
525
|
if config.OCRD_MISSING_OUTPUT == 'ABORT':
|
|
373
526
|
raise err
|
|
374
527
|
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
528
|
+
nr_skipped += 1
|
|
375
529
|
continue
|
|
376
530
|
if config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
377
531
|
self._copy_page_file(input_files[0])
|
|
532
|
+
nr_copied += 1
|
|
378
533
|
else:
|
|
379
534
|
desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
|
|
380
535
|
raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
|
|
536
|
+
|
|
537
|
+
if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
538
|
+
raise Exception(f"too many failures with skipped output ({nr_skipped})")
|
|
539
|
+
if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
540
|
+
raise Exception(f"too many failures with fallback output ({nr_skipped})")
|
|
541
|
+
executor.shutdown()
|
|
542
|
+
|
|
381
543
|
except NotImplementedError:
|
|
382
544
|
# fall back to deprecated method
|
|
383
545
|
self.process()
|
|
@@ -401,13 +563,14 @@ class Processor():
|
|
|
401
563
|
output_file_id = make_file_id(input_file, self.output_file_grp)
|
|
402
564
|
input_pcgts.set_pcGtsId(output_file_id)
|
|
403
565
|
self.add_metadata(input_pcgts)
|
|
404
|
-
self.workspace.add_file(
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
566
|
+
self.workspace.add_file(
|
|
567
|
+
file_id=output_file_id,
|
|
568
|
+
file_grp=self.output_file_grp,
|
|
569
|
+
page_id=input_file.pageId,
|
|
570
|
+
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
|
|
571
|
+
mimetype=MIMETYPE_PAGE,
|
|
572
|
+
content=to_xml(input_pcgts),
|
|
573
|
+
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
411
574
|
)
|
|
412
575
|
|
|
413
576
|
def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
|
|
@@ -439,7 +602,18 @@ class Processor():
|
|
|
439
602
|
for image_result in result.images:
|
|
440
603
|
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
|
|
441
604
|
image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
|
|
442
|
-
image_result.alternative_image
|
|
605
|
+
if isinstance(image_result.alternative_image, PageType):
|
|
606
|
+
# special case: not an alternative image, but replacing the original image
|
|
607
|
+
# (this is needed by certain processors when the original's coordinate system
|
|
608
|
+
# cannot or must not be kept)
|
|
609
|
+
image_result.alternative_image.set_imageFilename(image_file_path)
|
|
610
|
+
image_result.alternative_image.set_imageWidth(image_result.pil.width)
|
|
611
|
+
image_result.alternative_image.set_imageHeight(image_result.pil.height)
|
|
612
|
+
elif isinstance(image_result.alternative_image, AlternativeImageType):
|
|
613
|
+
image_result.alternative_image.set_filename(image_file_path)
|
|
614
|
+
else:
|
|
615
|
+
raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
|
|
616
|
+
f"{type(image_result.alternative_image)}")
|
|
443
617
|
self.workspace.save_image_file(
|
|
444
618
|
image_result.pil,
|
|
445
619
|
image_file_id,
|
|
@@ -450,13 +624,14 @@ class Processor():
|
|
|
450
624
|
)
|
|
451
625
|
result.pcgts.set_pcGtsId(output_file_id)
|
|
452
626
|
self.add_metadata(result.pcgts)
|
|
453
|
-
self.workspace.add_file(
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
627
|
+
self.workspace.add_file(
|
|
628
|
+
file_id=output_file_id,
|
|
629
|
+
file_grp=self.output_file_grp,
|
|
630
|
+
page_id=page_id,
|
|
631
|
+
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
|
|
632
|
+
mimetype=MIMETYPE_PAGE,
|
|
633
|
+
content=to_xml(result.pcgts),
|
|
634
|
+
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
460
635
|
)
|
|
461
636
|
|
|
462
637
|
def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
|
|
@@ -668,7 +843,7 @@ class Processor():
|
|
|
668
843
|
# can actually be much more costly than traversing the ltree.
|
|
669
844
|
# This might depend on the number of pages vs number of fileGrps.
|
|
670
845
|
|
|
671
|
-
pages =
|
|
846
|
+
pages = {}
|
|
672
847
|
for i, ifg in enumerate(ifgs):
|
|
673
848
|
files_ = sorted(self.workspace.mets.find_all_files(
|
|
674
849
|
pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
|
|
@@ -723,7 +898,7 @@ class Processor():
|
|
|
723
898
|
if self.page_id and not any(pages):
|
|
724
899
|
self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
|
|
725
900
|
f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
|
|
726
|
-
ifts =
|
|
901
|
+
ifts = []
|
|
727
902
|
for page, ifiles in pages.items():
|
|
728
903
|
for i, ifg in enumerate(ifgs):
|
|
729
904
|
if not ifiles[i]:
|
|
@@ -738,3 +913,151 @@ class Processor():
|
|
|
738
913
|
if ifiles[0] or not require_first:
|
|
739
914
|
ifts.append(tuple(ifiles))
|
|
740
915
|
return ifts
|
|
916
|
+
|
|
917
|
+
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
|
|
918
|
+
"""Generate a string describing the full CLI of this processor including params.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
|
|
922
|
+
processor_instance (object, optional): the processor implementation
|
|
923
|
+
(for adding any module/class/function docstrings)
|
|
924
|
+
subcommand (string): 'worker' or 'server'
|
|
925
|
+
"""
|
|
926
|
+
doc_help = ''
|
|
927
|
+
if processor_instance:
|
|
928
|
+
module = inspect.getmodule(processor_instance)
|
|
929
|
+
if module and module.__doc__:
|
|
930
|
+
doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
|
|
931
|
+
if processor_instance.__doc__:
|
|
932
|
+
doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
|
|
933
|
+
# Try to find the most concrete docstring among the various methods that an implementation
|
|
934
|
+
# could overload, first serving.
|
|
935
|
+
# In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
|
|
936
|
+
# (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
|
|
937
|
+
for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
|
|
938
|
+
instance_method = getattr(processor_instance, method)
|
|
939
|
+
superclass_method = getattr(Processor, method)
|
|
940
|
+
if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
|
|
941
|
+
doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
|
|
942
|
+
break
|
|
943
|
+
if doc_help:
|
|
944
|
+
doc_help = '\n\n' + wrap_text(doc_help, width=72,
|
|
945
|
+
initial_indent=' > ',
|
|
946
|
+
subsequent_indent=' > ',
|
|
947
|
+
preserve_paragraphs=True)
|
|
948
|
+
subcommands = '''\
|
|
949
|
+
worker Start a processing worker rather than do local processing
|
|
950
|
+
server Start a processor server rather than do local processing
|
|
951
|
+
'''
|
|
952
|
+
|
|
953
|
+
processing_worker_options = '''\
|
|
954
|
+
--queue The RabbitMQ server address in format
|
|
955
|
+
"amqp://{user}:{pass}@{host}:{port}/{vhost}"
|
|
956
|
+
[amqp://admin:admin@localhost:5672]
|
|
957
|
+
--database The MongoDB server address in format
|
|
958
|
+
"mongodb://{host}:{port}"
|
|
959
|
+
[mongodb://localhost:27018]
|
|
960
|
+
--log-filename Filename to redirect STDOUT/STDERR to,
|
|
961
|
+
if specified.
|
|
962
|
+
'''
|
|
963
|
+
|
|
964
|
+
processing_server_options = '''\
|
|
965
|
+
--address The Processor server address in format
|
|
966
|
+
"{host}:{port}"
|
|
967
|
+
--database The MongoDB server address in format
|
|
968
|
+
"mongodb://{host}:{port}"
|
|
969
|
+
[mongodb://localhost:27018]
|
|
970
|
+
'''
|
|
971
|
+
|
|
972
|
+
processing_options = '''\
|
|
973
|
+
-m, --mets URL-PATH URL or file path of METS to process [./mets.xml]
|
|
974
|
+
-w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)]
|
|
975
|
+
-I, --input-file-grp USE File group(s) used as input
|
|
976
|
+
-O, --output-file-grp USE File group(s) used as output
|
|
977
|
+
-g, --page-id ID Physical page ID(s) to process instead of full document []
|
|
978
|
+
--overwrite Remove existing output pages/images
|
|
979
|
+
(with "--page-id", remove only those).
|
|
980
|
+
Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
|
|
981
|
+
--debug Abort on any errors with full stack trace.
|
|
982
|
+
Short-hand for OCRD_MISSING_OUTPUT=ABORT
|
|
983
|
+
--profile Enable profiling
|
|
984
|
+
--profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile"
|
|
985
|
+
-p, --parameter JSON-PATH Parameters, either verbatim JSON string
|
|
986
|
+
or JSON file path
|
|
987
|
+
-P, --param-override KEY VAL Override a single JSON object key-value pair,
|
|
988
|
+
taking precedence over --parameter
|
|
989
|
+
-U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS
|
|
990
|
+
If URL starts with http:// start an HTTP server there,
|
|
991
|
+
otherwise URL is a path to an on-demand-created unix socket
|
|
992
|
+
-l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
|
|
993
|
+
Override log level globally [INFO]
|
|
994
|
+
--log-filename LOG-PATH File to redirect stderr logging to (overriding ocrd_logging.conf).
|
|
995
|
+
'''
|
|
996
|
+
|
|
997
|
+
information_options = '''\
|
|
998
|
+
-C, --show-resource RESNAME Dump the content of processor resource RESNAME
|
|
999
|
+
-L, --list-resources List names of processor resources
|
|
1000
|
+
-J, --dump-json Dump tool description as JSON
|
|
1001
|
+
-D, --dump-module-dir Show the 'module' resource location path for this processor
|
|
1002
|
+
-h, --help Show this message
|
|
1003
|
+
-V, --version Show version
|
|
1004
|
+
'''
|
|
1005
|
+
|
|
1006
|
+
parameter_help = ''
|
|
1007
|
+
if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
|
|
1008
|
+
parameter_help = ' NONE\n'
|
|
1009
|
+
else:
|
|
1010
|
+
def wrap(s):
|
|
1011
|
+
return wrap_text(s, initial_indent=' '*3,
|
|
1012
|
+
subsequent_indent=' '*4,
|
|
1013
|
+
width=72, preserve_paragraphs=True)
|
|
1014
|
+
for param_name, param in ocrd_tool['parameters'].items():
|
|
1015
|
+
parameter_help += wrap('"%s" [%s%s]' % (
|
|
1016
|
+
param_name,
|
|
1017
|
+
param['type'],
|
|
1018
|
+
' - REQUIRED' if 'required' in param and param['required'] else
|
|
1019
|
+
' - %s' % json.dumps(param['default']) if 'default' in param else ''))
|
|
1020
|
+
parameter_help += '\n ' + wrap(param['description'])
|
|
1021
|
+
if 'enum' in param:
|
|
1022
|
+
parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
|
|
1023
|
+
parameter_help += "\n"
|
|
1024
|
+
|
|
1025
|
+
if not subcommand:
|
|
1026
|
+
return f'''\
|
|
1027
|
+
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
|
|
1028
|
+
|
|
1029
|
+
{ocrd_tool['description']}{doc_help}
|
|
1030
|
+
|
|
1031
|
+
Subcommands:
|
|
1032
|
+
{subcommands}
|
|
1033
|
+
Options for processing:
|
|
1034
|
+
{processing_options}
|
|
1035
|
+
Options for information:
|
|
1036
|
+
{information_options}
|
|
1037
|
+
Parameters:
|
|
1038
|
+
{parameter_help}
|
|
1039
|
+
'''
|
|
1040
|
+
elif subcommand == 'worker':
|
|
1041
|
+
return f'''\
|
|
1042
|
+
Usage: {ocrd_tool['executable']} worker [OPTIONS]
|
|
1043
|
+
|
|
1044
|
+
Run {ocrd_tool['executable']} as a processing worker.
|
|
1045
|
+
|
|
1046
|
+
{ocrd_tool['description']}{doc_help}
|
|
1047
|
+
|
|
1048
|
+
Options:
|
|
1049
|
+
{processing_worker_options}
|
|
1050
|
+
'''
|
|
1051
|
+
elif subcommand == 'server':
|
|
1052
|
+
return f'''\
|
|
1053
|
+
Usage: {ocrd_tool['executable']} server [OPTIONS]
|
|
1054
|
+
|
|
1055
|
+
Run {ocrd_tool['executable']} as a processor sever.
|
|
1056
|
+
|
|
1057
|
+
{ocrd_tool['description']}{doc_help}
|
|
1058
|
+
|
|
1059
|
+
Options:
|
|
1060
|
+
{processing_server_options}
|
|
1061
|
+
'''
|
|
1062
|
+
else:
|
|
1063
|
+
pass
|