ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. ocrd/cli/__init__.py +34 -26
  2. ocrd/cli/bashlib.py +32 -18
  3. ocrd/cli/ocrd_tool.py +7 -5
  4. ocrd/cli/workspace.py +10 -8
  5. ocrd/decorators/__init__.py +13 -7
  6. ocrd/decorators/ocrd_cli_options.py +1 -1
  7. ocrd/lib.bash +3 -0
  8. ocrd/mets_server.py +3 -4
  9. ocrd/processor/__init__.py +1 -1
  10. ocrd/processor/base.py +421 -98
  11. ocrd/processor/builtin/dummy_processor.py +4 -11
  12. ocrd/processor/helpers.py +24 -161
  13. ocrd/processor/ocrd_page_result.py +3 -3
  14. ocrd/resolver.py +0 -3
  15. ocrd/resource_manager.py +9 -5
  16. ocrd/workspace.py +10 -11
  17. ocrd/workspace_backup.py +1 -1
  18. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/METADATA +32 -10
  19. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/RECORD +49 -48
  20. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/WHEEL +1 -1
  21. ocrd_modelfactory/__init__.py +1 -1
  22. ocrd_models/constants.py +0 -1
  23. ocrd_models/ocrd_exif.py +2 -2
  24. ocrd_models/ocrd_file.py +2 -2
  25. ocrd_models/ocrd_mets.py +22 -22
  26. ocrd_models/ocrd_page.py +0 -1
  27. ocrd_models/ocrd_xml_base.py +2 -2
  28. ocrd_network/cli/client.py +134 -30
  29. ocrd_network/client.py +53 -27
  30. ocrd_network/client_utils.py +101 -0
  31. ocrd_network/processing_server.py +1 -1
  32. ocrd_network/runtime_data/deployer.py +12 -3
  33. ocrd_network/server_utils.py +12 -10
  34. ocrd_utils/__init__.py +2 -0
  35. ocrd_utils/config.py +31 -2
  36. ocrd_utils/image.py +25 -25
  37. ocrd_utils/logging.py +20 -20
  38. ocrd_utils/os.py +4 -5
  39. ocrd_utils/str.py +10 -3
  40. ocrd_validators/json_validator.py +1 -3
  41. ocrd_validators/ocrd_tool_validator.py +2 -2
  42. ocrd_validators/page_validator.py +56 -56
  43. ocrd_validators/parameter_validator.py +2 -2
  44. ocrd_validators/resource_list_validator.py +4 -3
  45. ocrd_validators/workspace_validator.py +21 -21
  46. ocrd_validators/xsd_validator.py +1 -1
  47. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/LICENSE +0 -0
  48. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/entry_points.txt +0 -0
  49. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/top_level.txt +0 -0
ocrd/processor/base.py CHANGED
@@ -9,6 +9,7 @@ __all__ = [
9
9
  'run_processor'
10
10
  ]
11
11
 
12
+ from functools import cached_property
12
13
  from os.path import exists, join
13
14
  from shutil import copyfileobj
14
15
  import json
@@ -20,36 +21,48 @@ import sys
20
21
  import inspect
21
22
  import tarfile
22
23
  import io
23
- from warnings import warn
24
+ import weakref
25
+ from frozendict import frozendict
26
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError
27
+
28
+ from click import wrap_text
24
29
  from deprecated import deprecated
25
30
  from requests import HTTPError
26
31
 
27
- from ocrd.workspace import Workspace
32
+ from ..workspace import Workspace
33
+ from ..mets_server import ClientSideOcrdMets
28
34
  from ocrd_models.ocrd_file import OcrdFileType
29
- from ocrd.processor.ocrd_page_result import OcrdPageResult
35
+ from .ocrd_page_result import OcrdPageResult
30
36
  from ocrd_utils import (
31
37
  VERSION as OCRD_VERSION,
32
38
  MIMETYPE_PAGE,
33
39
  MIME_TO_EXT,
34
40
  config,
35
41
  getLogger,
36
- initLogging,
37
42
  list_resource_candidates,
38
43
  pushd_popd,
39
44
  list_all_resources,
40
45
  get_processor_resource_types,
41
46
  resource_filename,
42
- resource_string,
47
+ parse_json_file_with_comments,
43
48
  make_file_id,
44
49
  deprecation_warning
45
50
  )
46
51
  from ocrd_validators import ParameterValidator
47
- from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml
52
+ from ocrd_models.ocrd_page import (
53
+ PageType,
54
+ AlternativeImageType,
55
+ MetadataItemType,
56
+ LabelType,
57
+ LabelsType,
58
+ OcrdPage,
59
+ to_xml,
60
+ )
48
61
  from ocrd_modelfactory import page_from_file
49
62
  from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
50
63
 
51
64
  # XXX imports must remain for backwards-compatibility
52
- from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
65
+ from .helpers import run_cli, run_processor # pylint: disable=unused-import
53
66
 
54
67
 
55
68
  class ResourceNotFoundError(FileNotFoundError):
@@ -94,54 +107,163 @@ class MissingInputFile(ValueError):
94
107
 
95
108
  class Processor():
96
109
  """
97
- A processor is a tool that implements the uniform OCR-D command-line interface
98
- for run-time data processing. That is, it executes a single workflow step,
99
- or a combination of workflow steps, on the workspace (represented by local METS).
100
- It reads input files for all or requested physical pages of the input fileGrp(s),
101
- and writes output files for them into the output fileGrp(s). It may take
102
- a number of optional or mandatory parameters.
110
+ A processor is a tool that implements the uniform OCR-D
111
+ `command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
112
+
113
+ That is, it executes a single workflow step, or a combination of workflow steps,
114
+ on the workspace (represented by local METS). It reads input files for all or selected
115
+ physical pages of the input fileGrp(s), computes additional annotation, and writes output
116
+ files for them into the output fileGrp(s). It may take a number of optional or mandatory
117
+ parameters.
118
+ """
119
+
120
+ max_instances : int = -1
121
+ """
122
+ maximum number of cached instances (ignored if negative), to be applied on top of
123
+ :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
124
+
125
+ (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
126
+ """
127
+
128
+ max_workers : int = -1
129
+ """
130
+ maximum number of processor threads for page-parallel processing (ignored if negative),
131
+ to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
132
+ whatever is smaller).
133
+
134
+ (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
135
+ - at once, or if your class is not thread-safe.)
136
+ """
137
+
138
+ max_page_seconds : int = -1
139
+ """
140
+ maximum number of seconds may be spent processing a single page (ignored if negative),
141
+ to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
142
+ (i.e. whatever is smaller).
143
+
144
+ (Override this if you know how costly this processor may be, irrespective of image size
145
+ or complexity of the page.)
103
146
  """
104
147
 
105
148
  @property
149
+ def metadata_filename(self) -> str:
150
+ """
151
+ Relative location of the ``ocrd-tool.json`` file inside the package.
152
+
153
+ Used by :py:data:`metadata_location`.
154
+
155
+ (Override if ``ocrd-tool.json`` is not in the root of the module,
156
+ e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
157
+ """
158
+ return 'ocrd-tool.json'
159
+
160
+ @cached_property
161
+ def metadata_location(self) -> Path:
162
+ """
163
+ Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
164
+
165
+ Used by :py:data:`metadata_rawdict`.
166
+
167
+ (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
168
+ """
169
+ # XXX HACK
170
+ module_tokens = self.__module__.split('.')
171
+ if module_tokens[0] == 'src':
172
+ module_tokens.pop(0)
173
+ return resource_filename(module_tokens[0], self.metadata_filename)
174
+
175
+ @cached_property
176
+ def metadata_rawdict(self) -> dict:
177
+ """
178
+ Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
179
+
180
+ Used by :py:data:`metadata`.
181
+
182
+ (Override if ``ocrd-tool.json`` is not in a file.)
183
+ """
184
+ return parse_json_file_with_comments(self.metadata_location)
185
+
186
+ @cached_property
106
187
  def metadata(self) -> dict:
107
- """the ocrd-tool.json dict of the package"""
108
- if hasattr(self, '_metadata'):
109
- return self._metadata
110
- self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json'))
111
- report = OcrdToolValidator.validate(self._metadata)
188
+ """
189
+ The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
190
+ `spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
191
+
192
+ After deserialisation, it also gets validated against the
193
+ `schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
194
+ expanded.
195
+
196
+ Used by :py:data:`ocrd_tool` and :py:data:`version`.
197
+
198
+ (Override if you want to provide metadata programmatically instead of a
199
+ JSON file.)
200
+ """
201
+ metadata = self.metadata_rawdict
202
+ report = OcrdToolValidator.validate(metadata)
112
203
  if not report.is_valid:
113
- # FIXME: remove when bertsky/core#10 is merged
114
- self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
115
- self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.")
116
- return self._metadata
204
+ self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
205
+ f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
206
+ return metadata
117
207
 
118
- @property
208
+ @cached_property
119
209
  def version(self) -> str:
120
- """the version of the package"""
121
- if hasattr(self, '_version'):
122
- return self._version
123
- self._version = self.metadata['version']
124
- return self._version
210
+ """
211
+ The program version of the package.
212
+ Usually the ``version`` part of :py:data:`metadata`.
125
213
 
126
- @property
214
+ (Override if you do not want to use :py:data:`metadata` lookup
215
+ mechanism.)
216
+ """
217
+ return self.metadata['version']
218
+
219
+ @cached_property
127
220
  def executable(self) -> str:
128
- """the executable name of this processor tool"""
129
- if hasattr(self, '_executable'):
130
- return self._executable
131
- self._executable = os.path.basename(inspect.stack()[-1].filename)
132
- return self._executable
221
+ """
222
+ The executable name of this processor tool. Taken from the runtime
223
+ filename.
133
224
 
134
- @property
225
+ Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
226
+
227
+ (Override if your entry-point name deviates from the ``executable``
228
+ name, or the processor gets instantiated from another runtime.)
229
+ """
230
+ return os.path.basename(inspect.stack()[-1].filename)
231
+
232
+ @cached_property
135
233
  def ocrd_tool(self) -> dict:
136
- """the ocrd-tool.json dict of this processor tool"""
137
- if hasattr(self, '_ocrd_tool'):
138
- return self._ocrd_tool
139
- self._ocrd_tool = self.metadata['tools'][self.executable]
140
- return self._ocrd_tool
234
+ """
235
+ The ``ocrd-tool.json`` dict contents of this processor tool.
236
+ Usually the :py:data:`executable` key of the ``tools`` part
237
+ of :py:data:`metadata`.
238
+
239
+ (Override if you do not want to use :py:data:`metadata` lookup
240
+ mechanism.)
241
+ """
242
+ return self.metadata['tools'][self.executable]
243
+
244
+ @property
245
+ def parameter(self) -> Optional[dict]:
246
+ """the runtime parameter dict to be used by this processor"""
247
+ if hasattr(self, '_parameter'):
248
+ return self._parameter
249
+ return None
250
+
251
+ @parameter.setter
252
+ def parameter(self, parameter : dict) -> None:
253
+ if self.parameter is not None:
254
+ self.shutdown()
255
+ parameterValidator = ParameterValidator(self.ocrd_tool)
256
+ report = parameterValidator.validate(parameter)
257
+ if not report.is_valid:
258
+ raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
259
+ # make parameter dict read-only
260
+ self._parameter = frozendict(parameter)
261
+ # (re-)run setup to load models etc
262
+ self.setup()
141
263
 
142
264
  def __init__(
143
265
  self,
144
- # FIXME: deprecate in favor of process_workspace(workspace)
266
+ # FIXME: remove in favor of process_workspace(workspace)
145
267
  workspace : Optional[Workspace],
146
268
  ocrd_tool=None,
147
269
  parameter=None,
@@ -179,12 +301,12 @@ class Processor():
179
301
  if ocrd_tool is not None:
180
302
  deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
181
303
  "use or override metadata/executable/ocrd-tool properties instead")
182
- self._ocrd_tool = ocrd_tool
183
- self._executable = ocrd_tool['executable']
304
+ self.ocrd_tool = ocrd_tool
305
+ self.executable = ocrd_tool['executable']
184
306
  if version is not None:
185
307
  deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
186
308
  "use or override metadata/version properties instead")
187
- self._version = version
309
+ self.version = version
188
310
  if workspace is not None:
189
311
  deprecation_warning("Passing a workspace argument other than 'None' to Processor "
190
312
  "is deprecated - pass as argument to process_workspace instead")
@@ -204,19 +326,14 @@ class Processor():
204
326
  "is deprecated - pass as argument to process_workspace instead")
205
327
  self.page_id = page_id or None
206
328
  self.download = download_files
207
- if parameter is None:
208
- parameter = {}
209
- parameterValidator = ParameterValidator(self.ocrd_tool)
210
-
211
- report = parameterValidator.validate(parameter)
212
- if not report.is_valid:
213
- raise ValueError("Invalid parameters %s" % report.errors)
214
- self.parameter = parameter
215
- # NOTE: this is the logger to be used by processor implementations,
216
- # `processor.base` default implementations should use
217
- # :py:attr:`self._base_logger`
329
+ #: The logger to be used by processor implementations.
330
+ # `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
218
331
  self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
219
332
  self._base_logger = getLogger('ocrd.processor.base')
333
+ if parameter is not None:
334
+ self.parameter = parameter
335
+ # ensure that shutdown gets called at destruction
336
+ self._finalizer = weakref.finalize(self, self.shutdown)
220
337
  # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
221
338
  setattr(self, 'process',
222
339
  deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
@@ -254,14 +371,10 @@ class Processor():
254
371
  assert len(grps) >= minimum, msg % (len(grps), str(spec))
255
372
  if maximum > 0:
256
373
  assert len(grps) <= maximum, msg % (len(grps), str(spec))
257
- # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here
258
- # (but we already have ocrd-tool validation, and these first need to be adopted by implementors)
259
- if 'input_file_grp_cardinality' in self.ocrd_tool:
260
- assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
261
- "Unexpected number of input file groups %d vs %s")
262
- if 'output_file_grp_cardinality' in self.ocrd_tool:
263
- assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
264
- "Unexpected number of output file groups %d vs %s")
374
+ assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
375
+ "Unexpected number of input file groups %d vs %s")
376
+ assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
377
+ "Unexpected number of output file groups %d vs %s")
265
378
  for input_file_grp in input_file_grps:
266
379
  assert input_file_grp in self.workspace.mets.file_groups
267
380
  # keep this for backwards compatibility:
@@ -272,14 +385,12 @@ class Processor():
272
385
  Print :py:attr:`ocrd_tool` on stdout.
273
386
  """
274
387
  print(json.dumps(self.ocrd_tool, indent=True))
275
- return
276
388
 
277
389
  def dump_module_dir(self):
278
390
  """
279
391
  Print :py:attr:`moduledir` on stdout.
280
392
  """
281
393
  print(self.moduledir)
282
- return
283
394
 
284
395
  def list_resources(self):
285
396
  """
@@ -287,7 +398,6 @@ class Processor():
287
398
  """
288
399
  for res in self.list_all_resources():
289
400
  print(res)
290
- return
291
401
 
292
402
  def setup(self) -> None:
293
403
  """
@@ -299,6 +409,16 @@ class Processor():
299
409
  """
300
410
  pass
301
411
 
412
+ def shutdown(self) -> None:
413
+ """
414
+ Bring down the processor after data processing,
415
+ after to changing back from the workspace directory but
416
+ before exiting (or setting up with different parameters).
417
+
418
+ (Override this to unload models from memory etc.)
419
+ """
420
+ pass
421
+
302
422
  @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
303
423
  def process(self) -> None:
304
424
  """
@@ -330,7 +450,29 @@ class Processor():
330
450
  self.workspace = workspace
331
451
  self.verify()
332
452
  try:
333
- # FIXME: add page parallelization by running multiprocessing.Pool (#322)
453
+ nr_succeeded = 0
454
+ nr_skipped = 0
455
+ nr_copied = 0
456
+
457
+ # set up multithreading
458
+ if self.max_workers <= 0:
459
+ max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
460
+ else:
461
+ max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers))
462
+ if max_workers > 1:
463
+ assert isinstance(workspace.mets, ClientSideOcrdMets), \
464
+ "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
465
+ if self.max_page_seconds <= 0:
466
+ max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
467
+ else:
468
+ max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds))
469
+ executor = ThreadPoolExecutor(
470
+ max_workers=max_workers or 1,
471
+ thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
472
+ )
473
+ self._base_logger.debug("started executor %s", str(executor))
474
+ tasks = {}
475
+
334
476
  for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
335
477
  input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
336
478
  page_id = next(input_file.pageId
@@ -349,35 +491,55 @@ class Processor():
349
491
  except (ValueError, FileNotFoundError, HTTPError) as e:
350
492
  self._base_logger.error(repr(e))
351
493
  self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
494
+ # process page
495
+ tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
496
+ self._base_logger.debug("submitted %d processing tasks", len(tasks))
497
+
498
+ for task in tasks:
499
+ # wait for results, handle errors
500
+ page_id, input_files = tasks[task]
352
501
  # FIXME: differentiate error cases in various ways:
353
502
  # - ResourceNotFoundError → use ResourceManager to download (once), then retry
354
503
  # - transient (I/O or OOM) error → maybe sleep, retry
355
504
  # - persistent (data) error → skip / dummy / raise
356
505
  try:
357
- self.process_page_file(*input_files)
358
- except Exception as err:
359
- # we have to be broad here, but want to exclude NotImplementedError
360
- if isinstance(err, NotImplementedError):
506
+ self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
507
+ task.result(timeout=max_seconds or None)
508
+ nr_succeeded += 1
509
+ # exclude NotImplementedError, so we can try process() below
510
+ except NotImplementedError:
511
+ raise
512
+ # handle input failures separately
513
+ except FileExistsError as err:
514
+ if config.OCRD_EXISTING_OUTPUT == 'ABORT':
361
515
  raise err
362
- if isinstance(err, FileExistsError):
363
- if config.OCRD_EXISTING_OUTPUT == 'ABORT':
364
- raise err
365
- if config.OCRD_EXISTING_OUTPUT == 'SKIP':
366
- continue
367
- if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
368
- # too late here, must not happen
369
- raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
370
- # FIXME: re-usable/actionable logging
371
- self._base_logger.exception(f"Failure on page {page_id}: {err}")
516
+ if config.OCRD_EXISTING_OUTPUT == 'SKIP':
517
+ continue
518
+ if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
519
+ # too late here, must not happen
520
+ raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
521
+ # broad coverage of output failures (including TimeoutError)
522
+ except (Exception, TimeoutError) as err:
523
+ # FIXME: add re-usable/actionable logging
524
+ self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
372
525
  if config.OCRD_MISSING_OUTPUT == 'ABORT':
373
526
  raise err
374
527
  if config.OCRD_MISSING_OUTPUT == 'SKIP':
528
+ nr_skipped += 1
375
529
  continue
376
530
  if config.OCRD_MISSING_OUTPUT == 'COPY':
377
531
  self._copy_page_file(input_files[0])
532
+ nr_copied += 1
378
533
  else:
379
534
  desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
380
535
  raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
536
+
537
+ if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
538
+ raise Exception(f"too many failures with skipped output ({nr_skipped})")
539
+ if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
540
+ raise Exception(f"too many failures with fallback output ({nr_skipped})")
541
+ executor.shutdown()
542
+
381
543
  except NotImplementedError:
382
544
  # fall back to deprecated method
383
545
  self.process()
@@ -401,13 +563,14 @@ class Processor():
401
563
  output_file_id = make_file_id(input_file, self.output_file_grp)
402
564
  input_pcgts.set_pcGtsId(output_file_id)
403
565
  self.add_metadata(input_pcgts)
404
- self.workspace.add_file(file_id=output_file_id,
405
- file_grp=self.output_file_grp,
406
- page_id=input_file.pageId,
407
- local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
408
- mimetype=MIMETYPE_PAGE,
409
- content=to_xml(input_pcgts),
410
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
566
+ self.workspace.add_file(
567
+ file_id=output_file_id,
568
+ file_grp=self.output_file_grp,
569
+ page_id=input_file.pageId,
570
+ local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
571
+ mimetype=MIMETYPE_PAGE,
572
+ content=to_xml(input_pcgts),
573
+ force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
411
574
  )
412
575
 
413
576
  def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
@@ -439,7 +602,18 @@ class Processor():
439
602
  for image_result in result.images:
440
603
  image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
441
604
  image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
442
- image_result.alternative_image.set_filename(image_file_path)
605
+ if isinstance(image_result.alternative_image, PageType):
606
+ # special case: not an alternative image, but replacing the original image
607
+ # (this is needed by certain processors when the original's coordinate system
608
+ # cannot or must not be kept)
609
+ image_result.alternative_image.set_imageFilename(image_file_path)
610
+ image_result.alternative_image.set_imageWidth(image_result.pil.width)
611
+ image_result.alternative_image.set_imageHeight(image_result.pil.height)
612
+ elif isinstance(image_result.alternative_image, AlternativeImageType):
613
+ image_result.alternative_image.set_filename(image_file_path)
614
+ else:
615
+ raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
616
+ f"{type(image_result.alternative_image)}")
443
617
  self.workspace.save_image_file(
444
618
  image_result.pil,
445
619
  image_file_id,
@@ -450,13 +624,14 @@ class Processor():
450
624
  )
451
625
  result.pcgts.set_pcGtsId(output_file_id)
452
626
  self.add_metadata(result.pcgts)
453
- self.workspace.add_file(file_id=output_file_id,
454
- file_grp=self.output_file_grp,
455
- page_id=page_id,
456
- local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
457
- mimetype=MIMETYPE_PAGE,
458
- content=to_xml(result.pcgts),
459
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
627
+ self.workspace.add_file(
628
+ file_id=output_file_id,
629
+ file_grp=self.output_file_grp,
630
+ page_id=page_id,
631
+ local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
632
+ mimetype=MIMETYPE_PAGE,
633
+ content=to_xml(result.pcgts),
634
+ force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
460
635
  )
461
636
 
462
637
  def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
@@ -668,7 +843,7 @@ class Processor():
668
843
  # can actually be much more costly than traversing the ltree.
669
844
  # This might depend on the number of pages vs number of fileGrps.
670
845
 
671
- pages = dict()
846
+ pages = {}
672
847
  for i, ifg in enumerate(ifgs):
673
848
  files_ = sorted(self.workspace.mets.find_all_files(
674
849
  pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
@@ -723,7 +898,7 @@ class Processor():
723
898
  if self.page_id and not any(pages):
724
899
  self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
725
900
  f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
726
- ifts = list()
901
+ ifts = []
727
902
  for page, ifiles in pages.items():
728
903
  for i, ifg in enumerate(ifgs):
729
904
  if not ifiles[i]:
@@ -738,3 +913,151 @@ class Processor():
738
913
  if ifiles[0] or not require_first:
739
914
  ifts.append(tuple(ifiles))
740
915
  return ifts
916
+
917
+ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
918
+ """Generate a string describing the full CLI of this processor including params.
919
+
920
+ Args:
921
+ ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
922
+ processor_instance (object, optional): the processor implementation
923
+ (for adding any module/class/function docstrings)
924
+ subcommand (string): 'worker' or 'server'
925
+ """
926
+ doc_help = ''
927
+ if processor_instance:
928
+ module = inspect.getmodule(processor_instance)
929
+ if module and module.__doc__:
930
+ doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
931
+ if processor_instance.__doc__:
932
+ doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
933
+ # Try to find the most concrete docstring among the various methods that an implementation
934
+ # could overload, first serving.
935
+ # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
936
+ # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
937
+ for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
938
+ instance_method = getattr(processor_instance, method)
939
+ superclass_method = getattr(Processor, method)
940
+ if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
941
+ doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
942
+ break
943
+ if doc_help:
944
+ doc_help = '\n\n' + wrap_text(doc_help, width=72,
945
+ initial_indent=' > ',
946
+ subsequent_indent=' > ',
947
+ preserve_paragraphs=True)
948
+ subcommands = '''\
949
+ worker Start a processing worker rather than do local processing
950
+ server Start a processor server rather than do local processing
951
+ '''
952
+
953
+ processing_worker_options = '''\
954
+ --queue The RabbitMQ server address in format
955
+ "amqp://{user}:{pass}@{host}:{port}/{vhost}"
956
+ [amqp://admin:admin@localhost:5672]
957
+ --database The MongoDB server address in format
958
+ "mongodb://{host}:{port}"
959
+ [mongodb://localhost:27018]
960
+ --log-filename Filename to redirect STDOUT/STDERR to,
961
+ if specified.
962
+ '''
963
+
964
+ processing_server_options = '''\
965
+ --address The Processor server address in format
966
+ "{host}:{port}"
967
+ --database The MongoDB server address in format
968
+ "mongodb://{host}:{port}"
969
+ [mongodb://localhost:27018]
970
+ '''
971
+
972
+ processing_options = '''\
973
+ -m, --mets URL-PATH URL or file path of METS to process [./mets.xml]
974
+ -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)]
975
+ -I, --input-file-grp USE File group(s) used as input
976
+ -O, --output-file-grp USE File group(s) used as output
977
+ -g, --page-id ID Physical page ID(s) to process instead of full document []
978
+ --overwrite Remove existing output pages/images
979
+ (with "--page-id", remove only those).
980
+ Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
981
+ --debug Abort on any errors with full stack trace.
982
+ Short-hand for OCRD_MISSING_OUTPUT=ABORT
983
+ --profile Enable profiling
984
+ --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile"
985
+ -p, --parameter JSON-PATH Parameters, either verbatim JSON string
986
+ or JSON file path
987
+ -P, --param-override KEY VAL Override a single JSON object key-value pair,
988
+ taking precedence over --parameter
989
+ -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS
990
+ If URL starts with http:// start an HTTP server there,
991
+ otherwise URL is a path to an on-demand-created unix socket
992
+ -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
993
+ Override log level globally [INFO]
994
+ --log-filename LOG-PATH File to redirect stderr logging to (overriding ocrd_logging.conf).
995
+ '''
996
+
997
+ information_options = '''\
998
+ -C, --show-resource RESNAME Dump the content of processor resource RESNAME
999
+ -L, --list-resources List names of processor resources
1000
+ -J, --dump-json Dump tool description as JSON
1001
+ -D, --dump-module-dir Show the 'module' resource location path for this processor
1002
+ -h, --help Show this message
1003
+ -V, --version Show version
1004
+ '''
1005
+
1006
+ parameter_help = ''
1007
+ if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
1008
+ parameter_help = ' NONE\n'
1009
+ else:
1010
+ def wrap(s):
1011
+ return wrap_text(s, initial_indent=' '*3,
1012
+ subsequent_indent=' '*4,
1013
+ width=72, preserve_paragraphs=True)
1014
+ for param_name, param in ocrd_tool['parameters'].items():
1015
+ parameter_help += wrap('"%s" [%s%s]' % (
1016
+ param_name,
1017
+ param['type'],
1018
+ ' - REQUIRED' if 'required' in param and param['required'] else
1019
+ ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
1020
+ parameter_help += '\n ' + wrap(param['description'])
1021
+ if 'enum' in param:
1022
+ parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
1023
+ parameter_help += "\n"
1024
+
1025
+ if not subcommand:
1026
+ return f'''\
1027
+ Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
1028
+
1029
+ {ocrd_tool['description']}{doc_help}
1030
+
1031
+ Subcommands:
1032
+ {subcommands}
1033
+ Options for processing:
1034
+ {processing_options}
1035
+ Options for information:
1036
+ {information_options}
1037
+ Parameters:
1038
+ {parameter_help}
1039
+ '''
1040
+ elif subcommand == 'worker':
1041
+ return f'''\
1042
+ Usage: {ocrd_tool['executable']} worker [OPTIONS]
1043
+
1044
+ Run {ocrd_tool['executable']} as a processing worker.
1045
+
1046
+ {ocrd_tool['description']}{doc_help}
1047
+
1048
+ Options:
1049
+ {processing_worker_options}
1050
+ '''
1051
+ elif subcommand == 'server':
1052
+ return f'''\
1053
+ Usage: {ocrd_tool['executable']} server [OPTIONS]
1054
+
1055
+ Run {ocrd_tool['executable']} as a processor sever.
1056
+
1057
+ {ocrd_tool['description']}{doc_help}
1058
+
1059
+ Options:
1060
+ {processing_server_options}
1061
+ '''
1062
+ else:
1063
+ pass