ocrd 3.0.0b5__tar.gz → 3.0.0b7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. {ocrd-3.0.0b5/src/ocrd.egg-info → ocrd-3.0.0b7}/PKG-INFO +2 -1
  2. ocrd-3.0.0b7/VERSION +1 -0
  3. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/requirements.txt +1 -0
  4. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/workspace.py +21 -11
  5. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/__init__.py +6 -6
  6. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/base.py +302 -84
  7. ocrd-3.0.0b7/src/ocrd/processor/concurrent.py +909 -0
  8. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/helpers.py +10 -2
  9. {ocrd-3.0.0b5 → ocrd-3.0.0b7/src/ocrd.egg-info}/PKG-INFO +2 -1
  10. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/SOURCES.txt +1 -0
  11. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/requires.txt +1 -0
  12. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_mets.py +9 -0
  13. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/logging.py +27 -52
  14. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/ocrd_logging.conf +14 -16
  15. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_decorators.py +7 -10
  16. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_logging.py +6 -0
  17. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_logging_conf.py +21 -28
  18. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_mets_server.py +19 -9
  19. ocrd-3.0.0b5/VERSION +0 -1
  20. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/LICENSE +0 -0
  21. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/MANIFEST.in +0 -0
  22. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README.md +0 -0
  23. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_bashlib.md +0 -0
  24. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd.md +0 -0
  25. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_modelfactory.md +0 -0
  26. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_models.md +0 -0
  27. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_network.md +0 -0
  28. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_utils.md +0 -0
  29. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_validators.md +0 -0
  30. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/pyproject.toml +0 -0
  31. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/setup.cfg +0 -0
  32. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/__init__.py +0 -0
  33. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/__init__.py +0 -0
  34. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/bashlib.py +0 -0
  35. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/log.py +0 -0
  36. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/network.py +0 -0
  37. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/ocrd_tool.py +0 -0
  38. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/process.py +0 -0
  39. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/resmgr.py +0 -0
  40. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/validate.py +0 -0
  41. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/zip.py +0 -0
  42. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/constants.py +0 -0
  43. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/loglevel_option.py +0 -0
  44. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/mets_find_options.py +0 -0
  45. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/ocrd_cli_options.py +0 -0
  46. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/parameter_option.py +0 -0
  47. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/lib.bash +0 -0
  48. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/mets_server.py +0 -0
  49. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/ocrd-all-tool.json +0 -0
  50. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/__init__.py +0 -0
  51. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/builtin/__init__.py +0 -0
  52. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
  53. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
  54. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
  55. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/ocrd_page_result.py +0 -0
  56. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/resolver.py +0 -0
  57. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/resource_list.yml +0 -0
  58. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/resource_manager.py +0 -0
  59. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/task_sequence.py +0 -0
  60. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/workspace.py +0 -0
  61. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/workspace_backup.py +0 -0
  62. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/workspace_bagger.py +0 -0
  63. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/dependency_links.txt +0 -0
  64. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/entry_points.txt +0 -0
  65. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/top_level.txt +0 -0
  66. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_modelfactory/__init__.py +0 -0
  67. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/__init__.py +0 -0
  68. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/constants.py +0 -0
  69. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/mets-empty.xml +0 -0
  70. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_agent.py +0 -0
  71. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_exif.py +0 -0
  72. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_file.py +0 -0
  73. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_page.py +0 -0
  74. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_page_generateds.py +0 -0
  75. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_xml_base.py +0 -0
  76. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/report.py +0 -0
  77. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/utils.py +0 -0
  78. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/__init__.py +0 -0
  79. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/__init__.py +0 -0
  80. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/client.py +0 -0
  81. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/processing_server.py +0 -0
  82. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/processing_worker.py +0 -0
  83. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/processor_server.py +0 -0
  84. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/client.py +0 -0
  85. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/client_utils.py +0 -0
  86. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/constants.py +0 -0
  87. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/database.py +0 -0
  88. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/logging_utils.py +0 -0
  89. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/__init__.py +0 -0
  90. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/job.py +0 -0
  91. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/messages.py +0 -0
  92. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/ocrd_tool.py +0 -0
  93. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/workflow.py +0 -0
  94. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/workspace.py +0 -0
  95. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/param_validators.py +0 -0
  96. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/process_helpers.py +0 -0
  97. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/processing_server.py +0 -0
  98. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/processing_worker.py +0 -0
  99. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/processor_server.py +0 -0
  100. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
  101. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
  102. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
  103. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
  104. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
  105. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
  106. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
  107. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/__init__.py +0 -0
  108. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/config_parser.py +0 -0
  109. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
  110. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/deployer.py +0 -0
  111. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/hosts.py +0 -0
  112. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/network_agents.py +0 -0
  113. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/network_services.py +0 -0
  114. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/server_cache.py +0 -0
  115. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/server_utils.py +0 -0
  116. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
  117. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/utils.py +0 -0
  118. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/__init__.py +0 -0
  119. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/config.py +0 -0
  120. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/constants.py +0 -0
  121. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/deprecate.py +0 -0
  122. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/image.py +0 -0
  123. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/introspect.py +0 -0
  124. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/os.py +0 -0
  125. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/str.py +0 -0
  126. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/__init__.py +0 -0
  127. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/bagit-profile.yml +0 -0
  128. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/constants.py +0 -0
  129. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/json_validator.py +0 -0
  130. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/message_processing.schema.yml +0 -0
  131. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/message_result.schema.yml +0 -0
  132. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/mets.xsd +0 -0
  133. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
  134. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
  135. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
  136. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
  137. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/page.xsd +0 -0
  138. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/page_validator.py +0 -0
  139. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/parameter_validator.py +0 -0
  140. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
  141. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/processing_server_config_validator.py +0 -0
  142. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/resource_list_validator.py +0 -0
  143. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/workspace_validator.py +0 -0
  144. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/xlink.xsd +0 -0
  145. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/xsd_mets_validator.py +0 -0
  146. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/xsd_page_validator.py +0 -0
  147. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/xsd_validator.py +0 -0
  148. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_model_factory.py +0 -0
  149. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_resolver.py +0 -0
  150. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_resolver_oai.py +0 -0
  151. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_resource_manager.py +0 -0
  152. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_task_sequence.py +0 -0
  153. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_utils.py +0 -0
  154. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_version.py +0 -0
  155. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_workspace.py +0 -0
  156. {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_workspace_remove.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.0.0b5
3
+ Version: 3.0.0b7
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -26,6 +26,7 @@ Requires-Dist: httpx>=0.22.0
26
26
  Requires-Dist: importlib_metadata; python_version < "3.8"
27
27
  Requires-Dist: importlib_resources; python_version < "3.10"
28
28
  Requires-Dist: jsonschema>=4
29
+ Requires-Dist: loky
29
30
  Requires-Dist: lxml
30
31
  Requires-Dist: memory-profiler>=0.58.0
31
32
  Requires-Dist: numpy
ocrd-3.0.0b7/VERSION ADDED
@@ -0,0 +1 @@
1
+ 3.0.0b7
@@ -13,6 +13,7 @@ httpx>=0.22.0
13
13
  importlib_metadata ; python_version < '3.8'
14
14
  importlib_resources ; python_version < '3.10'
15
15
  jsonschema>=4
16
+ loky
16
17
  lxml
17
18
  memory-profiler >= 0.58.0
18
19
  # XXX explicitly do not restrict the numpy version because different
@@ -149,7 +149,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
149
149
  LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
150
150
  ctx.directory = workspace_dir
151
151
 
152
- assert not ctx.mets_server_url
152
+ assert not ctx.mets_server_url, \
153
+ f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
153
154
  workspace = ctx.resolver.workspace_from_url(
154
155
  mets_url,
155
156
  dst_dir=ctx.directory,
@@ -185,7 +186,8 @@ def workspace_init(ctx, clobber_mets, directory):
185
186
  if directory:
186
187
  LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
187
188
  ctx.directory = directory
188
- assert not ctx.mets_server_url
189
+ assert not ctx.mets_server_url, \
190
+ f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
189
191
  workspace = ctx.resolver.workspace_from_nothing(
190
192
  directory=ctx.directory,
191
193
  mets_basename=ctx.mets_basename,
@@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin
506
508
  (If any ``ID`` starts with ``//``, then its remainder
507
509
  will be interpreted as a regular expression.)
508
510
  """
511
+ assert not ctx.mets_server_url, \
512
+ f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
509
513
  workspace = ctx.workspace()
510
514
  for i in id:
511
515
  workspace.remove_file(i, force=force, keep_file=keep_file)
@@ -524,6 +528,8 @@ def rename_group(ctx, old, new):
524
528
  """
525
529
  Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
526
530
  """
531
+ assert not ctx.mets_server_url, \
532
+ f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
527
533
  workspace = ctx.workspace()
528
534
  workspace.rename_file_group(old, new)
529
535
  workspace.save_mets()
@@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files):
545
551
  (If any ``GROUP`` starts with ``//``, then its remainder
546
552
  will be interpreted as a regular expression.)
547
553
  """
554
+ assert not ctx.mets_server_url, \
555
+ f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
548
556
  workspace = ctx.workspace()
549
557
  for g in group:
550
558
  workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
@@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
567
575
  (If any ``FILTER`` starts with ``//``, then its remainder
568
576
  will be interpreted as a regular expression.)
569
577
  """
578
+ assert not ctx.mets_server_url, \
579
+ f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
570
580
  workspace = ctx.workspace()
571
581
  with pushd_popd(workspace.directory):
572
582
  for f in workspace.find_files(
@@ -673,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page
673
683
  will be interpreted as a regular expression.)
674
684
  """
675
685
  workspace = ctx.workspace()
676
- find_kwargs = {}
677
- if page_id_range and 'ID' in output_field:
678
- find_kwargs['pageId'] = page_id_range
679
- page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
680
686
  ret = []
681
-
682
- if output_field == ['ID']:
683
- ret = [[x] for x in page_ids]
684
- else:
685
- for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
687
+ if page_id_range or list(output_field) != ['ID']:
688
+ for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)):
686
689
  ret.append([])
687
690
  for k in output_field:
688
691
  ret[i].append(page_div.get(k, 'None'))
692
+ else:
693
+ for page_id in workspace.mets.physical_pages:
694
+ ret.append([page_id])
689
695
 
690
696
  if numeric_range:
691
697
  start, end = map(int, numeric_range.split('..'))
@@ -762,6 +768,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
762
768
  if contentids:
763
769
  update_kwargs['CONTENTIDS'] = contentids
764
770
  try:
771
+ assert not ctx.mets_server_url, \
772
+ f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
765
773
  workspace = ctx.workspace()
766
774
  workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
767
775
  workspace.save_mets()
@@ -800,6 +808,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
800
808
  mets_path = Path(mets_path)
801
809
  if filegrp_mapping:
802
810
  filegrp_mapping = loads(filegrp_mapping)
811
+ assert not ctx.mets_server_url, \
812
+ f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
803
813
  workspace = ctx.workspace()
804
814
  other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
805
815
  workspace.merge(
@@ -13,7 +13,6 @@ from ocrd_utils import (
13
13
  redirect_stderr_and_stdout_to_file,
14
14
  )
15
15
  from ocrd_validators import WorkspaceValidator
16
- from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
17
16
 
18
17
  from ..resolver import Resolver
19
18
  from ..processor.base import ResourceNotFoundError, run_processor
@@ -23,8 +22,6 @@ from .parameter_option import parameter_option, parameter_override_option
23
22
  from .ocrd_cli_options import ocrd_cli_options
24
23
  from .mets_find_options import mets_find_options
25
24
 
26
- SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER]
27
-
28
25
 
29
26
  def ocrd_cli_wrap_processor(
30
27
  processorClass,
@@ -88,11 +85,9 @@ def ocrd_cli_wrap_processor(
88
85
  if list_resources:
89
86
  processor.list_resources()
90
87
  sys.exit()
91
- if subcommand:
88
+ if subcommand or address or queue or database:
92
89
  # Used for checking/starting network agents for the WebAPI architecture
93
90
  check_and_run_network_agent(processorClass, subcommand, address, database, queue)
94
- elif address or queue or database:
95
- raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}")
96
91
 
97
92
  # from here: single-run processing context
98
93
  initLogging()
@@ -162,6 +157,11 @@ def ocrd_cli_wrap_processor(
162
157
  def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str):
163
158
  """
164
159
  """
160
+ from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
161
+ SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER]
162
+
163
+ if not subcommand:
164
+ raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}")
165
165
  if subcommand not in SUBCOMMANDS:
166
166
  raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}")
167
167
 
@@ -16,14 +16,21 @@ import json
16
16
  import os
17
17
  from os import getcwd
18
18
  from pathlib import Path
19
- from typing import Any, List, Optional, Union, get_args
19
+ from typing import Any, Dict, List, Optional, Tuple, Union, get_args
20
20
  import sys
21
+ import logging
22
+ import logging.handlers
21
23
  import inspect
22
24
  import tarfile
23
25
  import io
24
- import weakref
26
+ from collections import defaultdict
25
27
  from frozendict import frozendict
26
- from concurrent.futures import ThreadPoolExecutor, TimeoutError
28
+ # concurrent.futures is buggy in py38,
29
+ # this is where the fixes came from:
30
+ from loky import Future, ProcessPoolExecutor
31
+ import multiprocessing as mp
32
+ from threading import Timer
33
+ from _thread import interrupt_main
27
34
 
28
35
  from click import wrap_text
29
36
  from deprecated import deprecated
@@ -105,6 +112,31 @@ class MissingInputFile(ValueError):
105
112
  f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
106
113
  super().__init__(self.message)
107
114
 
115
+ class DummyFuture:
116
+ """
117
+ Mimics some of `concurrent.futures.Future` but runs immediately.
118
+ """
119
+ def __init__(self, fn, *args, **kwargs):
120
+ self.fn = fn
121
+ self.args = args
122
+ self.kwargs = kwargs
123
+ def result(self):
124
+ return self.fn(*self.args, **self.kwargs)
125
+ class DummyExecutor:
126
+ """
127
+ Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs
128
+ everything immediately in this process.
129
+ """
130
+ def __init__(self, initializer=None, initargs=(), **kwargs):
131
+ initializer(*initargs)
132
+ def shutdown(self, **kwargs):
133
+ pass
134
+ def submit(self, fn, *args, **kwargs) -> DummyFuture:
135
+ return DummyFuture(fn, *args, **kwargs)
136
+
137
+ TFuture = Union[DummyFuture, Future]
138
+ TExecutor = Union[DummyExecutor, ProcessPoolExecutor]
139
+
108
140
  class Processor():
109
141
  """
110
142
  A processor is a tool that implements the uniform OCR-D
@@ -127,12 +159,12 @@ class Processor():
127
159
 
128
160
  max_workers : int = -1
129
161
  """
130
- maximum number of processor threads for page-parallel processing (ignored if negative),
162
+ maximum number of processor forks for page-parallel processing (ignored if negative),
131
163
  to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
132
164
  whatever is smaller).
133
165
 
134
166
  (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
135
- - at once, or if your class is not thread-safe.)
167
+ - at once, or if your class already creates threads prior to forking, e.g. during ``setup``.)
136
168
  """
137
169
 
138
170
  max_page_seconds : int = -1
@@ -335,12 +367,14 @@ class Processor():
335
367
  self._base_logger = getLogger('ocrd.processor.base')
336
368
  if parameter is not None:
337
369
  self.parameter = parameter
338
- # ensure that shutdown gets called at destruction
339
- self._finalizer = weakref.finalize(self, self.shutdown)
340
370
  # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
341
371
  setattr(self, 'process',
342
372
  deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
343
373
 
374
+ def __del__(self):
375
+ self._base_logger.debug("shutting down")
376
+ self.shutdown()
377
+
344
378
  def show_help(self, subcommand=None):
345
379
  """
346
380
  Print a usage description including the standard CLI and all of this processor's ocrd-tool
@@ -456,6 +490,9 @@ class Processor():
456
490
  for the given :py:data:`page_id` (or all pages)
457
491
  under the given :py:data:`parameter`.
458
492
 
493
+ Delegates to :py:meth:`.process_workspace_submit_tasks`
494
+ and :py:meth:`.process_workspace_handle_tasks`.
495
+
459
496
  (This will iterate over pages and files, calling
460
497
  :py:meth:`.process_page_file` and handling exceptions.
461
498
  It should be overridden by subclasses to handle cases
@@ -465,11 +502,7 @@ class Processor():
465
502
  self.workspace = workspace
466
503
  self.verify()
467
504
  try:
468
- nr_succeeded = 0
469
- nr_skipped = 0
470
- nr_copied = 0
471
-
472
- # set up multithreading
505
+ # set up multitasking
473
506
  max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
474
507
  if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
475
508
  self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
@@ -481,80 +514,34 @@ class Processor():
481
514
  if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
482
515
  self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
483
516
  max_seconds = self.max_page_seconds
484
- executor = ThreadPoolExecutor(
517
+
518
+ if max_workers > 1:
519
+ executor_cls = ProcessPoolExecutor
520
+ log_queue = mp.Queue()
521
+ # forward messages from log queue (in subprocesses) to all root handlers
522
+ log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True)
523
+ else:
524
+ executor_cls = DummyExecutor
525
+ log_queue = None
526
+ log_listener = None
527
+ executor = executor_cls(
485
528
  max_workers=max_workers or 1,
486
- thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
529
+ # only forking method avoids pickling
530
+ context=mp.get_context('fork'),
531
+ # share processor instance as global to avoid pickling
532
+ initializer=_page_worker_set_ctxt,
533
+ initargs=(self, log_queue),
487
534
  )
488
- self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
489
- tasks = {}
490
-
491
- for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
492
- input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
493
- page_id = next(input_file.pageId
494
- for input_file in input_file_tuple
495
- if input_file)
496
- self._base_logger.info(f"preparing page {page_id}")
497
- for i, input_file in enumerate(input_file_tuple):
498
- if input_file is None:
499
- # file/page not found in this file grp
500
- continue
501
- input_files[i] = input_file
502
- if not self.download:
503
- continue
504
- try:
505
- input_files[i] = self.workspace.download_file(input_file)
506
- except (ValueError, FileNotFoundError, HTTPError) as e:
507
- self._base_logger.error(repr(e))
508
- self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
509
- # process page
510
- tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
511
- self._base_logger.debug("submitted %d processing tasks", len(tasks))
512
-
513
- for task in tasks:
514
- # wait for results, handle errors
515
- page_id, input_files = tasks[task]
516
- # FIXME: differentiate error cases in various ways:
517
- # - ResourceNotFoundError → use ResourceManager to download (once), then retry
518
- # - transient (I/O or OOM) error → maybe sleep, retry
519
- # - persistent (data) error → skip / dummy / raise
520
- try:
521
- self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
522
- task.result(timeout=max_seconds or None)
523
- nr_succeeded += 1
524
- # exclude NotImplementedError, so we can try process() below
525
- except NotImplementedError:
526
- raise
527
- # handle input failures separately
528
- except FileExistsError as err:
529
- if config.OCRD_EXISTING_OUTPUT == 'ABORT':
530
- raise err
531
- if config.OCRD_EXISTING_OUTPUT == 'SKIP':
532
- continue
533
- if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
534
- # too late here, must not happen
535
- raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
536
- # broad coverage of output failures (including TimeoutError)
537
- except (Exception, TimeoutError) as err:
538
- # FIXME: add re-usable/actionable logging
539
- if config.OCRD_MISSING_OUTPUT == 'ABORT':
540
- self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
541
- raise err
542
- self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
543
- if config.OCRD_MISSING_OUTPUT == 'SKIP':
544
- nr_skipped += 1
545
- continue
546
- if config.OCRD_MISSING_OUTPUT == 'COPY':
547
- self._copy_page_file(input_files[0])
548
- nr_copied += 1
549
- else:
550
- desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
551
- raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
552
-
553
- if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
554
- raise Exception(f"too many failures with skipped output ({nr_skipped})")
555
- if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
556
- raise Exception(f"too many failures with fallback output ({nr_skipped})")
557
- executor.shutdown()
535
+ if max_workers > 1:
536
+ log_listener.start()
537
+ try:
538
+ self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
539
+ tasks = self.process_workspace_submit_tasks(executor, max_seconds)
540
+ stats = self.process_workspace_handle_tasks(tasks)
541
+ finally:
542
+ executor.shutdown(kill_workers=True, wait=False)
543
+ if max_workers > 1:
544
+ log_listener.stop()
558
545
 
559
546
  except NotImplementedError:
560
547
  # fall back to deprecated method
@@ -564,6 +551,190 @@ class Processor():
564
551
  # suppress the NotImplementedError context
565
552
  raise err from None
566
553
 
554
+ def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
555
+ """
556
+ Look up all input files of the given ``workspace``
557
+ from the given :py:data:`input_file_grp`
558
+ for the given :py:data:`page_id` (or all pages),
559
+ and schedules calling :py:meth:`.process_page_file`
560
+ on them for each page via `executor` (enforcing
561
+ a per-page time limit of `max_seconds`).
562
+
563
+ When running with `OCRD_MAX_PARALLEL_PAGES>1` and
564
+ the workspace via METS Server, the executor will fork
565
+ this many worker parallel subprocesses each processing
566
+ one page at a time. (Interprocess communication is
567
+ done via task and result queues.)
568
+
569
+ Otherwise, tasks are run sequentially in the
570
+ current process.
571
+
572
+ Delegates to :py:meth:`.zip_input_files` to get
573
+ the input files for each page, and then calls
574
+ :py:meth:`.process_workspace_submit_page_task`.
575
+
576
+ Returns a dict mapping the per-page tasks
577
+ (i.e. futures submitted to the executor)
578
+ to their corresponding pageId and input files.
579
+ """
580
+ tasks = {}
581
+ for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
582
+ task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple)
583
+ tasks[task] = (page_id, input_files)
584
+ self._base_logger.debug("submitted %d processing tasks", len(tasks))
585
+ return tasks
586
+
587
+ def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]:
588
+ """
589
+ Ensure all input files for a single page are
590
+ downloaded to the workspace, then schedule
591
+ :py:meth:`.process_process_file` to be run on
592
+ them via `executor` (enforcing a per-page time
593
+ limit of `max_seconds`).
594
+
595
+ Delegates to :py:meth:`.process_page_file`
596
+ (wrapped in :py:func:`_page_worker` to share
597
+ the processor instance across forked processes).
598
+
599
+ \b
600
+ Returns a tuple of:
601
+ - the scheduled future object,
602
+ - the corresponding pageId,
603
+ - the corresponding input files.
604
+ """
605
+ input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
606
+ page_id = next(input_file.pageId
607
+ for input_file in input_file_tuple
608
+ if input_file)
609
+ self._base_logger.info(f"preparing page {page_id}")
610
+ for i, input_file in enumerate(input_file_tuple):
611
+ if input_file is None:
612
+ # file/page not found in this file grp
613
+ continue
614
+ input_files[i] = input_file
615
+ if not self.download:
616
+ continue
617
+ try:
618
+ input_files[i] = self.workspace.download_file(input_file)
619
+ except (ValueError, FileNotFoundError, HTTPError) as e:
620
+ self._base_logger.error(repr(e))
621
+ self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
622
+ # process page
623
+ #executor.submit(self.process_page_file, *input_files)
624
+ return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
625
+
626
+ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]:
627
+ """
628
+ Look up scheduled per-page futures one by one,
629
+ handle errors (exceptions) and gather results.
630
+
631
+ \b
632
+ Enforces policies configured by the following
633
+ environment variables:
634
+ - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
635
+ - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
636
+ - `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
637
+
638
+ \b
639
+ Returns a tuple of:
640
+ - the number of successfully processed pages
641
+ - the number of failed (i.e. skipped or copied) pages
642
+ - a dict of the type and corresponding number of exceptions seen
643
+ - the number of total requested pages (i.e. success+fail+existing).
644
+
645
+ Delegates to :py:meth:`.process_workspace_handle_page_task`
646
+ for each page.
647
+ """
648
+ # aggregate info for logging:
649
+ nr_succeeded = 0
650
+ nr_failed = 0
651
+ nr_errors = defaultdict(int) # count causes
652
+ if config.OCRD_MISSING_OUTPUT == 'SKIP':
653
+ reason = "skipped"
654
+ elif config.OCRD_MISSING_OUTPUT == 'COPY':
655
+ reason = "fallback-copied"
656
+ for task in tasks:
657
+ # wait for results, handle errors
658
+ page_id, input_files = tasks[task]
659
+ result = self.process_workspace_handle_page_task(page_id, input_files, task)
660
+ if isinstance(result, Exception):
661
+ nr_errors[result.__class__.__name__] += 1
662
+ nr_failed += 1
663
+ # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed
664
+ if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS:
665
+ # already irredeemably many failures, stop short
666
+ nr_errors = dict(nr_errors)
667
+ raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})")
668
+ elif result:
669
+ nr_succeeded += 1
670
+ # else skipped - already exists
671
+ nr_errors = dict(nr_errors)
672
+ if nr_failed > 0:
673
+ nr_all = nr_succeeded + nr_failed
674
+ if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
675
+ raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
676
+ self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
677
+ return nr_succeeded, nr_failed, nr_errors, len(tasks)
678
+
679
+ def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]:
680
+ """
681
+ \b
682
+ Await a single page result and handle errors (exceptions),
683
+ enforcing policies configured by the following
684
+ environment variables:
685
+ - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
686
+ - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
687
+ - `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
688
+
689
+ \b
690
+ Returns
691
+ - true in case of success
692
+ - false in case the output already exists
693
+ - the exception in case of failure
694
+ """
695
+ # FIXME: differentiate error cases in various ways:
696
+ # - ResourceNotFoundError → use ResourceManager to download (once), then retry
697
+ # - transient (I/O or OOM) error → maybe sleep, retry
698
+ # - persistent (data) error → skip / dummy / raise
699
+ try:
700
+ self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id)
701
+ # timeout kwarg on future is useless: it only raises TimeoutError here,
702
+ # but does not stop the running process/thread, and executor itself
703
+ # offers nothing to that effect:
704
+ # task.result(timeout=max_seconds or None)
705
+ # so we instead applied the timeout within the worker function
706
+ task.result()
707
+ return True
708
+ except NotImplementedError:
709
+ # exclude NotImplementedError, so we can try process() below
710
+ raise
711
+ # handle input failures separately
712
+ except FileExistsError as err:
713
+ if config.OCRD_EXISTING_OUTPUT == 'ABORT':
714
+ raise err
715
+ if config.OCRD_EXISTING_OUTPUT == 'SKIP':
716
+ return False
717
+ if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
718
+ # too late here, must not happen
719
+ raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
720
+ except KeyboardInterrupt:
721
+ raise
722
+ # broad coverage of output failures (including TimeoutError)
723
+ except Exception as err:
724
+ # FIXME: add re-usable/actionable logging
725
+ if config.OCRD_MISSING_OUTPUT == 'ABORT':
726
+ self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
727
+ raise err
728
+ self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
729
+ if config.OCRD_MISSING_OUTPUT == 'SKIP':
730
+ pass
731
+ elif config.OCRD_MISSING_OUTPUT == 'COPY':
732
+ self._copy_page_file(input_files[0])
733
+ else:
734
+ desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
735
+ raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
736
+ return err
737
+
567
738
  def _copy_page_file(self, input_file : OcrdFileType) -> None:
568
739
  """
569
740
  Copy the given ``input_file`` of the :py:data:`workspace`,
@@ -618,6 +789,12 @@ class Processor():
618
789
  # not PAGE and not an image to generate PAGE for
619
790
  self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
620
791
  output_file_id = make_file_id(input_files[0], self.output_file_grp)
792
+ output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
793
+ if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
794
+ # short-cut avoiding useless computation:
795
+ raise FileExistsError(
796
+ f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
797
+ )
621
798
  result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
622
799
  for image_result in result.images:
623
800
  image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
@@ -934,6 +1111,47 @@ class Processor():
934
1111
  ifts.append(tuple(ifiles))
935
1112
  return ifts
936
1113
 
1114
+ _page_worker_processor = None
1115
+ """
1116
+ This global binding for the processor is required to avoid
1117
+ squeezing the processor through a mp.Queue (which is impossible
1118
+ due to unpicklable attributes like .workspace.mets._tree anyway)
1119
+ when calling Processor.process_page_file as page worker processes
1120
+ in Processor.process_workspace. Forking allows inheriting global
1121
+ objects, and with the METS Server we do not mutate the local
1122
+ processor instance anyway.
1123
+ """
1124
+ def _page_worker_set_ctxt(processor, log_queue):
1125
+ """
1126
+ Overwrites `ocrd.processor.base._page_worker_processor` instance
1127
+ for sharing with subprocesses in ProcessPoolExecutor initializer.
1128
+ """
1129
+ global _page_worker_processor
1130
+ _page_worker_processor = processor
1131
+ if log_queue:
1132
+ # replace all log handlers with just one queue handler
1133
+ logging.root.handlers = [logging.handlers.QueueHandler(log_queue)]
1134
+
1135
+ def _page_worker(timeout, *input_files):
1136
+ """
1137
+ Wraps a `Processor.process_page_file` call as payload (call target)
1138
+ of the ProcessPoolExecutor workers, but also enforces the given timeout.
1139
+ """
1140
+ page_id = next((file.pageId for file in input_files
1141
+ if hasattr(file, 'pageId')), "")
1142
+ if timeout > 0:
1143
+ timer = Timer(timeout, interrupt_main)
1144
+ timer.start()
1145
+ try:
1146
+ _page_worker_processor.process_page_file(*input_files)
1147
+ _page_worker_processor.logger.debug("page worker completed for page %s", page_id)
1148
+ except KeyboardInterrupt:
1149
+ _page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
1150
+ raise TimeoutError()
1151
+ finally:
1152
+ if timeout > 0:
1153
+ timer.cancel()
1154
+
937
1155
  def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
938
1156
  """Generate a string describing the full CLI of this processor including params.
939
1157