ocrd 3.0.0b5__tar.gz → 3.0.0b6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. {ocrd-3.0.0b5/src/ocrd.egg-info → ocrd-3.0.0b6}/PKG-INFO +2 -1
  2. ocrd-3.0.0b6/VERSION +1 -0
  3. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/requirements.txt +1 -0
  4. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/workspace.py +21 -11
  5. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/__init__.py +6 -6
  6. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/base.py +282 -79
  7. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/helpers.py +10 -2
  8. {ocrd-3.0.0b5 → ocrd-3.0.0b6/src/ocrd.egg-info}/PKG-INFO +2 -1
  9. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/requires.txt +1 -0
  10. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_mets.py +9 -0
  11. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/logging.py +6 -2
  12. ocrd-3.0.0b5/VERSION +0 -1
  13. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/LICENSE +0 -0
  14. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/MANIFEST.in +0 -0
  15. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README.md +0 -0
  16. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_bashlib.md +0 -0
  17. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd.md +0 -0
  18. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_modelfactory.md +0 -0
  19. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_models.md +0 -0
  20. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_network.md +0 -0
  21. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_utils.md +0 -0
  22. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_validators.md +0 -0
  23. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/pyproject.toml +0 -0
  24. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/setup.cfg +0 -0
  25. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/__init__.py +0 -0
  26. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/__init__.py +0 -0
  27. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/bashlib.py +0 -0
  28. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/log.py +0 -0
  29. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/network.py +0 -0
  30. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/ocrd_tool.py +0 -0
  31. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/process.py +0 -0
  32. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/resmgr.py +0 -0
  33. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/validate.py +0 -0
  34. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/zip.py +0 -0
  35. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/constants.py +0 -0
  36. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/loglevel_option.py +0 -0
  37. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/mets_find_options.py +0 -0
  38. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/ocrd_cli_options.py +0 -0
  39. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/parameter_option.py +0 -0
  40. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/lib.bash +0 -0
  41. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/mets_server.py +0 -0
  42. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/ocrd-all-tool.json +0 -0
  43. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/__init__.py +0 -0
  44. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/builtin/__init__.py +0 -0
  45. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
  46. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
  47. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
  48. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/ocrd_page_result.py +0 -0
  49. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/resolver.py +0 -0
  50. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/resource_list.yml +0 -0
  51. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/resource_manager.py +0 -0
  52. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/task_sequence.py +0 -0
  53. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/workspace.py +0 -0
  54. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/workspace_backup.py +0 -0
  55. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/workspace_bagger.py +0 -0
  56. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/SOURCES.txt +0 -0
  57. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/dependency_links.txt +0 -0
  58. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/entry_points.txt +0 -0
  59. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/top_level.txt +0 -0
  60. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_modelfactory/__init__.py +0 -0
  61. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/__init__.py +0 -0
  62. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/constants.py +0 -0
  63. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/mets-empty.xml +0 -0
  64. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_agent.py +0 -0
  65. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_exif.py +0 -0
  66. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_file.py +0 -0
  67. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_page.py +0 -0
  68. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_page_generateds.py +0 -0
  69. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_xml_base.py +0 -0
  70. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/report.py +0 -0
  71. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/utils.py +0 -0
  72. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/__init__.py +0 -0
  73. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/__init__.py +0 -0
  74. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/client.py +0 -0
  75. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/processing_server.py +0 -0
  76. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/processing_worker.py +0 -0
  77. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/processor_server.py +0 -0
  78. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/client.py +0 -0
  79. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/client_utils.py +0 -0
  80. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/constants.py +0 -0
  81. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/database.py +0 -0
  82. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/logging_utils.py +0 -0
  83. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/__init__.py +0 -0
  84. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/job.py +0 -0
  85. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/messages.py +0 -0
  86. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/ocrd_tool.py +0 -0
  87. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/workflow.py +0 -0
  88. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/workspace.py +0 -0
  89. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/param_validators.py +0 -0
  90. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/process_helpers.py +0 -0
  91. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/processing_server.py +0 -0
  92. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/processing_worker.py +0 -0
  93. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/processor_server.py +0 -0
  94. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
  95. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
  96. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
  97. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
  98. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
  99. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
  100. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
  101. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/__init__.py +0 -0
  102. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/config_parser.py +0 -0
  103. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
  104. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/deployer.py +0 -0
  105. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/hosts.py +0 -0
  106. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/network_agents.py +0 -0
  107. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/network_services.py +0 -0
  108. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/server_cache.py +0 -0
  109. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/server_utils.py +0 -0
  110. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
  111. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/utils.py +0 -0
  112. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/__init__.py +0 -0
  113. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/config.py +0 -0
  114. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/constants.py +0 -0
  115. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/deprecate.py +0 -0
  116. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/image.py +0 -0
  117. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/introspect.py +0 -0
  118. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/ocrd_logging.conf +0 -0
  119. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/os.py +0 -0
  120. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/str.py +0 -0
  121. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/__init__.py +0 -0
  122. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/bagit-profile.yml +0 -0
  123. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/constants.py +0 -0
  124. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/json_validator.py +0 -0
  125. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/message_processing.schema.yml +0 -0
  126. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/message_result.schema.yml +0 -0
  127. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/mets.xsd +0 -0
  128. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
  129. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
  130. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
  131. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
  132. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/page.xsd +0 -0
  133. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/page_validator.py +0 -0
  134. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/parameter_validator.py +0 -0
  135. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
  136. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/processing_server_config_validator.py +0 -0
  137. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/resource_list_validator.py +0 -0
  138. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/workspace_validator.py +0 -0
  139. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/xlink.xsd +0 -0
  140. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/xsd_mets_validator.py +0 -0
  141. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/xsd_page_validator.py +0 -0
  142. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/xsd_validator.py +0 -0
  143. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_decorators.py +0 -0
  144. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_logging.py +0 -0
  145. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_logging_conf.py +0 -0
  146. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_mets_server.py +0 -0
  147. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_model_factory.py +0 -0
  148. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_resolver.py +0 -0
  149. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_resolver_oai.py +0 -0
  150. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_resource_manager.py +0 -0
  151. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_task_sequence.py +0 -0
  152. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_utils.py +0 -0
  153. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_version.py +0 -0
  154. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_workspace.py +0 -0
  155. {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_workspace_remove.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.0.0b5
3
+ Version: 3.0.0b6
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -26,6 +26,7 @@ Requires-Dist: httpx>=0.22.0
26
26
  Requires-Dist: importlib_metadata; python_version < "3.8"
27
27
  Requires-Dist: importlib_resources; python_version < "3.10"
28
28
  Requires-Dist: jsonschema>=4
29
+ Requires-Dist: loky
29
30
  Requires-Dist: lxml
30
31
  Requires-Dist: memory-profiler>=0.58.0
31
32
  Requires-Dist: numpy
ocrd-3.0.0b6/VERSION ADDED
@@ -0,0 +1 @@
1
+ 3.0.0b6
@@ -13,6 +13,7 @@ httpx>=0.22.0
13
13
  importlib_metadata ; python_version < '3.8'
14
14
  importlib_resources ; python_version < '3.10'
15
15
  jsonschema>=4
16
+ loky
16
17
  lxml
17
18
  memory-profiler >= 0.58.0
18
19
  # XXX explicitly do not restrict the numpy version because different
@@ -149,7 +149,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
149
149
  LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
150
150
  ctx.directory = workspace_dir
151
151
 
152
- assert not ctx.mets_server_url
152
+ assert not ctx.mets_server_url, \
153
+ f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
153
154
  workspace = ctx.resolver.workspace_from_url(
154
155
  mets_url,
155
156
  dst_dir=ctx.directory,
@@ -185,7 +186,8 @@ def workspace_init(ctx, clobber_mets, directory):
185
186
  if directory:
186
187
  LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
187
188
  ctx.directory = directory
188
- assert not ctx.mets_server_url
189
+ assert not ctx.mets_server_url, \
190
+ f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
189
191
  workspace = ctx.resolver.workspace_from_nothing(
190
192
  directory=ctx.directory,
191
193
  mets_basename=ctx.mets_basename,
@@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin
506
508
  (If any ``ID`` starts with ``//``, then its remainder
507
509
  will be interpreted as a regular expression.)
508
510
  """
511
+ assert not ctx.mets_server_url, \
512
+ f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
509
513
  workspace = ctx.workspace()
510
514
  for i in id:
511
515
  workspace.remove_file(i, force=force, keep_file=keep_file)
@@ -524,6 +528,8 @@ def rename_group(ctx, old, new):
524
528
  """
525
529
  Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
526
530
  """
531
+ assert not ctx.mets_server_url, \
532
+ f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
527
533
  workspace = ctx.workspace()
528
534
  workspace.rename_file_group(old, new)
529
535
  workspace.save_mets()
@@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files):
545
551
  (If any ``GROUP`` starts with ``//``, then its remainder
546
552
  will be interpreted as a regular expression.)
547
553
  """
554
+ assert not ctx.mets_server_url, \
555
+ f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
548
556
  workspace = ctx.workspace()
549
557
  for g in group:
550
558
  workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
@@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
567
575
  (If any ``FILTER`` starts with ``//``, then its remainder
568
576
  will be interpreted as a regular expression.)
569
577
  """
578
+ assert not ctx.mets_server_url, \
579
+ f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
570
580
  workspace = ctx.workspace()
571
581
  with pushd_popd(workspace.directory):
572
582
  for f in workspace.find_files(
@@ -673,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page
673
683
  will be interpreted as a regular expression.)
674
684
  """
675
685
  workspace = ctx.workspace()
676
- find_kwargs = {}
677
- if page_id_range and 'ID' in output_field:
678
- find_kwargs['pageId'] = page_id_range
679
- page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
680
686
  ret = []
681
-
682
- if output_field == ['ID']:
683
- ret = [[x] for x in page_ids]
684
- else:
685
- for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
687
+ if page_id_range or list(output_field) != ['ID']:
688
+ for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)):
686
689
  ret.append([])
687
690
  for k in output_field:
688
691
  ret[i].append(page_div.get(k, 'None'))
692
+ else:
693
+ for page_id in workspace.mets.physical_pages:
694
+ ret.append([page_id])
689
695
 
690
696
  if numeric_range:
691
697
  start, end = map(int, numeric_range.split('..'))
@@ -762,6 +768,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
762
768
  if contentids:
763
769
  update_kwargs['CONTENTIDS'] = contentids
764
770
  try:
771
+ assert not ctx.mets_server_url, \
772
+ f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
765
773
  workspace = ctx.workspace()
766
774
  workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
767
775
  workspace.save_mets()
@@ -800,6 +808,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
800
808
  mets_path = Path(mets_path)
801
809
  if filegrp_mapping:
802
810
  filegrp_mapping = loads(filegrp_mapping)
811
+ assert not ctx.mets_server_url, \
812
+ f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
803
813
  workspace = ctx.workspace()
804
814
  other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
805
815
  workspace.merge(
@@ -13,7 +13,6 @@ from ocrd_utils import (
13
13
  redirect_stderr_and_stdout_to_file,
14
14
  )
15
15
  from ocrd_validators import WorkspaceValidator
16
- from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
17
16
 
18
17
  from ..resolver import Resolver
19
18
  from ..processor.base import ResourceNotFoundError, run_processor
@@ -23,8 +22,6 @@ from .parameter_option import parameter_option, parameter_override_option
23
22
  from .ocrd_cli_options import ocrd_cli_options
24
23
  from .mets_find_options import mets_find_options
25
24
 
26
- SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER]
27
-
28
25
 
29
26
  def ocrd_cli_wrap_processor(
30
27
  processorClass,
@@ -88,11 +85,9 @@ def ocrd_cli_wrap_processor(
88
85
  if list_resources:
89
86
  processor.list_resources()
90
87
  sys.exit()
91
- if subcommand:
88
+ if subcommand or address or queue or database:
92
89
  # Used for checking/starting network agents for the WebAPI architecture
93
90
  check_and_run_network_agent(processorClass, subcommand, address, database, queue)
94
- elif address or queue or database:
95
- raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}")
96
91
 
97
92
  # from here: single-run processing context
98
93
  initLogging()
@@ -162,6 +157,11 @@ def ocrd_cli_wrap_processor(
162
157
  def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str):
163
158
  """
164
159
  """
160
+ from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
161
+ SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER]
162
+
163
+ if not subcommand:
164
+ raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}")
165
165
  if subcommand not in SUBCOMMANDS:
166
166
  raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}")
167
167
 
@@ -16,14 +16,20 @@ import json
16
16
  import os
17
17
  from os import getcwd
18
18
  from pathlib import Path
19
- from typing import Any, List, Optional, Union, get_args
19
+ from typing import Any, Dict, List, Optional, Tuple, Union, get_args
20
20
  import sys
21
21
  import inspect
22
22
  import tarfile
23
23
  import io
24
24
  import weakref
25
+ from collections import defaultdict
25
26
  from frozendict import frozendict
26
- from concurrent.futures import ThreadPoolExecutor, TimeoutError
27
+ # concurrent.futures is buggy in py38,
28
+ # this is where the fixes came from:
29
+ from loky import Future, ProcessPoolExecutor
30
+ import multiprocessing as mp
31
+ from threading import Timer
32
+ from _thread import interrupt_main
27
33
 
28
34
  from click import wrap_text
29
35
  from deprecated import deprecated
@@ -105,6 +111,31 @@ class MissingInputFile(ValueError):
105
111
  f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
106
112
  super().__init__(self.message)
107
113
 
114
+ class DummyFuture:
115
+ """
116
+ Mimics some of `concurrent.futures.Future` but runs immediately.
117
+ """
118
+ def __init__(self, fn, *args, **kwargs):
119
+ self.fn = fn
120
+ self.args = args
121
+ self.kwargs = kwargs
122
+ def result(self):
123
+ return self.fn(*self.args, **self.kwargs)
124
+ class DummyExecutor:
125
+ """
126
+ Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs
127
+ everything immediately in this process.
128
+ """
129
+ def __init__(self, initializer=None, initargs=(), **kwargs):
130
+ initializer(*initargs)
131
+ def shutdown(self, **kwargs):
132
+ pass
133
+ def submit(self, fn, *args, **kwargs) -> DummyFuture:
134
+ return DummyFuture(fn, *args, **kwargs)
135
+
136
+ TFuture = Union[DummyFuture, Future]
137
+ TExecutor = Union[DummyExecutor, ProcessPoolExecutor]
138
+
108
139
  class Processor():
109
140
  """
110
141
  A processor is a tool that implements the uniform OCR-D
@@ -456,6 +487,9 @@ class Processor():
456
487
  for the given :py:data:`page_id` (or all pages)
457
488
  under the given :py:data:`parameter`.
458
489
 
490
+ Delegates to :py:meth:`.process_workspace_submit_tasks`
491
+ and :py:meth:`.process_workspace_handle_tasks`.
492
+
459
493
  (This will iterate over pages and files, calling
460
494
  :py:meth:`.process_page_file` and handling exceptions.
461
495
  It should be overridden by subclasses to handle cases
@@ -465,11 +499,7 @@ class Processor():
465
499
  self.workspace = workspace
466
500
  self.verify()
467
501
  try:
468
- nr_succeeded = 0
469
- nr_skipped = 0
470
- nr_copied = 0
471
-
472
- # set up multithreading
502
+ # set up multitasking
473
503
  max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
474
504
  if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
475
505
  self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
@@ -481,80 +511,25 @@ class Processor():
481
511
  if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
482
512
  self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
483
513
  max_seconds = self.max_page_seconds
484
- executor = ThreadPoolExecutor(
514
+
515
+ if max_workers > 1:
516
+ executor_cls = ProcessPoolExecutor
517
+ else:
518
+ executor_cls = DummyExecutor
519
+ executor = executor_cls(
485
520
  max_workers=max_workers or 1,
486
- thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
521
+ # only forking method avoids pickling
522
+ context=mp.get_context('fork'),
523
+ # share processor instance as global to avoid pickling
524
+ initializer=_page_worker_set_ctxt,
525
+ initargs=(self,),
487
526
  )
488
- self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
489
- tasks = {}
490
-
491
- for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
492
- input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
493
- page_id = next(input_file.pageId
494
- for input_file in input_file_tuple
495
- if input_file)
496
- self._base_logger.info(f"preparing page {page_id}")
497
- for i, input_file in enumerate(input_file_tuple):
498
- if input_file is None:
499
- # file/page not found in this file grp
500
- continue
501
- input_files[i] = input_file
502
- if not self.download:
503
- continue
504
- try:
505
- input_files[i] = self.workspace.download_file(input_file)
506
- except (ValueError, FileNotFoundError, HTTPError) as e:
507
- self._base_logger.error(repr(e))
508
- self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
509
- # process page
510
- tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
511
- self._base_logger.debug("submitted %d processing tasks", len(tasks))
512
-
513
- for task in tasks:
514
- # wait for results, handle errors
515
- page_id, input_files = tasks[task]
516
- # FIXME: differentiate error cases in various ways:
517
- # - ResourceNotFoundError → use ResourceManager to download (once), then retry
518
- # - transient (I/O or OOM) error → maybe sleep, retry
519
- # - persistent (data) error → skip / dummy / raise
520
- try:
521
- self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
522
- task.result(timeout=max_seconds or None)
523
- nr_succeeded += 1
524
- # exclude NotImplementedError, so we can try process() below
525
- except NotImplementedError:
526
- raise
527
- # handle input failures separately
528
- except FileExistsError as err:
529
- if config.OCRD_EXISTING_OUTPUT == 'ABORT':
530
- raise err
531
- if config.OCRD_EXISTING_OUTPUT == 'SKIP':
532
- continue
533
- if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
534
- # too late here, must not happen
535
- raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
536
- # broad coverage of output failures (including TimeoutError)
537
- except (Exception, TimeoutError) as err:
538
- # FIXME: add re-usable/actionable logging
539
- if config.OCRD_MISSING_OUTPUT == 'ABORT':
540
- self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
541
- raise err
542
- self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
543
- if config.OCRD_MISSING_OUTPUT == 'SKIP':
544
- nr_skipped += 1
545
- continue
546
- if config.OCRD_MISSING_OUTPUT == 'COPY':
547
- self._copy_page_file(input_files[0])
548
- nr_copied += 1
549
- else:
550
- desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
551
- raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
552
-
553
- if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
554
- raise Exception(f"too many failures with skipped output ({nr_skipped})")
555
- if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
556
- raise Exception(f"too many failures with fallback output ({nr_skipped})")
557
- executor.shutdown()
527
+ try:
528
+ self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
529
+ tasks = self.process_workspace_submit_tasks(executor, max_seconds)
530
+ stats = self.process_workspace_handle_tasks(tasks)
531
+ finally:
532
+ executor.shutdown(kill_workers=True, wait=False)
558
533
 
559
534
  except NotImplementedError:
560
535
  # fall back to deprecated method
@@ -564,6 +539,190 @@ class Processor():
564
539
  # suppress the NotImplementedError context
565
540
  raise err from None
566
541
 
542
+ def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
543
+ """
544
+ Look up all input files of the given ``workspace``
545
+ from the given :py:data:`input_file_grp`
546
+ for the given :py:data:`page_id` (or all pages),
547
+ and schedules calling :py:meth:`.process_page_file`
548
+ on them for each page via `executor` (enforcing
549
+ a per-page time limit of `max_seconds`).
550
+
551
+ When running with `OCRD_MAX_PARALLEL_PAGES>1` and
552
+ the workspace via METS Server, the executor will fork
553
+ this many worker parallel subprocesses each processing
554
+ one page at a time. (Interprocess communication is
555
+ done via task and result queues.)
556
+
557
+ Otherwise, tasks are run sequentially in the
558
+ current process.
559
+
560
+ Delegates to :py:meth:`.zip_input_files` to get
561
+ the input files for each page, and then calls
562
+ :py:meth:`.process_workspace_submit_page_task`.
563
+
564
+ Returns a dict mapping the per-page tasks
565
+ (i.e. futures submitted to the executor)
566
+ to their corresponding pageId and input files.
567
+ """
568
+ tasks = {}
569
+ for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
570
+ task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple)
571
+ tasks[task] = (page_id, input_files)
572
+ self._base_logger.debug("submitted %d processing tasks", len(tasks))
573
+ return tasks
574
+
575
+ def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]:
576
+ """
577
+ Ensure all input files for a single page are
578
+ downloaded to the workspace, then schedule
579
+ :py:meth:`.process_process_file` to be run on
580
+ them via `executor` (enforcing a per-page time
581
+ limit of `max_seconds`).
582
+
583
+ Delegates to :py:meth:`.process_page_file`
584
+ (wrapped in :py:func:`_page_worker` to share
585
+ the processor instance across forked processes).
586
+
587
+ \b
588
+ Returns a tuple of:
589
+ - the scheduled future object,
590
+ - the corresponding pageId,
591
+ - the corresponding input files.
592
+ """
593
+ input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
594
+ page_id = next(input_file.pageId
595
+ for input_file in input_file_tuple
596
+ if input_file)
597
+ self._base_logger.info(f"preparing page {page_id}")
598
+ for i, input_file in enumerate(input_file_tuple):
599
+ if input_file is None:
600
+ # file/page not found in this file grp
601
+ continue
602
+ input_files[i] = input_file
603
+ if not self.download:
604
+ continue
605
+ try:
606
+ input_files[i] = self.workspace.download_file(input_file)
607
+ except (ValueError, FileNotFoundError, HTTPError) as e:
608
+ self._base_logger.error(repr(e))
609
+ self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
610
+ # process page
611
+ #executor.submit(self.process_page_file, *input_files)
612
+ return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
613
+
614
+ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]:
615
+ """
616
+ Look up scheduled per-page futures one by one,
617
+ handle errors (exceptions) and gather results.
618
+
619
+ \b
620
+ Enforces policies configured by the following
621
+ environment variables:
622
+ - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
623
+ - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
624
+ - `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
625
+
626
+ \b
627
+ Returns a tuple of:
628
+ - the number of successfully processed pages
629
+ - the number of failed (i.e. skipped or copied) pages
630
+ - a dict of the type and corresponding number of exceptions seen
631
+ - the number of total requested pages (i.e. success+fail+existing).
632
+
633
+ Delegates to :py:meth:`.process_workspace_handle_page_task`
634
+ for each page.
635
+ """
636
+ # aggregate info for logging:
637
+ nr_succeeded = 0
638
+ nr_failed = 0
639
+ nr_errors = defaultdict(int) # count causes
640
+ if config.OCRD_MISSING_OUTPUT == 'SKIP':
641
+ reason = "skipped"
642
+ elif config.OCRD_MISSING_OUTPUT == 'COPY':
643
+ reason = "fallback-copied"
644
+ for task in tasks:
645
+ # wait for results, handle errors
646
+ page_id, input_files = tasks[task]
647
+ result = self.process_workspace_handle_page_task(page_id, input_files, task)
648
+ if isinstance(result, Exception):
649
+ nr_errors[result.__class__.__name__] += 1
650
+ nr_failed += 1
651
+ # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed
652
+ if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS:
653
+ # already irredeemably many failures, stop short
654
+ nr_errors = dict(nr_errors)
655
+ raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})")
656
+ elif result:
657
+ nr_succeeded += 1
658
+ # else skipped - already exists
659
+ nr_errors = dict(nr_errors)
660
+ if nr_failed > 0:
661
+ nr_all = nr_succeeded + nr_failed
662
+ if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
663
+ raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
664
+ self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
665
+ return nr_succeeded, nr_failed, nr_errors, len(tasks)
666
+
667
+ def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]:
668
+ """
669
+ \b
670
+ Await a single page result and handle errors (exceptions),
671
+ enforcing policies configured by the following
672
+ environment variables:
673
+ - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
674
+ - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
675
+ - `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
676
+
677
+ \b
678
+ Returns
679
+ - true in case of success
680
+ - false in case the output already exists
681
+ - the exception in case of failure
682
+ """
683
+ # FIXME: differentiate error cases in various ways:
684
+ # - ResourceNotFoundError → use ResourceManager to download (once), then retry
685
+ # - transient (I/O or OOM) error → maybe sleep, retry
686
+ # - persistent (data) error → skip / dummy / raise
687
+ try:
688
+ self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id)
689
+ # timeout kwarg on future is useless: it only raises TimeoutError here,
690
+ # but does not stop the running process/thread, and executor itself
691
+ # offers nothing to that effect:
692
+ # task.result(timeout=max_seconds or None)
693
+ # so we instead applied the timeout within the worker function
694
+ task.result()
695
+ return True
696
+ except NotImplementedError:
697
+ # exclude NotImplementedError, so we can try process() below
698
+ raise
699
+ # handle input failures separately
700
+ except FileExistsError as err:
701
+ if config.OCRD_EXISTING_OUTPUT == 'ABORT':
702
+ raise err
703
+ if config.OCRD_EXISTING_OUTPUT == 'SKIP':
704
+ return False
705
+ if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
706
+ # too late here, must not happen
707
+ raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
708
+ except KeyboardInterrupt:
709
+ raise
710
+ # broad coverage of output failures (including TimeoutError)
711
+ except Exception as err:
712
+ # FIXME: add re-usable/actionable logging
713
+ if config.OCRD_MISSING_OUTPUT == 'ABORT':
714
+ self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
715
+ raise err
716
+ self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
717
+ if config.OCRD_MISSING_OUTPUT == 'SKIP':
718
+ pass
719
+ elif config.OCRD_MISSING_OUTPUT == 'COPY':
720
+ self._copy_page_file(input_files[0])
721
+ else:
722
+ desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
723
+ raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
724
+ return err
725
+
567
726
  def _copy_page_file(self, input_file : OcrdFileType) -> None:
568
727
  """
569
728
  Copy the given ``input_file`` of the :py:data:`workspace`,
@@ -618,6 +777,12 @@ class Processor():
618
777
  # not PAGE and not an image to generate PAGE for
619
778
  self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
620
779
  output_file_id = make_file_id(input_files[0], self.output_file_grp)
780
+ output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
781
+ if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
782
+ # short-cut avoiding useless computation:
783
+ raise FileExistsError(
784
+ f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
785
+ )
621
786
  result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
622
787
  for image_result in result.images:
623
788
  image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
@@ -934,6 +1099,44 @@ class Processor():
934
1099
  ifts.append(tuple(ifiles))
935
1100
  return ifts
936
1101
 
1102
+ _page_worker_processor = None
1103
+ """
1104
+ This global binding for the processor is required to avoid
1105
+ squeezing the processor through a mp.Queue (which is impossible
1106
+ due to unpicklable attributes like .workspace.mets._tree anyway)
1107
+ when calling Processor.process_page_file as page worker processes
1108
+ in Processor.process_workspace. Forking allows inheriting global
1109
+ objects, and with the METS Server we do not mutate the local
1110
+ processor instance anyway.
1111
+ """
1112
+ def _page_worker_set_ctxt(processor):
1113
+ """
1114
+ Overwrites `ocrd.processor.base._page_worker_processor` instance
1115
+ for sharing with subprocesses in ProcessPoolExecutor initializer.
1116
+ """
1117
+ global _page_worker_processor
1118
+ _page_worker_processor = processor
1119
+
1120
+ def _page_worker(timeout, *input_files):
1121
+ """
1122
+ Wraps a `Processor.process_page_file` call as payload (call target)
1123
+ of the ProcessPoolExecutor workers, but also enforces the given timeout.
1124
+ """
1125
+ page_id = next((file.pageId for file in input_files
1126
+ if hasattr(file, 'pageId')), "")
1127
+ if timeout > 0:
1128
+ timer = Timer(timeout, interrupt_main)
1129
+ timer.start()
1130
+ try:
1131
+ _page_worker_processor.process_page_file(*input_files)
1132
+ _page_worker_processor.logger.debug("page worker completed for page %s", page_id)
1133
+ except KeyboardInterrupt:
1134
+ _page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
1135
+ raise TimeoutError()
1136
+ finally:
1137
+ if timeout > 0:
1138
+ timer.cancel()
1139
+
937
1140
  def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
938
1141
  """Generate a string describing the full CLI of this processor including params.
939
1142
 
@@ -2,6 +2,7 @@
2
2
  Helper methods for running and documenting processors
3
3
  """
4
4
  from time import perf_counter, process_time
5
+ from os import times
5
6
  from functools import lru_cache
6
7
  import json
7
8
  import inspect
@@ -94,6 +95,7 @@ def run_processor(
94
95
  log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
95
96
  t0_wall = perf_counter()
96
97
  t0_cpu = process_time()
98
+ t0_os = times()
97
99
  if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
98
100
  backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
99
101
  from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
@@ -123,7 +125,13 @@ def run_processor(
123
125
 
124
126
  t1_wall = perf_counter() - t0_wall
125
127
  t1_cpu = process_time() - t0_cpu
126
- logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % (
128
+ t1_os = times()
129
+ # add CPU time from child processes (page worker etc)
130
+ t1_cpu += t1_os.children_user - t0_os.children_user
131
+ t1_cpu += t1_os.children_system - t0_os.children_system
132
+ logProfile.info(
133
+ "Executing processor '%s' took %fs (wall) %fs (CPU)( "
134
+ "[--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']",
127
135
  ocrd_tool['executable'],
128
136
  t1_wall,
129
137
  t1_cpu,
@@ -131,7 +139,7 @@ def run_processor(
131
139
  processor.output_file_grp or '',
132
140
  json.dumps(processor.parameter) or '',
133
141
  processor.page_id or ''
134
- ))
142
+ )
135
143
  workspace.mets.add_agent(
136
144
  name=name,
137
145
  _type='OTHER',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.0.0b5
3
+ Version: 3.0.0b6
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -26,6 +26,7 @@ Requires-Dist: httpx>=0.22.0
26
26
  Requires-Dist: importlib_metadata; python_version < "3.8"
27
27
  Requires-Dist: importlib_resources; python_version < "3.10"
28
28
  Requires-Dist: jsonschema>=4
29
+ Requires-Dist: loky
29
30
  Requires-Dist: lxml
30
31
  Requires-Dist: memory-profiler>=0.58.0
31
32
  Requires-Dist: numpy
@@ -11,6 +11,7 @@ frozendict>=2.3.4
11
11
  gdown
12
12
  httpx>=0.22.0
13
13
  jsonschema>=4
14
+ loky
14
15
  lxml
15
16
  memory-profiler>=0.58.0
16
17
  numpy
@@ -599,7 +599,16 @@ class OcrdMets(OcrdXmlDocument):
599
599
  If return_divs is set, returns div memory objects instead of strings of ids
600
600
  """
601
601
  if for_fileIds is None and for_pageIds is None:
602
+ if return_divs:
603
+ if self._cache_flag:
604
+ return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values())
605
+
606
+ return [x for x in self._tree.getroot().xpath(
607
+ 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
608
+ namespaces=NS)]
609
+
602
610
  return self.physical_pages
611
+
603
612
  # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
604
613
  if for_pageIds is not None:
605
614
  ret = []