ocrd 3.0.0b1__tar.gz → 3.0.0b3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. {ocrd-3.0.0b1/src/ocrd.egg-info → ocrd-3.0.0b3}/PKG-INFO +32 -10
  2. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/README.md +31 -9
  3. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/README_bashlib.md +9 -1
  4. ocrd-3.0.0b3/VERSION +1 -0
  5. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/__init__.py +3 -3
  6. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/lib.bash +1 -0
  7. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/mets_server.py +1 -1
  8. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/processor/__init__.py +1 -1
  9. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/processor/base.py +263 -38
  10. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/processor/helpers.py +1 -144
  11. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/workspace.py +2 -2
  12. {ocrd-3.0.0b1 → ocrd-3.0.0b3/src/ocrd.egg-info}/PKG-INFO +32 -10
  13. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/config.py +28 -1
  14. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/logging.py +3 -1
  15. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_decorators.py +1 -0
  16. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_mets_server.py +29 -19
  17. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_workspace.py +3 -3
  18. ocrd-3.0.0b1/VERSION +0 -1
  19. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/LICENSE +0 -0
  20. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/MANIFEST.in +0 -0
  21. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/README_ocrd.md +0 -0
  22. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/README_ocrd_modelfactory.md +0 -0
  23. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/README_ocrd_models.md +0 -0
  24. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/README_ocrd_network.md +0 -0
  25. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/README_ocrd_utils.md +0 -0
  26. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/README_ocrd_validators.md +0 -0
  27. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/pyproject.toml +0 -0
  28. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/requirements.txt +0 -0
  29. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/setup.cfg +0 -0
  30. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/__init__.py +0 -0
  31. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/bashlib.py +0 -0
  32. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/log.py +0 -0
  33. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/network.py +0 -0
  34. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/ocrd_tool.py +0 -0
  35. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/process.py +0 -0
  36. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/resmgr.py +0 -0
  37. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/validate.py +0 -0
  38. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/workspace.py +0 -0
  39. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/cli/zip.py +0 -0
  40. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/constants.py +0 -0
  41. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/decorators/__init__.py +0 -0
  42. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/decorators/loglevel_option.py +0 -0
  43. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/decorators/mets_find_options.py +0 -0
  44. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/decorators/ocrd_cli_options.py +1 -1
  45. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/decorators/parameter_option.py +0 -0
  46. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/ocrd-all-tool.json +0 -0
  47. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/processor/builtin/__init__.py +0 -0
  48. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
  49. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
  50. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
  51. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/processor/ocrd_page_result.py +0 -0
  52. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/resolver.py +0 -0
  53. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/resource_list.yml +0 -0
  54. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/resource_manager.py +0 -0
  55. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/task_sequence.py +0 -0
  56. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/workspace_backup.py +0 -0
  57. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd/workspace_bagger.py +0 -0
  58. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd.egg-info/SOURCES.txt +0 -0
  59. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd.egg-info/dependency_links.txt +0 -0
  60. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd.egg-info/entry_points.txt +0 -0
  61. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd.egg-info/requires.txt +0 -0
  62. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd.egg-info/top_level.txt +0 -0
  63. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_modelfactory/__init__.py +0 -0
  64. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/__init__.py +0 -0
  65. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/constants.py +0 -0
  66. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/mets-empty.xml +0 -0
  67. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/ocrd_agent.py +0 -0
  68. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/ocrd_exif.py +0 -0
  69. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/ocrd_file.py +0 -0
  70. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/ocrd_mets.py +0 -0
  71. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/ocrd_page.py +0 -0
  72. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/ocrd_page_generateds.py +0 -0
  73. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/ocrd_xml_base.py +0 -0
  74. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/report.py +0 -0
  75. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_models/utils.py +0 -0
  76. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/__init__.py +0 -0
  77. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/cli/__init__.py +0 -0
  78. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/cli/client.py +0 -0
  79. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/cli/processing_server.py +0 -0
  80. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/cli/processing_worker.py +0 -0
  81. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/cli/processor_server.py +0 -0
  82. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/client.py +0 -0
  83. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/client_utils.py +0 -0
  84. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/constants.py +0 -0
  85. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/database.py +0 -0
  86. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/logging_utils.py +0 -0
  87. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/models/__init__.py +0 -0
  88. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/models/job.py +0 -0
  89. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/models/messages.py +0 -0
  90. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/models/ocrd_tool.py +0 -0
  91. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/models/workflow.py +0 -0
  92. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/models/workspace.py +0 -0
  93. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/param_validators.py +0 -0
  94. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/process_helpers.py +0 -0
  95. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/processing_server.py +0 -0
  96. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/processing_worker.py +0 -0
  97. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/processor_server.py +0 -0
  98. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
  99. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
  100. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
  101. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
  102. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
  103. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
  104. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
  105. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/runtime_data/__init__.py +0 -0
  106. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/runtime_data/config_parser.py +0 -0
  107. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
  108. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/runtime_data/deployer.py +0 -0
  109. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/runtime_data/hosts.py +0 -0
  110. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/runtime_data/network_agents.py +0 -0
  111. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/runtime_data/network_services.py +0 -0
  112. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/server_cache.py +0 -0
  113. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/server_utils.py +0 -0
  114. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
  115. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_network/utils.py +0 -0
  116. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/__init__.py +0 -0
  117. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/constants.py +0 -0
  118. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/deprecate.py +0 -0
  119. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/image.py +0 -0
  120. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/introspect.py +0 -0
  121. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/ocrd_logging.conf +0 -0
  122. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/os.py +0 -0
  123. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_utils/str.py +0 -0
  124. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/__init__.py +0 -0
  125. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/bagit-profile.yml +0 -0
  126. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/constants.py +0 -0
  127. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/json_validator.py +0 -0
  128. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/message_processing.schema.yml +0 -0
  129. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/message_result.schema.yml +0 -0
  130. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/mets.xsd +0 -0
  131. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
  132. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
  133. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
  134. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
  135. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/page.xsd +0 -0
  136. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/page_validator.py +0 -0
  137. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/parameter_validator.py +0 -0
  138. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
  139. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/processing_server_config_validator.py +0 -0
  140. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/resource_list_validator.py +0 -0
  141. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/workspace_validator.py +0 -0
  142. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/xlink.xsd +0 -0
  143. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/xsd_mets_validator.py +0 -0
  144. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/xsd_page_validator.py +0 -0
  145. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/src/ocrd_validators/xsd_validator.py +0 -0
  146. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_logging.py +0 -0
  147. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_logging_conf.py +0 -0
  148. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_model_factory.py +0 -0
  149. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_resolver.py +0 -0
  150. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_resolver_oai.py +0 -0
  151. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_resource_manager.py +0 -0
  152. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_task_sequence.py +0 -0
  153. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_utils.py +0 -0
  154. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_version.py +0 -0
  155. {ocrd-3.0.0b1 → ocrd-3.0.0b3}/tests/test_workspace_remove.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.0.0b1
3
+ Version: 3.0.0b3
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -94,17 +94,12 @@ complete stack of OCR-D-related software.
94
94
 
95
95
  The easiest way to install is via `pip`:
96
96
 
97
- ```sh
98
- pip install ocrd
97
+ pip install ocrd
99
98
 
100
- # or just the functionality you need, e.g.
101
-
102
- pip install ocrd_modelfactory
103
- ```
104
99
 
105
100
  All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher.
106
101
 
107
- **NOTE** Some OCR-D-Tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
102
+ > **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
108
103
  * using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes
109
104
  * custom Python logging configurations in your personal account
110
105
 
@@ -129,7 +124,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an
129
124
 
130
125
  Some parts of the software are configured via environment variables:
131
126
 
132
- * `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification.
133
127
  * `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens:
134
128
  * `CPU`: Enable CPU profiling of processor runs
135
129
  * `RSS`: Enable RSS memory profiling
@@ -142,18 +136,46 @@ Some parts of the software are configured via environment variables:
142
136
  * `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`.
143
137
  * `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`.
144
138
 
145
- * `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files.
139
+ * `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files.
146
140
  * `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading.
147
141
 
142
+ * `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing:
143
+ * `SKIP`: ignore and proceed with next page's input
144
+ * `ABORT`: throw `MissingInputFile` exception
145
+
146
+ * `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing:
147
+ * `SKIP`: ignore and proceed processing next page
148
+ * `COPY`: fall back to copying input PAGE to output fileGrp for page
149
+ * `ABORT`: re-throw whatever caused processing to fail
150
+
151
+ * `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).
152
+
153
+ * `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing:
154
+ * `SKIP`: ignore and proceed processing next page
155
+ * `OVERWRITE`: force writing result to output fileGrp for page
156
+ * `ABORT`: re-throw `FileExistsError` exception
157
+
158
+
148
159
  * `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations.
149
160
 
150
161
  * `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.
151
162
 
163
+ * `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation.
164
+
165
+ * `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.
166
+
152
167
  * `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`).
153
168
  * `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`).
154
169
  * `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`).
155
170
  * `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started.
156
171
 
172
+ * `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again.
173
+ * `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds).
174
+
175
+ * `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created.
176
+ * `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored.
177
+
178
+
157
179
 
158
180
  ## Packages
159
181
 
@@ -47,17 +47,12 @@ complete stack of OCR-D-related software.
47
47
 
48
48
  The easiest way to install is via `pip`:
49
49
 
50
- ```sh
51
- pip install ocrd
50
+ pip install ocrd
52
51
 
53
- # or just the functionality you need, e.g.
54
-
55
- pip install ocrd_modelfactory
56
- ```
57
52
 
58
53
  All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher.
59
54
 
60
- **NOTE** Some OCR-D-Tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
55
+ > **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
61
56
  * using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes
62
57
  * custom Python logging configurations in your personal account
63
58
 
@@ -82,7 +77,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an
82
77
 
83
78
  Some parts of the software are configured via environment variables:
84
79
 
85
- * `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification.
86
80
  * `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens:
87
81
  * `CPU`: Enable CPU profiling of processor runs
88
82
  * `RSS`: Enable RSS memory profiling
@@ -95,18 +89,46 @@ Some parts of the software are configured via environment variables:
95
89
  * `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`.
96
90
  * `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`.
97
91
 
98
- * `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files.
92
+ * `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files.
99
93
  * `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading.
100
94
 
95
+ * `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing:
96
+ * `SKIP`: ignore and proceed with next page's input
97
+ * `ABORT`: throw `MissingInputFile` exception
98
+
99
+ * `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing:
100
+ * `SKIP`: ignore and proceed processing next page
101
+ * `COPY`: fall back to copying input PAGE to output fileGrp for page
102
+ * `ABORT`: re-throw whatever caused processing to fail
103
+
104
+ * `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).
105
+
106
+ * `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing:
107
+ * `SKIP`: ignore and proceed processing next page
108
+ * `OVERWRITE`: force writing result to output fileGrp for page
109
+ * `ABORT`: re-throw `FileExistsError` exception
110
+
111
+
101
112
  * `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations.
102
113
 
103
114
  * `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.
104
115
 
116
+ * `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation.
117
+
118
+ * `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.
119
+
105
120
  * `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`).
106
121
  * `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`).
107
122
  * `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`).
108
123
  * `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started.
109
124
 
125
+ * `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again.
126
+ * `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds).
127
+
128
+ * `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created.
129
+ * `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored.
130
+
131
+
110
132
 
111
133
  ## Packages
112
134
 
@@ -21,6 +21,9 @@ For example:
21
21
  * [`ocrd__log`](#ocrd__log)
22
22
  * [`ocrd__minversion`](#ocrd__minversion)
23
23
  * [`ocrd__dumpjson`](#ocrd__dumpjson)
24
+ * [`ocrd__resolve_resource`](#ocrd__resolve_resource)
25
+ * [`ocrd__show_resource`](#ocrd__show_resource)
26
+ * [`ocrd__list_resources`](#ocrd__list_resources)
24
27
  * [`ocrd__usage`](#ocrd__usage)
25
28
  * [`ocrd__parse_argv`](#ocrd__parse_argv)
26
29
  <!-- END-MARKDOWN-TOC -->
@@ -56,6 +59,10 @@ export OCRD_TOOL_NAME=ocrd-foo-bar
56
59
 
57
60
  (Which you automatically get from [`ocrd__wrap`](#ocrd__wrap).)
58
61
 
62
+ ### `ocrd__resolve_resource`
63
+
64
+ Output given resource file's path.
65
+
59
66
  ### `ocrd__show_resource`
60
67
 
61
68
  Output given resource file's content.
@@ -88,6 +95,7 @@ This will be filled by the parser along the following keys:
88
95
  - `profile`: whether `--profile` is enabled
89
96
  - `profile_file`: the argument of `--profile-file`
90
97
  - `log_level`: the argument of `--log-level`
98
+ - `mets_server_url`: the argument of `--mets-server-url` argument
91
99
  - `mets_file`: absolute path of the `--mets` argument
92
100
  - `working_dir`: absolute path of the `--working-dir` argument or the parent of `mets_file`
93
101
  - `page_id`: the argument of `--page-id`
@@ -95,7 +103,7 @@ This will be filled by the parser along the following keys:
95
103
  - `output_file_grp`: the argument of `--output-file-grp`
96
104
 
97
105
  Moreover, there will be an associative array **`params`**
98
- with the fully expanded runtime values of the ocrd-tool.json parameters.
106
+ with the fully validated and default-expanded runtime values of the `ocrd-tool.json` parameters.
99
107
 
100
108
  ### `ocrd__wrap`
101
109
 
ocrd-3.0.0b3/VERSION ADDED
@@ -0,0 +1 @@
1
+ 3.0.0b3
@@ -61,11 +61,11 @@ Variables:
61
61
  \b
62
62
  {config.describe('OCRD_DOWNLOAD_INPUT')}
63
63
  \b
64
- {config.describe('OCRD_MISSING_INPUT')}
64
+ {config.describe('OCRD_MISSING_INPUT', wrap_text=False)}
65
65
  \b
66
- {config.describe('OCRD_MISSING_OUTPUT')}
66
+ {config.describe('OCRD_MISSING_OUTPUT', wrap_text=False)}
67
67
  \b
68
- {config.describe('OCRD_EXISTING_OUTPUT')}
68
+ {config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)}
69
69
  \b
70
70
  {config.describe('OCRD_METS_CACHING')}
71
71
  \b
@@ -156,6 +156,7 @@ ocrd__parse_argv () {
156
156
  while [[ "${1:-}" = -* ]];do
157
157
  case "$1" in
158
158
  -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;;
159
+ --log-filename) exec 2> "$2" ; shift ;;
159
160
  -h|--help|--usage) ocrd__usage; exit ;;
160
161
  -J|--dump-json) ocrd__dumpjson; exit ;;
161
162
  -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;;
@@ -120,7 +120,7 @@ class ClientSideOcrdMets:
120
120
 
121
121
  def __init__(self, url, workspace_path: Optional[str] = None):
122
122
  self.protocol = "tcp" if url.startswith("http://") else "uds"
123
- self.log = getLogger(f"ocrd.mets_client[{url}]")
123
+ self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}")
124
124
  self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}'
125
125
  self.ws_dir_path = workspace_path if workspace_path else None
126
126
 
@@ -3,6 +3,7 @@ from .base import (
3
3
  ResourceNotFoundError,
4
4
  NonUniqueInputFile,
5
5
  MissingInputFile,
6
+ generate_processor_help,
6
7
  )
7
8
  from .ocrd_page_result import (
8
9
  OcrdPageResult,
@@ -11,5 +12,4 @@ from .ocrd_page_result import (
11
12
  from .helpers import (
12
13
  run_cli,
13
14
  run_processor,
14
- generate_processor_help
15
15
  )
@@ -23,12 +23,16 @@ import tarfile
23
23
  import io
24
24
  import weakref
25
25
  from frozendict import frozendict
26
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError
27
+
28
+ from click import wrap_text
26
29
  from deprecated import deprecated
27
30
  from requests import HTTPError
28
31
 
29
- from ocrd.workspace import Workspace
32
+ from ..workspace import Workspace
33
+ from ..mets_server import ClientSideOcrdMets
30
34
  from ocrd_models.ocrd_file import OcrdFileType
31
- from ocrd.processor.ocrd_page_result import OcrdPageResult
35
+ from .ocrd_page_result import OcrdPageResult
32
36
  from ocrd_utils import (
33
37
  VERSION as OCRD_VERSION,
34
38
  MIMETYPE_PAGE,
@@ -58,7 +62,7 @@ from ocrd_modelfactory import page_from_file
58
62
  from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
59
63
 
60
64
  # XXX imports must remain for backwards-compatibility
61
- from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
65
+ from .helpers import run_cli, run_processor # pylint: disable=unused-import
62
66
 
63
67
 
64
68
  class ResourceNotFoundError(FileNotFoundError):
@@ -118,7 +122,27 @@ class Processor():
118
122
  maximum number of cached instances (ignored if negative), to be applied on top of
119
123
  :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
120
124
 
121
- (Override this if you know how many instances fit into memory at once.)
125
+ (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
126
+ """
127
+
128
+ max_workers : int = -1
129
+ """
130
+ maximum number of processor threads for page-parallel processing (ignored if negative),
131
+ to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
132
+ whatever is smaller).
133
+
134
+ (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
135
+ - at once, or if your class is not thread-safe.)
136
+ """
137
+
138
+ max_page_seconds : int = -1
139
+ """
140
+ maximum number of seconds may be spent processing a single page (ignored if negative),
141
+ to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
142
+ (i.e. whatever is smaller).
143
+
144
+ (Override this if you know how costly this processor may be, irrespective of image size
145
+ or complexity of the page.)
122
146
  """
123
147
 
124
148
  @property
@@ -142,7 +166,11 @@ class Processor():
142
166
 
143
167
  (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
144
168
  """
145
- return resource_filename(self.__module__.split('.')[0], self.metadata_filename)
169
+ # XXX HACK
170
+ module_tokens = self.__module__.split('.')
171
+ if module_tokens[0] == 'src':
172
+ module_tokens.pop(0)
173
+ return resource_filename(module_tokens[0], self.metadata_filename)
146
174
 
147
175
  @cached_property
148
176
  def metadata_rawdict(self) -> dict:
@@ -273,12 +301,12 @@ class Processor():
273
301
  if ocrd_tool is not None:
274
302
  deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
275
303
  "use or override metadata/executable/ocrd-tool properties instead")
276
- self._ocrd_tool = ocrd_tool
277
- self._executable = ocrd_tool['executable']
304
+ self.ocrd_tool = ocrd_tool
305
+ self.executable = ocrd_tool['executable']
278
306
  if version is not None:
279
307
  deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
280
308
  "use or override metadata/version properties instead")
281
- self._version = version
309
+ self.version = version
282
310
  if workspace is not None:
283
311
  deprecation_warning("Passing a workspace argument other than 'None' to Processor "
284
312
  "is deprecated - pass as argument to process_workspace instead")
@@ -422,13 +450,35 @@ class Processor():
422
450
  self.workspace = workspace
423
451
  self.verify()
424
452
  try:
425
- # FIXME: add page parallelization by running multiprocessing.Pool (#322)
453
+ nr_succeeded = 0
454
+ nr_skipped = 0
455
+ nr_copied = 0
456
+
457
+ # set up multithreading
458
+ if self.max_workers <= 0:
459
+ max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
460
+ else:
461
+ max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers))
462
+ if max_workers > 1:
463
+ assert isinstance(workspace.mets, ClientSideOcrdMets), \
464
+ "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
465
+ if self.max_page_seconds <= 0:
466
+ max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
467
+ else:
468
+ max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds))
469
+ executor = ThreadPoolExecutor(
470
+ max_workers=max_workers or 1,
471
+ thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
472
+ )
473
+ self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
474
+ tasks = {}
475
+
426
476
  for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
427
477
  input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
428
478
  page_id = next(input_file.pageId
429
479
  for input_file in input_file_tuple
430
480
  if input_file)
431
- self._base_logger.info(f"processing page {page_id}")
481
+ self._base_logger.info(f"preparing page {page_id}")
432
482
  for i, input_file in enumerate(input_file_tuple):
433
483
  if input_file is None:
434
484
  # file/page not found in this file grp
@@ -441,35 +491,56 @@ class Processor():
441
491
  except (ValueError, FileNotFoundError, HTTPError) as e:
442
492
  self._base_logger.error(repr(e))
443
493
  self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
494
+ # process page
495
+ tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
496
+ self._base_logger.debug("submitted %d processing tasks", len(tasks))
497
+
498
+ for task in tasks:
499
+ # wait for results, handle errors
500
+ page_id, input_files = tasks[task]
444
501
  # FIXME: differentiate error cases in various ways:
445
502
  # - ResourceNotFoundError → use ResourceManager to download (once), then retry
446
503
  # - transient (I/O or OOM) error → maybe sleep, retry
447
504
  # - persistent (data) error → skip / dummy / raise
448
505
  try:
449
- self.process_page_file(*input_files)
450
- except Exception as err:
451
- # we have to be broad here, but want to exclude NotImplementedError
452
- if isinstance(err, NotImplementedError):
506
+ self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
507
+ task.result(timeout=max_seconds or None)
508
+ nr_succeeded += 1
509
+ # exclude NotImplementedError, so we can try process() below
510
+ except NotImplementedError:
511
+ raise
512
+ # handle input failures separately
513
+ except FileExistsError as err:
514
+ if config.OCRD_EXISTING_OUTPUT == 'ABORT':
453
515
  raise err
454
- if isinstance(err, FileExistsError):
455
- if config.OCRD_EXISTING_OUTPUT == 'ABORT':
456
- raise err
457
- if config.OCRD_EXISTING_OUTPUT == 'SKIP':
458
- continue
459
- if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
460
- # too late here, must not happen
461
- raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
462
- # FIXME: re-usable/actionable logging
463
- self._base_logger.exception(f"Failure on page {page_id}: {err}")
516
+ if config.OCRD_EXISTING_OUTPUT == 'SKIP':
517
+ continue
518
+ if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
519
+ # too late here, must not happen
520
+ raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
521
+ # broad coverage of output failures (including TimeoutError)
522
+ except (Exception, TimeoutError) as err:
523
+ # FIXME: add re-usable/actionable logging
464
524
  if config.OCRD_MISSING_OUTPUT == 'ABORT':
525
+ self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
465
526
  raise err
527
+ self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
466
528
  if config.OCRD_MISSING_OUTPUT == 'SKIP':
529
+ nr_skipped += 1
467
530
  continue
468
531
  if config.OCRD_MISSING_OUTPUT == 'COPY':
469
532
  self._copy_page_file(input_files[0])
533
+ nr_copied += 1
470
534
  else:
471
535
  desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
472
536
  raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
537
+
538
+ if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
539
+ raise Exception(f"too many failures with skipped output ({nr_skipped})")
540
+ if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
541
+ raise Exception(f"too many failures with fallback output ({nr_skipped})")
542
+ executor.shutdown()
543
+
473
544
  except NotImplementedError:
474
545
  # fall back to deprecated method
475
546
  self.process()
@@ -493,13 +564,14 @@ class Processor():
493
564
  output_file_id = make_file_id(input_file, self.output_file_grp)
494
565
  input_pcgts.set_pcGtsId(output_file_id)
495
566
  self.add_metadata(input_pcgts)
496
- self.workspace.add_file(file_id=output_file_id,
497
- file_grp=self.output_file_grp,
498
- page_id=input_file.pageId,
499
- local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
500
- mimetype=MIMETYPE_PAGE,
501
- content=to_xml(input_pcgts),
502
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
567
+ self.workspace.add_file(
568
+ file_id=output_file_id,
569
+ file_grp=self.output_file_grp,
570
+ page_id=input_file.pageId,
571
+ local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
572
+ mimetype=MIMETYPE_PAGE,
573
+ content=to_xml(input_pcgts),
574
+ force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
503
575
  )
504
576
 
505
577
  def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
@@ -516,6 +588,7 @@ class Processor():
516
588
  input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
517
589
  assert isinstance(input_files[0], get_args(OcrdFileType))
518
590
  page_id = input_files[0].pageId
591
+ self._base_logger.info("processing page %s", page_id)
519
592
  for i, input_file in enumerate(input_files):
520
593
  assert isinstance(input_file, get_args(OcrdFileType))
521
594
  self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
@@ -532,6 +605,9 @@ class Processor():
532
605
  image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
533
606
  image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
534
607
  if isinstance(image_result.alternative_image, PageType):
608
+ # special case: not an alternative image, but replacing the original image
609
+ # (this is needed by certain processors when the original's coordinate system
610
+ # cannot or must not be kept)
535
611
  image_result.alternative_image.set_imageFilename(image_file_path)
536
612
  image_result.alternative_image.set_imageWidth(image_result.pil.width)
537
613
  image_result.alternative_image.set_imageHeight(image_result.pil.height)
@@ -550,13 +626,14 @@ class Processor():
550
626
  )
551
627
  result.pcgts.set_pcGtsId(output_file_id)
552
628
  self.add_metadata(result.pcgts)
553
- self.workspace.add_file(file_id=output_file_id,
554
- file_grp=self.output_file_grp,
555
- page_id=page_id,
556
- local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
557
- mimetype=MIMETYPE_PAGE,
558
- content=to_xml(result.pcgts),
559
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
629
+ self.workspace.add_file(
630
+ file_id=output_file_id,
631
+ file_grp=self.output_file_grp,
632
+ page_id=page_id,
633
+ local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
634
+ mimetype=MIMETYPE_PAGE,
635
+ content=to_xml(result.pcgts),
636
+ force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
560
637
  )
561
638
 
562
639
  def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
@@ -838,3 +915,151 @@ class Processor():
838
915
  if ifiles[0] or not require_first:
839
916
  ifts.append(tuple(ifiles))
840
917
  return ifts
918
+
919
+ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
920
+ """Generate a string describing the full CLI of this processor including params.
921
+
922
+ Args:
923
+ ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
924
+ processor_instance (object, optional): the processor implementation
925
+ (for adding any module/class/function docstrings)
926
+ subcommand (string): 'worker' or 'server'
927
+ """
928
+ doc_help = ''
929
+ if processor_instance:
930
+ module = inspect.getmodule(processor_instance)
931
+ if module and module.__doc__:
932
+ doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
933
+ if processor_instance.__doc__:
934
+ doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
935
+ # Try to find the most concrete docstring among the various methods that an implementation
936
+ # could overload, first serving.
937
+ # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
938
+ # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
939
+ for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
940
+ instance_method = getattr(processor_instance, method)
941
+ superclass_method = getattr(Processor, method)
942
+ if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
943
+ doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
944
+ break
945
+ if doc_help:
946
+ doc_help = '\n\n' + wrap_text(doc_help, width=72,
947
+ initial_indent=' > ',
948
+ subsequent_indent=' > ',
949
+ preserve_paragraphs=True)
950
+ subcommands = '''\
951
+ worker Start a processing worker rather than do local processing
952
+ server Start a processor server rather than do local processing
953
+ '''
954
+
955
+ processing_worker_options = '''\
956
+ --queue The RabbitMQ server address in format
957
+ "amqp://{user}:{pass}@{host}:{port}/{vhost}"
958
+ [amqp://admin:admin@localhost:5672]
959
+ --database The MongoDB server address in format
960
+ "mongodb://{host}:{port}"
961
+ [mongodb://localhost:27018]
962
+ --log-filename Filename to redirect STDOUT/STDERR to,
963
+ if specified.
964
+ '''
965
+
966
+ processing_server_options = '''\
967
+ --address The Processor server address in format
968
+ "{host}:{port}"
969
+ --database The MongoDB server address in format
970
+ "mongodb://{host}:{port}"
971
+ [mongodb://localhost:27018]
972
+ '''
973
+
974
+ processing_options = '''\
975
+ -m, --mets URL-PATH URL or file path of METS to process [./mets.xml]
976
+ -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)]
977
+ -I, --input-file-grp USE File group(s) used as input
978
+ -O, --output-file-grp USE File group(s) used as output
979
+ -g, --page-id ID Physical page ID(s) to process instead of full document []
980
+ --overwrite Remove existing output pages/images
981
+ (with "--page-id", remove only those).
982
+ Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
983
+ --debug Abort on any errors with full stack trace.
984
+ Short-hand for OCRD_MISSING_OUTPUT=ABORT
985
+ --profile Enable profiling
986
+ --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile"
987
+ -p, --parameter JSON-PATH Parameters, either verbatim JSON string
988
+ or JSON file path
989
+ -P, --param-override KEY VAL Override a single JSON object key-value pair,
990
+ taking precedence over --parameter
991
+ -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS
992
+ If URL starts with http:// start an HTTP server there,
993
+ otherwise URL is a path to an on-demand-created unix socket
994
+ -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
995
+ Override log level globally [INFO]
996
+ --log-filename LOG-PATH File to redirect stderr logging to (overriding ocrd_logging.conf).
997
+ '''
998
+
999
+ information_options = '''\
1000
+ -C, --show-resource RESNAME Dump the content of processor resource RESNAME
1001
+ -L, --list-resources List names of processor resources
1002
+ -J, --dump-json Dump tool description as JSON
1003
+ -D, --dump-module-dir Show the 'module' resource location path for this processor
1004
+ -h, --help Show this message
1005
+ -V, --version Show version
1006
+ '''
1007
+
1008
+ parameter_help = ''
1009
+ if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
1010
+ parameter_help = ' NONE\n'
1011
+ else:
1012
+ def wrap(s):
1013
+ return wrap_text(s, initial_indent=' '*3,
1014
+ subsequent_indent=' '*4,
1015
+ width=72, preserve_paragraphs=True)
1016
+ for param_name, param in ocrd_tool['parameters'].items():
1017
+ parameter_help += wrap('"%s" [%s%s]' % (
1018
+ param_name,
1019
+ param['type'],
1020
+ ' - REQUIRED' if 'required' in param and param['required'] else
1021
+ ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
1022
+ parameter_help += '\n ' + wrap(param['description'])
1023
+ if 'enum' in param:
1024
+ parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
1025
+ parameter_help += "\n"
1026
+
1027
+ if not subcommand:
1028
+ return f'''\
1029
+ Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
1030
+
1031
+ {ocrd_tool['description']}{doc_help}
1032
+
1033
+ Subcommands:
1034
+ {subcommands}
1035
+ Options for processing:
1036
+ {processing_options}
1037
+ Options for information:
1038
+ {information_options}
1039
+ Parameters:
1040
+ {parameter_help}
1041
+ '''
1042
+ elif subcommand == 'worker':
1043
+ return f'''\
1044
+ Usage: {ocrd_tool['executable']} worker [OPTIONS]
1045
+
1046
+ Run {ocrd_tool['executable']} as a processing worker.
1047
+
1048
+ {ocrd_tool['description']}{doc_help}
1049
+
1050
+ Options:
1051
+ {processing_worker_options}
1052
+ '''
1053
+ elif subcommand == 'server':
1054
+ return f'''\
1055
+ Usage: {ocrd_tool['executable']} server [OPTIONS]
1056
+
1057
+ Run {ocrd_tool['executable']} as a processor sever.
1058
+
1059
+ {ocrd_tool['description']}{doc_help}
1060
+
1061
+ Options:
1062
+ {processing_server_options}
1063
+ '''
1064
+ else:
1065
+ pass