ocrd 3.0.0b1__tar.gz → 3.0.0b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrd-3.0.0b1/src/ocrd.egg-info → ocrd-3.0.0b2}/PKG-INFO +32 -10
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/README.md +31 -9
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/README_bashlib.md +9 -1
- ocrd-3.0.0b2/VERSION +1 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/lib.bash +1 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/mets_server.py +1 -1
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/processor/__init__.py +1 -1
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/processor/base.py +260 -37
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/processor/helpers.py +1 -144
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/workspace.py +2 -2
- {ocrd-3.0.0b1 → ocrd-3.0.0b2/src/ocrd.egg-info}/PKG-INFO +32 -10
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/config.py +15 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/logging.py +3 -1
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_decorators.py +1 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_mets_server.py +29 -19
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_workspace.py +3 -3
- ocrd-3.0.0b1/VERSION +0 -1
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/LICENSE +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/MANIFEST.in +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/README_ocrd.md +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/README_ocrd_modelfactory.md +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/README_ocrd_models.md +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/README_ocrd_network.md +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/README_ocrd_utils.md +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/README_ocrd_validators.md +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/pyproject.toml +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/requirements.txt +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/setup.cfg +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/bashlib.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/log.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/network.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/ocrd_tool.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/process.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/resmgr.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/validate.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/workspace.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/cli/zip.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/constants.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/decorators/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/decorators/loglevel_option.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/decorators/mets_find_options.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/decorators/ocrd_cli_options.py +1 -1
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/decorators/parameter_option.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/ocrd-all-tool.json +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/processor/builtin/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/processor/ocrd_page_result.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/resolver.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/resource_list.yml +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/resource_manager.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/task_sequence.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/workspace_backup.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd/workspace_bagger.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd.egg-info/SOURCES.txt +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd.egg-info/dependency_links.txt +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd.egg-info/entry_points.txt +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd.egg-info/requires.txt +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd.egg-info/top_level.txt +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_modelfactory/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/constants.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/mets-empty.xml +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_agent.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_exif.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_file.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_mets.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_page.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_page_generateds.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_xml_base.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/report.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_models/utils.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/cli/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/cli/client.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/cli/processing_server.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/cli/processing_worker.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/cli/processor_server.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/client.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/client_utils.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/constants.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/database.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/logging_utils.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/models/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/models/job.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/models/messages.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/models/ocrd_tool.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/models/workflow.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/models/workspace.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/param_validators.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/process_helpers.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/processing_server.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/processing_worker.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/processor_server.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/config_parser.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/deployer.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/hosts.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/network_agents.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/network_services.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/server_cache.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/server_utils.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_network/utils.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/constants.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/deprecate.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/image.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/introspect.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/ocrd_logging.conf +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/os.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_utils/str.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/__init__.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/bagit-profile.yml +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/constants.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/json_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/message_processing.schema.yml +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/message_result.schema.yml +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/mets.xsd +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/page.xsd +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/page_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/parameter_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/processing_server_config_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/resource_list_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/workspace_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/xlink.xsd +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/xsd_mets_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/xsd_page_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/src/ocrd_validators/xsd_validator.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_logging.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_logging_conf.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_model_factory.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_resolver.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_resolver_oai.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_resource_manager.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_task_sequence.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_utils.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_version.py +0 -0
- {ocrd-3.0.0b1 → ocrd-3.0.0b2}/tests/test_workspace_remove.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0b2
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -94,17 +94,12 @@ complete stack of OCR-D-related software.
|
|
|
94
94
|
|
|
95
95
|
The easiest way to install is via `pip`:
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
pip install ocrd
|
|
97
|
+
pip install ocrd
|
|
99
98
|
|
|
100
|
-
# or just the functionality you need, e.g.
|
|
101
|
-
|
|
102
|
-
pip install ocrd_modelfactory
|
|
103
|
-
```
|
|
104
99
|
|
|
105
100
|
All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher.
|
|
106
101
|
|
|
107
|
-
**NOTE** Some OCR-D
|
|
102
|
+
> **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
|
|
108
103
|
* using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes
|
|
109
104
|
* custom Python logging configurations in your personal account
|
|
110
105
|
|
|
@@ -129,7 +124,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an
|
|
|
129
124
|
|
|
130
125
|
Some parts of the software are configured via environment variables:
|
|
131
126
|
|
|
132
|
-
* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification.
|
|
133
127
|
* `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens:
|
|
134
128
|
* `CPU`: Enable CPU profiling of processor runs
|
|
135
129
|
* `RSS`: Enable RSS memory profiling
|
|
@@ -142,18 +136,46 @@ Some parts of the software are configured via environment variables:
|
|
|
142
136
|
* `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`.
|
|
143
137
|
* `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`.
|
|
144
138
|
|
|
145
|
-
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files.
|
|
139
|
+
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files.
|
|
146
140
|
* `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading.
|
|
147
141
|
|
|
142
|
+
* `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing:
|
|
143
|
+
* `SKIP`: ignore and proceed with next page's input
|
|
144
|
+
* `ABORT`: throw `MissingInputFile` exception
|
|
145
|
+
|
|
146
|
+
* `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing:
|
|
147
|
+
* `SKIP`: ignore and proceed processing next page
|
|
148
|
+
* `COPY`: fall back to copying input PAGE to output fileGrp for page
|
|
149
|
+
* `ABORT`: re-throw whatever caused processing to fail
|
|
150
|
+
|
|
151
|
+
* `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).
|
|
152
|
+
|
|
153
|
+
* `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing:
|
|
154
|
+
* `SKIP`: ignore and proceed processing next page
|
|
155
|
+
* `OVERWRITE`: force writing result to output fileGrp for page
|
|
156
|
+
* `ABORT`: re-throw `FileExistsError` exception
|
|
157
|
+
|
|
158
|
+
|
|
148
159
|
* `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations.
|
|
149
160
|
|
|
150
161
|
* `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.
|
|
151
162
|
|
|
163
|
+
* `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation.
|
|
164
|
+
|
|
165
|
+
* `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.
|
|
166
|
+
|
|
152
167
|
* `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`).
|
|
153
168
|
* `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`).
|
|
154
169
|
* `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`).
|
|
155
170
|
* `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started.
|
|
156
171
|
|
|
172
|
+
* `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again.
|
|
173
|
+
* `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds).
|
|
174
|
+
|
|
175
|
+
* `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created.
|
|
176
|
+
* `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored.
|
|
177
|
+
|
|
178
|
+
|
|
157
179
|
|
|
158
180
|
## Packages
|
|
159
181
|
|
|
@@ -47,17 +47,12 @@ complete stack of OCR-D-related software.
|
|
|
47
47
|
|
|
48
48
|
The easiest way to install is via `pip`:
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
pip install ocrd
|
|
50
|
+
pip install ocrd
|
|
52
51
|
|
|
53
|
-
# or just the functionality you need, e.g.
|
|
54
|
-
|
|
55
|
-
pip install ocrd_modelfactory
|
|
56
|
-
```
|
|
57
52
|
|
|
58
53
|
All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher.
|
|
59
54
|
|
|
60
|
-
**NOTE** Some OCR-D
|
|
55
|
+
> **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
|
|
61
56
|
* using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes
|
|
62
57
|
* custom Python logging configurations in your personal account
|
|
63
58
|
|
|
@@ -82,7 +77,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an
|
|
|
82
77
|
|
|
83
78
|
Some parts of the software are configured via environment variables:
|
|
84
79
|
|
|
85
|
-
* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification.
|
|
86
80
|
* `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens:
|
|
87
81
|
* `CPU`: Enable CPU profiling of processor runs
|
|
88
82
|
* `RSS`: Enable RSS memory profiling
|
|
@@ -95,18 +89,46 @@ Some parts of the software are configured via environment variables:
|
|
|
95
89
|
* `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`.
|
|
96
90
|
* `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`.
|
|
97
91
|
|
|
98
|
-
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files.
|
|
92
|
+
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files.
|
|
99
93
|
* `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading.
|
|
100
94
|
|
|
95
|
+
* `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing:
|
|
96
|
+
* `SKIP`: ignore and proceed with next page's input
|
|
97
|
+
* `ABORT`: throw `MissingInputFile` exception
|
|
98
|
+
|
|
99
|
+
* `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing:
|
|
100
|
+
* `SKIP`: ignore and proceed processing next page
|
|
101
|
+
* `COPY`: fall back to copying input PAGE to output fileGrp for page
|
|
102
|
+
* `ABORT`: re-throw whatever caused processing to fail
|
|
103
|
+
|
|
104
|
+
* `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).
|
|
105
|
+
|
|
106
|
+
* `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing:
|
|
107
|
+
* `SKIP`: ignore and proceed processing next page
|
|
108
|
+
* `OVERWRITE`: force writing result to output fileGrp for page
|
|
109
|
+
* `ABORT`: re-throw `FileExistsError` exception
|
|
110
|
+
|
|
111
|
+
|
|
101
112
|
* `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations.
|
|
102
113
|
|
|
103
114
|
* `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.
|
|
104
115
|
|
|
116
|
+
* `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation.
|
|
117
|
+
|
|
118
|
+
* `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.
|
|
119
|
+
|
|
105
120
|
* `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`).
|
|
106
121
|
* `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`).
|
|
107
122
|
* `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`).
|
|
108
123
|
* `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started.
|
|
109
124
|
|
|
125
|
+
* `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again.
|
|
126
|
+
* `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds).
|
|
127
|
+
|
|
128
|
+
* `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created.
|
|
129
|
+
* `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored.
|
|
130
|
+
|
|
131
|
+
|
|
110
132
|
|
|
111
133
|
## Packages
|
|
112
134
|
|
|
@@ -21,6 +21,9 @@ For example:
|
|
|
21
21
|
* [`ocrd__log`](#ocrd__log)
|
|
22
22
|
* [`ocrd__minversion`](#ocrd__minversion)
|
|
23
23
|
* [`ocrd__dumpjson`](#ocrd__dumpjson)
|
|
24
|
+
* [`ocrd__resolve_resource`](#ocrd__resolve_resource)
|
|
25
|
+
* [`ocrd__show_resource`](#ocrd__show_resource)
|
|
26
|
+
* [`ocrd__list_resources`](#ocrd__list_resources)
|
|
24
27
|
* [`ocrd__usage`](#ocrd__usage)
|
|
25
28
|
* [`ocrd__parse_argv`](#ocrd__parse_argv)
|
|
26
29
|
<!-- END-MARKDOWN-TOC -->
|
|
@@ -56,6 +59,10 @@ export OCRD_TOOL_NAME=ocrd-foo-bar
|
|
|
56
59
|
|
|
57
60
|
(Which you automatically get from [`ocrd__wrap`](#ocrd__wrap).)
|
|
58
61
|
|
|
62
|
+
### `ocrd__resolve_resource`
|
|
63
|
+
|
|
64
|
+
Output given resource file's path.
|
|
65
|
+
|
|
59
66
|
### `ocrd__show_resource`
|
|
60
67
|
|
|
61
68
|
Output given resource file's content.
|
|
@@ -88,6 +95,7 @@ This will be filled by the parser along the following keys:
|
|
|
88
95
|
- `profile`: whether `--profile` is enabled
|
|
89
96
|
- `profile_file`: the argument of `--profile-file`
|
|
90
97
|
- `log_level`: the argument of `--log-level`
|
|
98
|
+
- `mets_server_url`: the argument of `--mets-server-url` argument
|
|
91
99
|
- `mets_file`: absolute path of the `--mets` argument
|
|
92
100
|
- `working_dir`: absolute path of the `--working-dir` argument or the parent of `mets_file`
|
|
93
101
|
- `page_id`: the argument of `--page-id`
|
|
@@ -95,7 +103,7 @@ This will be filled by the parser along the following keys:
|
|
|
95
103
|
- `output_file_grp`: the argument of `--output-file-grp`
|
|
96
104
|
|
|
97
105
|
Moreover, there will be an associative array **`params`**
|
|
98
|
-
with the fully expanded runtime values of the ocrd-tool.json parameters.
|
|
106
|
+
with the fully validated and default-expanded runtime values of the `ocrd-tool.json` parameters.
|
|
99
107
|
|
|
100
108
|
### `ocrd__wrap`
|
|
101
109
|
|
ocrd-3.0.0b2/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.0.0b2
|
|
@@ -156,6 +156,7 @@ ocrd__parse_argv () {
|
|
|
156
156
|
while [[ "${1:-}" = -* ]];do
|
|
157
157
|
case "$1" in
|
|
158
158
|
-l|--log-level) ocrd__argv[log_level]=$2 ; shift ;;
|
|
159
|
+
--log-filename) exec 2> "$2" ; shift ;;
|
|
159
160
|
-h|--help|--usage) ocrd__usage; exit ;;
|
|
160
161
|
-J|--dump-json) ocrd__dumpjson; exit ;;
|
|
161
162
|
-D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;;
|
|
@@ -120,7 +120,7 @@ class ClientSideOcrdMets:
|
|
|
120
120
|
|
|
121
121
|
def __init__(self, url, workspace_path: Optional[str] = None):
|
|
122
122
|
self.protocol = "tcp" if url.startswith("http://") else "uds"
|
|
123
|
-
self.log = getLogger(f"ocrd.
|
|
123
|
+
self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}")
|
|
124
124
|
self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}'
|
|
125
125
|
self.ws_dir_path = workspace_path if workspace_path else None
|
|
126
126
|
|
|
@@ -3,6 +3,7 @@ from .base import (
|
|
|
3
3
|
ResourceNotFoundError,
|
|
4
4
|
NonUniqueInputFile,
|
|
5
5
|
MissingInputFile,
|
|
6
|
+
generate_processor_help,
|
|
6
7
|
)
|
|
7
8
|
from .ocrd_page_result import (
|
|
8
9
|
OcrdPageResult,
|
|
@@ -11,5 +12,4 @@ from .ocrd_page_result import (
|
|
|
11
12
|
from .helpers import (
|
|
12
13
|
run_cli,
|
|
13
14
|
run_processor,
|
|
14
|
-
generate_processor_help
|
|
15
15
|
)
|
|
@@ -23,12 +23,16 @@ import tarfile
|
|
|
23
23
|
import io
|
|
24
24
|
import weakref
|
|
25
25
|
from frozendict import frozendict
|
|
26
|
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError
|
|
27
|
+
|
|
28
|
+
from click import wrap_text
|
|
26
29
|
from deprecated import deprecated
|
|
27
30
|
from requests import HTTPError
|
|
28
31
|
|
|
29
|
-
from
|
|
32
|
+
from ..workspace import Workspace
|
|
33
|
+
from ..mets_server import ClientSideOcrdMets
|
|
30
34
|
from ocrd_models.ocrd_file import OcrdFileType
|
|
31
|
-
from
|
|
35
|
+
from .ocrd_page_result import OcrdPageResult
|
|
32
36
|
from ocrd_utils import (
|
|
33
37
|
VERSION as OCRD_VERSION,
|
|
34
38
|
MIMETYPE_PAGE,
|
|
@@ -58,7 +62,7 @@ from ocrd_modelfactory import page_from_file
|
|
|
58
62
|
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
|
|
59
63
|
|
|
60
64
|
# XXX imports must remain for backwards-compatibility
|
|
61
|
-
from .helpers import run_cli, run_processor
|
|
65
|
+
from .helpers import run_cli, run_processor # pylint: disable=unused-import
|
|
62
66
|
|
|
63
67
|
|
|
64
68
|
class ResourceNotFoundError(FileNotFoundError):
|
|
@@ -118,7 +122,27 @@ class Processor():
|
|
|
118
122
|
maximum number of cached instances (ignored if negative), to be applied on top of
|
|
119
123
|
:py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
|
|
120
124
|
|
|
121
|
-
(Override this if you know how many instances fit into memory at once.)
|
|
125
|
+
(Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
max_workers : int = -1
|
|
129
|
+
"""
|
|
130
|
+
maximum number of processor threads for page-parallel processing (ignored if negative),
|
|
131
|
+
to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
|
|
132
|
+
whatever is smaller).
|
|
133
|
+
|
|
134
|
+
(Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
|
|
135
|
+
- at once, or if your class is not thread-safe.)
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
max_page_seconds : int = -1
|
|
139
|
+
"""
|
|
140
|
+
maximum number of seconds may be spent processing a single page (ignored if negative),
|
|
141
|
+
to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
|
|
142
|
+
(i.e. whatever is smaller).
|
|
143
|
+
|
|
144
|
+
(Override this if you know how costly this processor may be, irrespective of image size
|
|
145
|
+
or complexity of the page.)
|
|
122
146
|
"""
|
|
123
147
|
|
|
124
148
|
@property
|
|
@@ -142,7 +166,11 @@ class Processor():
|
|
|
142
166
|
|
|
143
167
|
(Override if ``ocrd-tool.json`` is not distributed with the Python package.)
|
|
144
168
|
"""
|
|
145
|
-
|
|
169
|
+
# XXX HACK
|
|
170
|
+
module_tokens = self.__module__.split('.')
|
|
171
|
+
if module_tokens[0] == 'src':
|
|
172
|
+
module_tokens.pop(0)
|
|
173
|
+
return resource_filename(module_tokens[0], self.metadata_filename)
|
|
146
174
|
|
|
147
175
|
@cached_property
|
|
148
176
|
def metadata_rawdict(self) -> dict:
|
|
@@ -273,12 +301,12 @@ class Processor():
|
|
|
273
301
|
if ocrd_tool is not None:
|
|
274
302
|
deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
|
|
275
303
|
"use or override metadata/executable/ocrd-tool properties instead")
|
|
276
|
-
self.
|
|
277
|
-
self.
|
|
304
|
+
self.ocrd_tool = ocrd_tool
|
|
305
|
+
self.executable = ocrd_tool['executable']
|
|
278
306
|
if version is not None:
|
|
279
307
|
deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
|
|
280
308
|
"use or override metadata/version properties instead")
|
|
281
|
-
self.
|
|
309
|
+
self.version = version
|
|
282
310
|
if workspace is not None:
|
|
283
311
|
deprecation_warning("Passing a workspace argument other than 'None' to Processor "
|
|
284
312
|
"is deprecated - pass as argument to process_workspace instead")
|
|
@@ -422,7 +450,29 @@ class Processor():
|
|
|
422
450
|
self.workspace = workspace
|
|
423
451
|
self.verify()
|
|
424
452
|
try:
|
|
425
|
-
|
|
453
|
+
nr_succeeded = 0
|
|
454
|
+
nr_skipped = 0
|
|
455
|
+
nr_copied = 0
|
|
456
|
+
|
|
457
|
+
# set up multithreading
|
|
458
|
+
if self.max_workers <= 0:
|
|
459
|
+
max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
|
|
460
|
+
else:
|
|
461
|
+
max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers))
|
|
462
|
+
if max_workers > 1:
|
|
463
|
+
assert isinstance(workspace.mets, ClientSideOcrdMets), \
|
|
464
|
+
"OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
|
|
465
|
+
if self.max_page_seconds <= 0:
|
|
466
|
+
max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
|
|
467
|
+
else:
|
|
468
|
+
max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds))
|
|
469
|
+
executor = ThreadPoolExecutor(
|
|
470
|
+
max_workers=max_workers or 1,
|
|
471
|
+
thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
|
|
472
|
+
)
|
|
473
|
+
self._base_logger.debug("started executor %s", str(executor))
|
|
474
|
+
tasks = {}
|
|
475
|
+
|
|
426
476
|
for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
|
|
427
477
|
input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
|
|
428
478
|
page_id = next(input_file.pageId
|
|
@@ -441,35 +491,55 @@ class Processor():
|
|
|
441
491
|
except (ValueError, FileNotFoundError, HTTPError) as e:
|
|
442
492
|
self._base_logger.error(repr(e))
|
|
443
493
|
self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
|
|
494
|
+
# process page
|
|
495
|
+
tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
|
|
496
|
+
self._base_logger.debug("submitted %d processing tasks", len(tasks))
|
|
497
|
+
|
|
498
|
+
for task in tasks:
|
|
499
|
+
# wait for results, handle errors
|
|
500
|
+
page_id, input_files = tasks[task]
|
|
444
501
|
# FIXME: differentiate error cases in various ways:
|
|
445
502
|
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
|
|
446
503
|
# - transient (I/O or OOM) error → maybe sleep, retry
|
|
447
504
|
# - persistent (data) error → skip / dummy / raise
|
|
448
505
|
try:
|
|
449
|
-
self.
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
506
|
+
self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
|
|
507
|
+
task.result(timeout=max_seconds or None)
|
|
508
|
+
nr_succeeded += 1
|
|
509
|
+
# exclude NotImplementedError, so we can try process() below
|
|
510
|
+
except NotImplementedError:
|
|
511
|
+
raise
|
|
512
|
+
# handle input failures separately
|
|
513
|
+
except FileExistsError as err:
|
|
514
|
+
if config.OCRD_EXISTING_OUTPUT == 'ABORT':
|
|
453
515
|
raise err
|
|
454
|
-
if
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
self._base_logger.exception(f"Failure on page {page_id}: {err}")
|
|
516
|
+
if config.OCRD_EXISTING_OUTPUT == 'SKIP':
|
|
517
|
+
continue
|
|
518
|
+
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
519
|
+
# too late here, must not happen
|
|
520
|
+
raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
|
|
521
|
+
# broad coverage of output failures (including TimeoutError)
|
|
522
|
+
except (Exception, TimeoutError) as err:
|
|
523
|
+
# FIXME: add re-usable/actionable logging
|
|
524
|
+
self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
464
525
|
if config.OCRD_MISSING_OUTPUT == 'ABORT':
|
|
465
526
|
raise err
|
|
466
527
|
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
528
|
+
nr_skipped += 1
|
|
467
529
|
continue
|
|
468
530
|
if config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
469
531
|
self._copy_page_file(input_files[0])
|
|
532
|
+
nr_copied += 1
|
|
470
533
|
else:
|
|
471
534
|
desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
|
|
472
535
|
raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
|
|
536
|
+
|
|
537
|
+
if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
538
|
+
raise Exception(f"too many failures with skipped output ({nr_skipped})")
|
|
539
|
+
if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
540
|
+
raise Exception(f"too many failures with fallback output ({nr_skipped})")
|
|
541
|
+
executor.shutdown()
|
|
542
|
+
|
|
473
543
|
except NotImplementedError:
|
|
474
544
|
# fall back to deprecated method
|
|
475
545
|
self.process()
|
|
@@ -493,13 +563,14 @@ class Processor():
|
|
|
493
563
|
output_file_id = make_file_id(input_file, self.output_file_grp)
|
|
494
564
|
input_pcgts.set_pcGtsId(output_file_id)
|
|
495
565
|
self.add_metadata(input_pcgts)
|
|
496
|
-
self.workspace.add_file(
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
566
|
+
self.workspace.add_file(
|
|
567
|
+
file_id=output_file_id,
|
|
568
|
+
file_grp=self.output_file_grp,
|
|
569
|
+
page_id=input_file.pageId,
|
|
570
|
+
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
|
|
571
|
+
mimetype=MIMETYPE_PAGE,
|
|
572
|
+
content=to_xml(input_pcgts),
|
|
573
|
+
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
503
574
|
)
|
|
504
575
|
|
|
505
576
|
def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
|
|
@@ -532,6 +603,9 @@ class Processor():
|
|
|
532
603
|
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
|
|
533
604
|
image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
|
|
534
605
|
if isinstance(image_result.alternative_image, PageType):
|
|
606
|
+
# special case: not an alternative image, but replacing the original image
|
|
607
|
+
# (this is needed by certain processors when the original's coordinate system
|
|
608
|
+
# cannot or must not be kept)
|
|
535
609
|
image_result.alternative_image.set_imageFilename(image_file_path)
|
|
536
610
|
image_result.alternative_image.set_imageWidth(image_result.pil.width)
|
|
537
611
|
image_result.alternative_image.set_imageHeight(image_result.pil.height)
|
|
@@ -550,13 +624,14 @@ class Processor():
|
|
|
550
624
|
)
|
|
551
625
|
result.pcgts.set_pcGtsId(output_file_id)
|
|
552
626
|
self.add_metadata(result.pcgts)
|
|
553
|
-
self.workspace.add_file(
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
627
|
+
self.workspace.add_file(
|
|
628
|
+
file_id=output_file_id,
|
|
629
|
+
file_grp=self.output_file_grp,
|
|
630
|
+
page_id=page_id,
|
|
631
|
+
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
|
|
632
|
+
mimetype=MIMETYPE_PAGE,
|
|
633
|
+
content=to_xml(result.pcgts),
|
|
634
|
+
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
560
635
|
)
|
|
561
636
|
|
|
562
637
|
def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
|
|
@@ -838,3 +913,151 @@ class Processor():
|
|
|
838
913
|
if ifiles[0] or not require_first:
|
|
839
914
|
ifts.append(tuple(ifiles))
|
|
840
915
|
return ifts
|
|
916
|
+
|
|
917
|
+
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
|
|
918
|
+
"""Generate a string describing the full CLI of this processor including params.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
|
|
922
|
+
processor_instance (object, optional): the processor implementation
|
|
923
|
+
(for adding any module/class/function docstrings)
|
|
924
|
+
subcommand (string): 'worker' or 'server'
|
|
925
|
+
"""
|
|
926
|
+
doc_help = ''
|
|
927
|
+
if processor_instance:
|
|
928
|
+
module = inspect.getmodule(processor_instance)
|
|
929
|
+
if module and module.__doc__:
|
|
930
|
+
doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
|
|
931
|
+
if processor_instance.__doc__:
|
|
932
|
+
doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
|
|
933
|
+
# Try to find the most concrete docstring among the various methods that an implementation
|
|
934
|
+
# could overload, first serving.
|
|
935
|
+
# In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
|
|
936
|
+
# (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
|
|
937
|
+
for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
|
|
938
|
+
instance_method = getattr(processor_instance, method)
|
|
939
|
+
superclass_method = getattr(Processor, method)
|
|
940
|
+
if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
|
|
941
|
+
doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
|
|
942
|
+
break
|
|
943
|
+
if doc_help:
|
|
944
|
+
doc_help = '\n\n' + wrap_text(doc_help, width=72,
|
|
945
|
+
initial_indent=' > ',
|
|
946
|
+
subsequent_indent=' > ',
|
|
947
|
+
preserve_paragraphs=True)
|
|
948
|
+
subcommands = '''\
|
|
949
|
+
worker Start a processing worker rather than do local processing
|
|
950
|
+
server Start a processor server rather than do local processing
|
|
951
|
+
'''
|
|
952
|
+
|
|
953
|
+
processing_worker_options = '''\
|
|
954
|
+
--queue The RabbitMQ server address in format
|
|
955
|
+
"amqp://{user}:{pass}@{host}:{port}/{vhost}"
|
|
956
|
+
[amqp://admin:admin@localhost:5672]
|
|
957
|
+
--database The MongoDB server address in format
|
|
958
|
+
"mongodb://{host}:{port}"
|
|
959
|
+
[mongodb://localhost:27018]
|
|
960
|
+
--log-filename Filename to redirect STDOUT/STDERR to,
|
|
961
|
+
if specified.
|
|
962
|
+
'''
|
|
963
|
+
|
|
964
|
+
processing_server_options = '''\
|
|
965
|
+
--address The Processor server address in format
|
|
966
|
+
"{host}:{port}"
|
|
967
|
+
--database The MongoDB server address in format
|
|
968
|
+
"mongodb://{host}:{port}"
|
|
969
|
+
[mongodb://localhost:27018]
|
|
970
|
+
'''
|
|
971
|
+
|
|
972
|
+
processing_options = '''\
|
|
973
|
+
-m, --mets URL-PATH URL or file path of METS to process [./mets.xml]
|
|
974
|
+
-w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)]
|
|
975
|
+
-I, --input-file-grp USE File group(s) used as input
|
|
976
|
+
-O, --output-file-grp USE File group(s) used as output
|
|
977
|
+
-g, --page-id ID Physical page ID(s) to process instead of full document []
|
|
978
|
+
--overwrite Remove existing output pages/images
|
|
979
|
+
(with "--page-id", remove only those).
|
|
980
|
+
Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
|
|
981
|
+
--debug Abort on any errors with full stack trace.
|
|
982
|
+
Short-hand for OCRD_MISSING_OUTPUT=ABORT
|
|
983
|
+
--profile Enable profiling
|
|
984
|
+
--profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile"
|
|
985
|
+
-p, --parameter JSON-PATH Parameters, either verbatim JSON string
|
|
986
|
+
or JSON file path
|
|
987
|
+
-P, --param-override KEY VAL Override a single JSON object key-value pair,
|
|
988
|
+
taking precedence over --parameter
|
|
989
|
+
-U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS
|
|
990
|
+
If URL starts with http:// start an HTTP server there,
|
|
991
|
+
otherwise URL is a path to an on-demand-created unix socket
|
|
992
|
+
-l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
|
|
993
|
+
Override log level globally [INFO]
|
|
994
|
+
--log-filename LOG-PATH File to redirect stderr logging to (overriding ocrd_logging.conf).
|
|
995
|
+
'''
|
|
996
|
+
|
|
997
|
+
information_options = '''\
|
|
998
|
+
-C, --show-resource RESNAME Dump the content of processor resource RESNAME
|
|
999
|
+
-L, --list-resources List names of processor resources
|
|
1000
|
+
-J, --dump-json Dump tool description as JSON
|
|
1001
|
+
-D, --dump-module-dir Show the 'module' resource location path for this processor
|
|
1002
|
+
-h, --help Show this message
|
|
1003
|
+
-V, --version Show version
|
|
1004
|
+
'''
|
|
1005
|
+
|
|
1006
|
+
parameter_help = ''
|
|
1007
|
+
if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
|
|
1008
|
+
parameter_help = ' NONE\n'
|
|
1009
|
+
else:
|
|
1010
|
+
def wrap(s):
|
|
1011
|
+
return wrap_text(s, initial_indent=' '*3,
|
|
1012
|
+
subsequent_indent=' '*4,
|
|
1013
|
+
width=72, preserve_paragraphs=True)
|
|
1014
|
+
for param_name, param in ocrd_tool['parameters'].items():
|
|
1015
|
+
parameter_help += wrap('"%s" [%s%s]' % (
|
|
1016
|
+
param_name,
|
|
1017
|
+
param['type'],
|
|
1018
|
+
' - REQUIRED' if 'required' in param and param['required'] else
|
|
1019
|
+
' - %s' % json.dumps(param['default']) if 'default' in param else ''))
|
|
1020
|
+
parameter_help += '\n ' + wrap(param['description'])
|
|
1021
|
+
if 'enum' in param:
|
|
1022
|
+
parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
|
|
1023
|
+
parameter_help += "\n"
|
|
1024
|
+
|
|
1025
|
+
if not subcommand:
|
|
1026
|
+
return f'''\
|
|
1027
|
+
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
|
|
1028
|
+
|
|
1029
|
+
{ocrd_tool['description']}{doc_help}
|
|
1030
|
+
|
|
1031
|
+
Subcommands:
|
|
1032
|
+
{subcommands}
|
|
1033
|
+
Options for processing:
|
|
1034
|
+
{processing_options}
|
|
1035
|
+
Options for information:
|
|
1036
|
+
{information_options}
|
|
1037
|
+
Parameters:
|
|
1038
|
+
{parameter_help}
|
|
1039
|
+
'''
|
|
1040
|
+
elif subcommand == 'worker':
|
|
1041
|
+
return f'''\
|
|
1042
|
+
Usage: {ocrd_tool['executable']} worker [OPTIONS]
|
|
1043
|
+
|
|
1044
|
+
Run {ocrd_tool['executable']} as a processing worker.
|
|
1045
|
+
|
|
1046
|
+
{ocrd_tool['description']}{doc_help}
|
|
1047
|
+
|
|
1048
|
+
Options:
|
|
1049
|
+
{processing_worker_options}
|
|
1050
|
+
'''
|
|
1051
|
+
elif subcommand == 'server':
|
|
1052
|
+
return f'''\
|
|
1053
|
+
Usage: {ocrd_tool['executable']} server [OPTIONS]
|
|
1054
|
+
|
|
1055
|
+
Run {ocrd_tool['executable']} as a processor sever.
|
|
1056
|
+
|
|
1057
|
+
{ocrd_tool['description']}{doc_help}
|
|
1058
|
+
|
|
1059
|
+
Options:
|
|
1060
|
+
{processing_server_options}
|
|
1061
|
+
'''
|
|
1062
|
+
else:
|
|
1063
|
+
pass
|