ocrd 3.0.0a2__tar.gz → 3.0.0b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrd-3.0.0a2/src/ocrd.egg-info → ocrd-3.0.0b2}/PKG-INFO +32 -10
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/README.md +31 -9
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/README_bashlib.md +9 -1
- ocrd-3.0.0b2/VERSION +1 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/__init__.py +34 -26
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/bashlib.py +32 -18
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/ocrd_tool.py +7 -5
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/workspace.py +10 -8
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/decorators/__init__.py +13 -7
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/lib.bash +3 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/mets_server.py +3 -4
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/processor/__init__.py +1 -1
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/processor/base.py +421 -98
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/processor/builtin/dummy_processor.py +4 -11
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/processor/helpers.py +24 -161
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/processor/ocrd_page_result.py +3 -3
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/resolver.py +0 -3
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/resource_manager.py +9 -5
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/workspace.py +10 -11
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/workspace_backup.py +1 -1
- {ocrd-3.0.0a2 → ocrd-3.0.0b2/src/ocrd.egg-info}/PKG-INFO +32 -10
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd.egg-info/SOURCES.txt +1 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_modelfactory/__init__.py +1 -1
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/constants.py +0 -1
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_exif.py +2 -2
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_file.py +2 -2
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_mets.py +22 -22
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_page.py +0 -1
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_xml_base.py +2 -2
- ocrd-3.0.0b2/src/ocrd_network/cli/client.py +203 -0
- ocrd-3.0.0b2/src/ocrd_network/client.py +63 -0
- ocrd-3.0.0b2/src/ocrd_network/client_utils.py +101 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/processing_server.py +1 -1
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/deployer.py +12 -3
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/server_utils.py +12 -10
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/__init__.py +2 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/config.py +31 -2
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/image.py +25 -25
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/logging.py +20 -20
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/os.py +4 -5
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/str.py +10 -3
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/json_validator.py +1 -3
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/ocrd_tool_validator.py +2 -2
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/page_validator.py +56 -56
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/parameter_validator.py +2 -2
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/resource_list_validator.py +4 -3
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/workspace_validator.py +21 -21
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/xsd_validator.py +1 -1
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_decorators.py +1 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_mets_server.py +29 -19
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_workspace.py +4 -4
- ocrd-3.0.0a2/VERSION +0 -1
- ocrd-3.0.0a2/src/ocrd_network/cli/client.py +0 -99
- ocrd-3.0.0a2/src/ocrd_network/client.py +0 -37
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/LICENSE +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/MANIFEST.in +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/README_ocrd.md +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/README_ocrd_modelfactory.md +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/README_ocrd_models.md +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/README_ocrd_network.md +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/README_ocrd_utils.md +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/README_ocrd_validators.md +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/pyproject.toml +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/requirements.txt +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/setup.cfg +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/log.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/network.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/process.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/resmgr.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/validate.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/cli/zip.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/constants.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/decorators/loglevel_option.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/decorators/mets_find_options.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/decorators/ocrd_cli_options.py +1 -1
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/decorators/parameter_option.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/ocrd-all-tool.json +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/processor/builtin/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/resource_list.yml +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/task_sequence.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd/workspace_bagger.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd.egg-info/dependency_links.txt +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd.egg-info/entry_points.txt +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd.egg-info/requires.txt +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd.egg-info/top_level.txt +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/mets-empty.xml +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_agent.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/ocrd_page_generateds.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/report.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_models/utils.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/cli/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/cli/processing_server.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/cli/processing_worker.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/cli/processor_server.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/constants.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/database.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/logging_utils.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/models/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/models/job.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/models/messages.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/models/ocrd_tool.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/models/workflow.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/models/workspace.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/param_validators.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/process_helpers.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/processing_worker.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/processor_server.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/config_parser.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/hosts.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/network_agents.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/runtime_data/network_services.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/server_cache.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_network/utils.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/constants.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/deprecate.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/introspect.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_utils/ocrd_logging.conf +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/__init__.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/bagit-profile.yml +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/constants.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/message_processing.schema.yml +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/message_result.schema.yml +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/mets.xsd +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/page.xsd +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/processing_server_config_validator.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/xlink.xsd +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/xsd_mets_validator.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/src/ocrd_validators/xsd_page_validator.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_logging.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_logging_conf.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_model_factory.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_resolver.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_resolver_oai.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_resource_manager.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_task_sequence.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_utils.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_version.py +0 -0
- {ocrd-3.0.0a2 → ocrd-3.0.0b2}/tests/test_workspace_remove.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0b2
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -94,17 +94,12 @@ complete stack of OCR-D-related software.
|
|
|
94
94
|
|
|
95
95
|
The easiest way to install is via `pip`:
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
pip install ocrd
|
|
97
|
+
pip install ocrd
|
|
99
98
|
|
|
100
|
-
# or just the functionality you need, e.g.
|
|
101
|
-
|
|
102
|
-
pip install ocrd_modelfactory
|
|
103
|
-
```
|
|
104
99
|
|
|
105
100
|
All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher.
|
|
106
101
|
|
|
107
|
-
**NOTE** Some OCR-D
|
|
102
|
+
> **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
|
|
108
103
|
* using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes
|
|
109
104
|
* custom Python logging configurations in your personal account
|
|
110
105
|
|
|
@@ -129,7 +124,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an
|
|
|
129
124
|
|
|
130
125
|
Some parts of the software are configured via environment variables:
|
|
131
126
|
|
|
132
|
-
* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification.
|
|
133
127
|
* `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens:
|
|
134
128
|
* `CPU`: Enable CPU profiling of processor runs
|
|
135
129
|
* `RSS`: Enable RSS memory profiling
|
|
@@ -142,18 +136,46 @@ Some parts of the software are configured via environment variables:
|
|
|
142
136
|
* `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`.
|
|
143
137
|
* `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`.
|
|
144
138
|
|
|
145
|
-
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files.
|
|
139
|
+
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files.
|
|
146
140
|
* `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading.
|
|
147
141
|
|
|
142
|
+
* `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing:
|
|
143
|
+
* `SKIP`: ignore and proceed with next page's input
|
|
144
|
+
* `ABORT`: throw `MissingInputFile` exception
|
|
145
|
+
|
|
146
|
+
* `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing:
|
|
147
|
+
* `SKIP`: ignore and proceed processing next page
|
|
148
|
+
* `COPY`: fall back to copying input PAGE to output fileGrp for page
|
|
149
|
+
* `ABORT`: re-throw whatever caused processing to fail
|
|
150
|
+
|
|
151
|
+
* `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).
|
|
152
|
+
|
|
153
|
+
* `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing:
|
|
154
|
+
* `SKIP`: ignore and proceed processing next page
|
|
155
|
+
* `OVERWRITE`: force writing result to output fileGrp for page
|
|
156
|
+
* `ABORT`: re-throw `FileExistsError` exception
|
|
157
|
+
|
|
158
|
+
|
|
148
159
|
* `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations.
|
|
149
160
|
|
|
150
161
|
* `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.
|
|
151
162
|
|
|
163
|
+
* `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation.
|
|
164
|
+
|
|
165
|
+
* `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.
|
|
166
|
+
|
|
152
167
|
* `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`).
|
|
153
168
|
* `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`).
|
|
154
169
|
* `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`).
|
|
155
170
|
* `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started.
|
|
156
171
|
|
|
172
|
+
* `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again.
|
|
173
|
+
* `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds).
|
|
174
|
+
|
|
175
|
+
* `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created.
|
|
176
|
+
* `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored.
|
|
177
|
+
|
|
178
|
+
|
|
157
179
|
|
|
158
180
|
## Packages
|
|
159
181
|
|
|
@@ -47,17 +47,12 @@ complete stack of OCR-D-related software.
|
|
|
47
47
|
|
|
48
48
|
The easiest way to install is via `pip`:
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
pip install ocrd
|
|
50
|
+
pip install ocrd
|
|
52
51
|
|
|
53
|
-
# or just the functionality you need, e.g.
|
|
54
|
-
|
|
55
|
-
pip install ocrd_modelfactory
|
|
56
|
-
```
|
|
57
52
|
|
|
58
53
|
All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher.
|
|
59
54
|
|
|
60
|
-
**NOTE** Some OCR-D
|
|
55
|
+
> **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
|
|
61
56
|
* using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes
|
|
62
57
|
* custom Python logging configurations in your personal account
|
|
63
58
|
|
|
@@ -82,7 +77,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an
|
|
|
82
77
|
|
|
83
78
|
Some parts of the software are configured via environment variables:
|
|
84
79
|
|
|
85
|
-
* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification.
|
|
86
80
|
* `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens:
|
|
87
81
|
* `CPU`: Enable CPU profiling of processor runs
|
|
88
82
|
* `RSS`: Enable RSS memory profiling
|
|
@@ -95,18 +89,46 @@ Some parts of the software are configured via environment variables:
|
|
|
95
89
|
* `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`.
|
|
96
90
|
* `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`.
|
|
97
91
|
|
|
98
|
-
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files.
|
|
92
|
+
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files.
|
|
99
93
|
* `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading.
|
|
100
94
|
|
|
95
|
+
* `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing:
|
|
96
|
+
* `SKIP`: ignore and proceed with next page's input
|
|
97
|
+
* `ABORT`: throw `MissingInputFile` exception
|
|
98
|
+
|
|
99
|
+
* `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing:
|
|
100
|
+
* `SKIP`: ignore and proceed processing next page
|
|
101
|
+
* `COPY`: fall back to copying input PAGE to output fileGrp for page
|
|
102
|
+
* `ABORT`: re-throw whatever caused processing to fail
|
|
103
|
+
|
|
104
|
+
* `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).
|
|
105
|
+
|
|
106
|
+
* `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing:
|
|
107
|
+
* `SKIP`: ignore and proceed processing next page
|
|
108
|
+
* `OVERWRITE`: force writing result to output fileGrp for page
|
|
109
|
+
* `ABORT`: re-throw `FileExistsError` exception
|
|
110
|
+
|
|
111
|
+
|
|
101
112
|
* `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations.
|
|
102
113
|
|
|
103
114
|
* `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.
|
|
104
115
|
|
|
116
|
+
* `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation.
|
|
117
|
+
|
|
118
|
+
* `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.
|
|
119
|
+
|
|
105
120
|
* `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`).
|
|
106
121
|
* `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`).
|
|
107
122
|
* `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`).
|
|
108
123
|
* `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started.
|
|
109
124
|
|
|
125
|
+
* `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again.
|
|
126
|
+
* `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds).
|
|
127
|
+
|
|
128
|
+
* `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created.
|
|
129
|
+
* `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored.
|
|
130
|
+
|
|
131
|
+
|
|
110
132
|
|
|
111
133
|
## Packages
|
|
112
134
|
|
|
@@ -21,6 +21,9 @@ For example:
|
|
|
21
21
|
* [`ocrd__log`](#ocrd__log)
|
|
22
22
|
* [`ocrd__minversion`](#ocrd__minversion)
|
|
23
23
|
* [`ocrd__dumpjson`](#ocrd__dumpjson)
|
|
24
|
+
* [`ocrd__resolve_resource`](#ocrd__resolve_resource)
|
|
25
|
+
* [`ocrd__show_resource`](#ocrd__show_resource)
|
|
26
|
+
* [`ocrd__list_resources`](#ocrd__list_resources)
|
|
24
27
|
* [`ocrd__usage`](#ocrd__usage)
|
|
25
28
|
* [`ocrd__parse_argv`](#ocrd__parse_argv)
|
|
26
29
|
<!-- END-MARKDOWN-TOC -->
|
|
@@ -56,6 +59,10 @@ export OCRD_TOOL_NAME=ocrd-foo-bar
|
|
|
56
59
|
|
|
57
60
|
(Which you automatically get from [`ocrd__wrap`](#ocrd__wrap).)
|
|
58
61
|
|
|
62
|
+
### `ocrd__resolve_resource`
|
|
63
|
+
|
|
64
|
+
Output given resource file's path.
|
|
65
|
+
|
|
59
66
|
### `ocrd__show_resource`
|
|
60
67
|
|
|
61
68
|
Output given resource file's content.
|
|
@@ -88,6 +95,7 @@ This will be filled by the parser along the following keys:
|
|
|
88
95
|
- `profile`: whether `--profile` is enabled
|
|
89
96
|
- `profile_file`: the argument of `--profile-file`
|
|
90
97
|
- `log_level`: the argument of `--log-level`
|
|
98
|
+
- `mets_server_url`: the argument of `--mets-server-url` argument
|
|
91
99
|
- `mets_file`: absolute path of the `--mets` argument
|
|
92
100
|
- `working_dir`: absolute path of the `--working-dir` argument or the parent of `mets_file`
|
|
93
101
|
- `page_id`: the argument of `--page-id`
|
|
@@ -95,7 +103,7 @@ This will be filled by the parser along the following keys:
|
|
|
95
103
|
- `output_file_grp`: the argument of `--output-file-grp`
|
|
96
104
|
|
|
97
105
|
Moreover, there will be an associative array **`params`**
|
|
98
|
-
with the fully expanded runtime values of the ocrd-tool.json parameters.
|
|
106
|
+
with the fully validated and default-expanded runtime values of the `ocrd-tool.json` parameters.
|
|
99
107
|
|
|
100
108
|
### `ocrd__wrap`
|
|
101
109
|
|
ocrd-3.0.0b2/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.0.0b2
|
|
@@ -10,6 +10,36 @@ import click
|
|
|
10
10
|
|
|
11
11
|
from ocrd_utils import config
|
|
12
12
|
|
|
13
|
+
# pylint: disable=wrong-import-position
|
|
14
|
+
|
|
15
|
+
def command_with_replaced_help(*replacements):
|
|
16
|
+
|
|
17
|
+
class CommandWithReplacedHelp(click.Command):
|
|
18
|
+
def get_help(self, ctx):
|
|
19
|
+
newhelp = super().get_help(ctx)
|
|
20
|
+
for replacement in replacements:
|
|
21
|
+
newhelp = re.sub(*replacement, newhelp)
|
|
22
|
+
# print(newhelp)
|
|
23
|
+
return newhelp
|
|
24
|
+
|
|
25
|
+
return CommandWithReplacedHelp
|
|
26
|
+
|
|
27
|
+
# pylint: enable=wrong-import-position
|
|
28
|
+
|
|
29
|
+
from ..decorators import ocrd_loglevel
|
|
30
|
+
from .ocrd_tool import ocrd_tool_cli
|
|
31
|
+
from .workspace import workspace_cli
|
|
32
|
+
from .process import process_cli
|
|
33
|
+
from .bashlib import bashlib_cli
|
|
34
|
+
from .validate import validate_cli
|
|
35
|
+
from .resmgr import resmgr_cli
|
|
36
|
+
from .zip import zip_cli
|
|
37
|
+
from .log import log_cli
|
|
38
|
+
from .network import network_cli
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
__all__ = ['cli']
|
|
42
|
+
|
|
13
43
|
_epilog = f"""
|
|
14
44
|
|
|
15
45
|
\b
|
|
@@ -41,6 +71,10 @@ Variables:
|
|
|
41
71
|
\b
|
|
42
72
|
{config.describe('OCRD_MAX_PROCESSOR_CACHE')}
|
|
43
73
|
\b
|
|
74
|
+
{config.describe('OCRD_NETWORK_CLIENT_POLLING_SLEEP')}
|
|
75
|
+
\b
|
|
76
|
+
{config.describe('OCRD_NETWORK_CLIENT_POLLING_TIMEOUT')}
|
|
77
|
+
\b
|
|
44
78
|
{config.describe('OCRD_NETWORK_SERVER_ADDR_PROCESSING')}
|
|
45
79
|
\b
|
|
46
80
|
{config.describe('OCRD_NETWORK_SERVER_ADDR_WORKFLOW')}
|
|
@@ -60,30 +94,6 @@ Variables:
|
|
|
60
94
|
{config.describe('OCRD_LOGGING_DEBUG')}
|
|
61
95
|
"""
|
|
62
96
|
|
|
63
|
-
def command_with_replaced_help(*replacements):
|
|
64
|
-
|
|
65
|
-
class CommandWithReplacedHelp(click.Command):
|
|
66
|
-
def get_help(self, ctx):
|
|
67
|
-
help = super().get_help(ctx)
|
|
68
|
-
for replacement in replacements:
|
|
69
|
-
help = re.sub(*replacement, help)
|
|
70
|
-
# print(help)
|
|
71
|
-
return help
|
|
72
|
-
|
|
73
|
-
return CommandWithReplacedHelp
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
from ..decorators import ocrd_loglevel
|
|
77
|
-
from .ocrd_tool import ocrd_tool_cli
|
|
78
|
-
from .workspace import workspace_cli
|
|
79
|
-
from .process import process_cli
|
|
80
|
-
from .bashlib import bashlib_cli
|
|
81
|
-
from .validate import validate_cli
|
|
82
|
-
from .resmgr import resmgr_cli
|
|
83
|
-
from .zip import zip_cli
|
|
84
|
-
from .log import log_cli
|
|
85
|
-
from .network import network_cli
|
|
86
|
-
|
|
87
97
|
@click.group(epilog=_epilog)
|
|
88
98
|
@click.version_option(package_name='ocrd')
|
|
89
99
|
@ocrd_loglevel
|
|
@@ -101,5 +111,3 @@ cli.add_command(validate_cli)
|
|
|
101
111
|
cli.add_command(log_cli)
|
|
102
112
|
cli.add_command(resmgr_cli)
|
|
103
113
|
cli.add_command(network_cli)
|
|
104
|
-
|
|
105
|
-
__all__ = ['cli']
|
|
@@ -8,7 +8,6 @@ OCR-D CLI: bash library
|
|
|
8
8
|
"""
|
|
9
9
|
from __future__ import print_function
|
|
10
10
|
import sys
|
|
11
|
-
from os.path import isfile
|
|
12
11
|
import click
|
|
13
12
|
|
|
14
13
|
from ocrd.constants import BASHLIB_FILENAME
|
|
@@ -23,15 +22,7 @@ from ocrd.decorators import (
|
|
|
23
22
|
ocrd_loglevel,
|
|
24
23
|
ocrd_cli_wrap_processor
|
|
25
24
|
)
|
|
26
|
-
from ocrd_utils import
|
|
27
|
-
is_local_filename,
|
|
28
|
-
get_local_filename,
|
|
29
|
-
initLogging,
|
|
30
|
-
getLogger,
|
|
31
|
-
make_file_id,
|
|
32
|
-
config
|
|
33
|
-
)
|
|
34
|
-
from ocrd.resolver import Resolver
|
|
25
|
+
from ocrd_utils import make_file_id
|
|
35
26
|
from ocrd.processor import Processor
|
|
36
27
|
|
|
37
28
|
# ----------------------------------------------------------------------
|
|
@@ -82,6 +73,8 @@ def bashlib_constants(name):
|
|
|
82
73
|
print(val)
|
|
83
74
|
|
|
84
75
|
@bashlib_cli.command('input-files')
|
|
76
|
+
@click.option('--ocrd-tool', help="path to ocrd-tool.json of processor to feed", default=None)
|
|
77
|
+
@click.option('--executable', help="name of processor executable in ocrd-tool.json", default=None)
|
|
85
78
|
@click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME)
|
|
86
79
|
@click.option('-w', '--working-dir', help="Working Directory")
|
|
87
80
|
@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None)
|
|
@@ -96,7 +89,7 @@ def bashlib_constants(name):
|
|
|
96
89
|
@parameter_option
|
|
97
90
|
@parameter_override_option
|
|
98
91
|
@ocrd_loglevel
|
|
99
|
-
def bashlib_input_files(**kwargs):
|
|
92
|
+
def bashlib_input_files(ocrd_tool, executable, **kwargs):
|
|
100
93
|
"""
|
|
101
94
|
List input files for processing
|
|
102
95
|
|
|
@@ -108,12 +101,6 @@ def bashlib_input_files(**kwargs):
|
|
|
108
101
|
(The printing format is one associative array initializer per line.)
|
|
109
102
|
"""
|
|
110
103
|
class BashlibProcessor(Processor):
|
|
111
|
-
@property
|
|
112
|
-
def ocrd_tool(self):
|
|
113
|
-
return {'executable': '', 'steps': ['']}
|
|
114
|
-
@property
|
|
115
|
-
def version(self):
|
|
116
|
-
return '1.0'
|
|
117
104
|
# go half way of the normal run_processor / process_workspace call tree
|
|
118
105
|
# by just delegating to process_workspace, overriding process_page_file
|
|
119
106
|
# to ensure all input files exist locally (without persisting them in the METS)
|
|
@@ -129,4 +116,31 @@ def bashlib_input_files(**kwargs):
|
|
|
129
116
|
print(f"[{field}]='{value}'", end=' ')
|
|
130
117
|
output_file_id = make_file_id(input_files[0], kwargs['output_file_grp'])
|
|
131
118
|
print(f"[outputFileId]='{output_file_id}'")
|
|
132
|
-
|
|
119
|
+
if ocrd_tool and executable:
|
|
120
|
+
class FullBashlibProcessor(BashlibProcessor):
|
|
121
|
+
@property
|
|
122
|
+
def metadata_location(self):
|
|
123
|
+
# needed for metadata loading and validation mechanism
|
|
124
|
+
return ocrd_tool
|
|
125
|
+
@property
|
|
126
|
+
def executable(self):
|
|
127
|
+
# needed for ocrd_tool lookup
|
|
128
|
+
return executable
|
|
129
|
+
else:
|
|
130
|
+
# we have no true metadata file, so fill in just to make it work
|
|
131
|
+
class FullBashlibProcessor(BashlibProcessor):
|
|
132
|
+
@property
|
|
133
|
+
def ocrd_tool(self):
|
|
134
|
+
# needed to satisfy the validator
|
|
135
|
+
return {'executable': '',
|
|
136
|
+
# required now
|
|
137
|
+
'input_file_grp_cardinality': 1,
|
|
138
|
+
'output_file_grp_cardinality': 1,
|
|
139
|
+
'steps': ['']
|
|
140
|
+
}
|
|
141
|
+
@property
|
|
142
|
+
def version(self):
|
|
143
|
+
# needed to satisfy the validator and wrapper
|
|
144
|
+
return '1.0'
|
|
145
|
+
|
|
146
|
+
ocrd_cli_wrap_processor(FullBashlibProcessor, **kwargs)
|
|
@@ -17,7 +17,6 @@ from ocrd.decorators import parameter_option, parameter_override_option
|
|
|
17
17
|
from ocrd.processor import Processor
|
|
18
18
|
from ocrd_utils import (
|
|
19
19
|
set_json_key_value_overrides,
|
|
20
|
-
VERSION as OCRD_VERSION,
|
|
21
20
|
parse_json_string_or_file,
|
|
22
21
|
parse_json_string_with_comments as loads
|
|
23
22
|
)
|
|
@@ -29,23 +28,26 @@ class OcrdToolCtx():
|
|
|
29
28
|
self.filename = filename
|
|
30
29
|
with codecs.open(filename, encoding='utf-8') as f:
|
|
31
30
|
self.content = f.read()
|
|
31
|
+
# perhaps the validator should _always_ run (for default expansion)
|
|
32
|
+
# so validate command only for the report?
|
|
32
33
|
self.json = loads(self.content)
|
|
34
|
+
self.tool_name = ''
|
|
33
35
|
|
|
34
36
|
class BashProcessor(Processor):
|
|
35
37
|
@property
|
|
36
|
-
def metadata(inner_self):
|
|
38
|
+
def metadata(inner_self): # pylint: disable=no-self-argument,arguments-renamed
|
|
37
39
|
return self.json
|
|
38
40
|
@property
|
|
39
|
-
def executable(inner_self):
|
|
41
|
+
def executable(inner_self): # pylint: disable=no-self-argument,arguments-renamed
|
|
40
42
|
return self.tool_name
|
|
41
43
|
@property
|
|
42
|
-
def moduledir(inner_self):
|
|
44
|
+
def moduledir(inner_self): # pylint: disable=no-self-argument,arguments-renamed
|
|
43
45
|
return os.path.dirname(self.filename)
|
|
44
46
|
# set docstrings to empty
|
|
45
47
|
__doc__ = None
|
|
46
48
|
# HACK: override the module-level docstring, too
|
|
47
49
|
getmodule(OcrdToolCtx).__doc__ = None
|
|
48
|
-
def process(inner_self):
|
|
50
|
+
def process(inner_self): # pylint: disable=no-self-argument,arguments-renamed
|
|
49
51
|
return super()
|
|
50
52
|
|
|
51
53
|
self.processor = BashProcessor
|
|
@@ -6,7 +6,7 @@ OCR-D CLI: workspace management
|
|
|
6
6
|
:nested: full
|
|
7
7
|
"""
|
|
8
8
|
import os
|
|
9
|
-
from os import
|
|
9
|
+
from os import rmdir, unlink
|
|
10
10
|
from os.path import dirname, relpath, normpath, exists, join, isabs, isdir
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from json import loads, dumps
|
|
@@ -14,7 +14,6 @@ import sys
|
|
|
14
14
|
from glob import glob # XXX pathlib.Path.glob does not support absolute globs
|
|
15
15
|
import re
|
|
16
16
|
import time
|
|
17
|
-
import numpy as np
|
|
18
17
|
|
|
19
18
|
import click
|
|
20
19
|
|
|
@@ -118,7 +117,7 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency
|
|
|
118
117
|
@workspace_cli.command('clone', cls=command_with_replaced_help(
|
|
119
118
|
(r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
|
|
120
119
|
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
|
|
121
|
-
@click.option('-a', '--download', is_flag=True, help="Download all files and
|
|
120
|
+
@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards")
|
|
122
121
|
@click.argument('mets_url')
|
|
123
122
|
@mets_find_options
|
|
124
123
|
# XXX deprecated
|
|
@@ -129,8 +128,10 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
|
|
|
129
128
|
Create a workspace from METS_URL and return the directory
|
|
130
129
|
|
|
131
130
|
METS_URL can be a URL, an absolute path or a path relative to $PWD.
|
|
132
|
-
If METS_URL is not provided, use --mets accordingly.
|
|
133
131
|
METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
|
|
132
|
+
|
|
133
|
+
Additional options pertain to the selection of files / fileGrps / pages
|
|
134
|
+
to be downloaded, if --download is used.
|
|
134
135
|
"""
|
|
135
136
|
LOG = getLogger('ocrd.cli.workspace.clone')
|
|
136
137
|
if workspace_dir:
|
|
@@ -143,6 +144,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
|
|
|
143
144
|
mets_basename=ctx.mets_basename,
|
|
144
145
|
clobber_mets=clobber_mets,
|
|
145
146
|
download=download,
|
|
147
|
+
fileGrp=file_grp,
|
|
146
148
|
ID=file_id,
|
|
147
149
|
pageId=page_id,
|
|
148
150
|
mimetype=mimetype,
|
|
@@ -408,7 +410,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
|
|
|
408
410
|
if dry_run:
|
|
409
411
|
log.info('workspace.add_file(%s)' % file_dict)
|
|
410
412
|
else:
|
|
411
|
-
workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)
|
|
413
|
+
workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg
|
|
412
414
|
|
|
413
415
|
# save changes to disk
|
|
414
416
|
workspace.save_mets()
|
|
@@ -452,7 +454,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl
|
|
|
452
454
|
snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
|
|
453
455
|
output_field = [snake_to_camel.get(x, x) for x in output_field]
|
|
454
456
|
modified_mets = False
|
|
455
|
-
ret =
|
|
457
|
+
ret = []
|
|
456
458
|
workspace = Workspace(
|
|
457
459
|
ctx.resolver,
|
|
458
460
|
directory=ctx.directory,
|
|
@@ -748,7 +750,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin
|
|
|
748
750
|
|
|
749
751
|
@workspace_cli.command('update-page')
|
|
750
752
|
@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True)
|
|
751
|
-
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')
|
|
753
|
+
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')
|
|
752
754
|
@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
|
|
753
755
|
@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
|
|
754
756
|
@click.argument('PAGE_ID')
|
|
@@ -757,7 +759,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
|
|
|
757
759
|
"""
|
|
758
760
|
Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
|
|
759
761
|
"""
|
|
760
|
-
update_kwargs =
|
|
762
|
+
update_kwargs = dict(attr_value_pairs)
|
|
761
763
|
if order:
|
|
762
764
|
update_kwargs['ORDER'] = order
|
|
763
765
|
if orderlabel:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import sys
|
|
2
|
+
from contextlib import nullcontext
|
|
2
3
|
|
|
3
4
|
from ocrd_utils import (
|
|
4
5
|
config,
|
|
@@ -9,6 +10,7 @@ from ocrd_utils import (
|
|
|
9
10
|
parse_json_string_with_comments,
|
|
10
11
|
set_json_key_value_overrides,
|
|
11
12
|
parse_json_string_or_file,
|
|
13
|
+
redirect_stderr_and_stdout_to_file,
|
|
12
14
|
)
|
|
13
15
|
from ocrd_validators import WorkspaceValidator
|
|
14
16
|
from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
|
|
@@ -104,10 +106,10 @@ def ocrd_cli_wrap_processor(
|
|
|
104
106
|
kwargs['parameter'] = parse_json_string_or_file(*kwargs['parameter'],
|
|
105
107
|
resolve_preset_file=resolve)
|
|
106
108
|
else:
|
|
107
|
-
kwargs['parameter'] =
|
|
109
|
+
kwargs['parameter'] = {}
|
|
108
110
|
# Merge parameter overrides and parameters
|
|
109
111
|
if 'parameter_override' in kwargs:
|
|
110
|
-
set_json_key_value_overrides(kwargs['parameter'], *kwargs
|
|
112
|
+
set_json_key_value_overrides(kwargs['parameter'], *kwargs.pop('parameter_override'))
|
|
111
113
|
# Assert -I / -O
|
|
112
114
|
if not kwargs['input_file_grp']:
|
|
113
115
|
raise ValueError('-I/--input-file-grp is required')
|
|
@@ -140,17 +142,21 @@ def ocrd_cli_wrap_processor(
|
|
|
140
142
|
print("Profiling...")
|
|
141
143
|
pr = cProfile.Profile()
|
|
142
144
|
pr.enable()
|
|
143
|
-
def
|
|
145
|
+
def goexit():
|
|
144
146
|
pr.disable()
|
|
145
147
|
print("Profiling completed")
|
|
146
148
|
if profile_file:
|
|
147
|
-
|
|
148
|
-
pr.dump_stats(profile_file)
|
|
149
|
+
pr.dump_stats(profile_file)
|
|
149
150
|
s = io.StringIO()
|
|
150
151
|
pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats()
|
|
151
152
|
print(s.getvalue())
|
|
152
|
-
atexit.register(
|
|
153
|
-
|
|
153
|
+
atexit.register(goexit)
|
|
154
|
+
if log_filename:
|
|
155
|
+
log_ctx = redirect_stderr_and_stdout_to_file(log_filename)
|
|
156
|
+
else:
|
|
157
|
+
log_ctx = nullcontext()
|
|
158
|
+
with log_ctx:
|
|
159
|
+
run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs)
|
|
154
160
|
|
|
155
161
|
|
|
156
162
|
def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str):
|
|
@@ -156,6 +156,7 @@ ocrd__parse_argv () {
|
|
|
156
156
|
while [[ "${1:-}" = -* ]];do
|
|
157
157
|
case "$1" in
|
|
158
158
|
-l|--log-level) ocrd__argv[log_level]=$2 ; shift ;;
|
|
159
|
+
--log-filename) exec 2> "$2" ; shift ;;
|
|
159
160
|
-h|--help|--usage) ocrd__usage; exit ;;
|
|
160
161
|
-J|--dump-json) ocrd__dumpjson; exit ;;
|
|
161
162
|
-D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;;
|
|
@@ -299,6 +300,8 @@ ocrd__wrap () {
|
|
|
299
300
|
eval "ocrd__files[$i]=ocrd__file$i"
|
|
300
301
|
let ++i
|
|
301
302
|
done < <(ocrd bashlib input-files \
|
|
303
|
+
--ocrd-tool $OCRD_TOOL_JSON \
|
|
304
|
+
--executable $OCRD_TOOL_NAME \
|
|
302
305
|
-m "${ocrd__argv[mets_file]}" \
|
|
303
306
|
-I "${ocrd__argv[input_file_grp]}" \
|
|
304
307
|
-O "${ocrd__argv[output_file_grp]}" \
|
|
@@ -21,7 +21,7 @@ from pydantic import BaseModel, Field, ValidationError
|
|
|
21
21
|
import uvicorn
|
|
22
22
|
|
|
23
23
|
from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdAgent, ClientSideOcrdAgent
|
|
24
|
-
from ocrd_utils import getLogger
|
|
24
|
+
from ocrd_utils import getLogger
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
#
|
|
@@ -120,7 +120,7 @@ class ClientSideOcrdMets:
|
|
|
120
120
|
|
|
121
121
|
def __init__(self, url, workspace_path: Optional[str] = None):
|
|
122
122
|
self.protocol = "tcp" if url.startswith("http://") else "uds"
|
|
123
|
-
self.log = getLogger(f"ocrd.
|
|
123
|
+
self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}")
|
|
124
124
|
self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}'
|
|
125
125
|
self.ws_dir_path = workspace_path if workspace_path else None
|
|
126
126
|
|
|
@@ -236,7 +236,7 @@ class ClientSideOcrdMets:
|
|
|
236
236
|
agent_dict["_type"] = agent_dict.pop("type")
|
|
237
237
|
return [ClientSideOcrdAgent(None, **agent_dict) for agent_dict in agent_dicts]
|
|
238
238
|
|
|
239
|
-
def add_agent(self,
|
|
239
|
+
def add_agent(self, **kwargs):
|
|
240
240
|
if not self.multiplexing_mode:
|
|
241
241
|
return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict())
|
|
242
242
|
else:
|
|
@@ -403,7 +403,6 @@ class OcrdMetsServer:
|
|
|
403
403
|
@staticmethod
|
|
404
404
|
def kill_process(mets_server_pid: int):
|
|
405
405
|
subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True)
|
|
406
|
-
return
|
|
407
406
|
|
|
408
407
|
def shutdown(self):
|
|
409
408
|
if self.is_uds:
|
|
@@ -3,6 +3,7 @@ from .base import (
|
|
|
3
3
|
ResourceNotFoundError,
|
|
4
4
|
NonUniqueInputFile,
|
|
5
5
|
MissingInputFile,
|
|
6
|
+
generate_processor_help,
|
|
6
7
|
)
|
|
7
8
|
from .ocrd_page_result import (
|
|
8
9
|
OcrdPageResult,
|
|
@@ -11,5 +12,4 @@ from .ocrd_page_result import (
|
|
|
11
12
|
from .helpers import (
|
|
12
13
|
run_cli,
|
|
13
14
|
run_processor,
|
|
14
|
-
generate_processor_help
|
|
15
15
|
)
|