ocrd 3.6.0__tar.gz → 3.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrd-3.6.0/src/ocrd.egg-info → ocrd-3.8.0}/PKG-INFO +23 -10
- {ocrd-3.6.0 → ocrd-3.8.0}/README.md +19 -7
- ocrd-3.8.0/VERSION +1 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/pyproject.toml +2 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/requirements.txt +3 -2
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/cli/__init__.py +2 -4
- ocrd-3.8.0/src/ocrd/cli/bashlib.py +42 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/cli/network.py +2 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/cli/resmgr.py +29 -65
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/constants.py +0 -2
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/mets_server.py +5 -5
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/processor/base.py +6 -16
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +25 -0
- ocrd-3.8.0/src/ocrd/processor/builtin/merge_processor.py +131 -0
- ocrd-3.8.0/src/ocrd/processor/builtin/param_command_header2unordered.json +7 -0
- ocrd-3.8.0/src/ocrd/processor/builtin/param_command_heading2unordered.json +7 -0
- ocrd-3.8.0/src/ocrd/processor/builtin/param_command_lines2orientation.json +6 -0
- ocrd-3.8.0/src/ocrd/processor/builtin/param_command_page-update-version.json +5 -0
- ocrd-3.8.0/src/ocrd/processor/builtin/param_command_transkribus-to-prima.json +8 -0
- ocrd-3.8.0/src/ocrd/processor/builtin/shell_processor.py +128 -0
- ocrd-3.8.0/src/ocrd/resource_manager.py +469 -0
- {ocrd-3.6.0 → ocrd-3.8.0/src/ocrd.egg-info}/PKG-INFO +23 -10
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd.egg-info/SOURCES.txt +9 -4
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd.egg-info/entry_points.txt +2 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd.egg-info/requires.txt +3 -2
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_agent.py +3 -3
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/__init__.py +1 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/cli/__init__.py +2 -0
- ocrd-3.8.0/src/ocrd_network/cli/resmgr_server.py +23 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/constants.py +3 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/logging_utils.py +5 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/models/job.py +29 -28
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/models/messages.py +3 -2
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/models/workspace.py +4 -4
- ocrd-3.8.0/src/ocrd_network/resource_manager_server.py +182 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/connection_clients.py +1 -1
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/hosts.py +43 -16
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/network_agents.py +15 -1
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/__init__.py +5 -1
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/constants.py +5 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/logging.py +3 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/os.py +142 -62
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/ocrd_tool.schema.yml +7 -4
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_resource_manager.py +37 -16
- ocrd-3.6.0/README_bashlib.md +0 -177
- ocrd-3.6.0/VERSION +0 -1
- ocrd-3.6.0/src/ocrd/cli/bashlib.py +0 -153
- ocrd-3.6.0/src/ocrd/cli/log.py +0 -56
- ocrd-3.6.0/src/ocrd/lib.bash +0 -310
- ocrd-3.6.0/src/ocrd/resource_list.yml +0 -61
- ocrd-3.6.0/src/ocrd/resource_manager.py +0 -380
- {ocrd-3.6.0 → ocrd-3.8.0}/LICENSE +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/MANIFEST.in +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/README_ocrd.md +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/README_ocrd_modelfactory.md +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/README_ocrd_models.md +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/README_ocrd_network.md +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/README_ocrd_utils.md +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/README_ocrd_validators.md +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/setup.cfg +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/cli/ocrd_tool.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/cli/process.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/cli/validate.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/cli/workspace.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/cli/zip.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/decorators/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/decorators/loglevel_option.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/decorators/mets_find_options.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/decorators/ocrd_cli_options.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/decorators/parameter_option.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/ocrd-all-tool.json +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/processor/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/filter_processor.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/processor/helpers.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/processor/ocrd_page_result.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/resolver.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/task_sequence.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/workspace.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/workspace_backup.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd/workspace_bagger.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd.egg-info/dependency_links.txt +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd.egg-info/top_level.txt +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_modelfactory/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/constants.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/mets-empty.xml +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_exif.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_file.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_mets.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_page.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_page_generateds.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_xml_base.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/report.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/utils.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_models/xpath_functions.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/cli/client.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/cli/processing_server.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/cli/processing_worker.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/client.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/client_utils.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/database.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/models/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/models/workflow.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/param_validators.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/process_helpers.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/processing_server.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/processing_worker.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/config_parser.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/deployer.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/network_services.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/server_cache.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/server_utils.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_network/utils.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/config.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/deprecate.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/image.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/introspect.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/ocrd_logging.conf +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_utils/str.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/__init__.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/bagit-profile.yml +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/constants.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/json_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/message_processing.schema.yml +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/message_result.schema.yml +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/mets.xsd +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/page.xsd +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/page_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/parameter_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/processing_server_config_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/resource_list_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/workspace_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/xlink.xsd +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/xsd_mets_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/xsd_page_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/src/ocrd_validators/xsd_validator.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_decorators.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_logging.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_logging_conf.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_mets_server.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_model_factory.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_resolver.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_resolver_oai.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_task_sequence.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_utils.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_version.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_workspace.py +0 -0
- {ocrd-3.6.0 → ocrd-3.8.0}/tests/test_workspace_remove.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.8.0
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -16,12 +16,13 @@ Requires-Dist: beanie~=1.7
|
|
|
16
16
|
Requires-Dist: click>=7
|
|
17
17
|
Requires-Dist: cryptography<43.0.0
|
|
18
18
|
Requires-Dist: Deprecated==1.2.0
|
|
19
|
-
Requires-Dist: docker
|
|
19
|
+
Requires-Dist: docker>=7.1.0
|
|
20
20
|
Requires-Dist: elementpath
|
|
21
21
|
Requires-Dist: fastapi>=0.78.0
|
|
22
22
|
Requires-Dist: filetype
|
|
23
23
|
Requires-Dist: Flask
|
|
24
24
|
Requires-Dist: frozendict>=2.4.0
|
|
25
|
+
Requires-Dist: gitpython
|
|
25
26
|
Requires-Dist: gdown
|
|
26
27
|
Requires-Dist: httpx>=0.22.0
|
|
27
28
|
Requires-Dist: importlib_metadata; python_version < "3.8"
|
|
@@ -37,7 +38,7 @@ Requires-Dist: opencv-python-headless
|
|
|
37
38
|
Requires-Dist: paramiko
|
|
38
39
|
Requires-Dist: pika>=1.2.0
|
|
39
40
|
Requires-Dist: Pillow>=7.2.0
|
|
40
|
-
Requires-Dist: pydantic
|
|
41
|
+
Requires-Dist: pydantic>=2.0.0
|
|
41
42
|
Requires-Dist: python-magic
|
|
42
43
|
Requires-Dist: python-multipart
|
|
43
44
|
Requires-Dist: pyyaml
|
|
@@ -68,6 +69,9 @@ Requires-Dist: uvicorn>=0.17.6
|
|
|
68
69
|
* [Command line tools](#command-line-tools)
|
|
69
70
|
* [`ocrd` CLI](#ocrd-cli)
|
|
70
71
|
* [`ocrd-dummy` CLI](#ocrd-dummy-cli)
|
|
72
|
+
* [`ocrd-filter` CLI](#ocrd-filter-cli)
|
|
73
|
+
* [`ocrd-command` CLI](#ocrd-command-cli)
|
|
74
|
+
* [`ocrd-merge` CLI](#ocrd-merge-cli)
|
|
71
75
|
* [Configuration](#configuration)
|
|
72
76
|
* [Packages](#packages)
|
|
73
77
|
* [ocrd_utils](#ocrd_utils)
|
|
@@ -76,7 +80,6 @@ Requires-Dist: uvicorn>=0.17.6
|
|
|
76
80
|
* [ocrd_validators](#ocrd_validators)
|
|
77
81
|
* [ocrd_network](#ocrd_network)
|
|
78
82
|
* [ocrd](#ocrd)
|
|
79
|
-
* [bash library](#bash-library)
|
|
80
83
|
* [Testing](#testing)
|
|
81
84
|
* [See Also](#see-also)
|
|
82
85
|
|
|
@@ -121,6 +124,22 @@ supported flags, options and arguments.
|
|
|
121
124
|
|
|
122
125
|
A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp`
|
|
123
126
|
|
|
127
|
+
### `ocrd-filter` CLI
|
|
128
|
+
|
|
129
|
+
A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that removes segments in PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` with arbitrary selection based on powerful XPath 2.0 expressions.
|
|
130
|
+
|
|
131
|
+
### `ocrd-command` CLI
|
|
132
|
+
|
|
133
|
+
A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that runs arbitrary shell commands to transform PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` (in effect "wrapping" them for OCR-D).
|
|
134
|
+
|
|
135
|
+
### `ocrd-merge` CLI
|
|
136
|
+
|
|
137
|
+
A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that (for every page) joins PAGE-XML files from multiple `-I/-input-file-grp` into a single `-O/-output-file-grp`, ensuring that
|
|
138
|
+
- `Border` polygons are joined
|
|
139
|
+
- all regions are concatenated, while
|
|
140
|
+
- ensuring segment identifiers do not clash,
|
|
141
|
+
- and the reading order simply gets concatenated.
|
|
142
|
+
|
|
124
143
|
## Configuration
|
|
125
144
|
|
|
126
145
|
Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support.
|
|
@@ -220,12 +239,6 @@ Also contains the command line tool `ocrd`.
|
|
|
220
239
|
|
|
221
240
|
See [README for `ocrd`](./README_ocrd.md) for further information.
|
|
222
241
|
|
|
223
|
-
## bash library
|
|
224
|
-
|
|
225
|
-
Builds a bash script that can be sourced by other bash scripts to create OCRD-compliant CLI.
|
|
226
|
-
|
|
227
|
-
See [README for `bashlib`](./README_bashlib.md) for further information.
|
|
228
|
-
|
|
229
242
|
## Testing
|
|
230
243
|
|
|
231
244
|
Download assets (`make assets`)
|
|
@@ -18,6 +18,9 @@
|
|
|
18
18
|
* [Command line tools](#command-line-tools)
|
|
19
19
|
* [`ocrd` CLI](#ocrd-cli)
|
|
20
20
|
* [`ocrd-dummy` CLI](#ocrd-dummy-cli)
|
|
21
|
+
* [`ocrd-filter` CLI](#ocrd-filter-cli)
|
|
22
|
+
* [`ocrd-command` CLI](#ocrd-command-cli)
|
|
23
|
+
* [`ocrd-merge` CLI](#ocrd-merge-cli)
|
|
21
24
|
* [Configuration](#configuration)
|
|
22
25
|
* [Packages](#packages)
|
|
23
26
|
* [ocrd_utils](#ocrd_utils)
|
|
@@ -26,7 +29,6 @@
|
|
|
26
29
|
* [ocrd_validators](#ocrd_validators)
|
|
27
30
|
* [ocrd_network](#ocrd_network)
|
|
28
31
|
* [ocrd](#ocrd)
|
|
29
|
-
* [bash library](#bash-library)
|
|
30
32
|
* [Testing](#testing)
|
|
31
33
|
* [See Also](#see-also)
|
|
32
34
|
|
|
@@ -71,6 +73,22 @@ supported flags, options and arguments.
|
|
|
71
73
|
|
|
72
74
|
A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp`
|
|
73
75
|
|
|
76
|
+
### `ocrd-filter` CLI
|
|
77
|
+
|
|
78
|
+
A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that removes segments in PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` with arbitrary selection based on powerful XPath 2.0 expressions.
|
|
79
|
+
|
|
80
|
+
### `ocrd-command` CLI
|
|
81
|
+
|
|
82
|
+
A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that runs arbitrary shell commands to transform PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` (in effect "wrapping" them for OCR-D).
|
|
83
|
+
|
|
84
|
+
### `ocrd-merge` CLI
|
|
85
|
+
|
|
86
|
+
A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that (for every page) joins PAGE-XML files from multiple `-I/-input-file-grp` into a single `-O/-output-file-grp`, ensuring that
|
|
87
|
+
- `Border` polygons are joined
|
|
88
|
+
- all regions are concatenated, while
|
|
89
|
+
- ensuring segment identifiers do not clash,
|
|
90
|
+
- and the reading order simply gets concatenated.
|
|
91
|
+
|
|
74
92
|
## Configuration
|
|
75
93
|
|
|
76
94
|
Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support.
|
|
@@ -170,12 +188,6 @@ Also contains the command line tool `ocrd`.
|
|
|
170
188
|
|
|
171
189
|
See [README for `ocrd`](./README_ocrd.md) for further information.
|
|
172
190
|
|
|
173
|
-
## bash library
|
|
174
|
-
|
|
175
|
-
Builds a bash script that can be sourced by other bash scripts to create OCRD-compliant CLI.
|
|
176
|
-
|
|
177
|
-
See [README for `bashlib`](./README_bashlib.md) for further information.
|
|
178
|
-
|
|
179
191
|
## Testing
|
|
180
192
|
|
|
181
193
|
Download assets (`make assets`)
|
ocrd-3.8.0/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.8.0
|
|
@@ -35,6 +35,8 @@ Issues = "https://github.com/OCR-D/core/issues"
|
|
|
35
35
|
ocrd = "ocrd.cli:cli"
|
|
36
36
|
ocrd-dummy = "ocrd.processor.builtin.dummy_processor:cli"
|
|
37
37
|
ocrd-filter = "ocrd.processor.builtin.filter_processor:cli"
|
|
38
|
+
ocrd-command = "ocrd.processor.builtin.shell_processor:cli"
|
|
39
|
+
ocrd-merge = "ocrd.processor.builtin.merge_processor:cli"
|
|
38
40
|
|
|
39
41
|
[tool.setuptools]
|
|
40
42
|
include-package-data = true
|
|
@@ -3,12 +3,13 @@ beanie~=1.7
|
|
|
3
3
|
click >=7
|
|
4
4
|
cryptography < 43.0.0
|
|
5
5
|
Deprecated == 1.2.0
|
|
6
|
-
docker
|
|
6
|
+
docker>=7.1.0
|
|
7
7
|
elementpath
|
|
8
8
|
fastapi>=0.78.0
|
|
9
9
|
filetype
|
|
10
10
|
Flask
|
|
11
11
|
frozendict>=2.4.0
|
|
12
|
+
gitpython
|
|
12
13
|
gdown
|
|
13
14
|
httpx>=0.22.0
|
|
14
15
|
importlib_metadata ; python_version < '3.8'
|
|
@@ -26,7 +27,7 @@ opencv-python-headless
|
|
|
26
27
|
paramiko
|
|
27
28
|
pika>=1.2.0
|
|
28
29
|
Pillow >= 7.2.0
|
|
29
|
-
pydantic
|
|
30
|
+
pydantic >= 2.0.0
|
|
30
31
|
python-magic
|
|
31
32
|
python-multipart
|
|
32
33
|
pyyaml
|
|
@@ -32,12 +32,11 @@ from ..decorators import ocrd_loglevel
|
|
|
32
32
|
from .ocrd_tool import ocrd_tool_cli
|
|
33
33
|
from .workspace import workspace_cli
|
|
34
34
|
from .process import process_cli
|
|
35
|
-
from .bashlib import bashlib_cli
|
|
36
35
|
from .validate import validate_cli
|
|
37
36
|
from .resmgr import resmgr_cli
|
|
38
37
|
from .zip import zip_cli
|
|
39
|
-
from .log import log_cli
|
|
40
38
|
from .network import network_cli
|
|
39
|
+
from .bashlib import bashlib_cli
|
|
41
40
|
|
|
42
41
|
|
|
43
42
|
__all__ = ['cli']
|
|
@@ -117,9 +116,8 @@ def cli(**kwargs): # pylint: disable=unused-argument
|
|
|
117
116
|
cli.add_command(ocrd_tool_cli)
|
|
118
117
|
cli.add_command(workspace_cli)
|
|
119
118
|
cli.add_command(process_cli)
|
|
120
|
-
cli.add_command(bashlib_cli)
|
|
121
119
|
cli.add_command(zip_cli)
|
|
122
120
|
cli.add_command(validate_cli)
|
|
123
|
-
cli.add_command(log_cli)
|
|
124
121
|
cli.add_command(resmgr_cli)
|
|
125
122
|
cli.add_command(network_cli)
|
|
123
|
+
cli.add_command(bashlib_cli)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OCR-D CLI: bash library
|
|
3
|
+
|
|
4
|
+
.. click:: ocrd.cli.bashlib:bashlib_cli
|
|
5
|
+
:prog: ocrd bashlib
|
|
6
|
+
:nested: full
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# WARNING: bashlib processors have been deprecated as of v3 of the OCR-D/core API
|
|
11
|
+
# and will be removed in v3.7.0. We retain the `ocrd bashlib` CLI only
|
|
12
|
+
# to not break the `ocrd bashlib filename` command, which is used in CD
|
|
13
|
+
# scripts to get the `share` directory of the core installation.
|
|
14
|
+
|
|
15
|
+
import click
|
|
16
|
+
from ocrd.constants import BASHLIB_FILENAME
|
|
17
|
+
|
|
18
|
+
# ----------------------------------------------------------------------
|
|
19
|
+
# ocrd bashlib
|
|
20
|
+
# ----------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@click.group('bashlib')
|
|
24
|
+
def bashlib_cli():
|
|
25
|
+
"""
|
|
26
|
+
Work with bash library
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# ----------------------------------------------------------------------
|
|
30
|
+
# ocrd bashlib filename
|
|
31
|
+
# ----------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@bashlib_cli.command('filename')
|
|
35
|
+
def bashlib_filename():
|
|
36
|
+
"""
|
|
37
|
+
Dump the bash library filename for sourcing by shell scripts
|
|
38
|
+
|
|
39
|
+
For functions exported by bashlib, see `<../../README.md>`_
|
|
40
|
+
"""
|
|
41
|
+
print(BASHLIB_FILENAME)
|
|
42
|
+
|
|
@@ -12,6 +12,7 @@ from ocrd_network.cli import (
|
|
|
12
12
|
client_cli,
|
|
13
13
|
processing_server_cli,
|
|
14
14
|
processing_worker_cli,
|
|
15
|
+
resource_manager_server_cli
|
|
15
16
|
)
|
|
16
17
|
|
|
17
18
|
|
|
@@ -26,3 +27,4 @@ def network_cli():
|
|
|
26
27
|
network_cli.add_command(client_cli)
|
|
27
28
|
network_cli.add_command(processing_server_cli)
|
|
28
29
|
network_cli.add_command(processing_worker_cli)
|
|
30
|
+
network_cli.add_command(resource_manager_server_cli)
|
|
@@ -20,6 +20,7 @@ from ocrd_utils import (
|
|
|
20
20
|
get_ocrd_tool_json,
|
|
21
21
|
initLogging,
|
|
22
22
|
RESOURCE_LOCATIONS,
|
|
23
|
+
RESOURCE_TYPES
|
|
23
24
|
)
|
|
24
25
|
from ocrd.constants import RESOURCE_USER_LIST_COMMENT
|
|
25
26
|
|
|
@@ -70,16 +71,16 @@ def list_installed(executable=None):
|
|
|
70
71
|
@resmgr_cli.command('download')
|
|
71
72
|
@click.option('-n', '--any-url', default='', help='URL of unregistered resource to download/copy from')
|
|
72
73
|
@click.option('-D', '--no-dynamic', default=False, is_flag=True,
|
|
73
|
-
help="
|
|
74
|
-
@click.option('-t', '--resource-type', type=click.Choice(
|
|
75
|
-
help='Type of resource',)
|
|
76
|
-
@click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type')
|
|
74
|
+
help="Skip looking into each processor's --dump-{json,module-dir} module-registered resources")
|
|
75
|
+
@click.option('-t', '--resource-type', type=click.Choice(RESOURCE_TYPES), default='file',
|
|
76
|
+
help='Type of resource (when unregistered or incomplete)',)
|
|
77
|
+
@click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type (when unregistered or incomplete)')
|
|
77
78
|
@click.option('-a', '--allow-uninstalled', is_flag=True,
|
|
78
|
-
help="Allow installing resources for
|
|
79
|
+
help="Allow installing resources for not installed processors",)
|
|
79
80
|
@click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True)
|
|
80
|
-
@click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
|
|
81
|
+
@click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
|
|
81
82
|
help="Where to store resources - defaults to first location in processor's 'resource_locations' "
|
|
82
|
-
"list
|
|
83
|
+
"list, i.e. usually 'data'")
|
|
83
84
|
@click.argument('executable', required=True)
|
|
84
85
|
@click.argument('name', required=False)
|
|
85
86
|
def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable,
|
|
@@ -106,8 +107,6 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
|
|
|
106
107
|
executable = None
|
|
107
108
|
if name == '*':
|
|
108
109
|
name = None
|
|
109
|
-
is_url = (any_url.startswith('https://') or any_url.startswith('http://')) if any_url else False
|
|
110
|
-
is_filename = Path(any_url).exists() if any_url else False
|
|
111
110
|
if executable and not which(executable):
|
|
112
111
|
if not allow_uninstalled:
|
|
113
112
|
log.error(f"Executable '{executable}' is not installed. "
|
|
@@ -126,65 +125,30 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
|
|
|
126
125
|
'path_in_archive': path_in_archive}]
|
|
127
126
|
)]
|
|
128
127
|
for this_executable, this_reslist in reslist:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
log.warning(f"Cannot download user resource {resdict['name']}")
|
|
138
|
-
continue
|
|
139
|
-
if resdict['url'].startswith('https://') or resdict['url'].startswith('http://'):
|
|
140
|
-
log.info(f"Downloading {registered} resource '{resdict['name']}' ({resdict['url']})")
|
|
141
|
-
if 'size' not in resdict:
|
|
142
|
-
with requests.head(resdict['url']) as r:
|
|
143
|
-
resdict['size'] = int(r.headers.get('content-length', 0))
|
|
144
|
-
else:
|
|
145
|
-
log.info(f"Copying {registered} resource '{resdict['name']}' ({resdict['url']})")
|
|
146
|
-
urlpath = Path(resdict['url'])
|
|
147
|
-
resdict['url'] = str(urlpath.resolve())
|
|
148
|
-
if Path(urlpath).is_dir():
|
|
149
|
-
resdict['size'] = directory_size(urlpath)
|
|
150
|
-
else:
|
|
151
|
-
resdict['size'] = urlpath.stat().st_size
|
|
152
|
-
if not location:
|
|
153
|
-
location = get_ocrd_tool_json(this_executable)['resource_locations'][0]
|
|
154
|
-
elif location not in get_ocrd_tool_json(this_executable)['resource_locations']:
|
|
155
|
-
log.error(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
|
|
156
|
-
f"refusing to install to invalid location")
|
|
157
|
-
sys.exit(1)
|
|
158
|
-
if location != 'module':
|
|
159
|
-
basedir = resmgr.location_to_resource_dir(location)
|
|
160
|
-
else:
|
|
161
|
-
basedir = get_moduledir(this_executable)
|
|
162
|
-
if not basedir:
|
|
163
|
-
basedir = resmgr.location_to_resource_dir('data')
|
|
164
|
-
|
|
128
|
+
resource_locations = get_ocrd_tool_json(this_executable)['resource_locations']
|
|
129
|
+
if not location:
|
|
130
|
+
location = resource_locations[0]
|
|
131
|
+
elif location not in resource_locations:
|
|
132
|
+
log.warning(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
|
|
133
|
+
f"refusing to install to invalid location. Instead installing to: {resource_locations[0]}")
|
|
134
|
+
res_dest_dir = resmgr.build_resource_dest_dir(location=location, executable=this_executable)
|
|
135
|
+
for res_dict in this_reslist:
|
|
165
136
|
try:
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
)
|
|
178
|
-
if registered == 'unregistered':
|
|
179
|
-
log.info(f"{this_executable} resource '{name}' ({any_url}) not a known resource, creating stub "
|
|
180
|
-
f"in {resmgr.user_list}'")
|
|
181
|
-
resmgr.add_to_user_database(this_executable, fpath, url=any_url)
|
|
182
|
-
resmgr.save_user_list()
|
|
183
|
-
log.info(f"Installed resource {resdict['url']} under {fpath}")
|
|
137
|
+
fpath = resmgr.handle_resource(
|
|
138
|
+
res_dict=res_dict,
|
|
139
|
+
executable=this_executable,
|
|
140
|
+
dest_dir=res_dest_dir,
|
|
141
|
+
any_url=any_url,
|
|
142
|
+
overwrite=overwrite,
|
|
143
|
+
resource_type=resource_type,
|
|
144
|
+
path_in_archive=path_in_archive
|
|
145
|
+
)
|
|
146
|
+
if not fpath:
|
|
147
|
+
continue
|
|
184
148
|
except FileExistsError as exc:
|
|
185
149
|
log.info(str(exc))
|
|
186
|
-
|
|
187
|
-
|
|
150
|
+
usage = res_dict.get('parameter_usage', 'as-is')
|
|
151
|
+
log.info(f"Use in parameters as '{resmgr.parameter_usage(res_dict['name'], usage)}'")
|
|
188
152
|
|
|
189
153
|
|
|
190
154
|
@resmgr_cli.command('migrate')
|
|
@@ -9,7 +9,6 @@ __all__ = [
|
|
|
9
9
|
'DOWNLOAD_DIR',
|
|
10
10
|
'DEFAULT_REPOSITORY_URL',
|
|
11
11
|
'BASHLIB_FILENAME',
|
|
12
|
-
'RESOURCE_LIST_FILENAME',
|
|
13
12
|
'BACKUP_DIR',
|
|
14
13
|
'RESOURCE_USER_LIST_COMMENT',
|
|
15
14
|
]
|
|
@@ -19,6 +18,5 @@ DEFAULT_UPLOAD_FOLDER = '/tmp/uploads-ocrd-core'
|
|
|
19
18
|
DOWNLOAD_DIR = '/tmp/ocrd-core-downloads'
|
|
20
19
|
DEFAULT_REPOSITORY_URL = 'http://localhost:5000/'
|
|
21
20
|
BASHLIB_FILENAME = resource_filename(__package__, 'lib.bash')
|
|
22
|
-
RESOURCE_LIST_FILENAME = resource_filename(__package__, 'resource_list.yml')
|
|
23
21
|
RESOURCE_USER_LIST_COMMENT = "# OCR-D private resource list (consider sending a PR with your own resources to OCR-D/core)"
|
|
24
22
|
BACKUP_DIR = '.backup'
|
|
@@ -258,12 +258,12 @@ class ClientSideOcrdMets:
|
|
|
258
258
|
|
|
259
259
|
def add_agent(self, **kwargs):
|
|
260
260
|
if not self.multiplexing_mode:
|
|
261
|
-
return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).
|
|
261
|
+
return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).model_dump())
|
|
262
262
|
else:
|
|
263
263
|
self.session.request(
|
|
264
264
|
"POST",
|
|
265
265
|
self.url,
|
|
266
|
-
json=MpxReq.add_agent(self.ws_dir_path, OcrdAgentModel.create(**kwargs).
|
|
266
|
+
json=MpxReq.add_agent(self.ws_dir_path, OcrdAgentModel.create(**kwargs).model_dump())
|
|
267
267
|
).json()
|
|
268
268
|
return OcrdAgentModel.create(**kwargs)
|
|
269
269
|
|
|
@@ -305,7 +305,7 @@ class ClientSideOcrdMets:
|
|
|
305
305
|
mimetype=mimetype, url=url, local_filename=local_filename
|
|
306
306
|
)
|
|
307
307
|
# add force+ignore
|
|
308
|
-
kwargs = {**kwargs, **data.
|
|
308
|
+
kwargs = {**kwargs, **data.model_dump()}
|
|
309
309
|
|
|
310
310
|
if not self.multiplexing_mode:
|
|
311
311
|
r = self.session.request("POST", f"{self.url}/file", data=kwargs)
|
|
@@ -530,7 +530,7 @@ class OcrdMetsServer:
|
|
|
530
530
|
|
|
531
531
|
@app.post(path='/agent', response_model=OcrdAgentModel)
|
|
532
532
|
async def add_agent(agent: OcrdAgentModel):
|
|
533
|
-
kwargs = agent.
|
|
533
|
+
kwargs = agent.model_dump()
|
|
534
534
|
kwargs['_type'] = kwargs.pop('type')
|
|
535
535
|
workspace.mets.add_agent(**kwargs)
|
|
536
536
|
response = agent
|
|
@@ -575,7 +575,7 @@ class OcrdMetsServer:
|
|
|
575
575
|
local_filename=local_filename
|
|
576
576
|
)
|
|
577
577
|
# Add to workspace
|
|
578
|
-
kwargs = file_resource.
|
|
578
|
+
kwargs = file_resource.model_dump()
|
|
579
579
|
workspace.add_file(**kwargs, force=force)
|
|
580
580
|
response = file_resource
|
|
581
581
|
self.log.debug(f"POST /file -> {response.__dict__}")
|
|
@@ -42,15 +42,14 @@ from .ocrd_page_result import OcrdPageResult
|
|
|
42
42
|
from ocrd_utils import (
|
|
43
43
|
VERSION as OCRD_VERSION,
|
|
44
44
|
MIMETYPE_PAGE,
|
|
45
|
-
MIME_TO_EXT,
|
|
46
45
|
config,
|
|
47
46
|
getLogger,
|
|
48
47
|
list_resource_candidates,
|
|
49
|
-
pushd_popd,
|
|
50
48
|
list_all_resources,
|
|
51
49
|
get_processor_resource_types,
|
|
52
50
|
resource_filename,
|
|
53
51
|
parse_json_file_with_comments,
|
|
52
|
+
pushd_popd,
|
|
54
53
|
make_file_id,
|
|
55
54
|
deprecation_warning
|
|
56
55
|
)
|
|
@@ -608,7 +607,7 @@ class Processor():
|
|
|
608
607
|
"""
|
|
609
608
|
Ensure all input files for a single page are
|
|
610
609
|
downloaded to the workspace, then schedule
|
|
611
|
-
:py:meth:`.
|
|
610
|
+
:py:meth:`.process_page_file` to be run on
|
|
612
611
|
them via `executor` (enforcing a per-page time
|
|
613
612
|
limit of `max_seconds`).
|
|
614
613
|
|
|
@@ -935,9 +934,8 @@ class Processor():
|
|
|
935
934
|
cwd = self.old_pwd
|
|
936
935
|
else:
|
|
937
936
|
cwd = getcwd()
|
|
938
|
-
ret =
|
|
939
|
-
|
|
940
|
-
if exists(cand)]
|
|
937
|
+
ret = list(filter(exists, list_resource_candidates(executable, val,
|
|
938
|
+
cwd=cwd, moduled=self.moduledir)))
|
|
941
939
|
if ret:
|
|
942
940
|
self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
|
|
943
941
|
return ret[0]
|
|
@@ -968,17 +966,9 @@ class Processor():
|
|
|
968
966
|
"""
|
|
969
967
|
List all resources found in the filesystem and matching content-type by filename suffix
|
|
970
968
|
"""
|
|
971
|
-
|
|
972
|
-
for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
|
|
969
|
+
for res in list_all_resources(self.executable, ocrd_tool=self.ocrd_tool, moduled=self.moduledir):
|
|
973
970
|
res = Path(res)
|
|
974
|
-
|
|
975
|
-
if res.is_dir() and 'text/directory' not in mimetypes:
|
|
976
|
-
continue
|
|
977
|
-
# if we do not know all MIME types, then keep the file, otherwise require suffix match
|
|
978
|
-
if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
|
|
979
|
-
for mime in mimetypes):
|
|
980
|
-
continue
|
|
981
|
-
yield res
|
|
971
|
+
yield res.name
|
|
982
972
|
|
|
983
973
|
@property
|
|
984
974
|
def module(self):
|
|
@@ -37,6 +37,31 @@
|
|
|
37
37
|
"description": "Whether to extract an image for each filtered segment and write to the output fileGrp."
|
|
38
38
|
}
|
|
39
39
|
}
|
|
40
|
+
},
|
|
41
|
+
"ocrd-command": {
|
|
42
|
+
"executable": "ocrd-command",
|
|
43
|
+
"description": "Bare-bones processor runs shell commands to process PAGE files",
|
|
44
|
+
"steps": ["recognition/text-recognition", "recognition/font-identification", "recognition/post-correction", "layout/segmentation", "layout/analysis"],
|
|
45
|
+
"categories": [],
|
|
46
|
+
"input_file_grp_cardinality": [1, -1],
|
|
47
|
+
"output_file_grp_cardinality": 1,
|
|
48
|
+
"parameters": {
|
|
49
|
+
"command": {
|
|
50
|
+
"type": "string",
|
|
51
|
+
"default": "cat @INFILE > @OUTFILE",
|
|
52
|
+
"description": "Shell command to operate on PAGE files, with @INFILE as place-holder for the input file path(s), and @OUTFILE as place-holder for the output file path. If running on multiple input fileGrps, then @INFILE must be repeated as many times."
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"ocrd-merge": {
|
|
57
|
+
"executable": "ocrd-merge",
|
|
58
|
+
"description": "Bare-bones processor merges annotations from multiple fileGrps",
|
|
59
|
+
"steps": ["layout/segmentation"],
|
|
60
|
+
"categories": [],
|
|
61
|
+
"input_file_grp_cardinality": [1, -1],
|
|
62
|
+
"output_file_grp_cardinality": 1,
|
|
63
|
+
"parameters": {
|
|
64
|
+
}
|
|
40
65
|
}
|
|
41
66
|
}
|
|
42
67
|
}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# pylint: disable=missing-module-docstring,invalid-name
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from itertools import count
|
|
4
|
+
from collections import OrderedDict as odict
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from ocrd import Processor, OcrdPageResult
|
|
9
|
+
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
|
10
|
+
from ocrd_modelfactory import page_from_file
|
|
11
|
+
from ocrd_models import OcrdPage
|
|
12
|
+
from ocrd_models.ocrd_page import (
|
|
13
|
+
BorderType,
|
|
14
|
+
CoordsType,
|
|
15
|
+
ReadingOrderType,
|
|
16
|
+
UnorderedGroupType,
|
|
17
|
+
)
|
|
18
|
+
from ocrd_utils import bbox_from_points
|
|
19
|
+
|
|
20
|
+
_SEGTYPES = [
|
|
21
|
+
"NoiseRegion",
|
|
22
|
+
"LineDrawingRegion",
|
|
23
|
+
"AdvertRegion",
|
|
24
|
+
"ImageRegion",
|
|
25
|
+
"ChartRegion",
|
|
26
|
+
"MusicRegion",
|
|
27
|
+
"GraphicRegion",
|
|
28
|
+
"UnknownRegion",
|
|
29
|
+
"CustomRegion",
|
|
30
|
+
"SeparatorRegion",
|
|
31
|
+
"MathsRegion",
|
|
32
|
+
"TextRegion",
|
|
33
|
+
"MapRegion",
|
|
34
|
+
"ChemRegion",
|
|
35
|
+
"TableRegion",
|
|
36
|
+
"TextLine",
|
|
37
|
+
"Word",
|
|
38
|
+
"Glyph"
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_border_bbox(pcgts):
|
|
43
|
+
if pcgts.Page.Border is None:
|
|
44
|
+
return [0, 0, pcgts.Page.imageWidth, pcgts.Page.imageHeight]
|
|
45
|
+
return bbox_from_points(pcgts.Page.Border.Coords.points)
|
|
46
|
+
|
|
47
|
+
def rename_segments(pcgts, start=1):
|
|
48
|
+
renamed = {}
|
|
49
|
+
rodict = pcgts.Page.get_ReadingOrderGroups()
|
|
50
|
+
# get everything that has an identifier
|
|
51
|
+
nodes = pcgts.xpath("//*[@id]")
|
|
52
|
+
# filter segments
|
|
53
|
+
segments = [segment for segment in map(pcgts.revmap.get, nodes)
|
|
54
|
+
# get PAGE objects from matching etree nodes
|
|
55
|
+
# but allow only hierarchy segments
|
|
56
|
+
if segment.__class__.__name__.replace('Type', '') in _SEGTYPES]
|
|
57
|
+
# count segments and rename them
|
|
58
|
+
# fixme: or perhaps better to have each segment type named and counted differently?
|
|
59
|
+
num = 0
|
|
60
|
+
regions = []
|
|
61
|
+
for num, segment in zip(count(start=start), segments):
|
|
62
|
+
segtype = segment.original_tagname_
|
|
63
|
+
#parent = segment.parent_object_
|
|
64
|
+
newname = "seg%011d" % num
|
|
65
|
+
assert not segment.id in renamed
|
|
66
|
+
if segtype.endswith('Region') and segment.id in rodict:
|
|
67
|
+
# update reading order
|
|
68
|
+
roelem = rodict[segment.id]
|
|
69
|
+
roelem.regionRef = newname
|
|
70
|
+
renamed[segment.id] = newname
|
|
71
|
+
segment.id = newname
|
|
72
|
+
return num
|
|
73
|
+
|
|
74
|
+
class MergeProcessor(Processor):
|
|
75
|
+
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
|
76
|
+
"""
|
|
77
|
+
Merge PAGE segment hierarchy elements from all input file groups.
|
|
78
|
+
|
|
79
|
+
For each page, open and deserialise PAGE input files. Rename all elements
|
|
80
|
+
of the segment hierarchy to new (clash-free) identifers. Redefine the
|
|
81
|
+
`Border` coordinates as the convex hull of all input borders. Then add all
|
|
82
|
+
regions from all input files, concatenating them into a single `ReadingOrder`
|
|
83
|
+
in the order of input file groups.
|
|
84
|
+
|
|
85
|
+
Produce a new PAGE output file by serialising the resulting hierarchy.
|
|
86
|
+
"""
|
|
87
|
+
actual_pcgts = list(filter(None, input_pcgts))
|
|
88
|
+
assert len(set(pcgts.Page.imageFilename for pcgts in actual_pcgts)) == 1, \
|
|
89
|
+
"input files must all reference the same @imageFilename"
|
|
90
|
+
# create new PAGE for image
|
|
91
|
+
result = OcrdPageResult(page_from_file(actual_pcgts[0].Page.imageFilename))
|
|
92
|
+
# unify Border
|
|
93
|
+
borders = [get_border_bbox(pcgts) for pcgts in actual_pcgts]
|
|
94
|
+
minx, miny, maxx, maxy = zip(*borders)
|
|
95
|
+
minx = min(minx)
|
|
96
|
+
miny = min(miny)
|
|
97
|
+
maxx = max(maxx)
|
|
98
|
+
maxy = max(maxy)
|
|
99
|
+
result.pcgts.Page.set_Border(
|
|
100
|
+
BorderType(CoordsType(
|
|
101
|
+
points=f"{minx},{miny} {maxx},{miny} {maxx},{maxy} {minx},{maxy}")))
|
|
102
|
+
# rename all segments
|
|
103
|
+
num = 1
|
|
104
|
+
for pcgts in actual_pcgts:
|
|
105
|
+
num = rename_segments(pcgts, num)
|
|
106
|
+
# concatenate all regions
|
|
107
|
+
ug = UnorderedGroupType(id="merged")
|
|
108
|
+
result.pcgts.Page.set_ReadingOrder(ReadingOrderType(UnorderedGroup=ug))
|
|
109
|
+
for pcgts in actual_pcgts:
|
|
110
|
+
for region in pcgts.Page.get_AllRegions():
|
|
111
|
+
adder = getattr(result.pcgts.Page, 'add_' + region.original_tagname_)
|
|
112
|
+
adder(region)
|
|
113
|
+
if pcgts.Page.ReadingOrder:
|
|
114
|
+
group = pcgts.Page.ReadingOrder.OrderedGroup or pcgts.Page.ReadingOrder.UnorderedGroup
|
|
115
|
+
adder = getattr(ug, 'add_' + group.original_tagname_)
|
|
116
|
+
adder(group)
|
|
117
|
+
return result
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def metadata_filename(self):
|
|
121
|
+
return 'processor/builtin/dummy/ocrd-tool.json'
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def executable(self):
|
|
125
|
+
return 'ocrd-merge'
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@click.command()
|
|
129
|
+
@ocrd_cli_options
|
|
130
|
+
def cli(*args, **kwargs):
|
|
131
|
+
return ocrd_cli_wrap_processor(MergeProcessor, *args, **kwargs)
|