ocrd 3.0.0b7__tar.gz → 3.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrd-3.0.0b7/src/ocrd.egg-info → ocrd-3.0.2}/PKG-INFO +3 -2
- ocrd-3.0.2/VERSION +1 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/pyproject.toml +1 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/requirements.txt +1 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/__init__.py +3 -1
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/decorators/__init__.py +3 -2
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/decorators/ocrd_cli_options.py +5 -5
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/mets_server.py +62 -42
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/processor/base.py +8 -7
- ocrd-3.0.2/src/ocrd/processor/builtin/dummy/ocrd-tool.json +41 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/processor/builtin/dummy_processor.py +0 -3
- ocrd-3.0.2/src/ocrd/processor/builtin/filter_processor.py +108 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/resource_manager.py +4 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/workspace.py +0 -2
- {ocrd-3.0.0b7 → ocrd-3.0.2/src/ocrd.egg-info}/PKG-INFO +3 -2
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd.egg-info/SOURCES.txt +2 -1
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd.egg-info/entry_points.txt +1 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd.egg-info/requires.txt +1 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_modelfactory/__init__.py +7 -1
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/ocrd_exif.py +2 -2
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/ocrd_page.py +22 -3
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/ocrd_page_generateds.py +2813 -1438
- ocrd-3.0.2/src/ocrd_models/xpath_functions.py +51 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/cli/client.py +28 -9
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/client.py +9 -6
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/client_utils.py +25 -14
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/processing_server.py +27 -15
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/processing_worker.py +7 -4
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/processor_server.py +2 -1
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/rabbitmq_utils/connector.py +2 -2
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/runtime_data/deployer.py +28 -18
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/server_cache.py +26 -23
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/server_utils.py +40 -4
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/tcp_to_uds_mets_proxy.py +8 -5
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/utils.py +19 -15
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/config.py +39 -17
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_resolver.py +1 -1
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_resource_manager.py +1 -1
- ocrd-3.0.0b7/VERSION +0 -1
- ocrd-3.0.0b7/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -21
- ocrd-3.0.0b7/src/ocrd/processor/concurrent.py +0 -909
- {ocrd-3.0.0b7 → ocrd-3.0.2}/LICENSE +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/MANIFEST.in +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/README.md +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/README_bashlib.md +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/README_ocrd.md +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/README_ocrd_modelfactory.md +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/README_ocrd_models.md +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/README_ocrd_network.md +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/README_ocrd_utils.md +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/README_ocrd_validators.md +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/setup.cfg +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/bashlib.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/log.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/network.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/ocrd_tool.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/process.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/resmgr.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/validate.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/workspace.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/cli/zip.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/constants.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/decorators/loglevel_option.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/decorators/mets_find_options.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/decorators/parameter_option.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/lib.bash +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/ocrd-all-tool.json +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/processor/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/processor/builtin/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/processor/helpers.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/processor/ocrd_page_result.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/resolver.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/resource_list.yml +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/task_sequence.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/workspace_backup.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd/workspace_bagger.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd.egg-info/dependency_links.txt +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd.egg-info/top_level.txt +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/constants.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/mets-empty.xml +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/ocrd_agent.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/ocrd_file.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/ocrd_mets.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/ocrd_xml_base.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/report.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_models/utils.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/cli/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/cli/processing_server.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/cli/processing_worker.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/cli/processor_server.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/constants.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/database.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/logging_utils.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/models/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/models/job.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/models/messages.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/models/ocrd_tool.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/models/workflow.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/models/workspace.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/param_validators.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/process_helpers.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/runtime_data/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/runtime_data/config_parser.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/runtime_data/hosts.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/runtime_data/network_agents.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_network/runtime_data/network_services.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/constants.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/deprecate.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/image.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/introspect.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/logging.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/ocrd_logging.conf +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/os.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_utils/str.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/__init__.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/bagit-profile.yml +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/constants.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/json_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/message_processing.schema.yml +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/message_result.schema.yml +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/mets.xsd +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/page.xsd +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/page_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/parameter_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/processing_server_config_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/resource_list_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/workspace_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/xlink.xsd +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/xsd_mets_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/xsd_page_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/src/ocrd_validators/xsd_validator.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_decorators.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_logging.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_logging_conf.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_mets_server.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_model_factory.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_resolver_oai.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_task_sequence.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_utils.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_version.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_workspace.py +0 -0
- {ocrd-3.0.0b7 → ocrd-3.0.2}/tests/test_workspace_remove.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.2
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -17,6 +17,7 @@ Requires-Dist: click>=7
|
|
|
17
17
|
Requires-Dist: cryptography<43.0.0
|
|
18
18
|
Requires-Dist: Deprecated==1.2.0
|
|
19
19
|
Requires-Dist: docker
|
|
20
|
+
Requires-Dist: elementpath
|
|
20
21
|
Requires-Dist: fastapi>=0.78.0
|
|
21
22
|
Requires-Dist: filetype
|
|
22
23
|
Requires-Dist: Flask
|
ocrd-3.0.2/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.0.2
|
|
@@ -32,6 +32,7 @@ Issues = "https://github.com/OCR-D/core/issues"
|
|
|
32
32
|
[project.scripts]
|
|
33
33
|
ocrd = "ocrd.cli:cli"
|
|
34
34
|
ocrd-dummy = "ocrd.processor.builtin.dummy_processor:cli"
|
|
35
|
+
ocrd-filter = "ocrd.processor.builtin.filter_processor:cli"
|
|
35
36
|
|
|
36
37
|
[tool.setuptools]
|
|
37
38
|
include-package-data = true
|
|
@@ -16,7 +16,7 @@ def command_with_replaced_help(*replacements):
|
|
|
16
16
|
|
|
17
17
|
class CommandWithReplacedHelp(click.Command):
|
|
18
18
|
def get_help(self, ctx):
|
|
19
|
-
newhelp = super().get_help(ctx)
|
|
19
|
+
newhelp : str = super().get_help(ctx)
|
|
20
20
|
for replacement in replacements:
|
|
21
21
|
newhelp = re.sub(*replacement, newhelp)
|
|
22
22
|
# print(newhelp)
|
|
@@ -83,6 +83,8 @@ Variables:
|
|
|
83
83
|
\b
|
|
84
84
|
{config.describe('OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS')}
|
|
85
85
|
\b
|
|
86
|
+
{config.describe('OCRD_NETWORK_RABBITMQ_HEARTBEAT')}
|
|
87
|
+
\b
|
|
86
88
|
{config.describe('OCRD_PROFILE_FILE')}
|
|
87
89
|
\b
|
|
88
90
|
{config.describe('OCRD_PROFILE', wrap_text=False)}
|
|
@@ -48,6 +48,9 @@ def ocrd_cli_wrap_processor(
|
|
|
48
48
|
# ocrd_network params end #
|
|
49
49
|
**kwargs
|
|
50
50
|
):
|
|
51
|
+
# init logging handlers so no imported libs can preempt ours
|
|
52
|
+
initLogging()
|
|
53
|
+
|
|
51
54
|
# FIXME: remove workspace arg entirely
|
|
52
55
|
processor = processorClass(None)
|
|
53
56
|
if not sys.argv[1:]:
|
|
@@ -89,8 +92,6 @@ def ocrd_cli_wrap_processor(
|
|
|
89
92
|
# Used for checking/starting network agents for the WebAPI architecture
|
|
90
93
|
check_and_run_network_agent(processorClass, subcommand, address, database, queue)
|
|
91
94
|
|
|
92
|
-
# from here: single-run processing context
|
|
93
|
-
initLogging()
|
|
94
95
|
if 'parameter' in kwargs:
|
|
95
96
|
# Disambiguate parameter file/literal, and resolve file
|
|
96
97
|
def resolve(name):
|
|
@@ -13,16 +13,16 @@ from ocrd_network import (
|
|
|
13
13
|
|
|
14
14
|
def ocrd_cli_options(f):
|
|
15
15
|
"""
|
|
16
|
-
Implement
|
|
16
|
+
Implement Processor CLI.
|
|
17
17
|
|
|
18
18
|
Usage::
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
from ocrd.decorators import ocrd_cli_options
|
|
21
21
|
|
|
22
22
|
@click.command()
|
|
23
|
-
@
|
|
24
|
-
def cli(
|
|
25
|
-
print(mets_url)
|
|
23
|
+
@ocrd_cli_options
|
|
24
|
+
def cli(**kwargs):
|
|
25
|
+
print(kwargs['mets_url'])
|
|
26
26
|
"""
|
|
27
27
|
# XXX Note that the `--help` output is statically generate_processor_help
|
|
28
28
|
params = [
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
# METS server functionality
|
|
3
3
|
"""
|
|
4
|
+
import os
|
|
4
5
|
import re
|
|
5
6
|
from os import _exit, chmod
|
|
7
|
+
import signal
|
|
6
8
|
from typing import Dict, Optional, Union, List, Tuple
|
|
7
9
|
from time import sleep
|
|
8
10
|
from pathlib import Path
|
|
@@ -155,13 +157,13 @@ class ClientSideOcrdMets:
|
|
|
155
157
|
Request writing the changes to the file system
|
|
156
158
|
"""
|
|
157
159
|
if not self.multiplexing_mode:
|
|
158
|
-
self.session.request("PUT", url=self.url)
|
|
160
|
+
return self.session.request("PUT", url=self.url).text
|
|
159
161
|
else:
|
|
160
|
-
self.session.request(
|
|
162
|
+
return self.session.request(
|
|
161
163
|
"POST",
|
|
162
164
|
self.url,
|
|
163
165
|
json=MpxReq.save(self.ws_dir_path)
|
|
164
|
-
)
|
|
166
|
+
).json()["text"]
|
|
165
167
|
|
|
166
168
|
def stop(self):
|
|
167
169
|
"""
|
|
@@ -169,14 +171,13 @@ class ClientSideOcrdMets:
|
|
|
169
171
|
"""
|
|
170
172
|
try:
|
|
171
173
|
if not self.multiplexing_mode:
|
|
172
|
-
self.session.request("DELETE", self.url)
|
|
173
|
-
return
|
|
174
|
+
return self.session.request("DELETE", self.url).text
|
|
174
175
|
else:
|
|
175
|
-
self.session.request(
|
|
176
|
+
return self.session.request(
|
|
176
177
|
"POST",
|
|
177
178
|
self.url,
|
|
178
179
|
json=MpxReq.stop(self.ws_dir_path)
|
|
179
|
-
)
|
|
180
|
+
).json()["text"]
|
|
180
181
|
except ConnectionError:
|
|
181
182
|
# Expected because we exit the process without returning
|
|
182
183
|
pass
|
|
@@ -323,7 +324,7 @@ class ClientSideOcrdMets:
|
|
|
323
324
|
|
|
324
325
|
|
|
325
326
|
class MpxReq:
|
|
326
|
-
"""This class
|
|
327
|
+
"""This class wraps the request bodies needed for the tcp forwarding
|
|
327
328
|
|
|
328
329
|
For every mets-server-call like find_files or workspace_path a special request_body is
|
|
329
330
|
needed to call `MetsServerProxy.forward_tcp_request`. These are created by this functions.
|
|
@@ -346,12 +347,12 @@ class MpxReq:
|
|
|
346
347
|
@staticmethod
|
|
347
348
|
def save(ws_dir_path: str) -> Dict:
|
|
348
349
|
return MpxReq.__args_wrapper(
|
|
349
|
-
ws_dir_path, method_type="PUT", response_type="
|
|
350
|
+
ws_dir_path, method_type="PUT", response_type="text", request_url="", request_data={})
|
|
350
351
|
|
|
351
352
|
@staticmethod
|
|
352
353
|
def stop(ws_dir_path: str) -> Dict:
|
|
353
354
|
return MpxReq.__args_wrapper(
|
|
354
|
-
ws_dir_path, method_type="DELETE", response_type="
|
|
355
|
+
ws_dir_path, method_type="DELETE", response_type="text", request_url="", request_data={})
|
|
355
356
|
|
|
356
357
|
@staticmethod
|
|
357
358
|
def reload(ws_dir_path: str) -> Dict:
|
|
@@ -428,18 +429,24 @@ class OcrdMetsServer:
|
|
|
428
429
|
|
|
429
430
|
@staticmethod
|
|
430
431
|
def kill_process(mets_server_pid: int):
|
|
431
|
-
|
|
432
|
+
os.kill(mets_server_pid, signal.SIGINT)
|
|
433
|
+
sleep(3)
|
|
434
|
+
try:
|
|
435
|
+
os.kill(mets_server_pid, signal.SIGKILL)
|
|
436
|
+
except ProcessLookupError as e:
|
|
437
|
+
pass
|
|
432
438
|
|
|
433
439
|
def shutdown(self):
|
|
440
|
+
pid = os.getpid()
|
|
441
|
+
self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.")
|
|
442
|
+
os.kill(pid, signal.SIGTERM)
|
|
434
443
|
if self.is_uds:
|
|
435
444
|
if Path(self.url).exists():
|
|
436
|
-
self.log.
|
|
445
|
+
self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}")
|
|
437
446
|
Path(self.url).unlink()
|
|
438
|
-
# os._exit because uvicorn catches SystemExit raised by sys.exit
|
|
439
|
-
_exit(0)
|
|
440
447
|
|
|
441
448
|
def startup(self):
|
|
442
|
-
self.log.info("
|
|
449
|
+
self.log.info(f"Configuring the Mets Server")
|
|
443
450
|
|
|
444
451
|
workspace = self.workspace
|
|
445
452
|
|
|
@@ -465,32 +472,49 @@ class OcrdMetsServer:
|
|
|
465
472
|
"""
|
|
466
473
|
Write current changes to the file system
|
|
467
474
|
"""
|
|
468
|
-
|
|
475
|
+
workspace.save_mets()
|
|
476
|
+
response = Response(content="The Mets Server is writing changes to disk.", media_type='text/plain')
|
|
477
|
+
self.log.info(f"PUT / -> {response.__dict__}")
|
|
478
|
+
return response
|
|
469
479
|
|
|
470
480
|
@app.delete(path='/')
|
|
471
|
-
|
|
481
|
+
def stop():
|
|
472
482
|
"""
|
|
473
483
|
Stop the mets server
|
|
474
484
|
"""
|
|
475
|
-
getLogger('ocrd.models.ocrd_mets').info(f'Shutting down METS Server {self.url}')
|
|
476
485
|
workspace.save_mets()
|
|
486
|
+
response = Response(content="The Mets Server will shut down soon...", media_type='text/plain')
|
|
477
487
|
self.shutdown()
|
|
488
|
+
self.log.info(f"DELETE / -> {response.__dict__}")
|
|
489
|
+
return response
|
|
478
490
|
|
|
479
491
|
@app.post(path='/reload')
|
|
480
|
-
|
|
492
|
+
def workspace_reload_mets():
|
|
481
493
|
"""
|
|
482
494
|
Reload mets file from the file system
|
|
483
495
|
"""
|
|
484
496
|
workspace.reload_mets()
|
|
485
|
-
|
|
497
|
+
response = Response(content=f"Reloaded from {workspace.directory}", media_type='text/plain')
|
|
498
|
+
self.log.info(f"POST /reload -> {response.__dict__}")
|
|
499
|
+
return response
|
|
486
500
|
|
|
487
501
|
@app.get(path='/unique_identifier', response_model=str)
|
|
488
502
|
async def unique_identifier():
|
|
489
|
-
|
|
503
|
+
response = Response(content=workspace.mets.unique_identifier, media_type='text/plain')
|
|
504
|
+
self.log.info(f"GET /unique_identifier -> {response.__dict__}")
|
|
505
|
+
return response
|
|
490
506
|
|
|
491
507
|
@app.get(path='/workspace_path', response_model=str)
|
|
492
508
|
async def workspace_path():
|
|
493
|
-
|
|
509
|
+
response = Response(content=workspace.directory, media_type="text/plain")
|
|
510
|
+
self.log.info(f"GET /workspace_path -> {response.__dict__}")
|
|
511
|
+
return response
|
|
512
|
+
|
|
513
|
+
@app.get(path='/physical_pages', response_model=OcrdPageListModel)
|
|
514
|
+
async def physical_pages():
|
|
515
|
+
response = {'physical_pages': workspace.mets.physical_pages}
|
|
516
|
+
self.log.info(f"GET /physical_pages -> {response}")
|
|
517
|
+
return response
|
|
494
518
|
|
|
495
519
|
@app.get(path='/physical_pages', response_model=OcrdPageListModel)
|
|
496
520
|
async def physical_pages():
|
|
@@ -498,18 +522,24 @@ class OcrdMetsServer:
|
|
|
498
522
|
|
|
499
523
|
@app.get(path='/file_groups', response_model=OcrdFileGroupListModel)
|
|
500
524
|
async def file_groups():
|
|
501
|
-
|
|
525
|
+
response = {'file_groups': workspace.mets.file_groups}
|
|
526
|
+
self.log.info(f"GET /file_groups -> {response}")
|
|
527
|
+
return response
|
|
502
528
|
|
|
503
529
|
@app.get(path='/agent', response_model=OcrdAgentListModel)
|
|
504
530
|
async def agents():
|
|
505
|
-
|
|
531
|
+
response = OcrdAgentListModel.create(workspace.mets.agents)
|
|
532
|
+
self.log.info(f"GET /agent -> {response.__dict__}")
|
|
533
|
+
return response
|
|
506
534
|
|
|
507
535
|
@app.post(path='/agent', response_model=OcrdAgentModel)
|
|
508
536
|
async def add_agent(agent: OcrdAgentModel):
|
|
509
537
|
kwargs = agent.dict()
|
|
510
538
|
kwargs['_type'] = kwargs.pop('type')
|
|
511
539
|
workspace.mets.add_agent(**kwargs)
|
|
512
|
-
|
|
540
|
+
response = agent
|
|
541
|
+
self.log.info(f"POST /agent -> {response.__dict__}")
|
|
542
|
+
return response
|
|
513
543
|
|
|
514
544
|
@app.get(path="/file", response_model=OcrdFileListModel)
|
|
515
545
|
async def find_files(
|
|
@@ -526,7 +556,9 @@ class OcrdMetsServer:
|
|
|
526
556
|
found = workspace.mets.find_all_files(
|
|
527
557
|
fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, local_filename=local_filename, url=url
|
|
528
558
|
)
|
|
529
|
-
|
|
559
|
+
response = OcrdFileListModel.create(found)
|
|
560
|
+
self.log.info(f"GET /file -> {response.__dict__}")
|
|
561
|
+
return response
|
|
530
562
|
|
|
531
563
|
@app.post(path='/file', response_model=OcrdFileModel)
|
|
532
564
|
async def add_file(
|
|
@@ -549,7 +581,9 @@ class OcrdMetsServer:
|
|
|
549
581
|
# Add to workspace
|
|
550
582
|
kwargs = file_resource.dict()
|
|
551
583
|
workspace.add_file(**kwargs, force=force)
|
|
552
|
-
|
|
584
|
+
response = file_resource
|
|
585
|
+
self.log.info(f"POST /file -> {response.__dict__}")
|
|
586
|
+
return response
|
|
553
587
|
|
|
554
588
|
# ------------- #
|
|
555
589
|
|
|
@@ -557,9 +591,6 @@ class OcrdMetsServer:
|
|
|
557
591
|
# Create socket and change to world-readable and -writable to avoid permission errors
|
|
558
592
|
self.log.debug(f"chmod 0o677 {self.url}")
|
|
559
593
|
server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
560
|
-
if Path(self.url).exists() and not is_socket_in_use(self.url):
|
|
561
|
-
# remove leftover unused socket which blocks startup
|
|
562
|
-
Path(self.url).unlink()
|
|
563
594
|
server.bind(self.url) # creates the socket file
|
|
564
595
|
atexit.register(self.shutdown)
|
|
565
596
|
server.close()
|
|
@@ -571,16 +602,5 @@ class OcrdMetsServer:
|
|
|
571
602
|
uvicorn_kwargs['log_config'] = None
|
|
572
603
|
uvicorn_kwargs['access_log'] = False
|
|
573
604
|
|
|
574
|
-
self.log.
|
|
605
|
+
self.log.info("Starting the uvicorn Mets Server")
|
|
575
606
|
uvicorn.run(app, **uvicorn_kwargs)
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
def is_socket_in_use(socket_path):
|
|
579
|
-
if Path(socket_path).exists():
|
|
580
|
-
client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
581
|
-
try:
|
|
582
|
-
client.connect(socket_path)
|
|
583
|
-
except OSError:
|
|
584
|
-
return False
|
|
585
|
-
client.close()
|
|
586
|
-
return True
|
|
@@ -130,7 +130,8 @@ class DummyExecutor:
|
|
|
130
130
|
def __init__(self, initializer=None, initargs=(), **kwargs):
|
|
131
131
|
initializer(*initargs)
|
|
132
132
|
def shutdown(self, **kwargs):
|
|
133
|
-
|
|
133
|
+
# allow gc to catch processor instance (unless cached)
|
|
134
|
+
_page_worker_set_ctxt(None, None)
|
|
134
135
|
def submit(self, fn, *args, **kwargs) -> DummyFuture:
|
|
135
136
|
return DummyFuture(fn, *args, **kwargs)
|
|
136
137
|
|
|
@@ -372,7 +373,7 @@ class Processor():
|
|
|
372
373
|
deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
|
|
373
374
|
|
|
374
375
|
def __del__(self):
|
|
375
|
-
self._base_logger.debug("shutting down")
|
|
376
|
+
self._base_logger.debug("shutting down %s in %s", repr(self), mp.current_process().name)
|
|
376
377
|
self.shutdown()
|
|
377
378
|
|
|
378
379
|
def show_help(self, subcommand=None):
|
|
@@ -505,7 +506,7 @@ class Processor():
|
|
|
505
506
|
# set up multitasking
|
|
506
507
|
max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
|
|
507
508
|
if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
|
|
508
|
-
self._base_logger.info("limiting number of
|
|
509
|
+
self._base_logger.info("limiting number of workers from %d to %d", max_workers, self.max_workers)
|
|
509
510
|
max_workers = self.max_workers
|
|
510
511
|
if max_workers > 1:
|
|
511
512
|
assert isinstance(workspace.mets, ClientSideOcrdMets), \
|
|
@@ -517,13 +518,10 @@ class Processor():
|
|
|
517
518
|
|
|
518
519
|
if max_workers > 1:
|
|
519
520
|
executor_cls = ProcessPoolExecutor
|
|
520
|
-
log_queue = mp.Queue()
|
|
521
|
-
# forward messages from log queue (in subprocesses) to all root handlers
|
|
522
|
-
log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True)
|
|
521
|
+
log_queue = mp.get_context('fork').Queue()
|
|
523
522
|
else:
|
|
524
523
|
executor_cls = DummyExecutor
|
|
525
524
|
log_queue = None
|
|
526
|
-
log_listener = None
|
|
527
525
|
executor = executor_cls(
|
|
528
526
|
max_workers=max_workers or 1,
|
|
529
527
|
# only forking method avoids pickling
|
|
@@ -533,6 +531,8 @@ class Processor():
|
|
|
533
531
|
initargs=(self, log_queue),
|
|
534
532
|
)
|
|
535
533
|
if max_workers > 1:
|
|
534
|
+
# forward messages from log queue (in subprocesses) to all root handlers
|
|
535
|
+
log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True)
|
|
536
536
|
log_listener.start()
|
|
537
537
|
try:
|
|
538
538
|
self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
|
|
@@ -542,6 +542,7 @@ class Processor():
|
|
|
542
542
|
executor.shutdown(kill_workers=True, wait=False)
|
|
543
543
|
if max_workers > 1:
|
|
544
544
|
log_listener.stop()
|
|
545
|
+
del log_listener
|
|
545
546
|
|
|
546
547
|
except NotImplementedError:
|
|
547
548
|
# fall back to deprecated method
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "1.0.0",
|
|
3
|
+
"git_url": "https://github.com/OCR-D/core",
|
|
4
|
+
"tools": {
|
|
5
|
+
"ocrd-dummy": {
|
|
6
|
+
"executable": "ocrd-dummy",
|
|
7
|
+
"description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group",
|
|
8
|
+
"steps": ["preprocessing/optimization"],
|
|
9
|
+
"categories": ["Image preprocessing"],
|
|
10
|
+
"input_file_grp_cardinality": 1,
|
|
11
|
+
"output_file_grp_cardinality": 1,
|
|
12
|
+
"parameters": {
|
|
13
|
+
"copy_files": {
|
|
14
|
+
"type": "boolean",
|
|
15
|
+
"default": false,
|
|
16
|
+
"description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)"
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"ocrd-filter": {
|
|
21
|
+
"executable": "ocrd-filter",
|
|
22
|
+
"description": "Bare-bones processor can be dynamically configured to remove segments based on XPath queries",
|
|
23
|
+
"steps": ["recognition/post-correction"],
|
|
24
|
+
"categories": ["Quality assurance"],
|
|
25
|
+
"input_file_grp_cardinality": 1,
|
|
26
|
+
"output_file_grp_cardinality": 1,
|
|
27
|
+
"parameters": {
|
|
28
|
+
"select": {
|
|
29
|
+
"type": "string",
|
|
30
|
+
"default": "//*[ends-with(local-name(),'Region')]",
|
|
31
|
+
"description": "Which segments to select for removal. An XPath 2.0 query expression (path and optional predicates), with 'pc' as namespace prefix for PAGE-XML and our extension functions (see help text). Only selection of segment hierarchy elements is allowed (so e.g. `*` would be equivalent to `pc:NoiseRegion|pc:LineDrawingRegion|pc:AdvertRegion|pc:ImageRegion|pc:ChartRegion|pc:MusicRegion|pc:GraphicRegion|pc:UnknownRegion|pc:CustomRegion|pc:SeparatorRegion|pc:MathsRegion|pc:TextRegion|pc:MapRegion|pc:ChemRegion|pc:TableRegion|pc:TextLine|pc:Word|pc:Glyph`, but `pc:MetadataItem` or `pc:Border` or `pc:Coords` would not match).\nFor example, to remove words or glyphs with low text confidence, select '(pc:Word|pc:Glyph)[pc:TextEquiv/@conf < 0.7]'. Or low layout confidence, '*[pc:Coords/@conf < 0.7]'.\nTo remove high pixel-to-character rate, select '*[pc:pixelarea(.) div string-length(pc:textequiv(.)) > 10000]'."
|
|
32
|
+
},
|
|
33
|
+
"plot": {
|
|
34
|
+
"type": "boolean",
|
|
35
|
+
"default": false,
|
|
36
|
+
"description": "Whether to extract an image for each filtered segment and write to the output fileGrp."
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# pylint: disable=missing-module-docstring,invalid-name
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from lxml import etree
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
|
|
8
|
+
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
|
|
9
|
+
from ocrd_models import OcrdPage
|
|
10
|
+
|
|
11
|
+
_SEGTYPES = [
|
|
12
|
+
"NoiseRegion",
|
|
13
|
+
"LineDrawingRegion",
|
|
14
|
+
"AdvertRegion",
|
|
15
|
+
"ImageRegion",
|
|
16
|
+
"ChartRegion",
|
|
17
|
+
"MusicRegion",
|
|
18
|
+
"GraphicRegion",
|
|
19
|
+
"UnknownRegion",
|
|
20
|
+
"CustomRegion",
|
|
21
|
+
"SeparatorRegion",
|
|
22
|
+
"MathsRegion",
|
|
23
|
+
"TextRegion",
|
|
24
|
+
"MapRegion",
|
|
25
|
+
"ChemRegion",
|
|
26
|
+
"TableRegion",
|
|
27
|
+
"TextLine",
|
|
28
|
+
"Word",
|
|
29
|
+
"Glyph"
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
class FilterProcessor(Processor):
|
|
33
|
+
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
|
34
|
+
"""
|
|
35
|
+
Remove PAGE segment hierarchy elements based on flexible selection criteria.
|
|
36
|
+
|
|
37
|
+
Open and deserialise PAGE input file, then iterate over the segment hierarchy
|
|
38
|
+
down to the level required for ``select`` (which could be multiple levels at once).
|
|
39
|
+
|
|
40
|
+
Remove any segments matching XPath query ``select`` from that hierarchy (and from
|
|
41
|
+
the `ReadingOrder` if it is a region type).
|
|
42
|
+
|
|
43
|
+
\b
|
|
44
|
+
Besides full XPath 2.0 syntax, this supports extra predicates:
|
|
45
|
+
- `pc:pixelarea()` for the number of pixels of the bounding box (or sum area on node sets),
|
|
46
|
+
- `pc:textequiv()` for the first TextEquiv unicode string (or concatenated string on node sets).
|
|
47
|
+
|
|
48
|
+
If ``plot`` is `true`, then extract and write an image file for all removed segments
|
|
49
|
+
to the output fileGrp (without reference to the PAGE).
|
|
50
|
+
|
|
51
|
+
Produce a new PAGE output file by serialising the resulting hierarchy.
|
|
52
|
+
"""
|
|
53
|
+
pcgts = input_pcgts[0]
|
|
54
|
+
result = OcrdPageResult(pcgts)
|
|
55
|
+
nodes = pcgts.xpath(self.parameter['select'])
|
|
56
|
+
# get PAGE objects from matching etree nodes
|
|
57
|
+
# but allow only hierarchy segments
|
|
58
|
+
segments = [segment for segment in map(pcgts.revmap.get, nodes)
|
|
59
|
+
if segment.__class__.__name__.replace('Type', '') in _SEGTYPES]
|
|
60
|
+
if not(len(segments)):
|
|
61
|
+
self.logger.info("no matches")
|
|
62
|
+
return result
|
|
63
|
+
rodict = pcgts.get_Page().get_ReadingOrderGroups()
|
|
64
|
+
if self.parameter['plot']:
|
|
65
|
+
page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id)
|
|
66
|
+
for segment in segments:
|
|
67
|
+
segtype = segment.original_tagname_
|
|
68
|
+
self.logger.info("matched %s segment %s", segtype, segment.id)
|
|
69
|
+
parent = segment.parent_object_
|
|
70
|
+
partype = parent.__class__.__name__.replace('Type', '')
|
|
71
|
+
if partype == 'Page':
|
|
72
|
+
getattr(parent, 'get_' + segtype)().remove(segment)
|
|
73
|
+
elif partype.endswith('Region'):
|
|
74
|
+
if segtype.endswith('Region'):
|
|
75
|
+
getattr(parent, 'get_' + segtype)().remove(segment)
|
|
76
|
+
else:
|
|
77
|
+
parent.TextLine.remove(segment)
|
|
78
|
+
elif partype == 'TextLine':
|
|
79
|
+
parent.Word.remove(segment)
|
|
80
|
+
elif partype == 'Word':
|
|
81
|
+
parent.Glyph.remove(segment)
|
|
82
|
+
else:
|
|
83
|
+
raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})")
|
|
84
|
+
segment.parent_object_ = None
|
|
85
|
+
if segtype.endswith('Region') and segment.id in rodict:
|
|
86
|
+
# remove from ReadingOrder as well
|
|
87
|
+
roelem = rodict[segment.id]
|
|
88
|
+
rorefs = getattr(roelem.parent_object_, roelem.__class__.__name__.replace('Type', ''))
|
|
89
|
+
rorefs.remove(roelem)
|
|
90
|
+
roelem.parent_object_ = None
|
|
91
|
+
del rodict[segment.id]
|
|
92
|
+
if self.parameter['plot']:
|
|
93
|
+
segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords)
|
|
94
|
+
result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None))
|
|
95
|
+
return result
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def metadata_filename(self):
|
|
99
|
+
return 'processor/builtin/dummy/ocrd-tool.json'
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def executable(self):
|
|
103
|
+
return 'ocrd-filter'
|
|
104
|
+
|
|
105
|
+
@click.command()
|
|
106
|
+
@ocrd_cli_options
|
|
107
|
+
def cli(*args, **kwargs):
|
|
108
|
+
return ocrd_cli_wrap_processor(FilterProcessor, *args, **kwargs)
|
|
@@ -23,6 +23,10 @@ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'
|
|
|
23
23
|
|
|
24
24
|
# pylint: enable=wrong-import-position
|
|
25
25
|
|
|
26
|
+
# pylint: enable=wrong-import-position
|
|
27
|
+
|
|
28
|
+
# pylint: enable=wrong-import-position
|
|
29
|
+
|
|
26
30
|
from ocrd_validators import OcrdResourceListValidator
|
|
27
31
|
from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
|
|
28
32
|
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
|
|
@@ -798,7 +798,6 @@ class Workspace():
|
|
|
798
798
|
raise Exception('Found no AlternativeImage that satisfies all requirements ' +
|
|
799
799
|
'filter="%s" in page "%s"' % (
|
|
800
800
|
feature_filter, page_id))
|
|
801
|
-
page_image.format = 'PNG' # workaround for tesserocr#194
|
|
802
801
|
# ensure DPI will be set in image meta-data again
|
|
803
802
|
if 'DPI' in page_coords:
|
|
804
803
|
dpi = page_coords['DPI']
|
|
@@ -1060,7 +1059,6 @@ class Workspace():
|
|
|
1060
1059
|
raise Exception('Found no AlternativeImage that satisfies all requirements ' +
|
|
1061
1060
|
'filter="%s" in segment "%s"' % (
|
|
1062
1061
|
feature_filter, segment.id))
|
|
1063
|
-
segment_image.format = 'PNG' # workaround for tesserocr#194
|
|
1064
1062
|
# ensure DPI will be set in image meta-data again
|
|
1065
1063
|
if 'DPI' in segment_coords:
|
|
1066
1064
|
dpi = segment_coords['DPI']
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.2
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -17,6 +17,7 @@ Requires-Dist: click>=7
|
|
|
17
17
|
Requires-Dist: cryptography<43.0.0
|
|
18
18
|
Requires-Dist: Deprecated==1.2.0
|
|
19
19
|
Requires-Dist: docker
|
|
20
|
+
Requires-Dist: elementpath
|
|
20
21
|
Requires-Dist: fastapi>=0.78.0
|
|
21
22
|
Requires-Dist: filetype
|
|
22
23
|
Requires-Dist: Flask
|
|
@@ -46,11 +46,11 @@ src/ocrd/decorators/ocrd_cli_options.py
|
|
|
46
46
|
src/ocrd/decorators/parameter_option.py
|
|
47
47
|
src/ocrd/processor/__init__.py
|
|
48
48
|
src/ocrd/processor/base.py
|
|
49
|
-
src/ocrd/processor/concurrent.py
|
|
50
49
|
src/ocrd/processor/helpers.py
|
|
51
50
|
src/ocrd/processor/ocrd_page_result.py
|
|
52
51
|
src/ocrd/processor/builtin/__init__.py
|
|
53
52
|
src/ocrd/processor/builtin/dummy_processor.py
|
|
53
|
+
src/ocrd/processor/builtin/filter_processor.py
|
|
54
54
|
src/ocrd/processor/builtin/dummy/__init__.py
|
|
55
55
|
src/ocrd/processor/builtin/dummy/ocrd-tool.json
|
|
56
56
|
src/ocrd_modelfactory/__init__.py
|
|
@@ -66,6 +66,7 @@ src/ocrd_models/ocrd_page_generateds.py
|
|
|
66
66
|
src/ocrd_models/ocrd_xml_base.py
|
|
67
67
|
src/ocrd_models/report.py
|
|
68
68
|
src/ocrd_models/utils.py
|
|
69
|
+
src/ocrd_models/xpath_functions.py
|
|
69
70
|
src/ocrd_network/__init__.py
|
|
70
71
|
src/ocrd_network/client.py
|
|
71
72
|
src/ocrd_network/client_utils.py
|
|
@@ -101,5 +101,11 @@ def page_from_file(input_file, **kwargs) -> OcrdPage:
|
|
|
101
101
|
if input_file.mimetype.startswith('image'):
|
|
102
102
|
return page_from_image(input_file)
|
|
103
103
|
if input_file.mimetype == MIMETYPE_PAGE:
|
|
104
|
-
|
|
104
|
+
revmap = {}
|
|
105
|
+
# the old/default gds.reverse_node_mapping is useless
|
|
106
|
+
# since 2.39.4, we can actually get the exact reverse mapping for perfect round-trip
|
|
107
|
+
# but awkwardly, we have to pass the dict in for that
|
|
108
|
+
page = OcrdPage(*parseEtree(input_file.local_filename, reverse_mapping=revmap, silence=True))
|
|
109
|
+
page.revmap = revmap
|
|
110
|
+
return page
|
|
105
111
|
raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype)
|