ocrd 3.5.0__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +6 -2
- ocrd/cli/bashlib.py +7 -2
- ocrd/cli/log.py +7 -2
- ocrd/cli/network.py +0 -2
- ocrd/cli/ocrd_tool.py +26 -4
- ocrd/cli/process.py +1 -0
- ocrd/cli/resmgr.py +0 -1
- ocrd/cli/validate.py +32 -13
- ocrd/cli/workspace.py +125 -52
- ocrd/cli/zip.py +13 -4
- ocrd/decorators/__init__.py +28 -52
- ocrd/decorators/loglevel_option.py +4 -0
- ocrd/decorators/mets_find_options.py +2 -1
- ocrd/decorators/ocrd_cli_options.py +3 -7
- ocrd/decorators/parameter_option.py +12 -11
- ocrd/lib.bash +6 -13
- ocrd/mets_server.py +6 -10
- ocrd/processor/base.py +88 -71
- ocrd/processor/builtin/dummy_processor.py +7 -4
- ocrd/processor/builtin/filter_processor.py +3 -2
- ocrd/processor/helpers.py +5 -6
- ocrd/processor/ocrd_page_result.py +7 -5
- ocrd/resolver.py +42 -32
- ocrd/task_sequence.py +11 -4
- ocrd/workspace.py +64 -54
- ocrd/workspace_backup.py +3 -0
- ocrd/workspace_bagger.py +15 -8
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/METADATA +3 -2
- ocrd-3.6.0.dist-info/RECORD +125 -0
- ocrd_modelfactory/__init__.py +4 -2
- ocrd_models/constants.py +18 -1
- ocrd_models/ocrd_agent.py +1 -1
- ocrd_models/ocrd_exif.py +7 -3
- ocrd_models/ocrd_file.py +24 -19
- ocrd_models/ocrd_mets.py +90 -67
- ocrd_models/ocrd_page.py +17 -13
- ocrd_models/ocrd_xml_base.py +1 -0
- ocrd_models/report.py +2 -1
- ocrd_models/utils.py +4 -3
- ocrd_models/xpath_functions.py +3 -1
- ocrd_network/__init__.py +1 -2
- ocrd_network/cli/__init__.py +0 -2
- ocrd_network/cli/client.py +122 -50
- ocrd_network/cli/processing_server.py +1 -2
- ocrd_network/client.py +2 -2
- ocrd_network/client_utils.py +30 -13
- ocrd_network/constants.py +1 -6
- ocrd_network/database.py +3 -3
- ocrd_network/logging_utils.py +2 -7
- ocrd_network/models/__init__.py +0 -2
- ocrd_network/models/job.py +2 -5
- ocrd_network/models/workspace.py +1 -1
- ocrd_network/process_helpers.py +54 -17
- ocrd_network/processing_server.py +63 -114
- ocrd_network/processing_worker.py +6 -5
- ocrd_network/rabbitmq_utils/__init__.py +2 -0
- ocrd_network/rabbitmq_utils/helpers.py +24 -7
- ocrd_network/runtime_data/__init__.py +1 -2
- ocrd_network/runtime_data/deployer.py +12 -85
- ocrd_network/runtime_data/hosts.py +61 -130
- ocrd_network/runtime_data/network_agents.py +7 -31
- ocrd_network/runtime_data/network_services.py +1 -1
- ocrd_network/server_cache.py +1 -1
- ocrd_network/server_utils.py +13 -52
- ocrd_network/utils.py +1 -0
- ocrd_utils/__init__.py +4 -4
- ocrd_utils/config.py +86 -76
- ocrd_utils/deprecate.py +3 -0
- ocrd_utils/image.py +51 -23
- ocrd_utils/introspect.py +8 -3
- ocrd_utils/logging.py +12 -7
- ocrd_utils/os.py +16 -3
- ocrd_utils/str.py +32 -16
- ocrd_validators/json_validator.py +4 -1
- ocrd_validators/ocrd_tool_validator.py +2 -1
- ocrd_validators/ocrd_zip_validator.py +5 -4
- ocrd_validators/page_validator.py +21 -9
- ocrd_validators/parameter_validator.py +3 -2
- ocrd_validators/processing_server_config.schema.yml +1 -33
- ocrd_validators/resource_list_validator.py +3 -1
- ocrd_validators/workspace_validator.py +30 -20
- ocrd_validators/xsd_mets_validator.py +2 -1
- ocrd_validators/xsd_page_validator.py +2 -1
- ocrd_validators/xsd_validator.py +4 -2
- ocrd-3.5.0.dist-info/RECORD +0 -128
- ocrd_network/cli/processor_server.py +0 -31
- ocrd_network/models/ocrd_tool.py +0 -12
- ocrd_network/processor_server.py +0 -255
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/LICENSE +0 -0
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/WHEEL +0 -0
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/entry_points.txt +0 -0
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/top_level.txt +0 -0
|
@@ -9,11 +9,10 @@ Each Processing Worker is an instance of an OCR-D processor.
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
import psutil
|
|
12
|
-
from
|
|
13
|
-
from typing import Dict, List, Union
|
|
12
|
+
from typing import Dict, List
|
|
14
13
|
|
|
15
14
|
from ocrd import OcrdMetsServer
|
|
16
|
-
from ocrd_utils import
|
|
15
|
+
from ocrd_utils import getLogger
|
|
17
16
|
from ..logging_utils import get_mets_server_logging_file_path
|
|
18
17
|
from ..utils import get_uds_path, is_mets_server_running, stop_mets_server
|
|
19
18
|
from .config_parser import parse_hosts_data, parse_mongodb_data, parse_rabbitmq_data, validate_and_load_config
|
|
@@ -34,89 +33,15 @@ class Deployer:
|
|
|
34
33
|
self.mets_servers_paths: Dict = {} # {"ws_dir_path": "mets_server_url"}
|
|
35
34
|
self.use_tcp_mets = ps_config.get("use_tcp_mets", False)
|
|
36
35
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
self, worker_only: bool = False, server_only: bool = False, docker_only: bool = False,
|
|
40
|
-
native_only: bool = False, str_names_only: bool = False, unique_only: bool = False, sort: bool = False
|
|
41
|
-
) -> Union[List[str], List[object]]:
|
|
42
|
-
"""Finds and returns a list of matching data objects of type:
|
|
43
|
-
`DataProcessingWorker` and `DataProcessorServer`.
|
|
44
|
-
|
|
45
|
-
:py:attr:`worker_only` match only worker network agents (DataProcessingWorker)
|
|
46
|
-
:py:attr:`server_only` match only server network agents (DataProcessorServer)
|
|
47
|
-
:py:attr:`docker_only` match only docker network agents (DataProcessingWorker and DataProcessorServer)
|
|
48
|
-
:py:attr:`native_only` match only native network agents (DataProcessingWorker and DataProcessorServer)
|
|
49
|
-
:py:attr:`str_names_only` returns the processor_name filed instead of the Data* object
|
|
50
|
-
:py:attr:`unique_only` remove duplicate names from the matches
|
|
51
|
-
:py:attr:`sort` sort the result
|
|
52
|
-
|
|
53
|
-
`worker_only` and `server_only` are mutually exclusive to each other
|
|
54
|
-
`docker_only` and `native_only` are mutually exclusive to each other
|
|
55
|
-
`unique_only` is allowed only together with `str_names_only`
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
if worker_only and server_only:
|
|
59
|
-
msg = f"Only 'worker_only' or 'server_only' is allowed, not both."
|
|
60
|
-
self.log.exception(msg)
|
|
61
|
-
raise ValueError(msg)
|
|
62
|
-
if docker_only and native_only:
|
|
63
|
-
msg = f"Only 'docker_only' or 'native_only' is allowed, not both."
|
|
64
|
-
self.log.exception(msg)
|
|
65
|
-
raise ValueError(msg)
|
|
66
|
-
if not str_names_only and unique_only:
|
|
67
|
-
msg = f"Value 'unique_only' is allowed only together with 'str_names_only'"
|
|
68
|
-
self.log.exception(msg)
|
|
69
|
-
raise ValueError(msg)
|
|
70
|
-
if sort and not str_names_only:
|
|
71
|
-
msg = f"Value 'sort' is allowed only together with 'str_names_only'"
|
|
72
|
-
self.log.exception(msg)
|
|
73
|
-
raise ValueError(msg)
|
|
74
|
-
|
|
75
|
-
# Find all matching objects of type DataProcessingWorker or DataProcessorServer
|
|
76
|
-
matched_objects = []
|
|
77
|
-
for data_host in self.data_hosts:
|
|
78
|
-
if not server_only:
|
|
79
|
-
if not docker_only:
|
|
80
|
-
for data_worker in data_host.network_agents_worker_native:
|
|
81
|
-
matched_objects.append(data_worker)
|
|
82
|
-
if not native_only:
|
|
83
|
-
for data_worker in data_host.network_agents_worker_docker:
|
|
84
|
-
matched_objects.append(data_worker)
|
|
85
|
-
if not worker_only:
|
|
86
|
-
if not docker_only:
|
|
87
|
-
for data_server in data_host.network_agents_server_native:
|
|
88
|
-
matched_objects.append(data_server)
|
|
89
|
-
if not native_only:
|
|
90
|
-
for data_server in data_host.network_agents_server_docker:
|
|
91
|
-
matched_objects.append(data_server)
|
|
92
|
-
if not str_names_only:
|
|
93
|
-
return matched_objects
|
|
94
|
-
# Gets only the processor names of the matched objects
|
|
95
|
-
matched_names = [match.processor_name for match in matched_objects]
|
|
96
|
-
if not unique_only:
|
|
97
|
-
return matched_names
|
|
98
|
-
list_matched = list(dict.fromkeys(matched_names))
|
|
99
|
-
if not sort:
|
|
100
|
-
# Removes any duplicate entries from matched names
|
|
101
|
-
return list_matched
|
|
102
|
-
list_matched.sort()
|
|
103
|
-
return list_matched
|
|
104
|
-
|
|
105
|
-
def resolve_processor_server_url(self, processor_name) -> str:
|
|
106
|
-
processor_server_url = ''
|
|
107
|
-
for data_host in self.data_hosts:
|
|
108
|
-
processor_server_url = data_host.resolve_processor_server_url(processor_name=processor_name)
|
|
109
|
-
return processor_server_url
|
|
110
|
-
|
|
111
|
-
def deploy_network_agents(self, mongodb_url: str, rabbitmq_url: str) -> None:
|
|
112
|
-
self.log.debug("Deploying processing workers/processor servers...")
|
|
36
|
+
def deploy_workers(self, mongodb_url: str, rabbitmq_url: str) -> None:
|
|
37
|
+
self.log.debug("Deploying processing workers...")
|
|
113
38
|
for host_data in self.data_hosts:
|
|
114
|
-
host_data.
|
|
39
|
+
host_data.deploy_workers(logger=self.log, mongodb_url=mongodb_url, rabbitmq_url=rabbitmq_url)
|
|
115
40
|
|
|
116
|
-
def
|
|
117
|
-
self.log.debug("Stopping processing workers
|
|
41
|
+
def stop_workers(self) -> None:
|
|
42
|
+
self.log.debug("Stopping processing workers...")
|
|
118
43
|
for host_data in self.data_hosts:
|
|
119
|
-
host_data.
|
|
44
|
+
host_data.stop_workers(logger=self.log)
|
|
120
45
|
|
|
121
46
|
def deploy_rabbitmq(self) -> str:
|
|
122
47
|
self.data_queue.deploy_rabbitmq(self.log)
|
|
@@ -138,7 +63,7 @@ class Deployer:
|
|
|
138
63
|
If RabbitMQ server is stopped before stopping Processing Workers that may have
|
|
139
64
|
a bad outcome and leave Processing Workers in an unpredictable state.
|
|
140
65
|
"""
|
|
141
|
-
self.
|
|
66
|
+
self.stop_workers()
|
|
142
67
|
self.stop_mongodb()
|
|
143
68
|
self.stop_rabbitmq()
|
|
144
69
|
|
|
@@ -154,7 +79,9 @@ class Deployer:
|
|
|
154
79
|
"Removing to avoid any weird behavior before starting the server.")
|
|
155
80
|
Path(mets_server_url).unlink()
|
|
156
81
|
self.log.info(f"Starting UDS mets server: {mets_server_url}")
|
|
157
|
-
pid = OcrdMetsServer.create_process(mets_server_url=str(mets_server_url),
|
|
82
|
+
pid = OcrdMetsServer.create_process(mets_server_url=str(mets_server_url),
|
|
83
|
+
ws_dir_path=str(ws_dir_path),
|
|
84
|
+
log_file=str(log_file))
|
|
158
85
|
self.mets_servers[str(mets_server_url)] = pid
|
|
159
86
|
self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url)
|
|
160
87
|
return mets_server_url
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from logging import Logger
|
|
2
2
|
from time import sleep
|
|
3
|
-
from typing import Dict, List
|
|
3
|
+
from typing import Dict, List
|
|
4
4
|
|
|
5
5
|
from .connection_clients import create_docker_client, create_ssh_client
|
|
6
|
-
from .network_agents import
|
|
6
|
+
from .network_agents import DataProcessingWorker, DeployType
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class DataHost:
|
|
@@ -24,68 +24,39 @@ class DataHost:
|
|
|
24
24
|
self.ssh_client = None
|
|
25
25
|
self.docker_client = None
|
|
26
26
|
|
|
27
|
-
# Time to wait between deploying
|
|
28
|
-
self.
|
|
27
|
+
# Time to wait between deploying single workers
|
|
28
|
+
self.wait_between_deploys: float = 0.3
|
|
29
29
|
|
|
30
|
-
# Lists of
|
|
31
|
-
self.
|
|
32
|
-
self.
|
|
33
|
-
self.network_agents_server_native = []
|
|
34
|
-
self.network_agents_server_docker = []
|
|
30
|
+
# Lists of Processing Workers based on their deployment type
|
|
31
|
+
self.workers_native = []
|
|
32
|
+
self.workers_docker = []
|
|
35
33
|
|
|
36
34
|
if not workers:
|
|
37
35
|
workers = []
|
|
38
36
|
if not servers:
|
|
39
37
|
servers = []
|
|
40
38
|
|
|
41
|
-
self.
|
|
42
|
-
self.__parse_network_agents_servers(processor_servers=servers)
|
|
39
|
+
self.__parse_workers(processing_workers=workers)
|
|
43
40
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
def __append_workers_to_lists(self, worker_data: DataProcessingWorker) -> None:
|
|
42
|
+
if worker_data.deploy_type != DeployType.DOCKER and worker_data.deploy_type != DeployType.NATIVE:
|
|
43
|
+
raise ValueError(f"Processing Worker deploy type is unknown: {worker_data.deploy_type}")
|
|
47
44
|
|
|
48
|
-
|
|
49
|
-
if processor_name not in self.processor_servers_ports:
|
|
50
|
-
self.processor_servers_ports[processor_name] = [port]
|
|
51
|
-
return
|
|
52
|
-
self.processor_servers_ports[processor_name] = self.processor_servers_ports[processor_name].append(port)
|
|
53
|
-
|
|
54
|
-
def __append_network_agent_to_lists(self, agent_data: DataNetworkAgent) -> None:
|
|
55
|
-
if agent_data.deploy_type != DeployType.DOCKER and agent_data.deploy_type != DeployType.NATIVE:
|
|
56
|
-
raise ValueError(f"Network agent deploy type is unknown: {agent_data.deploy_type}")
|
|
57
|
-
if agent_data.agent_type != AgentType.PROCESSING_WORKER and agent_data.agent_type != AgentType.PROCESSOR_SERVER:
|
|
58
|
-
raise ValueError(f"Network agent type is unknown: {agent_data.agent_type}")
|
|
59
|
-
|
|
60
|
-
if agent_data.deploy_type == DeployType.NATIVE:
|
|
45
|
+
if worker_data.deploy_type == DeployType.NATIVE:
|
|
61
46
|
self.needs_ssh_connector = True
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
if agent_data.agent_type == AgentType.PROCESSOR_SERVER:
|
|
65
|
-
self.network_agents_server_native.append(agent_data)
|
|
66
|
-
if agent_data.deploy_type == DeployType.DOCKER:
|
|
47
|
+
self.workers_native.append(worker_data)
|
|
48
|
+
if worker_data.deploy_type == DeployType.DOCKER:
|
|
67
49
|
self.needs_docker_connector = True
|
|
68
|
-
|
|
69
|
-
self.network_agents_worker_docker.append(agent_data)
|
|
70
|
-
if agent_data.agent_type == AgentType.PROCESSOR_SERVER:
|
|
71
|
-
self.network_agents_server_docker.append(agent_data)
|
|
72
|
-
|
|
73
|
-
def __parse_network_agents_servers(self, processor_servers: List[Dict]):
|
|
74
|
-
for server in processor_servers:
|
|
75
|
-
server_data = DataProcessorServer(
|
|
76
|
-
processor_name=server["name"], deploy_type=server["deploy_type"], host=self.host,
|
|
77
|
-
port=int(server["port"]), init_by_config=True, pid=None
|
|
78
|
-
)
|
|
79
|
-
self.__append_network_agent_to_lists(agent_data=server_data)
|
|
50
|
+
self.workers_docker.append(worker_data)
|
|
80
51
|
|
|
81
|
-
def
|
|
52
|
+
def __parse_workers(self, processing_workers: List[Dict]):
|
|
82
53
|
for worker in processing_workers:
|
|
83
54
|
worker_data = DataProcessingWorker(
|
|
84
|
-
processor_name=worker["name"], deploy_type=worker
|
|
85
|
-
init_by_config=True, pid=None
|
|
55
|
+
processor_name=worker["name"], deploy_type=worker.get("deploy_type", "native"),
|
|
56
|
+
host=self.host, init_by_config=True, pid=None
|
|
86
57
|
)
|
|
87
58
|
for _ in range(int(worker["number_of_instance"])):
|
|
88
|
-
self.
|
|
59
|
+
self.__append_workers_to_lists(worker_data=worker_data)
|
|
89
60
|
|
|
90
61
|
def create_connection_client(self, client_type: str):
|
|
91
62
|
if client_type not in ["docker", "ssh"]:
|
|
@@ -97,62 +68,46 @@ class DataHost:
|
|
|
97
68
|
self.docker_client = create_docker_client(self.host, self.username, self.password, self.keypath)
|
|
98
69
|
return self.docker_client
|
|
99
70
|
|
|
100
|
-
def
|
|
101
|
-
self, logger: Logger,
|
|
71
|
+
def __deploy_single_worker(
|
|
72
|
+
self, logger: Logger, worker_data: DataProcessingWorker,
|
|
102
73
|
mongodb_url: str, rabbitmq_url: str
|
|
103
74
|
) -> None:
|
|
104
|
-
deploy_type =
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
logger.info(f"Deploying {agent_info}")
|
|
75
|
+
deploy_type = worker_data.deploy_type
|
|
76
|
+
name = worker_data.processor_name
|
|
77
|
+
worker_info = f"Processing Worker, deploy: {deploy_type}, name: {name}, host: {self.host}"
|
|
78
|
+
logger.info(f"Deploying {worker_info}")
|
|
109
79
|
|
|
110
80
|
connection_client = None
|
|
111
81
|
if deploy_type == DeployType.NATIVE:
|
|
112
|
-
assert self.ssh_client,
|
|
82
|
+
assert self.ssh_client, "SSH client connection missing."
|
|
113
83
|
connection_client = self.ssh_client
|
|
114
84
|
if deploy_type == DeployType.DOCKER:
|
|
115
|
-
assert self.docker_client,
|
|
85
|
+
assert self.docker_client, "Docker client connection missing."
|
|
116
86
|
connection_client = self.docker_client
|
|
117
87
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
if agent_type == AgentType.PROCESSOR_SERVER:
|
|
121
|
-
agent_data.deploy_network_agent(logger, connection_client, mongodb_url)
|
|
88
|
+
worker_data.deploy_network_agent(logger, connection_client, mongodb_url, rabbitmq_url)
|
|
89
|
+
sleep(self.wait_between_deploys)
|
|
122
90
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def __deploy_network_agents_workers(self, logger: Logger, mongodb_url: str, rabbitmq_url: str):
|
|
91
|
+
def __deploy_all_workers(self, logger: Logger, mongodb_url: str, rabbitmq_url: str):
|
|
126
92
|
logger.info(f"Deploying processing workers on host: {self.host}")
|
|
127
|
-
amount_workers = len(self.
|
|
93
|
+
amount_workers = len(self.workers_native) + len(self.workers_docker)
|
|
128
94
|
if not amount_workers:
|
|
129
|
-
logger.info(
|
|
130
|
-
for data_worker in self.
|
|
131
|
-
self.
|
|
132
|
-
for data_worker in self.
|
|
133
|
-
self.
|
|
134
|
-
|
|
135
|
-
def
|
|
136
|
-
logger.info(f"Deploying processor servers on host: {self.host}")
|
|
137
|
-
amount_servers = len(self.network_agents_server_native) + len(self.network_agents_server_docker)
|
|
138
|
-
if not amount_servers:
|
|
139
|
-
logger.info(f"No processor servers found to be deployed")
|
|
140
|
-
for data_server in self.network_agents_server_native:
|
|
141
|
-
self.__deploy_network_agent(logger, data_server, mongodb_url, rabbitmq_url)
|
|
142
|
-
self.__add_deployed_agent_server_port_to_cache(data_server.processor_name, data_server.port)
|
|
143
|
-
for data_server in self.network_agents_server_docker:
|
|
144
|
-
self.__deploy_network_agent(logger, data_server, mongodb_url, rabbitmq_url)
|
|
145
|
-
self.__add_deployed_agent_server_port_to_cache(data_server.processor_name, data_server.port)
|
|
146
|
-
|
|
147
|
-
def deploy_network_agents(self, logger: Logger, mongodb_url: str, rabbitmq_url: str) -> None:
|
|
95
|
+
logger.info("No processing workers found to be deployed")
|
|
96
|
+
for data_worker in self.workers_native:
|
|
97
|
+
self.__deploy_single_worker(logger, data_worker, mongodb_url, rabbitmq_url)
|
|
98
|
+
for data_worker in self.workers_docker:
|
|
99
|
+
self.__deploy_single_worker(logger, data_worker, mongodb_url, rabbitmq_url)
|
|
100
|
+
|
|
101
|
+
def deploy_workers(self, logger: Logger, mongodb_url: str, rabbitmq_url: str) -> None:
|
|
148
102
|
if self.needs_ssh_connector and not self.ssh_client:
|
|
149
103
|
logger.debug("Creating missing ssh connector before deploying")
|
|
150
104
|
self.ssh_client = self.create_connection_client(client_type="ssh")
|
|
151
105
|
if self.needs_docker_connector:
|
|
152
106
|
logger.debug("Creating missing docker connector before deploying")
|
|
153
107
|
self.docker_client = self.create_connection_client(client_type="docker")
|
|
154
|
-
|
|
155
|
-
self.
|
|
108
|
+
|
|
109
|
+
self.__deploy_all_workers(logger=logger, mongodb_url=mongodb_url, rabbitmq_url=rabbitmq_url)
|
|
110
|
+
|
|
156
111
|
if self.ssh_client:
|
|
157
112
|
self.ssh_client.close()
|
|
158
113
|
self.ssh_client = None
|
|
@@ -160,66 +115,42 @@ class DataHost:
|
|
|
160
115
|
self.docker_client.close()
|
|
161
116
|
self.docker_client = None
|
|
162
117
|
|
|
163
|
-
def
|
|
164
|
-
|
|
118
|
+
def __stop_worker(self, logger: Logger, name: str, deploy_type: DeployType, pid: str):
|
|
119
|
+
worker_info = f"Processing Worker: deploy: {deploy_type}, name: {name}"
|
|
165
120
|
if not pid:
|
|
166
|
-
logger.warning(f"No pid was passed for {
|
|
121
|
+
logger.warning(f"No pid was passed for {worker_info}")
|
|
167
122
|
return
|
|
168
|
-
|
|
169
|
-
logger.info(f"Stopping {
|
|
123
|
+
worker_info += f", pid: {pid}"
|
|
124
|
+
logger.info(f"Stopping {worker_info}")
|
|
170
125
|
if deploy_type == DeployType.NATIVE:
|
|
171
|
-
assert self.ssh_client,
|
|
126
|
+
assert self.ssh_client, "SSH client connection missing"
|
|
172
127
|
self.ssh_client.exec_command(f"kill {pid}")
|
|
173
128
|
if deploy_type == DeployType.DOCKER:
|
|
174
|
-
assert self.docker_client,
|
|
129
|
+
assert self.docker_client, "Docker client connection missing"
|
|
175
130
|
self.docker_client.containers.get(pid).stop()
|
|
176
131
|
|
|
177
|
-
def
|
|
178
|
-
logger.info(f"Stopping processing workers on host: {self.host}")
|
|
179
|
-
amount_workers = len(self.network_agents_worker_native) + len(self.network_agents_worker_docker)
|
|
180
|
-
if not amount_workers:
|
|
181
|
-
logger.warning(f"No active processing workers to be stopped.")
|
|
182
|
-
for worker in self.network_agents_worker_native:
|
|
183
|
-
self.__stop_network_agent(logger, worker.processor_name, worker.deploy_type, worker.agent_type, worker.pid)
|
|
184
|
-
self.network_agents_worker_native = []
|
|
185
|
-
for worker in self.network_agents_worker_docker:
|
|
186
|
-
self.__stop_network_agent(logger, worker.processor_name, worker.deploy_type, worker.agent_type, worker.pid)
|
|
187
|
-
self.network_agents_worker_docker = []
|
|
188
|
-
|
|
189
|
-
def __stop_network_agents_servers(self, logger: Logger):
|
|
190
|
-
logger.info(f"Stopping processor servers on host: {self.host}")
|
|
191
|
-
amount_servers = len(self.network_agents_server_native) + len(self.network_agents_server_docker)
|
|
192
|
-
if not amount_servers:
|
|
193
|
-
logger.warning(f"No active processor servers to be stopped.")
|
|
194
|
-
for server in self.network_agents_server_native:
|
|
195
|
-
self.__stop_network_agent(logger, server.processor_name, server.deploy_type, server.agent_type, server.pid)
|
|
196
|
-
self.network_agents_server_native = []
|
|
197
|
-
for server in self.network_agents_server_docker:
|
|
198
|
-
self.__stop_network_agent(logger, server.processor_name, server.deploy_type, server.agent_type, server.pid)
|
|
199
|
-
self.network_agents_server_docker = []
|
|
200
|
-
|
|
201
|
-
def stop_network_agents(self, logger: Logger):
|
|
132
|
+
def stop_workers(self, logger: Logger):
|
|
202
133
|
if self.needs_ssh_connector and not self.ssh_client:
|
|
203
134
|
logger.debug("Creating missing ssh connector before stopping")
|
|
204
135
|
self.ssh_client = self.create_connection_client(client_type="ssh")
|
|
205
136
|
if self.needs_docker_connector and not self.docker_client:
|
|
206
137
|
logger.debug("Creating missing docker connector before stopping")
|
|
207
138
|
self.docker_client = self.create_connection_client(client_type="docker")
|
|
208
|
-
|
|
209
|
-
self.
|
|
139
|
+
|
|
140
|
+
logger.info(f"Stopping processing workers on host: {self.host}")
|
|
141
|
+
amount_workers = len(self.workers_native) + len(self.workers_docker)
|
|
142
|
+
if not amount_workers:
|
|
143
|
+
logger.warning("No active processing workers to be stopped.")
|
|
144
|
+
for worker in self.workers_native:
|
|
145
|
+
self.__stop_worker(logger, worker.processor_name, worker.deploy_type, worker.pid)
|
|
146
|
+
self.workers_native = []
|
|
147
|
+
for worker in self.workers_docker:
|
|
148
|
+
self.__stop_worker(logger, worker.processor_name, worker.deploy_type, worker.pid)
|
|
149
|
+
self.workers_docker = []
|
|
150
|
+
|
|
210
151
|
if self.ssh_client:
|
|
211
152
|
self.ssh_client.close()
|
|
212
153
|
self.ssh_client = None
|
|
213
154
|
if self.docker_client:
|
|
214
155
|
self.docker_client.close()
|
|
215
156
|
self.docker_client = None
|
|
216
|
-
|
|
217
|
-
def resolve_processor_server_url(self, processor_name: str) -> str:
|
|
218
|
-
processor_server_url = ''
|
|
219
|
-
for data_server in self.network_agents_server_docker:
|
|
220
|
-
if data_server.processor_name == processor_name:
|
|
221
|
-
processor_server_url = f"http://{self.host}:{data_server.port}/"
|
|
222
|
-
for data_server in self.network_agents_server_native:
|
|
223
|
-
if data_server.processor_name == processor_name:
|
|
224
|
-
processor_server_url = f"http://{self.host}:{data_server.port}/"
|
|
225
|
-
return processor_server_url
|
|
@@ -2,14 +2,15 @@ from logging import Logger
|
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
4
|
from re import search as re_search
|
|
5
|
-
from ..constants import
|
|
5
|
+
from ..constants import DeployType
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
# TODO: Find appropriate replacement for the hack
|
|
9
9
|
def deploy_agent_native_get_pid_hack(logger: Logger, ssh_client, start_cmd: str):
|
|
10
10
|
channel = ssh_client.invoke_shell()
|
|
11
11
|
stdin, stdout = channel.makefile("wb"), channel.makefile("rb")
|
|
12
|
-
|
|
12
|
+
# TODO: set back to debug
|
|
13
|
+
logger.info(f"Executing command: {start_cmd}")
|
|
13
14
|
|
|
14
15
|
# TODO: This hack should still be fixed
|
|
15
16
|
# Note left from @joschrew
|
|
@@ -40,14 +41,13 @@ def deploy_agent_docker_template(logger: Logger, docker_client, start_cmd: str):
|
|
|
40
41
|
|
|
41
42
|
class DataNetworkAgent:
|
|
42
43
|
def __init__(
|
|
43
|
-
self, processor_name: str, deploy_type: DeployType,
|
|
44
|
+
self, processor_name: str, deploy_type: DeployType,
|
|
44
45
|
host: str, init_by_config: bool, pid: Any = None
|
|
45
46
|
) -> None:
|
|
46
47
|
self.processor_name = processor_name
|
|
47
48
|
self.deploy_type = deploy_type
|
|
48
49
|
self.host = host
|
|
49
50
|
self.deployed_by_config = init_by_config
|
|
50
|
-
self.agent_type = agent_type
|
|
51
51
|
# The id is assigned when the agent is deployed
|
|
52
52
|
self.pid = pid
|
|
53
53
|
|
|
@@ -69,42 +69,18 @@ class DataProcessingWorker(DataNetworkAgent):
|
|
|
69
69
|
self, processor_name: str, deploy_type: DeployType, host: str, init_by_config: bool, pid: Any = None
|
|
70
70
|
) -> None:
|
|
71
71
|
super().__init__(
|
|
72
|
-
processor_name=processor_name, host=host, deploy_type=deploy_type,
|
|
72
|
+
processor_name=processor_name, host=host, deploy_type=deploy_type,
|
|
73
73
|
init_by_config=init_by_config, pid=pid
|
|
74
74
|
)
|
|
75
75
|
|
|
76
76
|
def deploy_network_agent(self, logger: Logger, connector_client, database_url: str, queue_url: str):
|
|
77
77
|
if self.deploy_type == DeployType.NATIVE:
|
|
78
|
-
start_cmd = f"{self.processor_name}
|
|
78
|
+
start_cmd = f"{self.processor_name} --database {database_url} --queue {queue_url} &"
|
|
79
79
|
self.pid = self._start_native_instance(logger, connector_client, start_cmd)
|
|
80
80
|
return self.pid
|
|
81
81
|
if self.deploy_type == DeployType.DOCKER:
|
|
82
82
|
# TODO: add real command to start processing worker in docker here
|
|
83
|
-
start_cmd =
|
|
84
|
-
self.pid = self._start_docker_instance(logger, connector_client, start_cmd)
|
|
85
|
-
return self.pid
|
|
86
|
-
raise RuntimeError(f"Unknown deploy type of {self.__dict__}")
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
class DataProcessorServer(DataNetworkAgent):
|
|
90
|
-
def __init__(
|
|
91
|
-
self, processor_name: str, deploy_type: DeployType, host: str, port: int, init_by_config: bool, pid: Any = None
|
|
92
|
-
) -> None:
|
|
93
|
-
super().__init__(
|
|
94
|
-
processor_name=processor_name, host=host, deploy_type=deploy_type, agent_type=AgentType.PROCESSOR_SERVER,
|
|
95
|
-
init_by_config=init_by_config, pid=pid
|
|
96
|
-
)
|
|
97
|
-
self.port = port
|
|
98
|
-
|
|
99
|
-
def deploy_network_agent(self, logger: Logger, connector_client, database_url: str):
|
|
100
|
-
agent_address = f"{self.host}:{self.port}"
|
|
101
|
-
if self.deploy_type == DeployType.NATIVE:
|
|
102
|
-
start_cmd = f"{self.processor_name} {self.agent_type} --address {agent_address} --database {database_url} &"
|
|
103
|
-
self.pid = self._start_native_instance(logger, connector_client, start_cmd)
|
|
104
|
-
return self.pid
|
|
105
|
-
if self.deploy_type == DeployType.DOCKER:
|
|
106
|
-
# TODO: add real command to start processor server in docker here
|
|
107
|
-
start_cmd = f""
|
|
83
|
+
start_cmd = ""
|
|
108
84
|
self.pid = self._start_docker_instance(logger, connector_client, start_cmd)
|
|
109
85
|
return self.pid
|
|
110
86
|
raise RuntimeError(f"Unknown deploy type of {self.__dict__}")
|
|
@@ -129,7 +129,7 @@ class DataRabbitMQ(DataNetworkService):
|
|
|
129
129
|
rmq_host, rmq_port, rmq_vhost = self.host, int(self.port), self.vhost
|
|
130
130
|
rmq_user, rmq_password = self.cred_username, self.cred_password
|
|
131
131
|
if self.skip_deployment:
|
|
132
|
-
logger.debug(
|
|
132
|
+
logger.debug("RabbitMQ is managed externally. Skipping deployment.")
|
|
133
133
|
verify_rabbitmq_available(logger=logger, rabbitmq_address=self.service_url)
|
|
134
134
|
return self.service_url
|
|
135
135
|
if not env:
|
ocrd_network/server_cache.py
CHANGED
|
@@ -33,7 +33,7 @@ class CacheLockedPages:
|
|
|
33
33
|
if not self.locked_pages.get(workspace_key, None):
|
|
34
34
|
self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}")
|
|
35
35
|
return False
|
|
36
|
-
debug_message =
|
|
36
|
+
debug_message = "Caching the received request due to locked output file grp pages."
|
|
37
37
|
for file_group in output_file_grps:
|
|
38
38
|
if file_group in self.locked_pages[workspace_key]:
|
|
39
39
|
if self.placeholder_all_pages in self.locked_pages[workspace_key][file_group]:
|
ocrd_network/server_utils.py
CHANGED
|
@@ -55,7 +55,7 @@ def create_processing_message(logger: Logger, job: DBProcessorJob) -> OcrdProces
|
|
|
55
55
|
)
|
|
56
56
|
return processing_message
|
|
57
57
|
except ValueError as error:
|
|
58
|
-
message =
|
|
58
|
+
message = "Failed to create OcrdProcessingMessage from DBProcessorJob"
|
|
59
59
|
raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message, error)
|
|
60
60
|
|
|
61
61
|
|
|
@@ -124,50 +124,7 @@ async def _get_processor_job_log(logger: Logger, job_id: str) -> FileResponse:
|
|
|
124
124
|
return FileResponse(path=log_file_path, filename=log_file_path.name)
|
|
125
125
|
|
|
126
126
|
|
|
127
|
-
def
|
|
128
|
-
# Request the ocrd tool json from the Processor Server
|
|
129
|
-
try:
|
|
130
|
-
response = requests_get(
|
|
131
|
-
urljoin(base=processor_server_base_url, url="info"),
|
|
132
|
-
headers={"Content-Type": "application/json"}
|
|
133
|
-
)
|
|
134
|
-
except Exception as error:
|
|
135
|
-
message = f"Failed to retrieve ocrd tool json from: {processor_server_base_url}"
|
|
136
|
-
raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message, error)
|
|
137
|
-
if response.status_code != 200:
|
|
138
|
-
message = f"Failed to retrieve tool json from: {processor_server_base_url}, code: {response.status_code}"
|
|
139
|
-
raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
|
|
140
|
-
return response.json()
|
|
141
|
-
|
|
142
|
-
async def forward_job_to_processor_server(
|
|
143
|
-
logger: Logger, job_input: PYJobInput, processor_server_base_url: str
|
|
144
|
-
) -> PYJobOutput:
|
|
145
|
-
try:
|
|
146
|
-
json_data = dumps(job_input.dict(exclude_unset=True, exclude_none=True))
|
|
147
|
-
except Exception as error:
|
|
148
|
-
message = f"Failed to json dump the PYJobInput: {job_input}"
|
|
149
|
-
raise_http_exception(logger, status.HTTP_500_INTERNAL_SERVER_ERROR, message, error)
|
|
150
|
-
|
|
151
|
-
# TODO: The amount of pages should come as a request input
|
|
152
|
-
# TODO: cf https://github.com/OCR-D/core/pull/1030/files#r1152551161
|
|
153
|
-
# currently, use 200 as a default
|
|
154
|
-
request_timeout = calculate_processing_request_timeout(amount_pages=200, timeout_per_page=20.0)
|
|
155
|
-
|
|
156
|
-
# Post a processing job to the Processor Server asynchronously
|
|
157
|
-
async with AsyncClient(timeout=Timeout(timeout=request_timeout, connect=30.0)) as client:
|
|
158
|
-
response = await client.post(
|
|
159
|
-
urljoin(base=processor_server_base_url, url="run"),
|
|
160
|
-
headers={"Content-Type": "application/json"},
|
|
161
|
-
json=loads(json_data)
|
|
162
|
-
)
|
|
163
|
-
if response.status_code != 202:
|
|
164
|
-
message = f"Failed to post '{job_input.processor_name}' job to: {processor_server_base_url}"
|
|
165
|
-
raise_http_exception(logger, status.HTTP_500_INTERNAL_SERVER_ERROR, message)
|
|
166
|
-
job_output = response.json()
|
|
167
|
-
return job_output
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
async def get_workflow_content(logger: Logger, workflow_id: str, workflow: Union[UploadFile, None]) -> str:
|
|
127
|
+
async def get_workflow_content(logger: Logger, workflow_id: str, workflow: Union[UploadFile, str, None]) -> str:
|
|
171
128
|
if not workflow and not workflow_id:
|
|
172
129
|
message = "Either 'workflow' must be uploaded as a file or 'workflow_id' must be provided. Both are missing."
|
|
173
130
|
raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message)
|
|
@@ -178,6 +135,9 @@ async def get_workflow_content(logger: Logger, workflow_id: str, workflow: Union
|
|
|
178
135
|
except ValueError as error:
|
|
179
136
|
message = f"Workflow with id '{workflow_id}' not found"
|
|
180
137
|
raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message, error)
|
|
138
|
+
if isinstance(workflow, str):
|
|
139
|
+
with open(workflow) as wf_file:
|
|
140
|
+
return wf_file.read()
|
|
181
141
|
return await generate_workflow_content(workflow)
|
|
182
142
|
|
|
183
143
|
|
|
@@ -193,14 +153,14 @@ def parse_workflow_tasks(logger: Logger, workflow_content: str) -> List[Processo
|
|
|
193
153
|
tasks_list = workflow_content.splitlines()
|
|
194
154
|
return [ProcessorTask.parse(task_str) for task_str in tasks_list if task_str.strip()]
|
|
195
155
|
except ValueError as error:
|
|
196
|
-
message =
|
|
156
|
+
message = "Failed parsing processing tasks from a workflow."
|
|
197
157
|
raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message, error)
|
|
198
158
|
|
|
199
159
|
|
|
200
160
|
def raise_http_exception(logger: Logger, status_code: int, message: str, error: Exception = None) -> None:
|
|
201
161
|
if error:
|
|
202
162
|
message = f"{message} {error}"
|
|
203
|
-
logger.exception(
|
|
163
|
+
logger.exception(message)
|
|
204
164
|
raise HTTPException(status_code=status_code, detail=message)
|
|
205
165
|
|
|
206
166
|
|
|
@@ -214,7 +174,7 @@ def validate_job_input(logger: Logger, processor_name: str, ocrd_tool: dict, job
|
|
|
214
174
|
)
|
|
215
175
|
raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message)
|
|
216
176
|
if not ocrd_tool:
|
|
217
|
-
message =
|
|
177
|
+
message = "Failed parsing processing tasks from a workflow."
|
|
218
178
|
raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
|
|
219
179
|
try:
|
|
220
180
|
report = ParameterValidator(ocrd_tool).validate(dict(job_input.parameters))
|
|
@@ -249,10 +209,10 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s
|
|
|
249
209
|
raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message)
|
|
250
210
|
|
|
251
211
|
|
|
252
|
-
def kill_mets_server_zombies(minutes_ago
|
|
253
|
-
if minutes_ago
|
|
212
|
+
def kill_mets_server_zombies(minutes_ago: Optional[int], dry_run: Optional[bool]) -> List[int]:
|
|
213
|
+
if minutes_ago is None:
|
|
254
214
|
minutes_ago = 90
|
|
255
|
-
if dry_run
|
|
215
|
+
if dry_run is None:
|
|
256
216
|
dry_run = False
|
|
257
217
|
|
|
258
218
|
now = time()
|
|
@@ -271,7 +231,8 @@ def kill_mets_server_zombies(minutes_ago : Optional[int], dry_run : Optional[boo
|
|
|
271
231
|
if re.match(cmdline_pat, cmdline):
|
|
272
232
|
pid = int(procdir.name)
|
|
273
233
|
ret.append(pid)
|
|
274
|
-
print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago},
|
|
234
|
+
print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, '
|
|
235
|
+
f'so killing (cmdline="{cmdline})', file=sys.stderr)
|
|
275
236
|
if dry_run:
|
|
276
237
|
print(f'[dry_run is active] kill {pid}')
|
|
277
238
|
else:
|
ocrd_network/utils.py
CHANGED
|
@@ -172,5 +172,6 @@ def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) ->
|
|
|
172
172
|
else:
|
|
173
173
|
ValueError(f"Unexpected protocol type: {protocol}")
|
|
174
174
|
|
|
175
|
+
|
|
175
176
|
def get_uds_path(ws_dir_path: str) -> Path:
|
|
176
177
|
return Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(ws_dir_path)}.sock")
|
ocrd_utils/__init__.py
CHANGED
|
@@ -8,11 +8,11 @@ Utility functions and constants usable in various circumstances.
|
|
|
8
8
|
levels below page (i.e. region, line, word, glyph) between relative coordinates
|
|
9
9
|
w.r.t. a corresponding image and absolute coordinates w.r.t. the top-level image.
|
|
10
10
|
This includes rotation and offset correction, based on affine transformations.
|
|
11
|
-
(Used by :py:class:`ocrd.workspace.Workspace` methods
|
|
12
|
-
:py:meth:`ocrd.workspace.Workspace.image_from_page` and
|
|
11
|
+
(Used by :py:class:`ocrd.workspace.Workspace` methods
|
|
12
|
+
:py:meth:`ocrd.workspace.Workspace.image_from_page` and
|
|
13
13
|
:py:meth:`ocrd.workspace.Workspace.image_from_segment`.)
|
|
14
14
|
|
|
15
|
-
* :py:func:`rotate_coordinates`,
|
|
15
|
+
* :py:func:`rotate_coordinates`,
|
|
16
16
|
:py:func:`scale_coordinates`,
|
|
17
17
|
:py:func:`shift_coordinates`,
|
|
18
18
|
:py:func:`transpose_coordinates`,
|
|
@@ -23,7 +23,7 @@ Utility functions and constants usable in various circumstances.
|
|
|
23
23
|
used to pass down the coordinate system along with images (both invariably sharing
|
|
24
24
|
the same operations context) when traversing the element hierarchy top to bottom.
|
|
25
25
|
(Used by :py:class:`ocrd.workspace.Workspace` methods
|
|
26
|
-
:py:meth:`ocrd.workspace.Workspace.image_from_page` and
|
|
26
|
+
:py:meth:`ocrd.workspace.Workspace.image_from_page` and
|
|
27
27
|
:py:meth:`ocrd.workspace.Workspace.image_from_segment`.)
|
|
28
28
|
|
|
29
29
|
* :py:func:`rotate_image`,
|