ocrd 3.0.0b6__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +3 -1
- ocrd/decorators/__init__.py +3 -2
- ocrd/mets_server.py +62 -42
- ocrd/processor/base.py +25 -9
- ocrd/processor/builtin/dummy/ocrd-tool.json +20 -0
- ocrd/processor/builtin/dummy_processor.py +0 -3
- ocrd/processor/builtin/filter_processor.py +108 -0
- ocrd/resource_manager.py +4 -0
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/METADATA +2 -1
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/RECORD +34 -32
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/entry_points.txt +1 -0
- ocrd_modelfactory/__init__.py +7 -1
- ocrd_models/ocrd_exif.py +2 -2
- ocrd_models/ocrd_page.py +22 -3
- ocrd_models/ocrd_page_generateds.py +2813 -1438
- ocrd_models/xpath_functions.py +51 -0
- ocrd_network/cli/client.py +27 -8
- ocrd_network/client.py +9 -6
- ocrd_network/client_utils.py +25 -14
- ocrd_network/processing_server.py +27 -15
- ocrd_network/processing_worker.py +7 -4
- ocrd_network/processor_server.py +2 -1
- ocrd_network/rabbitmq_utils/connector.py +2 -2
- ocrd_network/runtime_data/deployer.py +28 -18
- ocrd_network/server_cache.py +26 -23
- ocrd_network/server_utils.py +40 -4
- ocrd_network/tcp_to_uds_mets_proxy.py +8 -5
- ocrd_network/utils.py +19 -15
- ocrd_utils/config.py +38 -16
- ocrd_utils/logging.py +27 -56
- ocrd_utils/ocrd_logging.conf +14 -16
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/LICENSE +0 -0
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/WHEEL +0 -0
- {ocrd-3.0.0b6.dist-info → ocrd-3.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from ocrd_utils import xywh_from_points
|
|
2
|
+
|
|
3
|
+
pc_functions = []
|
|
4
|
+
|
|
5
|
+
def _export(func):
|
|
6
|
+
pc_functions.append(func)
|
|
7
|
+
return func
|
|
8
|
+
|
|
9
|
+
@_export
|
|
10
|
+
def pc_pixelarea(nodes):
|
|
11
|
+
"""
|
|
12
|
+
Extract Coords/@points from all nodes, calculate the bounding
|
|
13
|
+
box, and accumulate areas.
|
|
14
|
+
"""
|
|
15
|
+
area = 0
|
|
16
|
+
for node in nodes:
|
|
17
|
+
# FIXME: find out why we need to go to the parent here
|
|
18
|
+
node = node.parent.value
|
|
19
|
+
coords = node.find(f'{node.prefix}:Coords', node.nsmap)
|
|
20
|
+
if coords is None:
|
|
21
|
+
continue
|
|
22
|
+
points = coords.attrib['points']
|
|
23
|
+
xywh = xywh_from_points(points)
|
|
24
|
+
area += xywh['w'] * xywh['h']
|
|
25
|
+
return area
|
|
26
|
+
|
|
27
|
+
@_export
|
|
28
|
+
def pc_textequiv(nodes):
|
|
29
|
+
"""
|
|
30
|
+
Extract TextEquiv/Unicode from all nodes, then concatenate
|
|
31
|
+
(interspersed with spaces or newlines).
|
|
32
|
+
"""
|
|
33
|
+
text = ''
|
|
34
|
+
for node in nodes:
|
|
35
|
+
# FIXME: find out why we need to go to the parent here
|
|
36
|
+
node = node.parent.value
|
|
37
|
+
if text and node.tag.endswith('Region'):
|
|
38
|
+
text += '\n'
|
|
39
|
+
if text and node.tag.endswith('Line'):
|
|
40
|
+
text += '\n'
|
|
41
|
+
if text and node.tag.endswith('Word'):
|
|
42
|
+
text += ' '
|
|
43
|
+
equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap)
|
|
44
|
+
if equiv is None:
|
|
45
|
+
continue
|
|
46
|
+
string = equiv.find(f'{node.prefix}:Unicode', node.nsmap)
|
|
47
|
+
if string is None:
|
|
48
|
+
continue
|
|
49
|
+
text += str(string.text)
|
|
50
|
+
return text
|
|
51
|
+
|
ocrd_network/cli/client.py
CHANGED
|
@@ -2,6 +2,7 @@ import click
|
|
|
2
2
|
from json import dumps
|
|
3
3
|
from typing import List, Optional, Tuple
|
|
4
4
|
from ocrd.decorators.parameter_option import parameter_option, parameter_override_option
|
|
5
|
+
from ocrd_network.constants import JobState
|
|
5
6
|
from ocrd_utils import DEFAULT_METS_BASENAME
|
|
6
7
|
from ocrd_utils.introspect import set_json_key_value_overrides
|
|
7
8
|
from ocrd_utils.str import parse_json_string_or_file
|
|
@@ -104,8 +105,10 @@ def check_processing_job_status(address: Optional[str], processing_job_id: str):
|
|
|
104
105
|
@click.option('--result-queue-name')
|
|
105
106
|
@click.option('--callback-url')
|
|
106
107
|
@click.option('--agent-type', default='worker')
|
|
107
|
-
@click.option('-b', '--block', default=False,
|
|
108
|
+
@click.option('-b', '--block', default=False, is_flag=True,
|
|
108
109
|
help='If set, the client will block till job timeout, fail or success.')
|
|
110
|
+
@click.option('-p', '--print-state', default=False, is_flag=True,
|
|
111
|
+
help='If set, the client will print job states by each iteration.')
|
|
109
112
|
def send_processing_job_request(
|
|
110
113
|
address: Optional[str],
|
|
111
114
|
processor_name: str,
|
|
@@ -120,7 +123,8 @@ def send_processing_job_request(
|
|
|
120
123
|
# TODO: This is temporally available to toggle
|
|
121
124
|
# between the ProcessingWorker/ProcessorServer
|
|
122
125
|
agent_type: Optional[str],
|
|
123
|
-
block: Optional[bool]
|
|
126
|
+
block: Optional[bool],
|
|
127
|
+
print_state: Optional[bool]
|
|
124
128
|
):
|
|
125
129
|
"""
|
|
126
130
|
Submit a processing job to the processing server.
|
|
@@ -146,7 +150,7 @@ def send_processing_job_request(
|
|
|
146
150
|
assert processing_job_id
|
|
147
151
|
print(f"Processing job id: {processing_job_id}")
|
|
148
152
|
if block:
|
|
149
|
-
client.poll_job_status(job_id=processing_job_id)
|
|
153
|
+
client.poll_job_status(job_id=processing_job_id, print_state=print_state)
|
|
150
154
|
|
|
151
155
|
|
|
152
156
|
@client_cli.group('workflow')
|
|
@@ -176,24 +180,39 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str):
|
|
|
176
180
|
'the "OCRD_NETWORK_SERVER_ADDR_PROCESSING" env variable is used by default')
|
|
177
181
|
@click.option('-m', '--path-to-mets', required=True)
|
|
178
182
|
@click.option('-w', '--path-to-workflow', required=True)
|
|
179
|
-
@click.option('-
|
|
183
|
+
@click.option('--page-wise/--no-page-wise', is_flag=True, default=False, help="Whether to generate per-page jobs")
|
|
184
|
+
@click.option('-b', '--block', default=False, is_flag=True,
|
|
180
185
|
help='If set, the client will block till job timeout, fail or success.')
|
|
186
|
+
@click.option('-p', '--print-state', default=False, is_flag=True,
|
|
187
|
+
help='If set, the client will print job states by each iteration.')
|
|
181
188
|
def send_workflow_job_request(
|
|
182
189
|
address: Optional[str],
|
|
183
190
|
path_to_mets: str,
|
|
184
191
|
path_to_workflow: str,
|
|
185
|
-
|
|
192
|
+
page_wise: bool,
|
|
193
|
+
block: bool,
|
|
194
|
+
print_state: bool
|
|
186
195
|
):
|
|
187
196
|
"""
|
|
188
197
|
Submit a workflow job to the processing server.
|
|
189
198
|
"""
|
|
190
199
|
client = Client(server_addr_processing=address)
|
|
191
|
-
workflow_job_id = client.send_workflow_job_request(
|
|
200
|
+
workflow_job_id = client.send_workflow_job_request(
|
|
201
|
+
path_to_wf=path_to_workflow,
|
|
202
|
+
path_to_mets=path_to_mets,
|
|
203
|
+
page_wise=page_wise,
|
|
204
|
+
)
|
|
192
205
|
assert workflow_job_id
|
|
193
206
|
print(f"Workflow job id: {workflow_job_id}")
|
|
194
207
|
if block:
|
|
195
|
-
|
|
196
|
-
|
|
208
|
+
print(f"Polling state of workflow job {workflow_job_id}")
|
|
209
|
+
state = client.poll_workflow_status(job_id=workflow_job_id, print_state=print_state)
|
|
210
|
+
if state != JobState.success:
|
|
211
|
+
print(f"Workflow failed with {state}")
|
|
212
|
+
exit(1)
|
|
213
|
+
else:
|
|
214
|
+
print(f"Workflow succeeded")
|
|
215
|
+
exit(0)
|
|
197
216
|
|
|
198
217
|
@client_cli.group('workspace')
|
|
199
218
|
def workspace_cli():
|
ocrd_network/client.py
CHANGED
|
@@ -46,18 +46,21 @@ class Client:
|
|
|
46
46
|
def check_workflow_status(self, workflow_job_id: str):
|
|
47
47
|
return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id)
|
|
48
48
|
|
|
49
|
-
def poll_job_status(self, job_id: str) -> str:
|
|
49
|
+
def poll_job_status(self, job_id: str, print_state: bool = False) -> str:
|
|
50
50
|
return poll_job_status_till_timeout_fail_or_success(
|
|
51
|
-
ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait
|
|
51
|
+
ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait,
|
|
52
|
+
print_state=print_state)
|
|
52
53
|
|
|
53
|
-
def poll_workflow_status(self, job_id: str) -> str:
|
|
54
|
+
def poll_workflow_status(self, job_id: str, print_state: bool = False) -> str:
|
|
54
55
|
return poll_wf_status_till_timeout_fail_or_success(
|
|
55
|
-
ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait
|
|
56
|
+
ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait,
|
|
57
|
+
print_state=print_state)
|
|
56
58
|
|
|
57
59
|
def send_processing_job_request(self, processor_name: str, req_params: dict) -> str:
|
|
58
60
|
return post_ps_processing_request(
|
|
59
61
|
ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params)
|
|
60
62
|
|
|
61
|
-
def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str):
|
|
63
|
+
def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str, page_wise: bool = False):
|
|
62
64
|
return post_ps_workflow_request(
|
|
63
|
-
ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets
|
|
65
|
+
ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets,
|
|
66
|
+
page_wise=page_wise)
|
ocrd_network/client_utils.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from requests import get as request_get, post as request_post
|
|
2
3
|
from time import sleep
|
|
3
4
|
from .constants import JobState, NETWORK_PROTOCOLS
|
|
4
5
|
|
|
5
6
|
|
|
6
|
-
def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int):
|
|
7
|
+
def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_state: bool = False) -> JobState:
|
|
7
8
|
if job_type not in ["workflow", "processor"]:
|
|
8
9
|
raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'")
|
|
9
10
|
job_state = JobState.unset
|
|
@@ -13,18 +14,22 @@ def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries
|
|
|
13
14
|
job_state = get_ps_processing_job_status(ps_server_host, job_id)
|
|
14
15
|
if job_type == "workflow":
|
|
15
16
|
job_state = get_ps_workflow_job_status(ps_server_host, job_id)
|
|
17
|
+
if print_state:
|
|
18
|
+
print(f"State of the {job_type} job {job_id}: {job_state}")
|
|
16
19
|
if job_state == JobState.success or job_state == JobState.failed:
|
|
17
20
|
break
|
|
18
21
|
tries -= 1
|
|
19
22
|
return job_state
|
|
20
23
|
|
|
21
24
|
|
|
22
|
-
def poll_job_status_till_timeout_fail_or_success(
|
|
23
|
-
|
|
25
|
+
def poll_job_status_till_timeout_fail_or_success(
|
|
26
|
+
ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState:
|
|
27
|
+
return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_state)
|
|
24
28
|
|
|
25
29
|
|
|
26
|
-
def poll_wf_status_till_timeout_fail_or_success(
|
|
27
|
-
|
|
30
|
+
def poll_wf_status_till_timeout_fail_or_success(
|
|
31
|
+
ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState:
|
|
32
|
+
return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_state)
|
|
28
33
|
|
|
29
34
|
|
|
30
35
|
def get_ps_deployed_processors(ps_server_host: str):
|
|
@@ -47,22 +52,21 @@ def get_ps_processing_job_log(ps_server_host: str, processing_job_id: str):
|
|
|
47
52
|
return response
|
|
48
53
|
|
|
49
54
|
|
|
50
|
-
def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) ->
|
|
55
|
+
def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> JobState:
|
|
51
56
|
request_url = f"{ps_server_host}/processor/job/{processing_job_id}"
|
|
52
57
|
response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
|
|
53
58
|
assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
|
|
54
59
|
job_state = response.json()["state"]
|
|
55
60
|
assert job_state
|
|
56
|
-
return job_state
|
|
57
|
-
|
|
61
|
+
return getattr(JobState, job_state.lower())
|
|
58
62
|
|
|
59
|
-
def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) ->
|
|
63
|
+
def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> JobState:
|
|
60
64
|
request_url = f"{ps_server_host}/workflow/job-simple/{workflow_job_id}"
|
|
61
65
|
response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
|
|
62
66
|
assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
|
|
63
67
|
job_state = response.json()["state"]
|
|
64
68
|
assert job_state
|
|
65
|
-
return job_state
|
|
69
|
+
return getattr(JobState, job_state.lower())
|
|
66
70
|
|
|
67
71
|
|
|
68
72
|
def post_ps_processing_request(ps_server_host: str, processor: str, job_input: dict) -> str:
|
|
@@ -78,9 +82,13 @@ def post_ps_processing_request(ps_server_host: str, processor: str, job_input: d
|
|
|
78
82
|
return processing_job_id
|
|
79
83
|
|
|
80
84
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
85
|
+
def post_ps_workflow_request(
|
|
86
|
+
ps_server_host: str,
|
|
87
|
+
path_to_wf: str,
|
|
88
|
+
path_to_mets: str,
|
|
89
|
+
page_wise: bool = False,
|
|
90
|
+
) -> str:
|
|
91
|
+
request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise={'True' if page_wise else 'False'}"
|
|
84
92
|
response = request_post(
|
|
85
93
|
url=request_url,
|
|
86
94
|
headers={"accept": "application/json; charset=utf-8"},
|
|
@@ -88,8 +96,11 @@ def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets:
|
|
|
88
96
|
)
|
|
89
97
|
# print(response.json())
|
|
90
98
|
# print(response.__dict__)
|
|
99
|
+
json_resp_raw = response.text
|
|
100
|
+
# print(f'post_ps_workflow_request >> {response.status_code}')
|
|
101
|
+
# print(f'post_ps_workflow_request >> {json_resp_raw}')
|
|
91
102
|
assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
|
|
92
|
-
wf_job_id =
|
|
103
|
+
wf_job_id = json.loads(json_resp_raw)["job_id"]
|
|
93
104
|
assert wf_job_id
|
|
94
105
|
return wf_job_id
|
|
95
106
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from os import getpid
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Dict, List, Union
|
|
4
|
+
from typing import Dict, List, Optional, Union
|
|
5
5
|
from uvicorn import run as uvicorn_run
|
|
6
6
|
|
|
7
7
|
from fastapi import APIRouter, FastAPI, File, HTTPException, Request, status, UploadFile
|
|
@@ -48,6 +48,7 @@ from .server_utils import (
|
|
|
48
48
|
get_workflow_content,
|
|
49
49
|
get_from_database_workspace,
|
|
50
50
|
get_from_database_workflow_job,
|
|
51
|
+
kill_mets_server_zombies,
|
|
51
52
|
parse_workflow_tasks,
|
|
52
53
|
raise_http_exception,
|
|
53
54
|
request_processor_server_tool_json,
|
|
@@ -78,7 +79,6 @@ class ProcessingServer(FastAPI):
|
|
|
78
79
|
"""
|
|
79
80
|
|
|
80
81
|
def __init__(self, config_path: str, host: str, port: int) -> None:
|
|
81
|
-
initLogging()
|
|
82
82
|
self.title = "OCR-D Processing Server"
|
|
83
83
|
super().__init__(
|
|
84
84
|
title=self.title,
|
|
@@ -86,6 +86,7 @@ class ProcessingServer(FastAPI):
|
|
|
86
86
|
on_shutdown=[self.on_shutdown],
|
|
87
87
|
description="OCR-D Processing Server"
|
|
88
88
|
)
|
|
89
|
+
initLogging()
|
|
89
90
|
self.log = getLogger("ocrd_network.processing_server")
|
|
90
91
|
log_file = get_processing_server_logging_file_path(pid=getpid())
|
|
91
92
|
configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a")
|
|
@@ -155,7 +156,7 @@ class ProcessingServer(FastAPI):
|
|
|
155
156
|
queue_names = self.deployer.find_matching_network_agents(
|
|
156
157
|
worker_only=True, str_names_only=True, unique_only=True
|
|
157
158
|
)
|
|
158
|
-
self.log.
|
|
159
|
+
self.log.info(f"Creating message queues on RabbitMQ instance url: {self.rabbitmq_url}")
|
|
159
160
|
create_message_queues(logger=self.log, rmq_publisher=self.rmq_publisher, queue_names=queue_names)
|
|
160
161
|
|
|
161
162
|
self.deployer.deploy_network_agents(mongodb_url=self.mongodb_url, rabbitmq_url=self.rabbitmq_url)
|
|
@@ -167,6 +168,7 @@ class ProcessingServer(FastAPI):
|
|
|
167
168
|
uvicorn_run(self, host=self.hostname, port=int(self.port))
|
|
168
169
|
|
|
169
170
|
async def on_startup(self):
|
|
171
|
+
self.log.info(f"Initializing the Database on: {self.mongodb_url}")
|
|
170
172
|
await initiate_database(db_url=self.mongodb_url)
|
|
171
173
|
|
|
172
174
|
async def on_shutdown(self) -> None:
|
|
@@ -200,6 +202,14 @@ class ProcessingServer(FastAPI):
|
|
|
200
202
|
tags=[ServerApiTags.WORKSPACE],
|
|
201
203
|
summary="Forward a TCP request to UDS mets server"
|
|
202
204
|
)
|
|
205
|
+
others_router.add_api_route(
|
|
206
|
+
path="/kill_mets_server_zombies",
|
|
207
|
+
endpoint=self.kill_mets_server_zombies,
|
|
208
|
+
methods=["DELETE"],
|
|
209
|
+
tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING],
|
|
210
|
+
status_code=status.HTTP_200_OK,
|
|
211
|
+
summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago."
|
|
212
|
+
)
|
|
203
213
|
self.include_router(others_router)
|
|
204
214
|
|
|
205
215
|
def add_api_routes_processing(self):
|
|
@@ -320,7 +330,7 @@ class ProcessingServer(FastAPI):
|
|
|
320
330
|
"""Forward mets-server-request
|
|
321
331
|
|
|
322
332
|
A processor calls a mets related method like add_file with ClientSideOcrdMets. This sends
|
|
323
|
-
a request to this endpoint. This request contains all
|
|
333
|
+
a request to this endpoint. This request contains all information necessary to make a call
|
|
324
334
|
to the uds-mets-server. This information is used by `MetsServerProxy` to make a the call
|
|
325
335
|
to the local (local for the processing-server) reachable the uds-mets-server.
|
|
326
336
|
"""
|
|
@@ -574,26 +584,20 @@ class ProcessingServer(FastAPI):
|
|
|
574
584
|
)
|
|
575
585
|
|
|
576
586
|
async def _consume_cached_jobs_of_workspace(
|
|
577
|
-
self, workspace_key: str, mets_server_url: str
|
|
587
|
+
self, workspace_key: str, mets_server_url: str, path_to_mets: str
|
|
578
588
|
) -> List[PYJobInput]:
|
|
579
|
-
|
|
580
|
-
# Check whether the internal queue for the workspace key still exists
|
|
581
|
-
if workspace_key not in self.cache_processing_requests.processing_requests:
|
|
582
|
-
self.log.debug(f"No internal queue available for workspace with key: {workspace_key}")
|
|
583
|
-
return []
|
|
584
|
-
|
|
585
589
|
# decrease the internal cache counter by 1
|
|
586
590
|
request_counter = self.cache_processing_requests.update_request_counter(
|
|
587
591
|
workspace_key=workspace_key, by_value=-1
|
|
588
592
|
)
|
|
589
593
|
self.log.debug(f"Internal processing job cache counter value: {request_counter}")
|
|
590
|
-
if not
|
|
594
|
+
if (workspace_key not in self.cache_processing_requests.processing_requests or
|
|
595
|
+
not len(self.cache_processing_requests.processing_requests[workspace_key])):
|
|
591
596
|
if request_counter <= 0:
|
|
592
597
|
# Shut down the Mets Server for the workspace_key since no
|
|
593
598
|
# more internal callbacks are expected for that workspace
|
|
594
599
|
self.log.debug(f"Stopping the mets server: {mets_server_url}")
|
|
595
|
-
|
|
596
|
-
self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url)
|
|
600
|
+
self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url, path_to_mets=path_to_mets)
|
|
597
601
|
|
|
598
602
|
try:
|
|
599
603
|
# The queue is empty - delete it
|
|
@@ -609,6 +613,10 @@ class ProcessingServer(FastAPI):
|
|
|
609
613
|
else:
|
|
610
614
|
self.log.debug(f"Internal request cache is empty but waiting for {request_counter} result callbacks.")
|
|
611
615
|
return []
|
|
616
|
+
# Check whether the internal queue for the workspace key still exists
|
|
617
|
+
if workspace_key not in self.cache_processing_requests.processing_requests:
|
|
618
|
+
self.log.debug(f"No internal queue available for workspace with key: {workspace_key}")
|
|
619
|
+
return []
|
|
612
620
|
consumed_requests = await self.cache_processing_requests.consume_cached_requests(workspace_key=workspace_key)
|
|
613
621
|
return consumed_requests
|
|
614
622
|
|
|
@@ -643,7 +651,7 @@ class ProcessingServer(FastAPI):
|
|
|
643
651
|
raise_http_exception(self.log, status.HTTP_404_NOT_FOUND, message, error)
|
|
644
652
|
|
|
645
653
|
consumed_cached_jobs = await self._consume_cached_jobs_of_workspace(
|
|
646
|
-
workspace_key=workspace_key, mets_server_url=mets_server_url
|
|
654
|
+
workspace_key=workspace_key, mets_server_url=mets_server_url, path_to_mets=path_to_mets
|
|
647
655
|
)
|
|
648
656
|
await self.push_cached_jobs_to_agents(processing_jobs=consumed_cached_jobs)
|
|
649
657
|
|
|
@@ -817,6 +825,10 @@ class ProcessingServer(FastAPI):
|
|
|
817
825
|
response = self._produce_workflow_status_response(processing_jobs=jobs)
|
|
818
826
|
return response
|
|
819
827
|
|
|
828
|
+
async def kill_mets_server_zombies(self, minutes_ago : Optional[int] = None, dry_run : Optional[bool] = None) -> List[int]:
|
|
829
|
+
pids_killed = kill_mets_server_zombies(minutes_ago=minutes_ago, dry_run=dry_run)
|
|
830
|
+
return pids_killed
|
|
831
|
+
|
|
820
832
|
async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]:
|
|
821
833
|
"""
|
|
822
834
|
Simplified version of the `get_workflow_info` that returns a single state for the entire workflow.
|
|
@@ -9,12 +9,12 @@ is a single OCR-D Processor instance.
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from datetime import datetime
|
|
12
|
-
from os import getpid
|
|
12
|
+
from os import getpid, getppid
|
|
13
13
|
from pika import BasicProperties
|
|
14
14
|
from pika.adapters.blocking_connection import BlockingChannel
|
|
15
15
|
from pika.spec import Basic
|
|
16
16
|
|
|
17
|
-
from ocrd_utils import getLogger
|
|
17
|
+
from ocrd_utils import getLogger, initLogging
|
|
18
18
|
from .constants import JobState
|
|
19
19
|
from .database import sync_initiate_database, sync_db_get_workspace, sync_db_update_processing_job, verify_database_uri
|
|
20
20
|
from .logging_utils import (
|
|
@@ -35,14 +35,16 @@ from .utils import calculate_execution_time, post_to_callback_url
|
|
|
35
35
|
|
|
36
36
|
class ProcessingWorker:
|
|
37
37
|
def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None:
|
|
38
|
+
initLogging()
|
|
38
39
|
self.log = getLogger(f'ocrd_network.processing_worker')
|
|
39
40
|
log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid())
|
|
40
41
|
configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a")
|
|
41
42
|
|
|
42
43
|
try:
|
|
43
44
|
verify_database_uri(mongodb_addr)
|
|
44
|
-
self.log.
|
|
45
|
+
self.log.info(f'Verified MongoDB URL: {mongodb_addr}')
|
|
45
46
|
self.rmq_data = verify_and_parse_mq_uri(rabbitmq_addr)
|
|
47
|
+
self.log.info(f'Verified RabbitMQ URL: {rabbitmq_addr}')
|
|
46
48
|
except ValueError as error:
|
|
47
49
|
msg = f"Failed to parse data, error: {error}"
|
|
48
50
|
self.log.exception(msg)
|
|
@@ -61,6 +63,7 @@ class ProcessingWorker:
|
|
|
61
63
|
# Gets assigned when the `connect_publisher` is called on the worker object
|
|
62
64
|
# Used to publish OcrdResultMessage type message to the queue with name {processor_name}-result
|
|
63
65
|
self.rmq_publisher = None
|
|
66
|
+
self.log.info(f"Initialized processing worker: {processor_name}")
|
|
64
67
|
|
|
65
68
|
def connect_consumer(self):
|
|
66
69
|
self.rmq_consumer = connect_rabbitmq_consumer(self.log, self.rmq_data)
|
|
@@ -240,7 +243,7 @@ class ProcessingWorker:
|
|
|
240
243
|
# post the result message (callback to a user defined endpoint)
|
|
241
244
|
post_to_callback_url(self.log, callback_url, result_message)
|
|
242
245
|
if internal_callback_url:
|
|
243
|
-
self.log.info(f"Publishing result to internal callback url (Processing Server): {
|
|
246
|
+
self.log.info(f"Publishing result to internal callback url (Processing Server): {internal_callback_url}")
|
|
244
247
|
# If the internal callback_url field is set,
|
|
245
248
|
# post the result message (callback to Processing Server endpoint)
|
|
246
249
|
post_to_callback_url(self.log, internal_callback_url, result_message)
|
ocrd_network/processor_server.py
CHANGED
|
@@ -42,13 +42,13 @@ class ProcessorServer(FastAPI):
|
|
|
42
42
|
def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class=None):
|
|
43
43
|
if not (processor_name or processor_class):
|
|
44
44
|
raise ValueError("Either 'processor_name' or 'processor_class' must be provided")
|
|
45
|
-
initLogging()
|
|
46
45
|
super().__init__(
|
|
47
46
|
on_startup=[self.on_startup],
|
|
48
47
|
on_shutdown=[self.on_shutdown],
|
|
49
48
|
title=f"Network agent - Processor Server",
|
|
50
49
|
description="Network agent - Processor Server"
|
|
51
50
|
)
|
|
51
|
+
initLogging()
|
|
52
52
|
self.log = getLogger("ocrd_network.processor_server")
|
|
53
53
|
log_file = get_processor_server_logging_file_path(processor_name=processor_name, pid=getpid())
|
|
54
54
|
configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a")
|
|
@@ -69,6 +69,7 @@ class ProcessorServer(FastAPI):
|
|
|
69
69
|
self.processor_name = self.ocrd_tool["executable"]
|
|
70
70
|
|
|
71
71
|
self.add_api_routes_processing()
|
|
72
|
+
self.log.info(f"Initialized processor server: {processor_name}")
|
|
72
73
|
|
|
73
74
|
async def on_startup(self):
|
|
74
75
|
await initiate_database(db_url=self.db_url)
|
|
@@ -6,6 +6,7 @@ RabbitMQ documentation.
|
|
|
6
6
|
from typing import Any, Optional, Union
|
|
7
7
|
from pika import BasicProperties, BlockingConnection, ConnectionParameters, PlainCredentials
|
|
8
8
|
from pika.adapters.blocking_connection import BlockingChannel
|
|
9
|
+
from ocrd_utils import config
|
|
9
10
|
from .constants import (
|
|
10
11
|
DEFAULT_EXCHANGER_NAME,
|
|
11
12
|
DEFAULT_EXCHANGER_TYPE,
|
|
@@ -69,8 +70,7 @@ class RMQConnector:
|
|
|
69
70
|
port=port,
|
|
70
71
|
virtual_host=vhost,
|
|
71
72
|
credentials=credentials,
|
|
72
|
-
|
|
73
|
-
heartbeat=0
|
|
73
|
+
heartbeat=config.OCRD_NETWORK_RABBITMQ_HEARTBEAT
|
|
74
74
|
),
|
|
75
75
|
)
|
|
76
76
|
return blocking_connection
|
|
@@ -8,7 +8,7 @@ Each Processing Worker is an instance of an OCR-D processor.
|
|
|
8
8
|
"""
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
|
|
11
|
+
import psutil
|
|
12
12
|
from time import sleep
|
|
13
13
|
from typing import Dict, List, Union
|
|
14
14
|
|
|
@@ -30,6 +30,8 @@ class Deployer:
|
|
|
30
30
|
self.data_hosts: List[DataHost] = parse_hosts_data(ps_config["hosts"])
|
|
31
31
|
self.internal_callback_url = ps_config.get("internal_callback_url", None)
|
|
32
32
|
self.mets_servers: Dict = {} # {"mets_server_url": "mets_server_pid"}
|
|
33
|
+
# This is required to store UDS urls that are multiplexed through the TCP proxy and are not preserved anywhere
|
|
34
|
+
self.mets_servers_paths: Dict = {} # {"ws_dir_path": "mets_server_url"}
|
|
33
35
|
self.use_tcp_mets = ps_config.get("use_tcp_mets", False)
|
|
34
36
|
|
|
35
37
|
# TODO: Reconsider this.
|
|
@@ -146,25 +148,33 @@ class Deployer:
|
|
|
146
148
|
if is_mets_server_running(mets_server_url=str(mets_server_url)):
|
|
147
149
|
self.log.debug(f"The UDS mets server for {ws_dir_path} is already started: {mets_server_url}")
|
|
148
150
|
return mets_server_url
|
|
151
|
+
elif Path(mets_server_url).is_socket():
|
|
152
|
+
self.log.warning(
|
|
153
|
+
f"The UDS mets server for {ws_dir_path} is not running but the socket file exists: {mets_server_url}."
|
|
154
|
+
"Removing to avoid any weird behavior before starting the server.")
|
|
155
|
+
Path(mets_server_url).unlink()
|
|
149
156
|
self.log.info(f"Starting UDS mets server: {mets_server_url}")
|
|
150
|
-
pid = OcrdMetsServer.create_process(mets_server_url=mets_server_url, ws_dir_path=ws_dir_path, log_file=log_file)
|
|
151
|
-
self.mets_servers[mets_server_url] = pid
|
|
157
|
+
pid = OcrdMetsServer.create_process(mets_server_url=str(mets_server_url), ws_dir_path=str(ws_dir_path), log_file=str(log_file))
|
|
158
|
+
self.mets_servers[str(mets_server_url)] = pid
|
|
159
|
+
self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url)
|
|
152
160
|
return mets_server_url
|
|
153
161
|
|
|
154
|
-
def stop_uds_mets_server(self, mets_server_url: str,
|
|
162
|
+
def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str) -> None:
|
|
155
163
|
self.log.info(f"Stopping UDS mets server: {mets_server_url}")
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
164
|
+
self.log.info(f"Path to the mets file: {path_to_mets}")
|
|
165
|
+
self.log.debug(f"mets_server: {self.mets_servers}")
|
|
166
|
+
self.log.debug(f"mets_server_paths: {self.mets_servers_paths}")
|
|
167
|
+
workspace_path = str(Path(path_to_mets).parent)
|
|
168
|
+
mets_server_url_uds = self.mets_servers_paths[workspace_path]
|
|
169
|
+
mets_server_pid = self.mets_servers[mets_server_url_uds]
|
|
170
|
+
self.log.info(f"Terminating mets server with pid: {mets_server_pid}")
|
|
171
|
+
p = psutil.Process(mets_server_pid)
|
|
172
|
+
stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=workspace_path)
|
|
173
|
+
if p.is_running():
|
|
174
|
+
p.wait()
|
|
175
|
+
self.log.info(f"Terminated mets server with pid: {mets_server_pid}")
|
|
176
|
+
else:
|
|
177
|
+
self.log.info(f"Mets server with pid: {mets_server_pid} has already terminated.")
|
|
178
|
+
del self.mets_servers_paths[workspace_path]
|
|
179
|
+
del self.mets_servers[mets_server_url_uds]
|
|
170
180
|
return
|