ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +34 -26
- ocrd/cli/bashlib.py +32 -18
- ocrd/cli/ocrd_tool.py +7 -5
- ocrd/cli/workspace.py +10 -8
- ocrd/decorators/__init__.py +13 -7
- ocrd/decorators/ocrd_cli_options.py +1 -1
- ocrd/lib.bash +3 -0
- ocrd/mets_server.py +3 -4
- ocrd/processor/__init__.py +1 -1
- ocrd/processor/base.py +421 -98
- ocrd/processor/builtin/dummy_processor.py +4 -11
- ocrd/processor/helpers.py +24 -161
- ocrd/processor/ocrd_page_result.py +3 -3
- ocrd/resolver.py +0 -3
- ocrd/resource_manager.py +9 -5
- ocrd/workspace.py +10 -11
- ocrd/workspace_backup.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/METADATA +32 -10
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/RECORD +49 -48
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/WHEEL +1 -1
- ocrd_modelfactory/__init__.py +1 -1
- ocrd_models/constants.py +0 -1
- ocrd_models/ocrd_exif.py +2 -2
- ocrd_models/ocrd_file.py +2 -2
- ocrd_models/ocrd_mets.py +22 -22
- ocrd_models/ocrd_page.py +0 -1
- ocrd_models/ocrd_xml_base.py +2 -2
- ocrd_network/cli/client.py +134 -30
- ocrd_network/client.py +53 -27
- ocrd_network/client_utils.py +101 -0
- ocrd_network/processing_server.py +1 -1
- ocrd_network/runtime_data/deployer.py +12 -3
- ocrd_network/server_utils.py +12 -10
- ocrd_utils/__init__.py +2 -0
- ocrd_utils/config.py +31 -2
- ocrd_utils/image.py +25 -25
- ocrd_utils/logging.py +20 -20
- ocrd_utils/os.py +4 -5
- ocrd_utils/str.py +10 -3
- ocrd_validators/json_validator.py +1 -3
- ocrd_validators/ocrd_tool_validator.py +2 -2
- ocrd_validators/page_validator.py +56 -56
- ocrd_validators/parameter_validator.py +2 -2
- ocrd_validators/resource_list_validator.py +4 -3
- ocrd_validators/workspace_validator.py +21 -21
- ocrd_validators/xsd_validator.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/LICENSE +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/entry_points.txt +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/top_level.txt +0 -0
ocrd_network/client.py
CHANGED
|
@@ -1,37 +1,63 @@
|
|
|
1
|
-
from
|
|
2
|
-
from requests import post as requests_post
|
|
1
|
+
from typing import Optional
|
|
3
2
|
from ocrd_utils import config, getLogger, LOG_FORMAT
|
|
3
|
+
from .client_utils import (
|
|
4
|
+
get_ps_deployed_processors,
|
|
5
|
+
get_ps_deployed_processor_ocrd_tool,
|
|
6
|
+
get_ps_processing_job_log,
|
|
7
|
+
get_ps_processing_job_status,
|
|
8
|
+
get_ps_workflow_job_status,
|
|
9
|
+
poll_job_status_till_timeout_fail_or_success,
|
|
10
|
+
poll_wf_status_till_timeout_fail_or_success,
|
|
11
|
+
post_ps_processing_request,
|
|
12
|
+
post_ps_workflow_request,
|
|
13
|
+
verify_server_protocol
|
|
14
|
+
)
|
|
4
15
|
|
|
5
|
-
from .constants import NETWORK_PROTOCOLS
|
|
6
16
|
|
|
7
|
-
|
|
8
|
-
# TODO: This is just a conceptual implementation and first try to
|
|
9
|
-
# trigger further discussions on how this should look like.
|
|
10
17
|
class Client:
|
|
11
18
|
def __init__(
|
|
12
19
|
self,
|
|
13
|
-
server_addr_processing: str
|
|
14
|
-
|
|
15
|
-
|
|
20
|
+
server_addr_processing: Optional[str],
|
|
21
|
+
timeout: int = config.OCRD_NETWORK_CLIENT_POLLING_TIMEOUT,
|
|
22
|
+
wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP
|
|
16
23
|
):
|
|
17
24
|
self.log = getLogger(f"ocrd_network.client")
|
|
25
|
+
if not server_addr_processing:
|
|
26
|
+
server_addr_processing = config.OCRD_NETWORK_SERVER_ADDR_PROCESSING
|
|
18
27
|
self.server_addr_processing = server_addr_processing
|
|
19
|
-
self.server_addr_workflow = server_addr_workflow
|
|
20
|
-
self.server_addr_workspace = server_addr_workspace
|
|
21
|
-
|
|
22
|
-
def send_processing_request(self, processor_name: str, req_params: dict):
|
|
23
28
|
verify_server_protocol(self.server_addr_processing)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
29
|
+
self.polling_timeout = timeout
|
|
30
|
+
self.polling_wait = wait
|
|
31
|
+
self.polling_tries = int(timeout / wait)
|
|
32
|
+
|
|
33
|
+
def check_deployed_processors(self):
|
|
34
|
+
return get_ps_deployed_processors(ps_server_host=self.server_addr_processing)
|
|
35
|
+
|
|
36
|
+
def check_deployed_processor_ocrd_tool(self, processor_name: str):
|
|
37
|
+
return get_ps_deployed_processor_ocrd_tool(
|
|
38
|
+
ps_server_host=self.server_addr_processing, processor_name=processor_name)
|
|
39
|
+
|
|
40
|
+
def check_job_log(self, job_id: str):
|
|
41
|
+
return get_ps_processing_job_log(self.server_addr_processing, processing_job_id=job_id)
|
|
42
|
+
|
|
43
|
+
def check_job_status(self, job_id: str):
|
|
44
|
+
return get_ps_processing_job_status(self.server_addr_processing, processing_job_id=job_id)
|
|
45
|
+
|
|
46
|
+
def check_workflow_status(self, workflow_job_id: str):
|
|
47
|
+
return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id)
|
|
48
|
+
|
|
49
|
+
def poll_job_status(self, job_id: str) -> str:
|
|
50
|
+
return poll_job_status_till_timeout_fail_or_success(
|
|
51
|
+
ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait)
|
|
52
|
+
|
|
53
|
+
def poll_workflow_status(self, job_id: str) -> str:
|
|
54
|
+
return poll_wf_status_till_timeout_fail_or_success(
|
|
55
|
+
ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait)
|
|
56
|
+
|
|
57
|
+
def send_processing_job_request(self, processor_name: str, req_params: dict) -> str:
|
|
58
|
+
return post_ps_processing_request(
|
|
59
|
+
ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params)
|
|
60
|
+
|
|
61
|
+
def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str):
|
|
62
|
+
return post_ps_workflow_request(
|
|
63
|
+
ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from requests import get as request_get, post as request_post
|
|
2
|
+
from time import sleep
|
|
3
|
+
from .constants import JobState, NETWORK_PROTOCOLS
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int):
|
|
7
|
+
if job_type not in ["workflow", "processor"]:
|
|
8
|
+
raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'")
|
|
9
|
+
job_state = JobState.unset
|
|
10
|
+
while tries > 0:
|
|
11
|
+
sleep(wait)
|
|
12
|
+
if job_type == "processor":
|
|
13
|
+
job_state = get_ps_processing_job_status(ps_server_host, job_id)
|
|
14
|
+
if job_type == "workflow":
|
|
15
|
+
job_state = get_ps_workflow_job_status(ps_server_host, job_id)
|
|
16
|
+
if job_state == JobState.success or job_state == JobState.failed:
|
|
17
|
+
break
|
|
18
|
+
tries -= 1
|
|
19
|
+
return job_state
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def poll_job_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState:
|
|
23
|
+
return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def poll_wf_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState:
|
|
27
|
+
return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_ps_deployed_processors(ps_server_host: str):
|
|
31
|
+
request_url = f"{ps_server_host}/processor"
|
|
32
|
+
response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
|
|
33
|
+
assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
|
|
34
|
+
return response.json()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_ps_deployed_processor_ocrd_tool(ps_server_host: str, processor_name: str):
|
|
38
|
+
request_url = f"{ps_server_host}/processor/info/{processor_name}"
|
|
39
|
+
response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
|
|
40
|
+
assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
|
|
41
|
+
return response.json()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_ps_processing_job_log(ps_server_host: str, processing_job_id: str):
|
|
45
|
+
request_url = f"{ps_server_host}/processor/log/{processing_job_id}"
|
|
46
|
+
response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
|
|
47
|
+
return response
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> str:
|
|
51
|
+
request_url = f"{ps_server_host}/processor/job/{processing_job_id}"
|
|
52
|
+
response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
|
|
53
|
+
assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
|
|
54
|
+
job_state = response.json()["state"]
|
|
55
|
+
assert job_state
|
|
56
|
+
return job_state
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> str:
|
|
60
|
+
request_url = f"{ps_server_host}/workflow/job-simple/{workflow_job_id}"
|
|
61
|
+
response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
|
|
62
|
+
assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
|
|
63
|
+
job_state = response.json()["state"]
|
|
64
|
+
assert job_state
|
|
65
|
+
return job_state
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def post_ps_processing_request(ps_server_host: str, processor: str, job_input: dict) -> str:
|
|
69
|
+
request_url = f"{ps_server_host}/processor/run/{processor}"
|
|
70
|
+
response = request_post(
|
|
71
|
+
url=request_url,
|
|
72
|
+
headers={"accept": "application/json; charset=utf-8"},
|
|
73
|
+
json=job_input
|
|
74
|
+
)
|
|
75
|
+
assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
|
|
76
|
+
processing_job_id = response.json()["job_id"]
|
|
77
|
+
assert processing_job_id
|
|
78
|
+
return processing_job_id
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# TODO: Can be extended to include other parameters such as page_wise
|
|
82
|
+
def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: str) -> str:
|
|
83
|
+
request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise=True"
|
|
84
|
+
response = request_post(
|
|
85
|
+
url=request_url,
|
|
86
|
+
headers={"accept": "application/json; charset=utf-8"},
|
|
87
|
+
files={"workflow": open(path_to_wf, "rb")}
|
|
88
|
+
)
|
|
89
|
+
# print(response.json())
|
|
90
|
+
# print(response.__dict__)
|
|
91
|
+
assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
|
|
92
|
+
wf_job_id = response.json()["job_id"]
|
|
93
|
+
assert wf_job_id
|
|
94
|
+
return wf_job_id
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def verify_server_protocol(address: str):
|
|
98
|
+
for protocol in NETWORK_PROTOCOLS:
|
|
99
|
+
if address.startswith(protocol):
|
|
100
|
+
return
|
|
101
|
+
raise ValueError(f"Wrong/Missing protocol in the server address: {address}, must be one of: {NETWORK_PROTOCOLS}")
|
|
@@ -651,7 +651,7 @@ class ProcessingServer(FastAPI):
|
|
|
651
651
|
# There is no caching on the Processing Server side
|
|
652
652
|
processor_names_list = self.deployer.find_matching_network_agents(
|
|
653
653
|
docker_only=False, native_only=False, worker_only=False, server_only=False,
|
|
654
|
-
str_names_only=True, unique_only=True
|
|
654
|
+
str_names_only=True, unique_only=True, sort=True
|
|
655
655
|
)
|
|
656
656
|
return processor_names_list
|
|
657
657
|
|
|
@@ -35,7 +35,7 @@ class Deployer:
|
|
|
35
35
|
# TODO: Reconsider this.
|
|
36
36
|
def find_matching_network_agents(
|
|
37
37
|
self, worker_only: bool = False, server_only: bool = False, docker_only: bool = False,
|
|
38
|
-
native_only: bool = False, str_names_only: bool = False, unique_only: bool = False
|
|
38
|
+
native_only: bool = False, str_names_only: bool = False, unique_only: bool = False, sort: bool = False
|
|
39
39
|
) -> Union[List[str], List[object]]:
|
|
40
40
|
"""Finds and returns a list of matching data objects of type:
|
|
41
41
|
`DataProcessingWorker` and `DataProcessorServer`.
|
|
@@ -46,6 +46,7 @@ class Deployer:
|
|
|
46
46
|
:py:attr:`native_only` match only native network agents (DataProcessingWorker and DataProcessorServer)
|
|
47
47
|
:py:attr:`str_names_only` returns the processor_name filed instead of the Data* object
|
|
48
48
|
:py:attr:`unique_only` remove duplicate names from the matches
|
|
49
|
+
:py:attr:`sort` sort the result
|
|
49
50
|
|
|
50
51
|
`worker_only` and `server_only` are mutually exclusive to each other
|
|
51
52
|
`docker_only` and `native_only` are mutually exclusive to each other
|
|
@@ -64,6 +65,10 @@ class Deployer:
|
|
|
64
65
|
msg = f"Value 'unique_only' is allowed only together with 'str_names_only'"
|
|
65
66
|
self.log.exception(msg)
|
|
66
67
|
raise ValueError(msg)
|
|
68
|
+
if sort and not str_names_only:
|
|
69
|
+
msg = f"Value 'sort' is allowed only together with 'str_names_only'"
|
|
70
|
+
self.log.exception(msg)
|
|
71
|
+
raise ValueError(msg)
|
|
67
72
|
|
|
68
73
|
# Find all matching objects of type DataProcessingWorker or DataProcessorServer
|
|
69
74
|
matched_objects = []
|
|
@@ -88,8 +93,12 @@ class Deployer:
|
|
|
88
93
|
matched_names = [match.processor_name for match in matched_objects]
|
|
89
94
|
if not unique_only:
|
|
90
95
|
return matched_names
|
|
91
|
-
|
|
92
|
-
|
|
96
|
+
list_matched = list(dict.fromkeys(matched_names))
|
|
97
|
+
if not sort:
|
|
98
|
+
# Removes any duplicate entries from matched names
|
|
99
|
+
return list_matched
|
|
100
|
+
list_matched.sort()
|
|
101
|
+
return list_matched
|
|
93
102
|
|
|
94
103
|
def resolve_processor_server_url(self, processor_name) -> str:
|
|
95
104
|
processor_server_url = ''
|
ocrd_network/server_utils.py
CHANGED
|
@@ -125,14 +125,13 @@ def request_processor_server_tool_json(logger: Logger, processor_server_base_url
|
|
|
125
125
|
urljoin(base=processor_server_base_url, url="info"),
|
|
126
126
|
headers={"Content-Type": "application/json"}
|
|
127
127
|
)
|
|
128
|
-
if response.status_code != 200:
|
|
129
|
-
message = f"Failed to retrieve tool json from: {processor_server_base_url}, code: {response.status_code}"
|
|
130
|
-
raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
|
|
131
|
-
return response.json()
|
|
132
128
|
except Exception as error:
|
|
133
129
|
message = f"Failed to retrieve ocrd tool json from: {processor_server_base_url}"
|
|
134
130
|
raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message, error)
|
|
135
|
-
|
|
131
|
+
if response.status_code != 200:
|
|
132
|
+
message = f"Failed to retrieve tool json from: {processor_server_base_url}, code: {response.status_code}"
|
|
133
|
+
raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
|
|
134
|
+
return response.json()
|
|
136
135
|
|
|
137
136
|
async def forward_job_to_processor_server(
|
|
138
137
|
logger: Logger, job_input: PYJobInput, processor_server_base_url: str
|
|
@@ -193,11 +192,14 @@ def parse_workflow_tasks(logger: Logger, workflow_content: str) -> List[Processo
|
|
|
193
192
|
|
|
194
193
|
|
|
195
194
|
def raise_http_exception(logger: Logger, status_code: int, message: str, error: Exception = None) -> None:
|
|
196
|
-
|
|
195
|
+
if error:
|
|
196
|
+
message = f"{message} {error}"
|
|
197
|
+
logger.exception(f"{message}")
|
|
197
198
|
raise HTTPException(status_code=status_code, detail=message)
|
|
198
199
|
|
|
199
200
|
|
|
200
201
|
def validate_job_input(logger: Logger, processor_name: str, ocrd_tool: dict, job_input: PYJobInput) -> None:
|
|
202
|
+
# logger.warning(f"Job input: {job_input}")
|
|
201
203
|
if bool(job_input.path_to_mets) == bool(job_input.workspace_id):
|
|
202
204
|
message = (
|
|
203
205
|
"Wrong processing job input format. "
|
|
@@ -210,12 +212,12 @@ def validate_job_input(logger: Logger, processor_name: str, ocrd_tool: dict, job
|
|
|
210
212
|
raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
|
|
211
213
|
try:
|
|
212
214
|
report = ParameterValidator(ocrd_tool).validate(dict(job_input.parameters))
|
|
213
|
-
if not report.is_valid:
|
|
214
|
-
message = f"Failed to validate processing job input against the tool json of processor: {processor_name}\n"
|
|
215
|
-
raise_http_exception(logger, status.HTTP_404_BAD_REQUEST, message + report.errors)
|
|
216
215
|
except Exception as error:
|
|
217
216
|
message = f"Failed to validate processing job input against the ocrd tool json of processor: {processor_name}"
|
|
218
|
-
raise_http_exception(logger, status.
|
|
217
|
+
raise_http_exception(logger, status.HTTP_400_BAD_REQUEST, message, error)
|
|
218
|
+
if report and not report.is_valid:
|
|
219
|
+
message = f"Failed to validate processing job input against the tool json of processor: {processor_name}\n"
|
|
220
|
+
raise_http_exception(logger, status.HTTP_400_BAD_REQUEST, f"{message}{report.errors}")
|
|
219
221
|
|
|
220
222
|
|
|
221
223
|
def validate_workflow(logger: Logger, workflow: str) -> None:
|
ocrd_utils/__init__.py
CHANGED
|
@@ -75,6 +75,7 @@ Utility functions and constants usable in various circumstances.
|
|
|
75
75
|
:py:func:`concat_padded`,
|
|
76
76
|
:py:func:`nth_url_segment`,
|
|
77
77
|
:py:func:`remove_non_path_from_url`,
|
|
78
|
+
:py:func:`parse_json_file_with_comments`,
|
|
78
79
|
:py:func:`parse_json_string_with_comments`,
|
|
79
80
|
:py:func:`parse_json_string_or_file`,
|
|
80
81
|
:py:func:`set_json_key_value_overrides`,
|
|
@@ -204,6 +205,7 @@ from .str import (
|
|
|
204
205
|
make_xml_id,
|
|
205
206
|
nth_url_segment,
|
|
206
207
|
partition_list,
|
|
208
|
+
parse_json_file_with_comments,
|
|
207
209
|
parse_json_string_or_file,
|
|
208
210
|
parse_json_string_with_comments,
|
|
209
211
|
sparkline,
|
ocrd_utils/config.py
CHANGED
|
@@ -12,8 +12,12 @@ from pathlib import Path
|
|
|
12
12
|
from tempfile import gettempdir
|
|
13
13
|
from textwrap import fill, indent
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
|
|
16
|
+
def _validator_boolean(val):
|
|
17
|
+
return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
|
|
18
|
+
|
|
19
|
+
def _parser_boolean(val):
|
|
20
|
+
return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
|
|
17
21
|
|
|
18
22
|
class OcrdEnvVariable():
|
|
19
23
|
|
|
@@ -116,6 +120,16 @@ config.add('OCRD_MAX_PROCESSOR_CACHE',
|
|
|
116
120
|
parser=int,
|
|
117
121
|
default=(True, 128))
|
|
118
122
|
|
|
123
|
+
config.add('OCRD_MAX_PARALLEL_PAGES',
|
|
124
|
+
description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.",
|
|
125
|
+
parser=int,
|
|
126
|
+
default=(True, 1))
|
|
127
|
+
|
|
128
|
+
config.add('OCRD_PROCESSING_PAGE_TIMEOUT',
|
|
129
|
+
description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.",
|
|
130
|
+
parser=int,
|
|
131
|
+
default=(True, 0))
|
|
132
|
+
|
|
119
133
|
config.add("OCRD_PROFILE",
|
|
120
134
|
description="""\
|
|
121
135
|
Whether to enable gathering runtime statistics
|
|
@@ -180,6 +194,11 @@ How to deal with missing output files (for some fileGrp/pageId) during processin
|
|
|
180
194
|
validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
|
|
181
195
|
parser=str)
|
|
182
196
|
|
|
197
|
+
config.add("OCRD_MAX_MISSING_OUTPUTS",
|
|
198
|
+
description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).",
|
|
199
|
+
default=(True, 0.1),
|
|
200
|
+
parser=float)
|
|
201
|
+
|
|
183
202
|
config.add("OCRD_EXISTING_OUTPUT",
|
|
184
203
|
description="""\
|
|
185
204
|
How to deal with already existing output files (for some fileGrp/pageId) during processing:
|
|
@@ -197,6 +216,16 @@ config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
|
|
|
197
216
|
description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
|
|
198
217
|
default=(True, ''))
|
|
199
218
|
|
|
219
|
+
config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP",
|
|
220
|
+
description="How many seconds to sleep before trying again.",
|
|
221
|
+
parser=int,
|
|
222
|
+
default=(True, 30))
|
|
223
|
+
|
|
224
|
+
config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT",
|
|
225
|
+
description="Timeout for a blocking ocrd network client (in seconds).",
|
|
226
|
+
parser=int,
|
|
227
|
+
default=(True, 3600))
|
|
228
|
+
|
|
200
229
|
config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW",
|
|
201
230
|
description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).",
|
|
202
231
|
default=(True, ''))
|
ocrd_utils/image.py
CHANGED
|
@@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method):
|
|
|
65
65
|
|
|
66
66
|
Return a numpy array of the enlarged width and height.
|
|
67
67
|
"""
|
|
68
|
-
if method in [Image.ROTATE_90,
|
|
69
|
-
Image.ROTATE_270,
|
|
70
|
-
Image.TRANSPOSE,
|
|
71
|
-
Image.TRANSVERSE]:
|
|
68
|
+
if method in [Image.Transpose.ROTATE_90,
|
|
69
|
+
Image.Transpose.ROTATE_270,
|
|
70
|
+
Image.Transpose.TRANSPOSE,
|
|
71
|
+
Image.Transpose.TRANSVERSE]:
|
|
72
72
|
size = size[::-1]
|
|
73
73
|
return size
|
|
74
74
|
|
|
@@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])):
|
|
|
348
348
|
calculate the affine coordinate transform corresponding to the composition
|
|
349
349
|
of both transformations, which is respectively:
|
|
350
350
|
|
|
351
|
-
- ``PIL.Image.FLIP_LEFT_RIGHT``:
|
|
351
|
+
- ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``:
|
|
352
352
|
entails translation to the center, followed by pure reflection
|
|
353
353
|
about the y-axis, and subsequent translation back
|
|
354
|
-
- ``PIL.Image.FLIP_TOP_BOTTOM``:
|
|
354
|
+
- ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``:
|
|
355
355
|
entails translation to the center, followed by pure reflection
|
|
356
356
|
about the x-axis, and subsequent translation back
|
|
357
|
-
- ``PIL.Image.ROTATE_180``:
|
|
357
|
+
- ``PIL.Image.Transpose.ROTATE_180``:
|
|
358
358
|
entails translation to the center, followed by pure reflection
|
|
359
359
|
about the origin, and subsequent translation back
|
|
360
|
-
- ``PIL.Image.ROTATE_90``:
|
|
360
|
+
- ``PIL.Image.Transpose.ROTATE_90``:
|
|
361
361
|
entails translation to the center, followed by pure rotation
|
|
362
362
|
by 90° counter-clockwise, and subsequent translation back
|
|
363
|
-
- ``PIL.Image.ROTATE_270``:
|
|
363
|
+
- ``PIL.Image.Transpose.ROTATE_270``:
|
|
364
364
|
entails translation to the center, followed by pure rotation
|
|
365
365
|
by 270° counter-clockwise, and subsequent translation back
|
|
366
|
-
- ``PIL.Image.TRANSPOSE``:
|
|
366
|
+
- ``PIL.Image.Transpose.TRANSPOSE``:
|
|
367
367
|
entails translation to the center, followed by pure rotation
|
|
368
368
|
by 90° counter-clockwise and pure reflection about the x-axis,
|
|
369
369
|
and subsequent translation back
|
|
370
|
-
- ``PIL.Image.TRANSVERSE``:
|
|
370
|
+
- ``PIL.Image.Transpose.TRANSVERSE``:
|
|
371
371
|
entails translation to the center, followed by pure rotation
|
|
372
372
|
by 90° counter-clockwise and pure reflection about the y-axis,
|
|
373
373
|
and subsequent translation back
|
|
@@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])):
|
|
|
388
388
|
[0, 0, 1]])
|
|
389
389
|
transform = shift_coordinates(transform, -orig)
|
|
390
390
|
operations = {
|
|
391
|
-
Image.FLIP_LEFT_RIGHT: [refly],
|
|
392
|
-
Image.FLIP_TOP_BOTTOM: [reflx],
|
|
393
|
-
Image.ROTATE_180: [reflx, refly],
|
|
394
|
-
Image.ROTATE_90: [rot90],
|
|
395
|
-
Image.ROTATE_270: [rot90, reflx, refly],
|
|
396
|
-
Image.TRANSPOSE: [rot90, reflx],
|
|
397
|
-
Image.TRANSVERSE: [rot90, refly]
|
|
391
|
+
Image.Transpose.FLIP_LEFT_RIGHT: [refly],
|
|
392
|
+
Image.Transpose.FLIP_TOP_BOTTOM: [reflx],
|
|
393
|
+
Image.Transpose.ROTATE_180: [reflx, refly],
|
|
394
|
+
Image.Transpose.ROTATE_90: [rot90],
|
|
395
|
+
Image.Transpose.ROTATE_270: [rot90, reflx, refly],
|
|
396
|
+
Image.Transpose.TRANSPOSE: [rot90, reflx],
|
|
397
|
+
Image.Transpose.TRANSVERSE: [rot90, refly]
|
|
398
398
|
}.get(method) # no default
|
|
399
399
|
for operation in operations:
|
|
400
400
|
transform = np.dot(operation, transform)
|
|
@@ -411,29 +411,29 @@ def transpose_image(image, method):
|
|
|
411
411
|
Given a PIL.Image ``image`` and a transposition mode ``method``,
|
|
412
412
|
apply the respective operation:
|
|
413
413
|
|
|
414
|
-
- ``PIL.Image.FLIP_LEFT_RIGHT``:
|
|
414
|
+
- ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``:
|
|
415
415
|
all pixels get mirrored at half the width of the image
|
|
416
|
-
- ``PIL.Image.FLIP_TOP_BOTTOM``:
|
|
416
|
+
- ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``:
|
|
417
417
|
all pixels get mirrored at half the height of the image
|
|
418
|
-
- ``PIL.Image.ROTATE_180``:
|
|
418
|
+
- ``PIL.Image.Transpose.ROTATE_180``:
|
|
419
419
|
all pixels get mirrored at both, the width and half the height
|
|
420
420
|
of the image,
|
|
421
421
|
i.e. the image gets rotated by 180° counter-clockwise
|
|
422
|
-
- ``PIL.Image.ROTATE_90``:
|
|
422
|
+
- ``PIL.Image.Transpose.ROTATE_90``:
|
|
423
423
|
rows become columns (but counted from the right) and
|
|
424
424
|
columns become rows,
|
|
425
425
|
i.e. the image gets rotated by 90° counter-clockwise;
|
|
426
426
|
width becomes height and vice versa
|
|
427
|
-
- ``PIL.Image.ROTATE_270``:
|
|
427
|
+
- ``PIL.Image.Transpose.ROTATE_270``:
|
|
428
428
|
rows become columns and
|
|
429
429
|
columns become rows (but counted from the bottom),
|
|
430
430
|
i.e. the image gets rotated by 270° counter-clockwise;
|
|
431
431
|
width becomes height and vice versa
|
|
432
|
-
- ``PIL.Image.TRANSPOSE``:
|
|
432
|
+
- ``PIL.Image.Transpose.TRANSPOSE``:
|
|
433
433
|
rows become columns and vice versa,
|
|
434
434
|
i.e. all pixels get mirrored at the main diagonal;
|
|
435
435
|
width becomes height and vice versa
|
|
436
|
-
- ``PIL.Image.TRANSVERSE``:
|
|
436
|
+
- ``PIL.Image.Transpose.TRANSVERSE``:
|
|
437
437
|
rows become columns (but counted from the right) and
|
|
438
438
|
columns become rows (but counted from the bottom),
|
|
439
439
|
i.e. all pixels get mirrored at the opposite diagonal;
|
ocrd_utils/logging.py
CHANGED
|
@@ -5,9 +5,9 @@ By default: Log with lastResort logger, usually STDERR.
|
|
|
5
5
|
|
|
6
6
|
Logging can be overridden either programmatically in code using the library or by creating one or more of
|
|
7
7
|
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
-
-
|
|
8
|
+
- ``/etc/ocrd_logging.py``
|
|
9
|
+
- ``$HOME/ocrd_logging.py``
|
|
10
|
+
- ``$PWD/ocrd_logging.py``
|
|
11
11
|
|
|
12
12
|
These files will be executed in the context of ocrd/ocrd_logging.py, with `logging` global set.
|
|
13
13
|
|
|
@@ -16,20 +16,18 @@ Changes as of 2023-08-20:
|
|
|
16
16
|
- Try to be less intrusive with OCR-D specific logging conventions to
|
|
17
17
|
make it easier and less surprising to define logging behavior when
|
|
18
18
|
using OCR-D/core as a library
|
|
19
|
-
- Change setOverrideLogLevel to only override the log level of the ``ocrd``
|
|
19
|
+
- Change :py:meth:`setOverrideLogLevel` to only override the log level of the ``ocrd``
|
|
20
20
|
logger and its descendants
|
|
21
|
-
- initLogging will set exactly one handler, for the root logger or for the
|
|
21
|
+
- :py:meth:`initLogging` will set exactly one handler, for the root logger or for the
|
|
22
22
|
``ocrd`` logger.
|
|
23
23
|
- Child loggers should propagate to the ancestor logging (default
|
|
24
|
-
behavior of the logging library - no more PropagationShyLogger)
|
|
25
|
-
- disableLogging only removes any handlers from the ``ocrd`` logger
|
|
24
|
+
behavior of the logging library - no more ``PropagationShyLogger``)
|
|
25
|
+
- :py:meth:`disableLogging` only removes any handlers from the ``ocrd`` logger
|
|
26
26
|
"""
|
|
27
27
|
# pylint: disable=no-member
|
|
28
28
|
|
|
29
29
|
from __future__ import absolute_import
|
|
30
30
|
|
|
31
|
-
from traceback import format_stack
|
|
32
|
-
|
|
33
31
|
import logging
|
|
34
32
|
import logging.config
|
|
35
33
|
from pathlib import Path
|
|
@@ -81,10 +79,10 @@ _ocrdLevel2pythonLevel = {
|
|
|
81
79
|
|
|
82
80
|
def tf_disable_interactive_logs():
|
|
83
81
|
try:
|
|
84
|
-
from os import environ
|
|
82
|
+
from os import environ # pylint: disable=import-outside-toplevel
|
|
85
83
|
# This env variable must be set before importing from Keras
|
|
86
84
|
environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
|
87
|
-
from tensorflow.keras.utils import disable_interactive_logging
|
|
85
|
+
from tensorflow.keras.utils import disable_interactive_logging # pylint: disable=import-outside-toplevel
|
|
88
86
|
# Enabled interactive logging throws an exception
|
|
89
87
|
# due to a call of sys.stdout.flush()
|
|
90
88
|
disable_interactive_logging()
|
|
@@ -143,21 +141,21 @@ def get_logging_config_files():
|
|
|
143
141
|
|
|
144
142
|
def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG):
|
|
145
143
|
"""
|
|
146
|
-
Reset ``ocrd`` logger, read logging configuration if exists, otherwise use basicConfig
|
|
144
|
+
Reset ``ocrd`` logger, read logging configuration if exists, otherwise use :py:meth:`logging.basicConfig`
|
|
147
145
|
|
|
148
|
-
|
|
146
|
+
This is to be called by OCR-D/core only once, i.e.
|
|
149
147
|
- for the ``ocrd`` CLI
|
|
150
148
|
- for the processor wrapper methods
|
|
151
149
|
|
|
152
150
|
Other processes that use OCR-D/core as a library can, but do not have to, use this functionality.
|
|
153
151
|
|
|
154
152
|
Keyword Args:
|
|
155
|
-
- builtin_only (bool
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
- force_reinit (bool
|
|
159
|
-
|
|
160
|
-
- silent (bool
|
|
153
|
+
- builtin_only (bool): Whether to search for logging configuration
|
|
154
|
+
on-disk (``False``) or only use the hard-coded config (``True``).
|
|
155
|
+
For testing
|
|
156
|
+
- force_reinit (bool): Whether to ignore the module-level ``_initialized_flag``.
|
|
157
|
+
For testing only
|
|
158
|
+
- silent (bool): Whether to log logging behavior by printing to stderr
|
|
161
159
|
"""
|
|
162
160
|
global _initialized_flag
|
|
163
161
|
if _initialized_flag and not force_reinit:
|
|
@@ -212,11 +210,13 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG):
|
|
|
212
210
|
# logging.basicConfig(level=logging.CRITICAL)
|
|
213
211
|
# logging.disable(logging.ERROR)
|
|
214
212
|
# remove all handlers for the ocrd logger
|
|
215
|
-
for logger_name in ROOT_OCRD_LOGGERS:
|
|
213
|
+
for logger_name in ROOT_OCRD_LOGGERS + ['']:
|
|
216
214
|
for handler in logging.getLogger(logger_name).handlers[:]:
|
|
217
215
|
logging.getLogger(logger_name).removeHandler(handler)
|
|
218
216
|
for logger_name in LOGGING_DEFAULTS:
|
|
219
217
|
logging.getLogger(logger_name).setLevel(logging.NOTSET)
|
|
218
|
+
# Python default log level is WARNING
|
|
219
|
+
logging.root.setLevel(logging.WARNING)
|
|
220
220
|
|
|
221
221
|
# Initializing stream handlers at module level
|
|
222
222
|
# would cause message output in all runtime contexts,
|
ocrd_utils/os.py
CHANGED
|
@@ -71,9 +71,8 @@ def unzip_file_to_dir(path_to_zip, output_directory):
|
|
|
71
71
|
"""
|
|
72
72
|
Extract a ZIP archive to a directory
|
|
73
73
|
"""
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
z.close()
|
|
74
|
+
with ZipFile(path_to_zip, 'r') as z:
|
|
75
|
+
z.extractall(output_directory)
|
|
77
76
|
|
|
78
77
|
@lru_cache()
|
|
79
78
|
def get_ocrd_tool_json(executable):
|
|
@@ -87,7 +86,7 @@ def get_ocrd_tool_json(executable):
|
|
|
87
86
|
ocrd_tool = ocrd_all_tool[executable]
|
|
88
87
|
except (JSONDecodeError, OSError, KeyError):
|
|
89
88
|
try:
|
|
90
|
-
ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout)
|
|
89
|
+
ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE, check=False).stdout)
|
|
91
90
|
except (JSONDecodeError, OSError) as e:
|
|
92
91
|
getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}')
|
|
93
92
|
if 'resource_locations' not in ocrd_tool:
|
|
@@ -102,7 +101,7 @@ def get_moduledir(executable):
|
|
|
102
101
|
moduledir = ocrd_all_moduledir[executable]
|
|
103
102
|
except (JSONDecodeError, OSError, KeyError):
|
|
104
103
|
try:
|
|
105
|
-
moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n')
|
|
104
|
+
moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE, check=False).stdout.rstrip('\n')
|
|
106
105
|
except (JSONDecodeError, OSError) as e:
|
|
107
106
|
getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}')
|
|
108
107
|
return moduledir
|
ocrd_utils/str.py
CHANGED
|
@@ -4,9 +4,9 @@ Utility functions for strings, paths and URL.
|
|
|
4
4
|
|
|
5
5
|
import re
|
|
6
6
|
import json
|
|
7
|
-
from typing import List
|
|
7
|
+
from typing import List
|
|
8
8
|
from .constants import REGEX_FILE_ID, SPARKLINE_CHARS
|
|
9
|
-
from .deprecate import deprecation_warning
|
|
9
|
+
#from .deprecate import deprecation_warning
|
|
10
10
|
from deprecated import deprecated
|
|
11
11
|
from warnings import warn
|
|
12
12
|
from numpy import array_split
|
|
@@ -21,6 +21,7 @@ __all__ = [
|
|
|
21
21
|
'make_file_id',
|
|
22
22
|
'make_xml_id',
|
|
23
23
|
'nth_url_segment',
|
|
24
|
+
'parse_json_file_with_comments',
|
|
24
25
|
'parse_json_string_or_file',
|
|
25
26
|
'parse_json_string_with_comments',
|
|
26
27
|
'remove_non_path_from_url',
|
|
@@ -162,6 +163,13 @@ def is_string(val):
|
|
|
162
163
|
return isinstance(val, str)
|
|
163
164
|
|
|
164
165
|
|
|
166
|
+
def parse_json_file_with_comments(val):
|
|
167
|
+
"""
|
|
168
|
+
Parse a file of JSON interspersed with #-prefixed full-line comments
|
|
169
|
+
"""
|
|
170
|
+
with open(val, 'r', encoding='utf-8') as inputf:
|
|
171
|
+
return parse_json_string_with_comments(inputf.read())
|
|
172
|
+
|
|
165
173
|
def parse_json_string_with_comments(val):
|
|
166
174
|
"""
|
|
167
175
|
Parse a string of JSON interspersed with #-prefixed full-line comments
|
|
@@ -265,4 +273,3 @@ def sparkline(values : List[int]) -> str:
|
|
|
265
273
|
# normalize to 0..1 and convert to index in SPARKLINE_CHARS
|
|
266
274
|
mapped = [int(x / max_value * max_mapping) for x in values]
|
|
267
275
|
return ''.join(SPARKLINE_CHARS[x] for x in mapped)
|
|
268
|
-
|