ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. ocrd/cli/__init__.py +34 -26
  2. ocrd/cli/bashlib.py +32 -18
  3. ocrd/cli/ocrd_tool.py +7 -5
  4. ocrd/cli/workspace.py +10 -8
  5. ocrd/decorators/__init__.py +13 -7
  6. ocrd/decorators/ocrd_cli_options.py +1 -1
  7. ocrd/lib.bash +3 -0
  8. ocrd/mets_server.py +3 -4
  9. ocrd/processor/__init__.py +1 -1
  10. ocrd/processor/base.py +421 -98
  11. ocrd/processor/builtin/dummy_processor.py +4 -11
  12. ocrd/processor/helpers.py +24 -161
  13. ocrd/processor/ocrd_page_result.py +3 -3
  14. ocrd/resolver.py +0 -3
  15. ocrd/resource_manager.py +9 -5
  16. ocrd/workspace.py +10 -11
  17. ocrd/workspace_backup.py +1 -1
  18. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/METADATA +32 -10
  19. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/RECORD +49 -48
  20. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/WHEEL +1 -1
  21. ocrd_modelfactory/__init__.py +1 -1
  22. ocrd_models/constants.py +0 -1
  23. ocrd_models/ocrd_exif.py +2 -2
  24. ocrd_models/ocrd_file.py +2 -2
  25. ocrd_models/ocrd_mets.py +22 -22
  26. ocrd_models/ocrd_page.py +0 -1
  27. ocrd_models/ocrd_xml_base.py +2 -2
  28. ocrd_network/cli/client.py +134 -30
  29. ocrd_network/client.py +53 -27
  30. ocrd_network/client_utils.py +101 -0
  31. ocrd_network/processing_server.py +1 -1
  32. ocrd_network/runtime_data/deployer.py +12 -3
  33. ocrd_network/server_utils.py +12 -10
  34. ocrd_utils/__init__.py +2 -0
  35. ocrd_utils/config.py +31 -2
  36. ocrd_utils/image.py +25 -25
  37. ocrd_utils/logging.py +20 -20
  38. ocrd_utils/os.py +4 -5
  39. ocrd_utils/str.py +10 -3
  40. ocrd_validators/json_validator.py +1 -3
  41. ocrd_validators/ocrd_tool_validator.py +2 -2
  42. ocrd_validators/page_validator.py +56 -56
  43. ocrd_validators/parameter_validator.py +2 -2
  44. ocrd_validators/resource_list_validator.py +4 -3
  45. ocrd_validators/workspace_validator.py +21 -21
  46. ocrd_validators/xsd_validator.py +1 -1
  47. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/LICENSE +0 -0
  48. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/entry_points.txt +0 -0
  49. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/top_level.txt +0 -0
ocrd_network/client.py CHANGED
@@ -1,37 +1,63 @@
1
- from json import dumps, loads
2
- from requests import post as requests_post
1
+ from typing import Optional
3
2
  from ocrd_utils import config, getLogger, LOG_FORMAT
3
+ from .client_utils import (
4
+ get_ps_deployed_processors,
5
+ get_ps_deployed_processor_ocrd_tool,
6
+ get_ps_processing_job_log,
7
+ get_ps_processing_job_status,
8
+ get_ps_workflow_job_status,
9
+ poll_job_status_till_timeout_fail_or_success,
10
+ poll_wf_status_till_timeout_fail_or_success,
11
+ post_ps_processing_request,
12
+ post_ps_workflow_request,
13
+ verify_server_protocol
14
+ )
4
15
 
5
- from .constants import NETWORK_PROTOCOLS
6
16
 
7
-
8
- # TODO: This is just a conceptual implementation and first try to
9
- # trigger further discussions on how this should look like.
10
17
  class Client:
11
18
  def __init__(
12
19
  self,
13
- server_addr_processing: str = config.OCRD_NETWORK_SERVER_ADDR_PROCESSING,
14
- server_addr_workflow: str = config.OCRD_NETWORK_SERVER_ADDR_WORKFLOW,
15
- server_addr_workspace: str = config.OCRD_NETWORK_SERVER_ADDR_WORKSPACE
20
+ server_addr_processing: Optional[str],
21
+ timeout: int = config.OCRD_NETWORK_CLIENT_POLLING_TIMEOUT,
22
+ wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP
16
23
  ):
17
24
  self.log = getLogger(f"ocrd_network.client")
25
+ if not server_addr_processing:
26
+ server_addr_processing = config.OCRD_NETWORK_SERVER_ADDR_PROCESSING
18
27
  self.server_addr_processing = server_addr_processing
19
- self.server_addr_workflow = server_addr_workflow
20
- self.server_addr_workspace = server_addr_workspace
21
-
22
- def send_processing_request(self, processor_name: str, req_params: dict):
23
28
  verify_server_protocol(self.server_addr_processing)
24
- req_url = f"{self.server_addr_processing}/processor/{processor_name}"
25
- req_headers = {"Content-Type": "application/json; charset=utf-8"}
26
- req_json = loads(dumps(req_params))
27
- self.log.info(f"Sending processing request to: {req_url}")
28
- self.log.debug(req_json)
29
- response = requests_post(url=req_url, headers=req_headers, json=req_json)
30
- return response.json()
31
-
32
-
33
- def verify_server_protocol(address: str):
34
- for protocol in NETWORK_PROTOCOLS:
35
- if address.startswith(protocol):
36
- return
37
- raise ValueError(f"Wrong/Missing protocol in the server address: {address}, must be one of: {NETWORK_PROTOCOLS}")
29
+ self.polling_timeout = timeout
30
+ self.polling_wait = wait
31
+ self.polling_tries = int(timeout / wait)
32
+
33
+ def check_deployed_processors(self):
34
+ return get_ps_deployed_processors(ps_server_host=self.server_addr_processing)
35
+
36
+ def check_deployed_processor_ocrd_tool(self, processor_name: str):
37
+ return get_ps_deployed_processor_ocrd_tool(
38
+ ps_server_host=self.server_addr_processing, processor_name=processor_name)
39
+
40
+ def check_job_log(self, job_id: str):
41
+ return get_ps_processing_job_log(self.server_addr_processing, processing_job_id=job_id)
42
+
43
+ def check_job_status(self, job_id: str):
44
+ return get_ps_processing_job_status(self.server_addr_processing, processing_job_id=job_id)
45
+
46
+ def check_workflow_status(self, workflow_job_id: str):
47
+ return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id)
48
+
49
+ def poll_job_status(self, job_id: str) -> str:
50
+ return poll_job_status_till_timeout_fail_or_success(
51
+ ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait)
52
+
53
+ def poll_workflow_status(self, job_id: str) -> str:
54
+ return poll_wf_status_till_timeout_fail_or_success(
55
+ ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait)
56
+
57
+ def send_processing_job_request(self, processor_name: str, req_params: dict) -> str:
58
+ return post_ps_processing_request(
59
+ ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params)
60
+
61
+ def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str):
62
+ return post_ps_workflow_request(
63
+ ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets)
@@ -0,0 +1,101 @@
1
+ from requests import get as request_get, post as request_post
2
+ from time import sleep
3
+ from .constants import JobState, NETWORK_PROTOCOLS
4
+
5
+
6
+ def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int):
7
+ if job_type not in ["workflow", "processor"]:
8
+ raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'")
9
+ job_state = JobState.unset
10
+ while tries > 0:
11
+ sleep(wait)
12
+ if job_type == "processor":
13
+ job_state = get_ps_processing_job_status(ps_server_host, job_id)
14
+ if job_type == "workflow":
15
+ job_state = get_ps_workflow_job_status(ps_server_host, job_id)
16
+ if job_state == JobState.success or job_state == JobState.failed:
17
+ break
18
+ tries -= 1
19
+ return job_state
20
+
21
+
22
+ def poll_job_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState:
23
+ return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait)
24
+
25
+
26
+ def poll_wf_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState:
27
+ return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait)
28
+
29
+
30
+ def get_ps_deployed_processors(ps_server_host: str):
31
+ request_url = f"{ps_server_host}/processor"
32
+ response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
33
+ assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
34
+ return response.json()
35
+
36
+
37
+ def get_ps_deployed_processor_ocrd_tool(ps_server_host: str, processor_name: str):
38
+ request_url = f"{ps_server_host}/processor/info/{processor_name}"
39
+ response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
40
+ assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
41
+ return response.json()
42
+
43
+
44
+ def get_ps_processing_job_log(ps_server_host: str, processing_job_id: str):
45
+ request_url = f"{ps_server_host}/processor/log/{processing_job_id}"
46
+ response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
47
+ return response
48
+
49
+
50
+ def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> str:
51
+ request_url = f"{ps_server_host}/processor/job/{processing_job_id}"
52
+ response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
53
+ assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
54
+ job_state = response.json()["state"]
55
+ assert job_state
56
+ return job_state
57
+
58
+
59
+ def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> str:
60
+ request_url = f"{ps_server_host}/workflow/job-simple/{workflow_job_id}"
61
+ response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
62
+ assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
63
+ job_state = response.json()["state"]
64
+ assert job_state
65
+ return job_state
66
+
67
+
68
+ def post_ps_processing_request(ps_server_host: str, processor: str, job_input: dict) -> str:
69
+ request_url = f"{ps_server_host}/processor/run/{processor}"
70
+ response = request_post(
71
+ url=request_url,
72
+ headers={"accept": "application/json; charset=utf-8"},
73
+ json=job_input
74
+ )
75
+ assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
76
+ processing_job_id = response.json()["job_id"]
77
+ assert processing_job_id
78
+ return processing_job_id
79
+
80
+
81
+ # TODO: Can be extended to include other parameters such as page_wise
82
+ def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: str) -> str:
83
+ request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise=True"
84
+ response = request_post(
85
+ url=request_url,
86
+ headers={"accept": "application/json; charset=utf-8"},
87
+ files={"workflow": open(path_to_wf, "rb")}
88
+ )
89
+ # print(response.json())
90
+ # print(response.__dict__)
91
+ assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
92
+ wf_job_id = response.json()["job_id"]
93
+ assert wf_job_id
94
+ return wf_job_id
95
+
96
+
97
+ def verify_server_protocol(address: str):
98
+ for protocol in NETWORK_PROTOCOLS:
99
+ if address.startswith(protocol):
100
+ return
101
+ raise ValueError(f"Wrong/Missing protocol in the server address: {address}, must be one of: {NETWORK_PROTOCOLS}")
@@ -651,7 +651,7 @@ class ProcessingServer(FastAPI):
651
651
  # There is no caching on the Processing Server side
652
652
  processor_names_list = self.deployer.find_matching_network_agents(
653
653
  docker_only=False, native_only=False, worker_only=False, server_only=False,
654
- str_names_only=True, unique_only=True
654
+ str_names_only=True, unique_only=True, sort=True
655
655
  )
656
656
  return processor_names_list
657
657
 
@@ -35,7 +35,7 @@ class Deployer:
35
35
  # TODO: Reconsider this.
36
36
  def find_matching_network_agents(
37
37
  self, worker_only: bool = False, server_only: bool = False, docker_only: bool = False,
38
- native_only: bool = False, str_names_only: bool = False, unique_only: bool = False
38
+ native_only: bool = False, str_names_only: bool = False, unique_only: bool = False, sort: bool = False
39
39
  ) -> Union[List[str], List[object]]:
40
40
  """Finds and returns a list of matching data objects of type:
41
41
  `DataProcessingWorker` and `DataProcessorServer`.
@@ -46,6 +46,7 @@ class Deployer:
46
46
  :py:attr:`native_only` match only native network agents (DataProcessingWorker and DataProcessorServer)
47
47
  :py:attr:`str_names_only` returns the processor_name filed instead of the Data* object
48
48
  :py:attr:`unique_only` remove duplicate names from the matches
49
+ :py:attr:`sort` sort the result
49
50
 
50
51
  `worker_only` and `server_only` are mutually exclusive to each other
51
52
  `docker_only` and `native_only` are mutually exclusive to each other
@@ -64,6 +65,10 @@ class Deployer:
64
65
  msg = f"Value 'unique_only' is allowed only together with 'str_names_only'"
65
66
  self.log.exception(msg)
66
67
  raise ValueError(msg)
68
+ if sort and not str_names_only:
69
+ msg = f"Value 'sort' is allowed only together with 'str_names_only'"
70
+ self.log.exception(msg)
71
+ raise ValueError(msg)
67
72
 
68
73
  # Find all matching objects of type DataProcessingWorker or DataProcessorServer
69
74
  matched_objects = []
@@ -88,8 +93,12 @@ class Deployer:
88
93
  matched_names = [match.processor_name for match in matched_objects]
89
94
  if not unique_only:
90
95
  return matched_names
91
- # Removes any duplicate entries from matched names
92
- return list(dict.fromkeys(matched_names))
96
+ list_matched = list(dict.fromkeys(matched_names))
97
+ if not sort:
98
+ # Removes any duplicate entries from matched names
99
+ return list_matched
100
+ list_matched.sort()
101
+ return list_matched
93
102
 
94
103
  def resolve_processor_server_url(self, processor_name) -> str:
95
104
  processor_server_url = ''
@@ -125,14 +125,13 @@ def request_processor_server_tool_json(logger: Logger, processor_server_base_url
125
125
  urljoin(base=processor_server_base_url, url="info"),
126
126
  headers={"Content-Type": "application/json"}
127
127
  )
128
- if response.status_code != 200:
129
- message = f"Failed to retrieve tool json from: {processor_server_base_url}, code: {response.status_code}"
130
- raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
131
- return response.json()
132
128
  except Exception as error:
133
129
  message = f"Failed to retrieve ocrd tool json from: {processor_server_base_url}"
134
130
  raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message, error)
135
-
131
+ if response.status_code != 200:
132
+ message = f"Failed to retrieve tool json from: {processor_server_base_url}, code: {response.status_code}"
133
+ raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
134
+ return response.json()
136
135
 
137
136
  async def forward_job_to_processor_server(
138
137
  logger: Logger, job_input: PYJobInput, processor_server_base_url: str
@@ -193,11 +192,14 @@ def parse_workflow_tasks(logger: Logger, workflow_content: str) -> List[Processo
193
192
 
194
193
 
195
194
  def raise_http_exception(logger: Logger, status_code: int, message: str, error: Exception = None) -> None:
196
- logger.exception(f"{message} {error}")
195
+ if error:
196
+ message = f"{message} {error}"
197
+ logger.exception(f"{message}")
197
198
  raise HTTPException(status_code=status_code, detail=message)
198
199
 
199
200
 
200
201
  def validate_job_input(logger: Logger, processor_name: str, ocrd_tool: dict, job_input: PYJobInput) -> None:
202
+ # logger.warning(f"Job input: {job_input}")
201
203
  if bool(job_input.path_to_mets) == bool(job_input.workspace_id):
202
204
  message = (
203
205
  "Wrong processing job input format. "
@@ -210,12 +212,12 @@ def validate_job_input(logger: Logger, processor_name: str, ocrd_tool: dict, job
210
212
  raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
211
213
  try:
212
214
  report = ParameterValidator(ocrd_tool).validate(dict(job_input.parameters))
213
- if not report.is_valid:
214
- message = f"Failed to validate processing job input against the tool json of processor: {processor_name}\n"
215
- raise_http_exception(logger, status.HTTP_404_BAD_REQUEST, message + report.errors)
216
215
  except Exception as error:
217
216
  message = f"Failed to validate processing job input against the ocrd tool json of processor: {processor_name}"
218
- raise_http_exception(logger, status.HTTP_404_BAD_REQUEST, message, error)
217
+ raise_http_exception(logger, status.HTTP_400_BAD_REQUEST, message, error)
218
+ if report and not report.is_valid:
219
+ message = f"Failed to validate processing job input against the tool json of processor: {processor_name}\n"
220
+ raise_http_exception(logger, status.HTTP_400_BAD_REQUEST, f"{message}{report.errors}")
219
221
 
220
222
 
221
223
  def validate_workflow(logger: Logger, workflow: str) -> None:
ocrd_utils/__init__.py CHANGED
@@ -75,6 +75,7 @@ Utility functions and constants usable in various circumstances.
75
75
  :py:func:`concat_padded`,
76
76
  :py:func:`nth_url_segment`,
77
77
  :py:func:`remove_non_path_from_url`,
78
+ :py:func:`parse_json_file_with_comments`,
78
79
  :py:func:`parse_json_string_with_comments`,
79
80
  :py:func:`parse_json_string_or_file`,
80
81
  :py:func:`set_json_key_value_overrides`,
@@ -204,6 +205,7 @@ from .str import (
204
205
  make_xml_id,
205
206
  nth_url_segment,
206
207
  partition_list,
208
+ parse_json_file_with_comments,
207
209
  parse_json_string_or_file,
208
210
  parse_json_string_with_comments,
209
211
  sparkline,
ocrd_utils/config.py CHANGED
@@ -12,8 +12,12 @@ from pathlib import Path
12
12
  from tempfile import gettempdir
13
13
  from textwrap import fill, indent
14
14
 
15
- _validator_boolean = lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
16
- _parser_boolean = lambda val: bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
15
+
16
+ def _validator_boolean(val):
17
+ return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
18
+
19
+ def _parser_boolean(val):
20
+ return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
17
21
 
18
22
  class OcrdEnvVariable():
19
23
 
@@ -116,6 +120,16 @@ config.add('OCRD_MAX_PROCESSOR_CACHE',
116
120
  parser=int,
117
121
  default=(True, 128))
118
122
 
123
+ config.add('OCRD_MAX_PARALLEL_PAGES',
124
+ description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.",
125
+ parser=int,
126
+ default=(True, 1))
127
+
128
+ config.add('OCRD_PROCESSING_PAGE_TIMEOUT',
129
+ description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.",
130
+ parser=int,
131
+ default=(True, 0))
132
+
119
133
  config.add("OCRD_PROFILE",
120
134
  description="""\
121
135
  Whether to enable gathering runtime statistics
@@ -180,6 +194,11 @@ How to deal with missing output files (for some fileGrp/pageId) during processin
180
194
  validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
181
195
  parser=str)
182
196
 
197
+ config.add("OCRD_MAX_MISSING_OUTPUTS",
198
+ description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).",
199
+ default=(True, 0.1),
200
+ parser=float)
201
+
183
202
  config.add("OCRD_EXISTING_OUTPUT",
184
203
  description="""\
185
204
  How to deal with already existing output files (for some fileGrp/pageId) during processing:
@@ -197,6 +216,16 @@ config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
197
216
  description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
198
217
  default=(True, ''))
199
218
 
219
+ config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP",
220
+ description="How many seconds to sleep before trying again.",
221
+ parser=int,
222
+ default=(True, 30))
223
+
224
+ config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT",
225
+ description="Timeout for a blocking ocrd network client (in seconds).",
226
+ parser=int,
227
+ default=(True, 3600))
228
+
200
229
  config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW",
201
230
  description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).",
202
231
  default=(True, ''))
ocrd_utils/image.py CHANGED
@@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method):
65
65
 
66
66
  Return a numpy array of the enlarged width and height.
67
67
  """
68
- if method in [Image.ROTATE_90,
69
- Image.ROTATE_270,
70
- Image.TRANSPOSE,
71
- Image.TRANSVERSE]:
68
+ if method in [Image.Transpose.ROTATE_90,
69
+ Image.Transpose.ROTATE_270,
70
+ Image.Transpose.TRANSPOSE,
71
+ Image.Transpose.TRANSVERSE]:
72
72
  size = size[::-1]
73
73
  return size
74
74
 
@@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])):
348
348
  calculate the affine coordinate transform corresponding to the composition
349
349
  of both transformations, which is respectively:
350
350
 
351
- - ``PIL.Image.FLIP_LEFT_RIGHT``:
351
+ - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``:
352
352
  entails translation to the center, followed by pure reflection
353
353
  about the y-axis, and subsequent translation back
354
- - ``PIL.Image.FLIP_TOP_BOTTOM``:
354
+ - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``:
355
355
  entails translation to the center, followed by pure reflection
356
356
  about the x-axis, and subsequent translation back
357
- - ``PIL.Image.ROTATE_180``:
357
+ - ``PIL.Image.Transpose.ROTATE_180``:
358
358
  entails translation to the center, followed by pure reflection
359
359
  about the origin, and subsequent translation back
360
- - ``PIL.Image.ROTATE_90``:
360
+ - ``PIL.Image.Transpose.ROTATE_90``:
361
361
  entails translation to the center, followed by pure rotation
362
362
  by 90° counter-clockwise, and subsequent translation back
363
- - ``PIL.Image.ROTATE_270``:
363
+ - ``PIL.Image.Transpose.ROTATE_270``:
364
364
  entails translation to the center, followed by pure rotation
365
365
  by 270° counter-clockwise, and subsequent translation back
366
- - ``PIL.Image.TRANSPOSE``:
366
+ - ``PIL.Image.Transpose.TRANSPOSE``:
367
367
  entails translation to the center, followed by pure rotation
368
368
  by 90° counter-clockwise and pure reflection about the x-axis,
369
369
  and subsequent translation back
370
- - ``PIL.Image.TRANSVERSE``:
370
+ - ``PIL.Image.Transpose.TRANSVERSE``:
371
371
  entails translation to the center, followed by pure rotation
372
372
  by 90° counter-clockwise and pure reflection about the y-axis,
373
373
  and subsequent translation back
@@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])):
388
388
  [0, 0, 1]])
389
389
  transform = shift_coordinates(transform, -orig)
390
390
  operations = {
391
- Image.FLIP_LEFT_RIGHT: [refly],
392
- Image.FLIP_TOP_BOTTOM: [reflx],
393
- Image.ROTATE_180: [reflx, refly],
394
- Image.ROTATE_90: [rot90],
395
- Image.ROTATE_270: [rot90, reflx, refly],
396
- Image.TRANSPOSE: [rot90, reflx],
397
- Image.TRANSVERSE: [rot90, refly]
391
+ Image.Transpose.FLIP_LEFT_RIGHT: [refly],
392
+ Image.Transpose.FLIP_TOP_BOTTOM: [reflx],
393
+ Image.Transpose.ROTATE_180: [reflx, refly],
394
+ Image.Transpose.ROTATE_90: [rot90],
395
+ Image.Transpose.ROTATE_270: [rot90, reflx, refly],
396
+ Image.Transpose.TRANSPOSE: [rot90, reflx],
397
+ Image.Transpose.TRANSVERSE: [rot90, refly]
398
398
  }.get(method) # no default
399
399
  for operation in operations:
400
400
  transform = np.dot(operation, transform)
@@ -411,29 +411,29 @@ def transpose_image(image, method):
411
411
  Given a PIL.Image ``image`` and a transposition mode ``method``,
412
412
  apply the respective operation:
413
413
 
414
- - ``PIL.Image.FLIP_LEFT_RIGHT``:
414
+ - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``:
415
415
  all pixels get mirrored at half the width of the image
416
- - ``PIL.Image.FLIP_TOP_BOTTOM``:
416
+ - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``:
417
417
  all pixels get mirrored at half the height of the image
418
- - ``PIL.Image.ROTATE_180``:
418
+ - ``PIL.Image.Transpose.ROTATE_180``:
419
419
  all pixels get mirrored at both, the width and half the height
420
420
  of the image,
421
421
  i.e. the image gets rotated by 180° counter-clockwise
422
- - ``PIL.Image.ROTATE_90``:
422
+ - ``PIL.Image.Transpose.ROTATE_90``:
423
423
  rows become columns (but counted from the right) and
424
424
  columns become rows,
425
425
  i.e. the image gets rotated by 90° counter-clockwise;
426
426
  width becomes height and vice versa
427
- - ``PIL.Image.ROTATE_270``:
427
+ - ``PIL.Image.Transpose.ROTATE_270``:
428
428
  rows become columns and
429
429
  columns become rows (but counted from the bottom),
430
430
  i.e. the image gets rotated by 270° counter-clockwise;
431
431
  width becomes height and vice versa
432
- - ``PIL.Image.TRANSPOSE``:
432
+ - ``PIL.Image.Transpose.TRANSPOSE``:
433
433
  rows become columns and vice versa,
434
434
  i.e. all pixels get mirrored at the main diagonal;
435
435
  width becomes height and vice versa
436
- - ``PIL.Image.TRANSVERSE``:
436
+ - ``PIL.Image.Transpose.TRANSVERSE``:
437
437
  rows become columns (but counted from the right) and
438
438
  columns become rows (but counted from the bottom),
439
439
  i.e. all pixels get mirrored at the opposite diagonal;
ocrd_utils/logging.py CHANGED
@@ -5,9 +5,9 @@ By default: Log with lastResort logger, usually STDERR.
5
5
 
6
6
  Logging can be overridden either programmatically in code using the library or by creating one or more of
7
7
 
8
- - /etc/ocrd_logging.py
9
- - $HOME/ocrd_logging.py
10
- - $PWD/ocrd_logging.py
8
+ - ``/etc/ocrd_logging.py``
9
+ - ``$HOME/ocrd_logging.py``
10
+ - ``$PWD/ocrd_logging.py``
11
11
 
12
12
  These files will be executed in the context of ocrd/ocrd_logging.py, with `logging` global set.
13
13
 
@@ -16,20 +16,18 @@ Changes as of 2023-08-20:
16
16
  - Try to be less intrusive with OCR-D specific logging conventions to
17
17
  make it easier and less surprising to define logging behavior when
18
18
  using OCR-D/core as a library
19
- - Change setOverrideLogLevel to only override the log level of the ``ocrd``
19
+ - Change :py:meth:`setOverrideLogLevel` to only override the log level of the ``ocrd``
20
20
  logger and its descendants
21
- - initLogging will set exactly one handler, for the root logger or for the
21
+ - :py:meth:`initLogging` will set exactly one handler, for the root logger or for the
22
22
  ``ocrd`` logger.
23
23
  - Child loggers should propagate to the ancestor logging (default
24
- behavior of the logging library - no more PropagationShyLogger)
25
- - disableLogging only removes any handlers from the ``ocrd`` logger
24
+ behavior of the logging library - no more ``PropagationShyLogger``)
25
+ - :py:meth:`disableLogging` only removes any handlers from the ``ocrd`` logger
26
26
  """
27
27
  # pylint: disable=no-member
28
28
 
29
29
  from __future__ import absolute_import
30
30
 
31
- from traceback import format_stack
32
-
33
31
  import logging
34
32
  import logging.config
35
33
  from pathlib import Path
@@ -81,10 +79,10 @@ _ocrdLevel2pythonLevel = {
81
79
 
82
80
  def tf_disable_interactive_logs():
83
81
  try:
84
- from os import environ
82
+ from os import environ # pylint: disable=import-outside-toplevel
85
83
  # This env variable must be set before importing from Keras
86
84
  environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
87
- from tensorflow.keras.utils import disable_interactive_logging
85
+ from tensorflow.keras.utils import disable_interactive_logging # pylint: disable=import-outside-toplevel
88
86
  # Enabled interactive logging throws an exception
89
87
  # due to a call of sys.stdout.flush()
90
88
  disable_interactive_logging()
@@ -143,21 +141,21 @@ def get_logging_config_files():
143
141
 
144
142
  def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG):
145
143
  """
146
- Reset ``ocrd`` logger, read logging configuration if exists, otherwise use basicConfig
144
+ Reset ``ocrd`` logger, read logging configuration if exists, otherwise use :py:meth:`logging.basicConfig`
147
145
 
148
- initLogging is to be called by OCR-D/core once, i.e.
146
+ This is to be called by OCR-D/core only once, i.e.
149
147
  - for the ``ocrd`` CLI
150
148
  - for the processor wrapper methods
151
149
 
152
150
  Other processes that use OCR-D/core as a library can, but do not have to, use this functionality.
153
151
 
154
152
  Keyword Args:
155
- - builtin_only (bool, False): Whether to search for logging configuration
156
- on-disk (``False``) or only use the
157
- hard-coded config (``True``). For testing
158
- - force_reinit (bool, False): Whether to ignore the module-level
159
- ``_initialized_flag``. For testing only.
160
- - silent (bool, True): Whether to log logging behavior by printing to stderr
153
+ - builtin_only (bool): Whether to search for logging configuration
154
+ on-disk (``False``) or only use the hard-coded config (``True``).
155
+ For testing
156
+ - force_reinit (bool): Whether to ignore the module-level ``_initialized_flag``.
157
+ For testing only
158
+ - silent (bool): Whether to log logging behavior by printing to stderr
161
159
  """
162
160
  global _initialized_flag
163
161
  if _initialized_flag and not force_reinit:
@@ -212,11 +210,13 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG):
212
210
  # logging.basicConfig(level=logging.CRITICAL)
213
211
  # logging.disable(logging.ERROR)
214
212
  # remove all handlers for the ocrd logger
215
- for logger_name in ROOT_OCRD_LOGGERS:
213
+ for logger_name in ROOT_OCRD_LOGGERS + ['']:
216
214
  for handler in logging.getLogger(logger_name).handlers[:]:
217
215
  logging.getLogger(logger_name).removeHandler(handler)
218
216
  for logger_name in LOGGING_DEFAULTS:
219
217
  logging.getLogger(logger_name).setLevel(logging.NOTSET)
218
+ # Python default log level is WARNING
219
+ logging.root.setLevel(logging.WARNING)
220
220
 
221
221
  # Initializing stream handlers at module level
222
222
  # would cause message output in all runtime contexts,
ocrd_utils/os.py CHANGED
@@ -71,9 +71,8 @@ def unzip_file_to_dir(path_to_zip, output_directory):
71
71
  """
72
72
  Extract a ZIP archive to a directory
73
73
  """
74
- z = ZipFile(path_to_zip, 'r')
75
- z.extractall(output_directory)
76
- z.close()
74
+ with ZipFile(path_to_zip, 'r') as z:
75
+ z.extractall(output_directory)
77
76
 
78
77
  @lru_cache()
79
78
  def get_ocrd_tool_json(executable):
@@ -87,7 +86,7 @@ def get_ocrd_tool_json(executable):
87
86
  ocrd_tool = ocrd_all_tool[executable]
88
87
  except (JSONDecodeError, OSError, KeyError):
89
88
  try:
90
- ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout)
89
+ ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE, check=False).stdout)
91
90
  except (JSONDecodeError, OSError) as e:
92
91
  getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}')
93
92
  if 'resource_locations' not in ocrd_tool:
@@ -102,7 +101,7 @@ def get_moduledir(executable):
102
101
  moduledir = ocrd_all_moduledir[executable]
103
102
  except (JSONDecodeError, OSError, KeyError):
104
103
  try:
105
- moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n')
104
+ moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE, check=False).stdout.rstrip('\n')
106
105
  except (JSONDecodeError, OSError) as e:
107
106
  getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}')
108
107
  return moduledir
ocrd_utils/str.py CHANGED
@@ -4,9 +4,9 @@ Utility functions for strings, paths and URL.
4
4
 
5
5
  import re
6
6
  import json
7
- from typing import List, Union
7
+ from typing import List
8
8
  from .constants import REGEX_FILE_ID, SPARKLINE_CHARS
9
- from .deprecate import deprecation_warning
9
+ #from .deprecate import deprecation_warning
10
10
  from deprecated import deprecated
11
11
  from warnings import warn
12
12
  from numpy import array_split
@@ -21,6 +21,7 @@ __all__ = [
21
21
  'make_file_id',
22
22
  'make_xml_id',
23
23
  'nth_url_segment',
24
+ 'parse_json_file_with_comments',
24
25
  'parse_json_string_or_file',
25
26
  'parse_json_string_with_comments',
26
27
  'remove_non_path_from_url',
@@ -162,6 +163,13 @@ def is_string(val):
162
163
  return isinstance(val, str)
163
164
 
164
165
 
166
+ def parse_json_file_with_comments(val):
167
+ """
168
+ Parse a file of JSON interspersed with #-prefixed full-line comments
169
+ """
170
+ with open(val, 'r', encoding='utf-8') as inputf:
171
+ return parse_json_string_with_comments(inputf.read())
172
+
165
173
  def parse_json_string_with_comments(val):
166
174
  """
167
175
  Parse a string of JSON interspersed with #-prefixed full-line comments
@@ -265,4 +273,3 @@ def sparkline(values : List[int]) -> str:
265
273
  # normalize to 0..1 and convert to index in SPARKLINE_CHARS
266
274
  mapped = [int(x / max_value * max_mapping) for x in values]
267
275
  return ''.join(SPARKLINE_CHARS[x] for x in mapped)
268
-