PyPI - ocrd - Versions diffs - 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl - Mend

ocrd 3.0.0a2py3-none-any.whl → 3.0.0b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

ocrd/cli/__init__.py +34 -26
ocrd/cli/bashlib.py +32 -18
ocrd/cli/ocrd_tool.py +7 -5
ocrd/cli/workspace.py +10 -8
ocrd/decorators/__init__.py +13 -7
ocrd/decorators/ocrd_cli_options.py +1 -1
ocrd/lib.bash +3 -0
ocrd/mets_server.py +3 -4
ocrd/processor/__init__.py +1 -1
ocrd/processor/base.py +421 -98
ocrd/processor/builtin/dummy_processor.py +4 -11
ocrd/processor/helpers.py +24 -161
ocrd/processor/ocrd_page_result.py +3 -3
ocrd/resolver.py +0 -3
ocrd/resource_manager.py +9 -5
ocrd/workspace.py +10 -11
ocrd/workspace_backup.py +1 -1
{ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/METADATA +32 -10
{ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/RECORD +49 -48
{ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/WHEEL +1 -1
ocrd_modelfactory/__init__.py +1 -1
ocrd_models/constants.py +0 -1
ocrd_models/ocrd_exif.py +2 -2
ocrd_models/ocrd_file.py +2 -2
ocrd_models/ocrd_mets.py +22 -22
ocrd_models/ocrd_page.py +0 -1
ocrd_models/ocrd_xml_base.py +2 -2
ocrd_network/cli/client.py +134 -30
ocrd_network/client.py +53 -27
ocrd_network/client_utils.py +101 -0
ocrd_network/processing_server.py +1 -1
ocrd_network/runtime_data/deployer.py +12 -3
ocrd_network/server_utils.py +12 -10
ocrd_utils/__init__.py +2 -0
ocrd_utils/config.py +31 -2
ocrd_utils/image.py +25 -25
ocrd_utils/logging.py +20 -20
ocrd_utils/os.py +4 -5
ocrd_utils/str.py +10 -3
ocrd_validators/json_validator.py +1 -3
ocrd_validators/ocrd_tool_validator.py +2 -2
ocrd_validators/page_validator.py +56 -56
ocrd_validators/parameter_validator.py +2 -2
ocrd_validators/resource_list_validator.py +4 -3
ocrd_validators/workspace_validator.py +21 -21
ocrd_validators/xsd_validator.py +1 -1
{ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/LICENSE +0 -0
{ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/entry_points.txt +0 -0
{ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/top_level.txt +0 -0

ocrd_network/client.py CHANGED Viewed

@@ -1,37 +1,63 @@
-from json import dumps, loads
-from requests import post as requests_post
+from typing import Optional
 from ocrd_utils import config, getLogger, LOG_FORMAT
+from .client_utils import (
+    get_ps_deployed_processors,
+    get_ps_deployed_processor_ocrd_tool,
+    get_ps_processing_job_log,
+    get_ps_processing_job_status,
+    get_ps_workflow_job_status,
+    poll_job_status_till_timeout_fail_or_success,
+    poll_wf_status_till_timeout_fail_or_success,
+    post_ps_processing_request,
+    post_ps_workflow_request,
+    verify_server_protocol
+)
-from .constants import NETWORK_PROTOCOLS
-# TODO: This is just a conceptual implementation and first try to
-#  trigger further discussions on how this should look like.
 class Client:
     def __init__(
         self,
-        server_addr_processing: str = config.OCRD_NETWORK_SERVER_ADDR_PROCESSING,
-        server_addr_workflow: str = config.OCRD_NETWORK_SERVER_ADDR_WORKFLOW,
-        server_addr_workspace: str = config.OCRD_NETWORK_SERVER_ADDR_WORKSPACE
+        server_addr_processing: Optional[str],
+        timeout: int = config.OCRD_NETWORK_CLIENT_POLLING_TIMEOUT,
+        wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP
     ):
         self.log = getLogger(f"ocrd_network.client")
+        if not server_addr_processing:
+            server_addr_processing = config.OCRD_NETWORK_SERVER_ADDR_PROCESSING
         self.server_addr_processing = server_addr_processing
-        self.server_addr_workflow = server_addr_workflow
-        self.server_addr_workspace = server_addr_workspace
-    def send_processing_request(self, processor_name: str, req_params: dict):
         verify_server_protocol(self.server_addr_processing)
-        req_url = f"{self.server_addr_processing}/processor/{processor_name}"
-        req_headers = {"Content-Type": "application/json; charset=utf-8"}
-        req_json = loads(dumps(req_params))
-        self.log.info(f"Sending processing request to: {req_url}")
-        self.log.debug(req_json)
-        response = requests_post(url=req_url, headers=req_headers, json=req_json)
-        return response.json()
-def verify_server_protocol(address: str):
-    for protocol in NETWORK_PROTOCOLS:
-        if address.startswith(protocol):
-            return
-    raise ValueError(f"Wrong/Missing protocol in the server address: {address}, must be one of: {NETWORK_PROTOCOLS}")
+        self.polling_timeout = timeout
+        self.polling_wait = wait
+        self.polling_tries = int(timeout / wait)
+    def check_deployed_processors(self):
+        return get_ps_deployed_processors(ps_server_host=self.server_addr_processing)
+    def check_deployed_processor_ocrd_tool(self, processor_name: str):
+        return get_ps_deployed_processor_ocrd_tool(
+            ps_server_host=self.server_addr_processing, processor_name=processor_name)
+    def check_job_log(self, job_id: str):
+        return get_ps_processing_job_log(self.server_addr_processing, processing_job_id=job_id)
+    def check_job_status(self, job_id: str):
+        return get_ps_processing_job_status(self.server_addr_processing, processing_job_id=job_id)
+    def check_workflow_status(self, workflow_job_id: str):
+        return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id)
+    def poll_job_status(self, job_id: str) -> str:
+        return poll_job_status_till_timeout_fail_or_success(
+            ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait)
+    def poll_workflow_status(self, job_id: str) -> str:
+        return poll_wf_status_till_timeout_fail_or_success(
+            ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait)
+    def send_processing_job_request(self, processor_name: str, req_params: dict) -> str:
+        return post_ps_processing_request(
+            ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params)
+    def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str):
+        return post_ps_workflow_request(
+            ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets)

ocrd_network/client_utils.py ADDED Viewed

@@ -0,0 +1,101 @@
+from requests import get as request_get, post as request_post
+from time import sleep
+from .constants import JobState, NETWORK_PROTOCOLS
+def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int):
+    if job_type not in ["workflow", "processor"]:
+        raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'")
+    job_state = JobState.unset
+    while tries > 0:
+        sleep(wait)
+        if job_type == "processor":
+            job_state = get_ps_processing_job_status(ps_server_host, job_id)
+        if job_type == "workflow":
+            job_state = get_ps_workflow_job_status(ps_server_host, job_id)
+        if job_state == JobState.success or job_state == JobState.failed:
+            break
+        tries -= 1
+    return job_state
+def poll_job_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState:
+    return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait)
+def poll_wf_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState:
+    return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait)
+def get_ps_deployed_processors(ps_server_host: str):
+    request_url = f"{ps_server_host}/processor"
+    response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
+    assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
+    return response.json()
+def get_ps_deployed_processor_ocrd_tool(ps_server_host: str, processor_name: str):
+    request_url = f"{ps_server_host}/processor/info/{processor_name}"
+    response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
+    assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
+    return response.json()
+def get_ps_processing_job_log(ps_server_host: str, processing_job_id: str):
+    request_url = f"{ps_server_host}/processor/log/{processing_job_id}"
+    response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
+    return response
+def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> str:
+    request_url = f"{ps_server_host}/processor/job/{processing_job_id}"
+    response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
+    assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
+    job_state = response.json()["state"]
+    assert job_state
+    return job_state
+def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> str:
+    request_url = f"{ps_server_host}/workflow/job-simple/{workflow_job_id}"
+    response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"})
+    assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
+    job_state = response.json()["state"]
+    assert job_state
+    return job_state
+def post_ps_processing_request(ps_server_host: str, processor: str, job_input: dict) -> str:
+    request_url = f"{ps_server_host}/processor/run/{processor}"
+    response = request_post(
+        url=request_url,
+        headers={"accept": "application/json; charset=utf-8"},
+        json=job_input
+    )
+    assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
+    processing_job_id = response.json()["job_id"]
+    assert processing_job_id
+    return processing_job_id
+# TODO: Can be extended to include other parameters such as page_wise
+def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: str) -> str:
+    request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise=True"
+    response = request_post(
+        url=request_url,
+        headers={"accept": "application/json; charset=utf-8"},
+        files={"workflow": open(path_to_wf, "rb")}
+    )
+    # print(response.json())
+    # print(response.__dict__)
+    assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}"
+    wf_job_id = response.json()["job_id"]
+    assert wf_job_id
+    return wf_job_id
+def verify_server_protocol(address: str):
+    for protocol in NETWORK_PROTOCOLS:
+        if address.startswith(protocol):
+            return
+    raise ValueError(f"Wrong/Missing protocol in the server address: {address}, must be one of: {NETWORK_PROTOCOLS}")

ocrd_network/processing_server.py CHANGED Viewed

@@ -651,7 +651,7 @@ class ProcessingServer(FastAPI):
         # There is no caching on the Processing Server side
         processor_names_list = self.deployer.find_matching_network_agents(
             docker_only=False, native_only=False, worker_only=False, server_only=False,
-            str_names_only=True, unique_only=True
+            str_names_only=True, unique_only=True, sort=True
         )
         return processor_names_list

ocrd_network/runtime_data/deployer.py CHANGED Viewed

@@ -35,7 +35,7 @@ class Deployer:
     # TODO: Reconsider this.
     def find_matching_network_agents(
         self, worker_only: bool = False, server_only: bool = False, docker_only: bool = False,
-        native_only: bool = False, str_names_only: bool = False, unique_only: bool = False
+        native_only: bool = False, str_names_only: bool = False, unique_only: bool = False, sort: bool = False
     ) -> Union[List[str], List[object]]:
         """Finds and returns a list of matching data objects of type:
         `DataProcessingWorker` and `DataProcessorServer`.
@@ -46,6 +46,7 @@ class Deployer:
         :py:attr:`native_only` match only native network agents (DataProcessingWorker and DataProcessorServer)
         :py:attr:`str_names_only` returns the processor_name filed instead of the Data* object
         :py:attr:`unique_only` remove duplicate names from the matches
+        :py:attr:`sort` sort the result
         `worker_only` and `server_only` are mutually exclusive to each other
         `docker_only` and `native_only` are mutually exclusive to each other
@@ -64,6 +65,10 @@ class Deployer:
             msg = f"Value 'unique_only' is allowed only together with 'str_names_only'"
             self.log.exception(msg)
             raise ValueError(msg)
+        if sort and not str_names_only:
+            msg = f"Value 'sort' is allowed only together with 'str_names_only'"
+            self.log.exception(msg)
+            raise ValueError(msg)
         # Find all matching objects of type DataProcessingWorker or DataProcessorServer
         matched_objects = []
@@ -88,8 +93,12 @@ class Deployer:
         matched_names = [match.processor_name for match in matched_objects]
         if not unique_only:
             return matched_names
-        # Removes any duplicate entries from matched names
-        return list(dict.fromkeys(matched_names))
+        list_matched = list(dict.fromkeys(matched_names))
+        if not sort:
+            # Removes any duplicate entries from matched names
+            return list_matched
+        list_matched.sort()
+        return list_matched
     def resolve_processor_server_url(self, processor_name) -> str:
         processor_server_url = ''

ocrd_network/server_utils.py CHANGED Viewed

@@ -125,14 +125,13 @@ def request_processor_server_tool_json(logger: Logger, processor_server_base_url
             urljoin(base=processor_server_base_url, url="info"),
             headers={"Content-Type": "application/json"}
         )
-        if response.status_code != 200:
-            message = f"Failed to retrieve tool json from: {processor_server_base_url}, code: {response.status_code}"
-            raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
-        return response.json()
     except Exception as error:
         message = f"Failed to retrieve ocrd tool json from: {processor_server_base_url}"
         raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message, error)
+    if response.status_code != 200:
+        message = f"Failed to retrieve tool json from: {processor_server_base_url}, code: {response.status_code}"
+        raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
+    return response.json()
 async def forward_job_to_processor_server(
     logger: Logger, job_input: PYJobInput, processor_server_base_url: str
@@ -193,11 +192,14 @@ def parse_workflow_tasks(logger: Logger, workflow_content: str) -> List[Processo
 def raise_http_exception(logger: Logger, status_code: int, message: str, error: Exception = None) -> None:
-    logger.exception(f"{message} {error}")
+    if error:
+        message = f"{message} {error}"
+    logger.exception(f"{message}")
     raise HTTPException(status_code=status_code, detail=message)
 def validate_job_input(logger: Logger, processor_name: str, ocrd_tool: dict, job_input: PYJobInput) -> None:
+    # logger.warning(f"Job input: {job_input}")
     if bool(job_input.path_to_mets) == bool(job_input.workspace_id):
         message = (
             "Wrong processing job input format. "
@@ -210,12 +212,12 @@ def validate_job_input(logger: Logger, processor_name: str, ocrd_tool: dict, job
         raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message)
     try:
         report = ParameterValidator(ocrd_tool).validate(dict(job_input.parameters))
-        if not report.is_valid:
-            message = f"Failed to validate processing job input against the tool json of processor: {processor_name}\n"
-            raise_http_exception(logger, status.HTTP_404_BAD_REQUEST, message + report.errors)
     except Exception as error:
         message = f"Failed to validate processing job input against the ocrd tool json of processor: {processor_name}"
-        raise_http_exception(logger, status.HTTP_404_BAD_REQUEST, message, error)
+        raise_http_exception(logger, status.HTTP_400_BAD_REQUEST, message, error)
+    if report and not report.is_valid:
+        message = f"Failed to validate processing job input against the tool json of processor: {processor_name}\n"
+        raise_http_exception(logger, status.HTTP_400_BAD_REQUEST, f"{message}{report.errors}")
 def validate_workflow(logger: Logger, workflow: str) -> None:

ocrd_utils/__init__.py CHANGED Viewed

@@ -75,6 +75,7 @@ Utility functions and constants usable in various circumstances.
   :py:func:`concat_padded`,
   :py:func:`nth_url_segment`,
   :py:func:`remove_non_path_from_url`,
+  :py:func:`parse_json_file_with_comments`,
   :py:func:`parse_json_string_with_comments`,
   :py:func:`parse_json_string_or_file`,
   :py:func:`set_json_key_value_overrides`,
@@ -204,6 +205,7 @@ from .str import (
     make_xml_id,
     nth_url_segment,
     partition_list,
+    parse_json_file_with_comments,
     parse_json_string_or_file,
     parse_json_string_with_comments,
     sparkline,

ocrd_utils/config.py CHANGED Viewed

@@ -12,8 +12,12 @@ from pathlib import Path
 from tempfile import gettempdir
 from textwrap import fill, indent
-_validator_boolean = lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
-_parser_boolean = lambda val: bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
+def _validator_boolean(val):
+    return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
+def _parser_boolean(val):
+    return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
 class OcrdEnvVariable():
@@ -116,6 +120,16 @@ config.add('OCRD_MAX_PROCESSOR_CACHE',
     parser=int,
     default=(True, 128))
+config.add('OCRD_MAX_PARALLEL_PAGES',
+    description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.",
+    parser=int,
+    default=(True, 1))
+config.add('OCRD_PROCESSING_PAGE_TIMEOUT',
+    description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.",
+    parser=int,
+    default=(True, 0))
 config.add("OCRD_PROFILE",
     description="""\
 Whether to enable gathering runtime statistics
@@ -180,6 +194,11 @@ How to deal with missing output files (for some fileGrp/pageId) during processin
     validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
     parser=str)
+config.add("OCRD_MAX_MISSING_OUTPUTS",
+    description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).",
+    default=(True, 0.1),
+    parser=float)
 config.add("OCRD_EXISTING_OUTPUT",
     description="""\
 How to deal with already existing output files (for some fileGrp/pageId) during processing:
@@ -197,6 +216,16 @@ config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
         description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
         default=(True, ''))
+config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP",
+           description="How many seconds to sleep before trying again.",
+           parser=int,
+           default=(True, 30))
+config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT",
+           description="Timeout for a blocking ocrd network client (in seconds).",
+           parser=int,
+           default=(True, 3600))
 config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW",
         description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).",
         default=(True, ''))

ocrd_utils/image.py CHANGED Viewed

@@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method):
     Return a numpy array of the enlarged width and height.
     """
-    if method in [Image.ROTATE_90,
-                  Image.ROTATE_270,
-                  Image.TRANSPOSE,
-                  Image.TRANSVERSE]:
+    if method in [Image.Transpose.ROTATE_90,
+                  Image.Transpose.ROTATE_270,
+                  Image.Transpose.TRANSPOSE,
+                  Image.Transpose.TRANSVERSE]:
         size = size[::-1]
     return size
@@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])):
     calculate the affine coordinate transform corresponding to the composition
     of both transformations, which is respectively:
-    - ``PIL.Image.FLIP_LEFT_RIGHT``:
+    - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``:
       entails translation to the center, followed by pure reflection
       about the y-axis, and subsequent translation back
-    - ``PIL.Image.FLIP_TOP_BOTTOM``:
+    - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``:
       entails translation to the center, followed by pure reflection
       about the x-axis, and subsequent translation back
-    - ``PIL.Image.ROTATE_180``:
+    - ``PIL.Image.Transpose.ROTATE_180``:
       entails translation to the center, followed by pure reflection
       about the origin, and subsequent translation back
-    - ``PIL.Image.ROTATE_90``:
+    - ``PIL.Image.Transpose.ROTATE_90``:
       entails translation to the center, followed by pure rotation
       by 90° counter-clockwise, and subsequent translation back
-    - ``PIL.Image.ROTATE_270``:
+    - ``PIL.Image.Transpose.ROTATE_270``:
       entails translation to the center, followed by pure rotation
       by 270° counter-clockwise, and subsequent translation back
-    - ``PIL.Image.TRANSPOSE``:
+    - ``PIL.Image.Transpose.TRANSPOSE``:
       entails translation to the center, followed by pure rotation
       by 90° counter-clockwise and pure reflection about the x-axis,
       and subsequent translation back
-    - ``PIL.Image.TRANSVERSE``:
+    - ``PIL.Image.Transpose.TRANSVERSE``:
       entails translation to the center, followed by pure rotation
       by 90° counter-clockwise and pure reflection about the y-axis,
       and subsequent translation back
@@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])):
                       [0, 0, 1]])
     transform = shift_coordinates(transform, -orig)
     operations = {
-        Image.FLIP_LEFT_RIGHT: [refly],
-        Image.FLIP_TOP_BOTTOM: [reflx],
-        Image.ROTATE_180: [reflx, refly],
-        Image.ROTATE_90: [rot90],
-        Image.ROTATE_270: [rot90, reflx, refly],
-        Image.TRANSPOSE: [rot90, reflx],
-        Image.TRANSVERSE: [rot90, refly]
+        Image.Transpose.FLIP_LEFT_RIGHT: [refly],
+        Image.Transpose.FLIP_TOP_BOTTOM: [reflx],
+        Image.Transpose.ROTATE_180: [reflx, refly],
+        Image.Transpose.ROTATE_90: [rot90],
+        Image.Transpose.ROTATE_270: [rot90, reflx, refly],
+        Image.Transpose.TRANSPOSE: [rot90, reflx],
+        Image.Transpose.TRANSVERSE: [rot90, refly]
     }.get(method) # no default
     for operation in operations:
         transform = np.dot(operation, transform)
@@ -411,29 +411,29 @@ def transpose_image(image, method):
     Given a PIL.Image ``image`` and a transposition mode ``method``,
     apply the respective operation:
-    - ``PIL.Image.FLIP_LEFT_RIGHT``:
+    - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``:
       all pixels get mirrored at half the width of the image
-    - ``PIL.Image.FLIP_TOP_BOTTOM``:
+    - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``:
       all pixels get mirrored at half the height of the image
-    - ``PIL.Image.ROTATE_180``:
+    - ``PIL.Image.Transpose.ROTATE_180``:
       all pixels get mirrored at both, the width and half the height
       of the image,
       i.e. the image gets rotated by 180° counter-clockwise
-    - ``PIL.Image.ROTATE_90``:
+    - ``PIL.Image.Transpose.ROTATE_90``:
       rows become columns (but counted from the right) and
       columns become rows,
       i.e. the image gets rotated by 90° counter-clockwise;
       width becomes height and vice versa
-    - ``PIL.Image.ROTATE_270``:
+    - ``PIL.Image.Transpose.ROTATE_270``:
       rows become columns and
       columns become rows (but counted from the bottom),
       i.e. the image gets rotated by 270° counter-clockwise;
       width becomes height and vice versa
-    - ``PIL.Image.TRANSPOSE``:
+    - ``PIL.Image.Transpose.TRANSPOSE``:
       rows become columns and vice versa,
       i.e. all pixels get mirrored at the main diagonal;
       width becomes height and vice versa
-    - ``PIL.Image.TRANSVERSE``:
+    - ``PIL.Image.Transpose.TRANSVERSE``:
       rows become columns (but counted from the right) and
       columns become rows (but counted from the bottom),
       i.e. all pixels get mirrored at the opposite diagonal;

ocrd_utils/logging.py CHANGED Viewed

@@ -5,9 +5,9 @@ By default: Log with lastResort logger, usually STDERR.
 Logging can be overridden either programmatically in code using the library or by creating one or more of
-- /etc/ocrd_logging.py
-- $HOME/ocrd_logging.py
-- $PWD/ocrd_logging.py
+- ``/etc/ocrd_logging.py``
+- ``$HOME/ocrd_logging.py``
+- ``$PWD/ocrd_logging.py``
 These files will be executed in the context of ocrd/ocrd_logging.py, with `logging` global set.
@@ -16,20 +16,18 @@ Changes as of 2023-08-20:
     - Try to be less intrusive with OCR-D specific logging conventions to
       make it easier and less surprising to define logging behavior when
       using OCR-D/core as a library
-    - Change setOverrideLogLevel to only override the log level of the ``ocrd``
+    - Change :py:meth:`setOverrideLogLevel` to only override the log level of the ``ocrd``
       logger and its descendants
-    - initLogging will set exactly one handler, for the root logger or for the
+    - :py:meth:`initLogging` will set exactly one handler, for the root logger or for the
       ``ocrd`` logger.
     - Child loggers should propagate to the ancestor logging (default
-      behavior of the logging library - no more PropagationShyLogger)
-    - disableLogging only removes any handlers from the ``ocrd`` logger
+      behavior of the logging library - no more ``PropagationShyLogger``)
+    - :py:meth:`disableLogging` only removes any handlers from the ``ocrd`` logger
 """
 # pylint: disable=no-member
 from __future__ import absolute_import
-from traceback import format_stack
 import logging
 import logging.config
 from pathlib import Path
@@ -81,10 +79,10 @@ _ocrdLevel2pythonLevel = {
 def tf_disable_interactive_logs():
     try:
-        from os import environ
+        from os import environ # pylint: disable=import-outside-toplevel
         # This env variable must be set before importing from Keras
         environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-        from tensorflow.keras.utils import disable_interactive_logging
+        from tensorflow.keras.utils import disable_interactive_logging # pylint: disable=import-outside-toplevel
         # Enabled interactive logging throws an exception
         # due to a call of sys.stdout.flush()
         disable_interactive_logging()
@@ -143,21 +141,21 @@ def get_logging_config_files():
 def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG):
     """
-    Reset ``ocrd`` logger, read logging configuration if exists, otherwise use basicConfig
+    Reset ``ocrd`` logger, read logging configuration if exists, otherwise use :py:meth:`logging.basicConfig`
-    initLogging is to be called by OCR-D/core once, i.e.
+    This is to be called by OCR-D/core only once, i.e.
         -  for the ``ocrd`` CLI
         -  for the processor wrapper methods
     Other processes that use OCR-D/core as a library can, but do not have to, use this functionality.
     Keyword Args:
-        - builtin_only (bool, False): Whether to search for logging configuration
-                                      on-disk (``False``) or only use the
-                                      hard-coded config (``True``). For testing
-        - force_reinit (bool, False): Whether to ignore the module-level
-                                      ``_initialized_flag``. For testing only.
-        - silent (bool, True): Whether to log logging behavior by printing to stderr
+        - builtin_only (bool): Whether to search for logging configuration
+              on-disk (``False``) or only use the hard-coded config (``True``).
+              For testing
+        - force_reinit (bool): Whether to ignore the module-level ``_initialized_flag``.
+              For testing only
+        - silent (bool): Whether to log logging behavior by printing to stderr
     """
     global _initialized_flag
     if _initialized_flag and not force_reinit:
@@ -212,11 +210,13 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG):
     # logging.basicConfig(level=logging.CRITICAL)
     # logging.disable(logging.ERROR)
     # remove all handlers for the ocrd logger
-    for logger_name in ROOT_OCRD_LOGGERS:
+    for logger_name in ROOT_OCRD_LOGGERS + ['']:
         for handler in logging.getLogger(logger_name).handlers[:]:
             logging.getLogger(logger_name).removeHandler(handler)
     for logger_name in LOGGING_DEFAULTS:
         logging.getLogger(logger_name).setLevel(logging.NOTSET)
+    # Python default log level is WARNING
+    logging.root.setLevel(logging.WARNING)
 # Initializing stream handlers at module level
 # would cause message output in all runtime contexts,

ocrd_utils/os.py CHANGED Viewed

@@ -71,9 +71,8 @@ def unzip_file_to_dir(path_to_zip, output_directory):
     """
     Extract a ZIP archive to a directory
     """
-    z = ZipFile(path_to_zip, 'r')
-    z.extractall(output_directory)
-    z.close()
+    with ZipFile(path_to_zip, 'r') as z:
+        z.extractall(output_directory)
 @lru_cache()
 def get_ocrd_tool_json(executable):
@@ -87,7 +86,7 @@ def get_ocrd_tool_json(executable):
         ocrd_tool = ocrd_all_tool[executable]
     except (JSONDecodeError, OSError, KeyError):
         try:
-            ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE).stdout)
+            ocrd_tool = loads(run([executable, '--dump-json'], stdout=PIPE, check=False).stdout)
         except (JSONDecodeError, OSError) as e:
             getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}')
     if 'resource_locations' not in ocrd_tool:
@@ -102,7 +101,7 @@ def get_moduledir(executable):
         moduledir = ocrd_all_moduledir[executable]
     except (JSONDecodeError, OSError, KeyError):
         try:
-            moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE).stdout.rstrip('\n')
+            moduledir = run([executable, '--dump-module-dir'], encoding='utf-8', stdout=PIPE, check=False).stdout.rstrip('\n')
         except (JSONDecodeError, OSError) as e:
             getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}')
     return moduledir

ocrd_utils/str.py CHANGED Viewed

@@ -4,9 +4,9 @@ Utility functions for strings, paths and URL.
 import re
 import json
-from typing import List, Union
+from typing import List
 from .constants import REGEX_FILE_ID, SPARKLINE_CHARS
-from .deprecate import deprecation_warning
+#from .deprecate import deprecation_warning
 from deprecated import deprecated
 from warnings import warn
 from numpy import array_split
@@ -21,6 +21,7 @@ __all__ = [
     'make_file_id',
     'make_xml_id',
     'nth_url_segment',
+    'parse_json_file_with_comments',
     'parse_json_string_or_file',
     'parse_json_string_with_comments',
     'remove_non_path_from_url',
@@ -162,6 +163,13 @@ def is_string(val):
     return isinstance(val, str)
+def parse_json_file_with_comments(val):
+    """
+    Parse a file of JSON interspersed with #-prefixed full-line comments
+    """
+    with open(val, 'r', encoding='utf-8') as inputf:
+        return parse_json_string_with_comments(inputf.read())
 def parse_json_string_with_comments(val):
     """
     Parse a string of JSON interspersed with #-prefixed full-line comments
@@ -265,4 +273,3 @@ def sparkline(values : List[int]) -> str:
     # normalize to 0..1 and convert to index in SPARKLINE_CHARS
     mapped = [int(x / max_value * max_mapping) for x in values]
     return ''.join(SPARKLINE_CHARS[x] for x in mapped)

ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl

ocrd 3.0.0a2py3-none-any.whl → 3.0.0b2py3-none-any.whl