PyPI - ocrd - Versions diffs - 3.0.0b7__py3-none-any.whl → 3.0.1__py3-none-any.whl - Mend

ocrd 3.0.0b7py3-none-any.whl → 3.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

ocrd/cli/__init__.py +3 -1
ocrd/decorators/__init__.py +3 -2
ocrd/mets_server.py +62 -42
ocrd/processor/base.py +7 -6
ocrd/processor/builtin/dummy/ocrd-tool.json +20 -0
ocrd/processor/builtin/dummy_processor.py +0 -3
ocrd/processor/builtin/filter_processor.py +108 -0
ocrd/resource_manager.py +4 -0
{ocrd-3.0.0b7.dist-info → ocrd-3.0.1.dist-info}/METADATA +2 -1
{ocrd-3.0.0b7.dist-info → ocrd-3.0.1.dist-info}/RECORD +32 -31
{ocrd-3.0.0b7.dist-info → ocrd-3.0.1.dist-info}/entry_points.txt +1 -0
ocrd_modelfactory/__init__.py +7 -1
ocrd_models/ocrd_exif.py +2 -2
ocrd_models/ocrd_page.py +22 -3
ocrd_models/ocrd_page_generateds.py +2813 -1438
ocrd_models/xpath_functions.py +51 -0
ocrd_network/cli/client.py +27 -8
ocrd_network/client.py +9 -6
ocrd_network/client_utils.py +25 -14
ocrd_network/processing_server.py +27 -15
ocrd_network/processing_worker.py +7 -4
ocrd_network/processor_server.py +2 -1
ocrd_network/rabbitmq_utils/connector.py +2 -2
ocrd_network/runtime_data/deployer.py +28 -18
ocrd_network/server_cache.py +26 -23
ocrd_network/server_utils.py +40 -4
ocrd_network/tcp_to_uds_mets_proxy.py +8 -5
ocrd_network/utils.py +19 -15
ocrd_utils/config.py +38 -16
ocrd/processor/concurrent.py +0 -909
{ocrd-3.0.0b7.dist-info → ocrd-3.0.1.dist-info}/LICENSE +0 -0
{ocrd-3.0.0b7.dist-info → ocrd-3.0.1.dist-info}/WHEEL +0 -0
{ocrd-3.0.0b7.dist-info → ocrd-3.0.1.dist-info}/top_level.txt +0 -0

ocrd_network/server_cache.py CHANGED Viewed

@@ -31,7 +31,7 @@ class CacheLockedPages:
         self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]
     ) -> bool:
         if not self.locked_pages.get(workspace_key, None):
-            self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}")
+            self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}")
             return False
         debug_message = f"Caching the received request due to locked output file grp pages."
         for file_group in output_file_grps:
@@ -46,46 +46,45 @@ class CacheLockedPages:
     def get_locked_pages(self, workspace_key: str) -> Dict[str, List[str]]:
         if not self.locked_pages.get(workspace_key, None):
-            self.log.debug(f"No locked pages available for workspace key: {workspace_key}")
+            self.log.info(f"No locked pages available for workspace key: {workspace_key}")
             return {}
         return self.locked_pages[workspace_key]
     def lock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None:
         if not self.locked_pages.get(workspace_key, None):
-            self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}")
-            self.log.debug(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}")
+            self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}")
+            self.log.info(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}")
             self.locked_pages[workspace_key] = {}
         for file_group in output_file_grps:
             if file_group not in self.locked_pages[workspace_key]:
-                self.log.debug(f"Creating an empty list for output file grp: {file_group}")
+                self.log.info(f"Creating an empty list for output file grp: {file_group}")
                 self.locked_pages[workspace_key][file_group] = []
             # The page id list is not empty - only some pages are in the request
             if page_ids:
-                self.log.debug(f"Locking pages for '{file_group}': {page_ids}")
+                self.log.info(f"Locking pages for '{file_group}': {page_ids}")
                 self.locked_pages[workspace_key][file_group].extend(page_ids)
-                self.log.debug(f"Locked pages of '{file_group}': "
-                               f"{self.locked_pages[workspace_key][file_group]}")
+                self.log.info(f"Locked pages of '{file_group}': {self.locked_pages[workspace_key][file_group]}")
             else:
                 # Lock all pages with a single value
-                self.log.debug(f"Locking pages for '{file_group}': {self.placeholder_all_pages}")
+                self.log.info(f"Locking pages for '{file_group}': {self.placeholder_all_pages}")
                 self.locked_pages[workspace_key][file_group].append(self.placeholder_all_pages)
     def unlock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None:
         if not self.locked_pages.get(workspace_key, None):
-            self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}")
+            self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}")
             return
         for file_group in output_file_grps:
             if file_group in self.locked_pages[workspace_key]:
                 if page_ids:
                     # Unlock the previously locked pages
-                    self.log.debug(f"Unlocking pages of '{file_group}': {page_ids}")
+                    self.log.info(f"Unlocking pages of '{file_group}': {page_ids}")
                     self.locked_pages[workspace_key][file_group] = \
                         [x for x in self.locked_pages[workspace_key][file_group] if x not in page_ids]
-                    self.log.debug(f"Remaining locked pages of '{file_group}': "
-                                   f"{self.locked_pages[workspace_key][file_group]}")
+                    self.log.info(f"Remaining locked pages of '{file_group}': "
+                                  f"{self.locked_pages[workspace_key][file_group]}")
                 else:
                     # Remove the single variable used to indicate all pages are locked
-                    self.log.debug(f"Unlocking all pages for: {file_group}")
+                    self.log.info(f"Unlocking all pages for: {file_group}")
                     self.locked_pages[workspace_key][file_group].remove(self.placeholder_all_pages)
@@ -127,11 +126,11 @@ class CacheProcessingRequests:
         debug_message += f", page ids: {job_input.page_id}"
         debug_message += f", job id: {job_input.job_id}"
         debug_message += f", job depends on: {job_input.depends_on}"
-        self.log.debug(debug_message)
+        self.log.info(debug_message)
     async def consume_cached_requests(self, workspace_key: str) -> List[PYJobInput]:
         if not self.has_workspace_cached_requests(workspace_key=workspace_key):
-            self.log.debug(f"No jobs to be consumed for workspace key: {workspace_key}")
+            self.log.info(f"No jobs to be consumed for workspace key: {workspace_key}")
             return []
         found_consume_requests = []
         for current_element in self.processing_requests[workspace_key]:
@@ -165,25 +164,27 @@ class CacheProcessingRequests:
         # If a record counter of this workspace key does not exist
         # in the requests counter cache yet, create one and assign 0
         if not self.processing_counter.get(workspace_key, None):
-            self.log.debug(f"Creating an internal request counter for workspace key: {workspace_key}")
+            self.log.info(f"Creating an internal request counter for workspace key: {workspace_key}")
             self.processing_counter[workspace_key] = 0
         self.processing_counter[workspace_key] = self.processing_counter[workspace_key] + by_value
+        self.log.info(f"The new request counter of {workspace_key}: {self.processing_counter[workspace_key]}")
         return self.processing_counter[workspace_key]
     def cache_request(self, workspace_key: str, data: PYJobInput):
         # If a record queue of this workspace key does not exist in the requests cache
         if not self.processing_requests.get(workspace_key, None):
-            self.log.debug(f"Creating an internal request queue for workspace_key: {workspace_key}")
+            self.log.info(f"Creating an internal request queue for workspace_key: {workspace_key}")
             self.processing_requests[workspace_key] = []
         self.__print_job_input_debug_message(job_input=data)
         # Add the processing request to the end of the internal queue
+        self.log.info(f"Caching a processing request of {workspace_key}: {data.job_id}")
         self.processing_requests[workspace_key].append(data)
     async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str) -> List[PYJobInput]:
         if not self.has_workspace_cached_requests(workspace_key=workspace_key):
-            self.log.debug(f"No jobs to be cancelled for workspace key: {workspace_key}")
+            self.log.info(f"No jobs to be cancelled for workspace key: {workspace_key}")
             return []
-        self.log.debug(f"Cancelling jobs dependent on job id: {processing_job_id}")
+        self.log.info(f"Cancelling jobs dependent on job id: {processing_job_id}")
         found_cancel_requests = []
         for i, current_element in enumerate(self.processing_requests[workspace_key]):
             if processing_job_id in current_element.depends_on:
@@ -192,7 +193,7 @@ class CacheProcessingRequests:
         for cancel_element in found_cancel_requests:
             try:
                 self.processing_requests[workspace_key].remove(cancel_element)
-                self.log.debug(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'")
+                self.log.info(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'")
                 cancelled_jobs.append(cancel_element)
                 await db_update_processing_job(job_id=cancel_element.job_id, state=JobState.cancelled)
                 # Recursively cancel dependent jobs for the cancelled job
@@ -225,9 +226,11 @@ class CacheProcessingRequests:
     def has_workspace_cached_requests(self, workspace_key: str) -> bool:
         if not self.processing_requests.get(workspace_key, None):
-            self.log.debug(f"In processing requests cache, no workspace key found: {workspace_key}")
+            self.log.info(f"In processing requests cache, no workspace key found: {workspace_key}")
             return False
         if not len(self.processing_requests[workspace_key]):
-            self.log.debug(f"The processing requests cache is empty for workspace key: {workspace_key}")
+            self.log.info(f"The processing requests cache is empty for workspace key: {workspace_key}")
             return False
+        self.log.info(f"The processing requests cache has {len(self.processing_requests[workspace_key])} "
+                      f"entries for workspace key: {workspace_key} ")
         return True

ocrd_network/server_utils.py CHANGED Viewed

@@ -1,12 +1,18 @@
+import os
+import re
+import signal
+from pathlib import Path
+from json import dumps, loads
+from urllib.parse import urljoin
+from typing import Dict, List, Optional, Union
+from time import time
 from fastapi import HTTPException, status, UploadFile
 from fastapi.responses import FileResponse
 from httpx import AsyncClient, Timeout
-from json import dumps, loads
 from logging import Logger
-from pathlib import Path
 from requests import get as requests_get
-from typing import Dict, List, Union
-from urllib.parse import urljoin
+from requests_unixsocket import sys
 from ocrd.resolver import Resolver
 from ocrd.task_sequence import ProcessorTask
@@ -241,3 +247,33 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s
         if group not in available_groups:
             message = f"Input file group '{group}' of the first processor not found: {input_file_grps}"
             raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message)
+def kill_mets_server_zombies(minutes_ago : Optional[int], dry_run : Optional[bool]) -> List[int]:
+    if minutes_ago == None:
+        minutes_ago = 90
+    if dry_run == None:
+        dry_run = False
+    now = time()
+    cmdline_pat = r'.*ocrd workspace -U.*server start $'
+    ret = []
+    for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime):
+        if not procdir.is_dir():
+            continue
+        cmdline_file = procdir.joinpath('cmdline')
+        if not cmdline_file.is_file():
+            continue
+        ctime_ago = int((now - procdir.stat().st_ctime) / 60)
+        if ctime_ago < minutes_ago:
+            continue
+        cmdline = cmdline_file.read_text().replace('\x00', ' ')
+        if re.match(cmdline_pat, cmdline):
+            pid = int(procdir.name)
+            ret.append(pid)
+            print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr)
+            if dry_run:
+                print(f'[dry_run is active] kill {pid}')
+            else:
+                os.kill(pid, signal.SIGTERM)
+    return ret

ocrd_network/tcp_to_uds_mets_proxy.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from requests_unixsocket import Session as requests_unixsocket_session
-from .utils import get_uds_path
+from .utils import get_uds_path, convert_url_to_uds_format
 from typing import Dict
 from ocrd_utils import getLogger
@@ -31,9 +31,13 @@ class MetsServerProxy:
         if method_type not in SUPPORTED_METHOD_TYPES:
             raise NotImplementedError(f"Method type: {method_type} not recognized")
         ws_socket_file = str(get_uds_path(ws_dir_path=ws_dir_path))
-        ws_unix_socket_url = f'http+unix://{ws_socket_file.replace("/", "%2F")}'
+        ws_unix_socket_url = convert_url_to_uds_format(ws_socket_file)
         uds_request_url = f"{ws_unix_socket_url}/{request_url}"
+        self.log.info(f"Forwarding TCP mets server request to UDS url: {uds_request_url}")
+        self.log.info(f"Forwarding method type {method_type}, request data: {request_data}, "
+                      f"expected response type: {response_type}")
         if not request_data:
             response = self.session.request(method_type, uds_request_url)
         elif "params" in request_data:
@@ -45,12 +49,11 @@ class MetsServerProxy:
         else:
             raise ValueError("Expecting request_data to be empty or containing single key: params,"
                              f"form, or class but not {request_data.keys}")
+        if response_type == "empty":
+            return {}
         if not response:
             self.log.error(f"Uds-Mets-Server gives unexpected error. Response: {response.__dict__}")
             return {"error": response.text}
-        elif response_type == "empty":
-            return {}
         elif response_type == "text":
             return {"text": response.text}
         elif response_type == "class" or response_type == "dict":

ocrd_network/utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ from fastapi import UploadFile
 from functools import wraps
 from hashlib import md5
 from json import loads
+from logging import Logger
 from pathlib import Path
 from re import compile as re_compile, split as re_split
 from requests import get as requests_get, Session as Session_TCP
@@ -151,22 +152,25 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo
         return False
-def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool:
+def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> bool:
     protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds"
-    session = Session_TCP() if protocol == "tcp" else Session_UDS()
-    if protocol == "uds":
-        mets_server_url = convert_url_to_uds_format(mets_server_url)
-    try:
-        if 'tcp_mets' in mets_server_url:
-            if not ws_dir_path:
-                return False
-            response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(ws_dir_path))
-        else:
-            response = session.delete(url=f"{mets_server_url}/")
-    except Exception:
-        return False
-    return response.status_code == 200
+    # If the mets server URL is the proxy endpoint
+    if protocol == "tcp" and "tcp_mets" in mets_server_url:
+        # Convert the mets server url to UDS format
+        ws_socket_file = str(get_uds_path(ws_dir_path))
+        mets_server_url = convert_url_to_uds_format(ws_socket_file)
+        protocol = "uds"
+    if protocol == "tcp":
+        request_json = MpxReq.stop(ws_dir_path)
+        logger.info(f"Sending POST request to: {mets_server_url}, request_json: {request_json}")
+        response = Session_TCP().post(url=f"{mets_server_url}", json=request_json)
+        return response.status_code == 200
+    elif protocol == "uds":
+        logger.info(f"Sending DELETE request to: {mets_server_url}/")
+        response = Session_UDS().delete(url=f"{mets_server_url}/")
+        return response.status_code == 200
+    else:
+        ValueError(f"Unexpected protocol type: {protocol}")
 def get_uds_path(ws_dir_path: str) -> Path:
     return Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(ws_dir_path)}.sock")

ocrd_utils/config.py CHANGED Viewed

@@ -21,7 +21,7 @@ def _parser_boolean(val):
 class OcrdEnvVariable():
-    def __init__(self, name, description, parser=str, validator=lambda val: True, default=[False, None]):
+    def __init__(self, name, description, parser=str, validator=lambda _: True, default=[False, None]):
         """
         An environment variable for use in OCR-D.
@@ -47,10 +47,19 @@ class OcrdEnvVariable():
         return f'{self.name}: {self.description}'
     def describe(self, wrap_text=True, indent_text=True):
+        """
+        Output help information on a config option.
+        If ``option.description`` is a multiline string with complex formatting
+        (e.g. markdown lists), replace empty lines with ``\b`` and set
+        ``wrap_text`` to ``False``.
+        """
         desc = self.description
         if self.has_default:
             default = self.default() if callable(self.default) else self.default
-            desc += f' (Default: "{default}")'
+            if not desc.endswith('\n'):
+                desc += ' '
+            desc += f'(Default: "{default}")'
         ret = ''
         ret  = f'{self.name}\n'
         if wrap_text:
@@ -146,11 +155,11 @@ config.add("OCRD_PROFILE",
     description="""\
 Whether to enable gathering runtime statistics
 on the `ocrd.profile` logger (comma-separated):
+\b
 - `CPU`: yields CPU and wall-time,
 - `RSS`: also yields peak memory (resident set size)
 - `PSS`: also yields peak memory (proportional set size)
+\b
 """,
   validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')),
   default=(True, ''))
@@ -183,11 +192,12 @@ config.add("OCRD_DOWNLOAD_INPUT",
 config.add("OCRD_MISSING_INPUT",
     description="""\
-How to deal with missing input files (for some fileGrp/pageId) during processing:
+How to deal with missing input files
+(for some fileGrp/pageId) during processing:
+\b
  - `SKIP`: ignore and proceed with next page's input
  - `ABORT`: throw :py:class:`.MissingInputFile`
+\b
 """,
     default=(True, 'SKIP'),
     validator=lambda val: val in ['SKIP', 'ABORT'],
@@ -195,12 +205,13 @@ How to deal with missing input files (for some fileGrp/pageId) during processing
 config.add("OCRD_MISSING_OUTPUT",
     description="""\
-How to deal with missing output files (for some fileGrp/pageId) during processing:
+How to deal with missing output files
+(for some fileGrp/pageId) during processing:
+\b
  - `SKIP`: ignore and proceed processing next page
  - `COPY`: fall back to copying input PAGE to output fileGrp for page
  - `ABORT`: re-throw whatever caused processing to fail
+\b
 """,
     default=(True, 'SKIP'),
     validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
@@ -213,12 +224,13 @@ config.add("OCRD_MAX_MISSING_OUTPUTS",
 config.add("OCRD_EXISTING_OUTPUT",
     description="""\
-How to deal with already existing output files (for some fileGrp/pageId) during processing:
+How to deal with already existing output files
+(for some fileGrp/pageId) during processing:
+\b
  - `SKIP`: ignore and proceed processing next page
  - `OVERWRITE`: force writing result to output fileGrp for page
  - `ABORT`: re-throw :py:class:`FileExistsError`
+\b
 """,
     default=(True, 'SKIP'),
     validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'],
@@ -231,7 +243,7 @@ config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
 config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP",
            description="How many seconds to sleep before trying again.",
            parser=int,
-           default=(True, 30))
+           default=(True, 10))
 config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT",
            description="Timeout for a blocking ocrd network client (in seconds).",
@@ -247,9 +259,19 @@ config.add("OCRD_NETWORK_SERVER_ADDR_WORKSPACE",
         default=(True, ''))
 config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS",
-    description="Number of attempts for a RabbitMQ client to connect before failing.",
+           description="Number of attempts for a RabbitMQ client to connect before failing.",
+           parser=int,
+           default=(True, 3))
+config.add(
+    name="OCRD_NETWORK_RABBITMQ_HEARTBEAT",
+    description="""
+    Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value
+    proposed by broker. Use 0 to deactivate heartbeat.
+    """,
     parser=int,
-    default=(True, 3))
+    default=(True, 0)
+)
 config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR",
            description="The root directory where all mets server related socket files are created",

ocrd 3.0.0b7__py3-none-any.whl → 3.0.1__py3-none-any.whl

ocrd 3.0.0b7py3-none-any.whl → 3.0.1py3-none-any.whl