PyPI - ocrd - Versions diffs - 3.0.0b4__py3-none-any.whl → 3.0.0b6__py3-none-any.whl - Mend

ocrd 3.0.0b4py3-none-any.whl → 3.0.0b6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

ocrd/cli/bashlib.py +6 -4
ocrd/cli/ocrd_tool.py +1 -1
ocrd/cli/validate.py +6 -3
ocrd/cli/workspace.py +71 -56
ocrd/decorators/__init__.py +6 -6
ocrd/decorators/ocrd_cli_options.py +1 -0
ocrd/lib.bash +24 -21
ocrd/mets_server.py +39 -8
ocrd/processor/base.py +307 -89
ocrd/processor/builtin/dummy_processor.py +0 -2
ocrd/processor/helpers.py +16 -7
ocrd/processor/ocrd_page_result.py +2 -2
ocrd/workspace.py +3 -0
{ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/METADATA +2 -1
{ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/RECORD +23 -23
{ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/WHEEL +1 -1
ocrd_models/ocrd_mets.py +9 -0
ocrd_models/ocrd_page_generateds.py +44 -11
ocrd_utils/logging.py +6 -2
ocrd_utils/str.py +2 -1
{ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/LICENSE +0 -0
{ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/entry_points.txt +0 -0
{ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/top_level.txt +0 -0

ocrd/processor/base.py CHANGED Viewed

@@ -16,14 +16,20 @@ import json
 import os
 from os import getcwd
 from pathlib import Path
-from typing import List, Optional, Union, get_args
+from typing import Any, Dict, List, Optional, Tuple, Union, get_args
 import sys
 import inspect
 import tarfile
 import io
 import weakref
+from collections import defaultdict
 from frozendict import frozendict
-from concurrent.futures import ThreadPoolExecutor, TimeoutError
+# concurrent.futures is buggy in py38,
+# this is where the fixes came from:
+from loky import Future, ProcessPoolExecutor
+import multiprocessing as mp
+from threading import Timer
+from _thread import interrupt_main
 from click import wrap_text
 from deprecated import deprecated
@@ -105,6 +111,31 @@ class MissingInputFile(ValueError):
                         f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
         super().__init__(self.message)
+class DummyFuture:
+    """
+    Mimics some of `concurrent.futures.Future` but runs immediately.
+    """
+    def __init__(self, fn, *args, **kwargs):
+        self.fn = fn
+        self.args = args
+        self.kwargs = kwargs
+    def result(self):
+        return self.fn(*self.args, **self.kwargs)
+class DummyExecutor:
+    """
+    Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs
+    everything immediately in this process.
+    """
+    def __init__(self, initializer=None, initargs=(), **kwargs):
+        initializer(*initargs)
+    def shutdown(self, **kwargs):
+        pass
+    def submit(self, fn, *args, **kwargs) -> DummyFuture:
+        return DummyFuture(fn, *args, **kwargs)
+TFuture = Union[DummyFuture, Future]
+TExecutor = Union[DummyExecutor, ProcessPoolExecutor]
 class Processor():
     """
     A processor is a tool that implements the uniform OCR-D
@@ -339,7 +370,7 @@ class Processor():
         self._finalizer = weakref.finalize(self, self.shutdown)
         # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
         setattr(self, 'process',
-                deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
+                deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
     def show_help(self, subcommand=None):
         """
@@ -358,6 +389,7 @@ class Processor():
         """
         Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
         """
+        # verify input and output file groups in parameters
         assert self.input_file_grp is not None
         assert self.output_file_grp is not None
         input_file_grps = self.input_file_grp.split(',')
@@ -374,12 +406,23 @@ class Processor():
                     assert len(grps) >= minimum, msg % (len(grps), str(spec))
                 if maximum > 0:
                     assert len(grps) <= maximum, msg % (len(grps), str(spec))
-        assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
-                                    "Unexpected number of input file groups %d vs %s")
-        assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
-                                    "Unexpected number of output file groups %d vs %s")
+        # FIXME: enforce unconditionally as soon as grace period for deprecation is over
+        if 'input_file_grp_cardinality' in self.ocrd_tool:
+            assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
+                                        "Unexpected number of input file groups %d vs %s")
+        if 'output_file_grp_cardinality' in self.ocrd_tool:
+            assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
+                                        "Unexpected number of output file groups %d vs %s")
+        # verify input and output file groups in METS
         for input_file_grp in input_file_grps:
-            assert input_file_grp in self.workspace.mets.file_groups
+            assert input_file_grp in self.workspace.mets.file_groups, \
+                f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}"
+        for output_file_grp in output_file_grps:
+            assert output_file_grp not in self.workspace.mets.file_groups \
+                or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \
+                or not any(self.workspace.mets.find_files(
+                    pageId=self.page_id, fileGrp=output_file_grp)), \
+                    f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}"
         # keep this for backwards compatibility:
         return True
@@ -444,6 +487,9 @@ class Processor():
         for the given :py:data:`page_id` (or all pages)
         under the given :py:data:`parameter`.
+        Delegates to :py:meth:`.process_workspace_submit_tasks`
+        and :py:meth:`.process_workspace_handle_tasks`.
         (This will iterate over pages and files, calling
         :py:meth:`.process_page_file` and handling exceptions.
         It should be overridden by subclasses to handle cases
@@ -453,11 +499,7 @@ class Processor():
             self.workspace = workspace
             self.verify()
             try:
-                nr_succeeded = 0
-                nr_skipped = 0
-                nr_copied = 0
-                # set up multithreading
+                # set up multitasking
                 max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
                 if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
                     self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
@@ -469,84 +511,217 @@ class Processor():
                 if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
                     self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
                     max_seconds = self.max_page_seconds
-                executor = ThreadPoolExecutor(
+                if max_workers > 1:
+                    executor_cls = ProcessPoolExecutor
+                else:
+                    executor_cls = DummyExecutor
+                executor = executor_cls(
                     max_workers=max_workers or 1,
-                    thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
+                    # only forking method avoids pickling
+                    context=mp.get_context('fork'),
+                    # share processor instance as global to avoid pickling
+                    initializer=_page_worker_set_ctxt,
+                    initargs=(self,),
                 )
-                self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
-                tasks = {}
-                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
-                    input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
-                    page_id = next(input_file.pageId
-                                   for input_file in input_file_tuple
-                                   if input_file)
-                    self._base_logger.info(f"preparing page {page_id}")
-                    for i, input_file in enumerate(input_file_tuple):
-                        if input_file is None:
-                            # file/page not found in this file grp
-                            continue
-                        input_files[i] = input_file
-                        if not self.download:
-                            continue
-                        try:
-                            input_files[i] = self.workspace.download_file(input_file)
-                        except (ValueError, FileNotFoundError, HTTPError) as e:
-                            self._base_logger.error(repr(e))
-                            self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
-                    # process page
-                    tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
-                self._base_logger.debug("submitted %d processing tasks", len(tasks))
-                for task in tasks:
-                    # wait for results, handle errors
-                    page_id, input_files = tasks[task]
-                    # FIXME: differentiate error cases in various ways:
-                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
-                    # - transient (I/O or OOM) error → maybe sleep, retry
-                    # - persistent (data) error → skip / dummy / raise
-                    try:
-                        self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
-                        task.result(timeout=max_seconds or None)
-                        nr_succeeded += 1
-                    # exclude NotImplementedError, so we can try process() below
-                    except NotImplementedError:
-                        raise
-                    # handle input failures separately
-                    except FileExistsError as err:
-                        if config.OCRD_EXISTING_OUTPUT == 'ABORT':
-                            raise err
-                        if config.OCRD_EXISTING_OUTPUT == 'SKIP':
-                            continue
-                        if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
-                            # too late here, must not happen
-                            raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
-                    # broad coverage of output failures (including TimeoutError)
-                    except (Exception, TimeoutError) as err:
-                        # FIXME: add re-usable/actionable logging
-                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
-                            self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
-                            raise err
-                        self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
-                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
-                            nr_skipped += 1
-                            continue
-                        if config.OCRD_MISSING_OUTPUT == 'COPY':
-                            self._copy_page_file(input_files[0])
-                            nr_copied += 1
-                        else:
-                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
-                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
-                if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
-                    raise Exception(f"too many failures with skipped output ({nr_skipped})")
-                if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
-                    raise Exception(f"too many failures with fallback output ({nr_skipped})")
-                executor.shutdown()
+                try:
+                    self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
+                    tasks = self.process_workspace_submit_tasks(executor, max_seconds)
+                    stats = self.process_workspace_handle_tasks(tasks)
+                finally:
+                    executor.shutdown(kill_workers=True, wait=False)
             except NotImplementedError:
                 # fall back to deprecated method
-                self.process()
+                try:
+                    self.process()
+                except Exception as err:
+                    # suppress the NotImplementedError context
+                    raise err from None
+    def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
+        """
+        Look up all input files of the given ``workspace``
+        from the given :py:data:`input_file_grp`
+        for the given :py:data:`page_id` (or all pages),
+        and schedules calling :py:meth:`.process_page_file`
+        on them for each page via `executor` (enforcing
+        a per-page time limit of `max_seconds`).
+        When running with `OCRD_MAX_PARALLEL_PAGES>1` and
+        the workspace via METS Server, the executor will fork
+        this many worker parallel subprocesses each processing
+        one page at a time. (Interprocess communication is
+        done via task and result queues.)
+        Otherwise, tasks are run sequentially in the
+        current process.
+        Delegates to :py:meth:`.zip_input_files` to get
+        the input files for each page, and then calls
+        :py:meth:`.process_workspace_submit_page_task`.
+        Returns a dict mapping the per-page tasks
+        (i.e. futures submitted to the executor)
+        to their corresponding pageId and input files.
+        """
+        tasks = {}
+        for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
+            task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple)
+            tasks[task] = (page_id, input_files)
+        self._base_logger.debug("submitted %d processing tasks", len(tasks))
+        return tasks
+    def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]:
+        """
+        Ensure all input files for a single page are
+        downloaded to the workspace, then schedule
+        :py:meth:`.process_process_file` to be run on
+        them via `executor` (enforcing a per-page time
+        limit of `max_seconds`).
+        Delegates to :py:meth:`.process_page_file`
+        (wrapped in :py:func:`_page_worker` to share
+        the processor instance across forked processes).
+        \b
+        Returns a tuple of:
+        - the scheduled future object,
+        - the corresponding pageId,
+        - the corresponding input files.
+        """
+        input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
+        page_id = next(input_file.pageId
+                       for input_file in input_file_tuple
+                       if input_file)
+        self._base_logger.info(f"preparing page {page_id}")
+        for i, input_file in enumerate(input_file_tuple):
+            if input_file is None:
+                # file/page not found in this file grp
+                continue
+            input_files[i] = input_file
+            if not self.download:
+                continue
+            try:
+                input_files[i] = self.workspace.download_file(input_file)
+            except (ValueError, FileNotFoundError, HTTPError) as e:
+                self._base_logger.error(repr(e))
+                self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
+        # process page
+        #executor.submit(self.process_page_file, *input_files)
+        return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
+    def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]:
+        """
+        Look up scheduled per-page futures one by one,
+        handle errors (exceptions) and gather results.
+        \b
+        Enforces policies configured by the following
+        environment variables:
+        - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
+        - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
+        - `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
+        \b
+        Returns a tuple of:
+        - the number of successfully processed pages
+        - the number of failed (i.e. skipped or copied) pages
+        - a dict of the type and corresponding number of exceptions seen
+        - the number of total requested pages (i.e. success+fail+existing).
+        Delegates to :py:meth:`.process_workspace_handle_page_task`
+        for each page.
+        """
+        # aggregate info for logging:
+        nr_succeeded = 0
+        nr_failed = 0
+        nr_errors = defaultdict(int) # count causes
+        if config.OCRD_MISSING_OUTPUT == 'SKIP':
+            reason = "skipped"
+        elif config.OCRD_MISSING_OUTPUT == 'COPY':
+            reason = "fallback-copied"
+        for task in tasks:
+            # wait for results, handle errors
+            page_id, input_files = tasks[task]
+            result = self.process_workspace_handle_page_task(page_id, input_files, task)
+            if isinstance(result, Exception):
+                nr_errors[result.__class__.__name__] += 1
+                nr_failed += 1
+                # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed
+                if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS:
+                    # already irredeemably many failures, stop short
+                    nr_errors = dict(nr_errors)
+                    raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})")
+            elif result:
+                nr_succeeded += 1
+            # else skipped - already exists
+        nr_errors = dict(nr_errors)
+        if nr_failed > 0:
+            nr_all = nr_succeeded + nr_failed
+            if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
+                raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
+            self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
+        return nr_succeeded, nr_failed, nr_errors, len(tasks)
+    def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]:
+        """
+        \b
+        Await a single page result and handle errors (exceptions),
+        enforcing policies configured by the following
+        environment variables:
+        - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
+        - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
+        - `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
+        \b
+        Returns
+        - true in case of success
+        - false in case the output already exists
+        - the exception in case of failure
+        """
+        # FIXME: differentiate error cases in various ways:
+        # - ResourceNotFoundError → use ResourceManager to download (once), then retry
+        # - transient (I/O or OOM) error → maybe sleep, retry
+        # - persistent (data) error → skip / dummy / raise
+        try:
+            self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id)
+            # timeout kwarg on future is useless: it only raises TimeoutError here,
+            # but does not stop the running process/thread, and executor itself
+            # offers nothing to that effect:
+            # task.result(timeout=max_seconds or None)
+            # so we instead applied the timeout within the worker function
+            task.result()
+            return True
+        except NotImplementedError:
+            # exclude NotImplementedError, so we can try process() below
+            raise
+        # handle input failures separately
+        except FileExistsError as err:
+            if config.OCRD_EXISTING_OUTPUT == 'ABORT':
+                raise err
+            if config.OCRD_EXISTING_OUTPUT == 'SKIP':
+                return False
+            if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
+                # too late here, must not happen
+                raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
+        except KeyboardInterrupt:
+            raise
+        # broad coverage of output failures (including TimeoutError)
+        except Exception as err:
+            # FIXME: add re-usable/actionable logging
+            if config.OCRD_MISSING_OUTPUT == 'ABORT':
+                self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
+                raise err
+            self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
+            if config.OCRD_MISSING_OUTPUT == 'SKIP':
+                pass
+            elif config.OCRD_MISSING_OUTPUT == 'COPY':
+                self._copy_page_file(input_files[0])
+            else:
+                desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
+                raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
+            return err
     def _copy_page_file(self, input_file : OcrdFileType) -> None:
         """
@@ -574,7 +749,6 @@ class Processor():
             local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
             mimetype=MIMETYPE_PAGE,
             content=to_xml(input_pcgts),
-            force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
         )
     def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
@@ -603,6 +777,12 @@ class Processor():
                 # not PAGE and not an image to generate PAGE for
                 self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
         output_file_id = make_file_id(input_files[0], self.output_file_grp)
+        output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
+        if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
+            # short-cut avoiding useless computation:
+            raise FileExistsError(
+                f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
+            )
         result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
         for image_result in result.images:
             image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
@@ -616,6 +796,8 @@ class Processor():
                 image_result.alternative_image.set_imageHeight(image_result.pil.height)
             elif isinstance(image_result.alternative_image, AlternativeImageType):
                 image_result.alternative_image.set_filename(image_file_path)
+            elif image_result.alternative_image is None:
+                pass # do not reference in PAGE result
             else:
                 raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
                                  f"{type(image_result.alternative_image)}")
@@ -625,7 +807,6 @@ class Processor():
                 self.output_file_grp,
                 page_id=page_id,
                 file_path=image_file_path,
-                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
             )
         result.pcgts.set_pcGtsId(output_file_id)
         self.add_metadata(result.pcgts)
@@ -636,7 +817,6 @@ class Processor():
             local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
             mimetype=MIMETYPE_PAGE,
             content=to_xml(result.pcgts),
-            force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
         )
     def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
@@ -919,6 +1099,44 @@ class Processor():
                 ifts.append(tuple(ifiles))
         return ifts
+_page_worker_processor = None
+"""
+This global binding for the processor is required to avoid
+squeezing the processor through a mp.Queue (which is impossible
+due to unpicklable attributes like .workspace.mets._tree anyway)
+when calling Processor.process_page_file as page worker processes
+in Processor.process_workspace. Forking allows inheriting global
+objects, and with the METS Server we do not mutate the local
+processor instance anyway.
+"""
+def _page_worker_set_ctxt(processor):
+    """
+    Overwrites `ocrd.processor.base._page_worker_processor` instance
+    for sharing with subprocesses in ProcessPoolExecutor initializer.
+    """
+    global _page_worker_processor
+    _page_worker_processor = processor
+def _page_worker(timeout, *input_files):
+    """
+    Wraps a `Processor.process_page_file` call as payload (call target)
+    of the ProcessPoolExecutor workers, but also enforces the given timeout.
+    """
+    page_id = next((file.pageId for file in input_files
+                    if hasattr(file, 'pageId')), "")
+    if timeout > 0:
+        timer = Timer(timeout, interrupt_main)
+        timer.start()
+    try:
+        _page_worker_processor.process_page_file(*input_files)
+        _page_worker_processor.logger.debug("page worker completed for page %s", page_id)
+    except KeyboardInterrupt:
+        _page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
+        raise TimeoutError()
+    finally:
+        if timeout > 0:
+            timer.cancel()
 def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
     """Generate a string describing the full CLI of this processor including params.

ocrd/processor/builtin/dummy_processor.py CHANGED Viewed

@@ -47,7 +47,6 @@ class DummyProcessor(Processor):
                     mimetype=input_file.mimetype,
                     local_filename=local_filename,
                     content=f.read(),
-                    force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
                 )
             file_id = file_id + '_PAGE'
             pcgts = page_from_file(output_file)
@@ -62,7 +61,6 @@ class DummyProcessor(Processor):
                                     local_filename=join(self.output_file_grp, file_id + '.xml'),
                                     mimetype=MIMETYPE_PAGE,
                                     content=to_xml(pcgts),
-                                    force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
             )
         else:
             if self.parameter['copy_files']:

ocrd/processor/helpers.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Helper methods for running and documenting processors
 """
 from time import perf_counter, process_time
+from os import times
 from functools import lru_cache
 import json
 import inspect
@@ -89,11 +90,12 @@ def run_processor(
     ocrd_tool = processor.ocrd_tool
     name = '%s v%s' % (ocrd_tool['executable'], processor.version)
-    otherrole = ocrd_tool['steps'][0]
+    otherrole = ocrd_tool.get('steps', [''])[0]
     logProfile = getLogger('ocrd.process.profile')
     log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
     t0_wall = perf_counter()
     t0_cpu = process_time()
+    t0_os = times()
     if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
         backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
         from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
@@ -123,7 +125,13 @@ def run_processor(
     t1_wall = perf_counter() - t0_wall
     t1_cpu = process_time() - t0_cpu
-    logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % (
+    t1_os = times()
+    # add CPU time from child processes (page worker etc)
+    t1_cpu += t1_os.children_user - t0_os.children_user
+    t1_cpu += t1_os.children_system - t0_os.children_system
+    logProfile.info(
+        "Executing processor '%s' took %fs (wall) %fs (CPU)( "
+        "[--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']",
         ocrd_tool['executable'],
         t1_wall,
         t1_cpu,
@@ -131,7 +139,7 @@ def run_processor(
         processor.output_file_grp or '',
         json.dumps(processor.parameter) or '',
         processor.page_id or ''
-    ))
+    )
     workspace.mets.add_agent(
         name=name,
         _type='OTHER',
@@ -234,10 +242,10 @@ def get_cached_processor(parameter: dict, processor_class):
 def get_processor(
         processor_class,
         parameter: Optional[dict] = None,
-        workspace: Workspace = None,
-        page_id: str = None,
-        input_file_grp: List[str] = None,
-        output_file_grp: List[str] = None,
+        workspace: Optional[Workspace] = None,
+        page_id: Optional[str] = None,
+        input_file_grp: Optional[List[str]] = None,
+        output_file_grp: Optional[List[str]] = None,
         instance_caching: bool = False,
 ):
     if processor_class:
@@ -258,6 +266,7 @@ def get_processor(
         else:
             # avoid passing workspace already (deprecated chdir behaviour)
             processor = processor_class(None, parameter=parameter)
+        assert processor
         # set current processing parameters
         processor.workspace = workspace
         processor.page_id = page_id

ocrd/processor/ocrd_page_result.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import List, Union
+from typing import List, Union, Optional
 from ocrd_models.ocrd_page import OcrdPage
 from PIL.Image import Image
@@ -9,7 +9,7 @@ from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
 class OcrdPageResultImage():
     pil : Image
     file_id_suffix : str
-    alternative_image : Union[AlternativeImageType, PageType]
+    alternative_image : Optional[Union[AlternativeImageType, PageType]]
 @dataclass
 class OcrdPageResult():

ocrd/workspace.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ocrd_models.ocrd_page import parse, BorderType, to_xml
 from ocrd_modelfactory import exif_from_filename, page_from_file
 from ocrd_utils import (
     atomic_write,
+    config,
     getLogger,
     image_from_polygon,
     coordinates_of_segment,
@@ -427,6 +428,8 @@ class Workspace():
             kwargs["pageId"] = kwargs.pop("page_id")
             if "file_id" in kwargs:
                 kwargs["ID"] = kwargs.pop("file_id")
+            if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
+                kwargs["force"] = True
             ret = self.mets.add_file(file_grp, **kwargs)

{ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ocrd
-Version: 3.0.0b4
+Version: 3.0.0b6
 Summary: OCR-D framework
 Author-email: Konstantin Baierer <unixprog@gmail.com>
 License: Apache License 2.0
@@ -24,6 +24,7 @@ Requires-Dist: frozendict>=2.3.4
 Requires-Dist: gdown
 Requires-Dist: httpx>=0.22.0
 Requires-Dist: jsonschema>=4
+Requires-Dist: loky
 Requires-Dist: lxml
 Requires-Dist: memory-profiler>=0.58.0
 Requires-Dist: numpy

ocrd 3.0.0b4__py3-none-any.whl → 3.0.0b6__py3-none-any.whl

ocrd 3.0.0b4py3-none-any.whl → 3.0.0b6py3-none-any.whl