PyPI - ocrd - Versions diffs - 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl - Mend

ocrd 3.7.0py3-none-any.whl → 3.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

ocrd/cli/network.py +2 -0
ocrd/cli/resmgr.py +29 -65
ocrd/constants.py +0 -2
ocrd/ocrd-all-tool.json +25 -0
ocrd/processor/base.py +6 -16
ocrd/processor/builtin/dummy/ocrd-tool.json +25 -0
ocrd/processor/builtin/merge_processor.py +131 -0
ocrd/processor/builtin/param_command_header2unordered.json +7 -0
ocrd/processor/builtin/param_command_heading2unordered.json +7 -0
ocrd/processor/builtin/param_command_lines2orientation.json +6 -0
ocrd/processor/builtin/param_command_page-update-version.json +5 -0
ocrd/processor/builtin/param_command_transkribus-to-prima.json +8 -0
ocrd/processor/builtin/shell_processor.py +128 -0
ocrd/resource_manager.py +213 -124
{ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/METADATA +22 -3
{ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/RECORD +34 -26
{ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/entry_points.txt +2 -0
ocrd_models/ocrd_agent.py +3 -3
ocrd_network/__init__.py +1 -0
ocrd_network/cli/__init__.py +2 -0
ocrd_network/cli/resmgr_server.py +23 -0
ocrd_network/constants.py +3 -0
ocrd_network/logging_utils.py +5 -0
ocrd_network/resource_manager_server.py +182 -0
ocrd_network/runtime_data/connection_clients.py +1 -1
ocrd_network/runtime_data/hosts.py +43 -16
ocrd_network/runtime_data/network_agents.py +15 -1
ocrd_utils/__init__.py +5 -1
ocrd_utils/constants.py +5 -0
ocrd_utils/os.py +141 -61
ocrd_validators/ocrd_tool.schema.yml +7 -4
ocrd/resource_list.yml +0 -61
{ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/LICENSE +0 -0
{ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/WHEEL +0 -0
{ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/top_level.txt +0 -0

ocrd/cli/network.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ocrd_network.cli import (
     client_cli,
     processing_server_cli,
     processing_worker_cli,
+    resource_manager_server_cli
 )
@@ -26,3 +27,4 @@ def network_cli():
 network_cli.add_command(client_cli)
 network_cli.add_command(processing_server_cli)
 network_cli.add_command(processing_worker_cli)
+network_cli.add_command(resource_manager_server_cli)

ocrd/cli/resmgr.py CHANGED Viewed

@@ -20,6 +20,7 @@ from ocrd_utils import (
     get_ocrd_tool_json,
     initLogging,
     RESOURCE_LOCATIONS,
+    RESOURCE_TYPES
 )
 from ocrd.constants import RESOURCE_USER_LIST_COMMENT
@@ -70,16 +71,16 @@ def list_installed(executable=None):
 @resmgr_cli.command('download')
 @click.option('-n', '--any-url', default='', help='URL of unregistered resource to download/copy from')
 @click.option('-D', '--no-dynamic', default=False, is_flag=True,
-              help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources")
-@click.option('-t', '--resource-type', type=click.Choice(['file', 'directory', 'archive']), default='file',
-              help='Type of resource',)
-@click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type')
+              help="Skip looking into each processor's --dump-{json,module-dir} module-registered resources")
+@click.option('-t', '--resource-type', type=click.Choice(RESOURCE_TYPES), default='file',
+              help='Type of resource (when unregistered or incomplete)',)
+@click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type (when unregistered or incomplete)')
 @click.option('-a', '--allow-uninstalled', is_flag=True,
-              help="Allow installing resources for uninstalled processors",)
+              help="Allow installing resources for not installed processors",)
 @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True)
-@click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
+@click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
               help="Where to store resources - defaults to first location in processor's 'resource_locations' "
-                   "list or finally 'data'")
+                   "list, i.e. usually 'data'")
 @click.argument('executable', required=True)
 @click.argument('name', required=False)
 def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable,
@@ -106,8 +107,6 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
         executable = None
     if name == '*':
         name = None
-    is_url = (any_url.startswith('https://') or any_url.startswith('http://')) if any_url else False
-    is_filename = Path(any_url).exists() if any_url else False
     if executable and not which(executable):
         if not allow_uninstalled:
             log.error(f"Executable '{executable}' is not installed. "
@@ -126,65 +125,30 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
                 'path_in_archive': path_in_archive}]
             )]
     for this_executable, this_reslist in reslist:
-        for resdict in this_reslist:
-            if 'size' in resdict:
-                registered = "registered"
-            else:
-                registered = "unregistered"
-            if any_url:
-                resdict['url'] = any_url
-            if resdict['url'] == '???':
-                log.warning(f"Cannot download user resource {resdict['name']}")
-                continue
-            if resdict['url'].startswith('https://') or resdict['url'].startswith('http://'):
-                log.info(f"Downloading {registered} resource '{resdict['name']}' ({resdict['url']})")
-                if 'size' not in resdict:
-                    with requests.head(resdict['url']) as r:
-                        resdict['size'] = int(r.headers.get('content-length', 0))
-            else:
-                log.info(f"Copying {registered} resource '{resdict['name']}' ({resdict['url']})")
-                urlpath = Path(resdict['url'])
-                resdict['url'] = str(urlpath.resolve())
-                if Path(urlpath).is_dir():
-                    resdict['size'] = directory_size(urlpath)
-                else:
-                    resdict['size'] = urlpath.stat().st_size
-            if not location:
-                location = get_ocrd_tool_json(this_executable)['resource_locations'][0]
-            elif location not in get_ocrd_tool_json(this_executable)['resource_locations']:
-                log.error(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
-                          f"refusing to install to invalid location")
-                sys.exit(1)
-            if location != 'module':
-                basedir = resmgr.location_to_resource_dir(location)
-            else:
-                basedir = get_moduledir(this_executable)
-                if not basedir:
-                    basedir = resmgr.location_to_resource_dir('data')
+        resource_locations = get_ocrd_tool_json(this_executable)['resource_locations']
+        if not location:
+            location = resource_locations[0]
+        elif location not in resource_locations:
+            log.warning(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
+                        f"refusing to install to invalid location. Instead installing to: {resource_locations[0]}")
+        res_dest_dir = resmgr.build_resource_dest_dir(location=location, executable=this_executable)
+        for res_dict in this_reslist:
             try:
-                with click.progressbar(length=resdict['size']) as bar:
-                    fpath = resmgr.download(
-                        this_executable,
-                        resdict['url'],
-                        basedir,
-                        name=resdict['name'],
-                        resource_type=resdict.get('type', resource_type),
-                        path_in_archive=resdict.get('path_in_archive', path_in_archive),
-                        overwrite=overwrite,
-                        no_subdir=location in ['cwd', 'module'],
-                        progress_cb=lambda delta: bar.update(delta)
-                    )
-                if registered == 'unregistered':
-                    log.info(f"{this_executable} resource '{name}' ({any_url}) not a known resource, creating stub "
-                             f"in {resmgr.user_list}'")
-                    resmgr.add_to_user_database(this_executable, fpath, url=any_url)
-                resmgr.save_user_list()
-                log.info(f"Installed resource {resdict['url']} under {fpath}")
+                fpath = resmgr.handle_resource(
+                    res_dict=res_dict,
+                    executable=this_executable,
+                    dest_dir=res_dest_dir,
+                    any_url=any_url,
+                    overwrite=overwrite,
+                    resource_type=resource_type,
+                    path_in_archive=path_in_archive
+                )
+                if not fpath:
+                    continue
             except FileExistsError as exc:
                 log.info(str(exc))
-            log.info(f"Use in parameters as "
-                     f"'{resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))}'")
+            usage = res_dict.get('parameter_usage', 'as-is')
+            log.info(f"Use in parameters as '{resmgr.parameter_usage(res_dict['name'], usage)}'")
 @resmgr_cli.command('migrate')

ocrd/constants.py CHANGED Viewed

@@ -9,7 +9,6 @@ __all__ = [
     'DOWNLOAD_DIR',
     'DEFAULT_REPOSITORY_URL',
     'BASHLIB_FILENAME',
-    'RESOURCE_LIST_FILENAME',
     'BACKUP_DIR',
     'RESOURCE_USER_LIST_COMMENT',
 ]
@@ -19,6 +18,5 @@ DEFAULT_UPLOAD_FOLDER = '/tmp/uploads-ocrd-core'
 DOWNLOAD_DIR = '/tmp/ocrd-core-downloads'
 DEFAULT_REPOSITORY_URL = 'http://localhost:5000/'
 BASHLIB_FILENAME = resource_filename(__package__, 'lib.bash')
-RESOURCE_LIST_FILENAME = resource_filename(__package__, 'resource_list.yml')
 RESOURCE_USER_LIST_COMMENT = "# OCR-D private resource list (consider sending a PR with your own resources to OCR-D/core)"
 BACKUP_DIR = '.backup'

ocrd/ocrd-all-tool.json CHANGED Viewed

@@ -41,5 +41,30 @@
     "description": "Whether to extract an image for each filtered segment and write to the output fileGrp."
    }
   }
+ },
+ "ocrd-command": {
+  "executable": "ocrd-command",
+  "description": "Bare-bones processor runs shell commands to process PAGE files",
+  "steps": ["recognition/text-recognition", "recognition/font-identification", "recognition/post-correction", "layout/segmentation", "layout/analysis"],
+  "categories": [],
+  "input_file_grp_cardinality": [1, -1],
+  "output_file_grp_cardinality": 1,
+  "parameters": {
+   "command": {
+    "type": "string",
+    "default": "cat @INFILE > @OUTFILE",
+    "description": "Shell command to operate on PAGE files, with @INFILE as place-holder for the input file path(s), and @OUTFILE as place-holder for the output file path. If running on multiple input fileGrps, then @INFILE must be repeated as many times."
+   }
+  }
+ },
+ "ocrd-merge": {
+  "executable": "ocrd-merge",
+  "description": "Bare-bones processor merges annotations from multiple fileGrps",
+  "steps": ["layout/segmentation"],
+  "categories": [],
+  "input_file_grp_cardinality": [1, -1],
+  "output_file_grp_cardinality": 1,
+  "parameters": {
+  }
  }
 }

ocrd/processor/base.py CHANGED Viewed

@@ -42,15 +42,14 @@ from .ocrd_page_result import OcrdPageResult
 from ocrd_utils import (
     VERSION as OCRD_VERSION,
     MIMETYPE_PAGE,
-    MIME_TO_EXT,
     config,
     getLogger,
     list_resource_candidates,
-    pushd_popd,
     list_all_resources,
     get_processor_resource_types,
     resource_filename,
     parse_json_file_with_comments,
+    pushd_popd,
     make_file_id,
     deprecation_warning
 )
@@ -608,7 +607,7 @@ class Processor():
         """
         Ensure all input files for a single page are
         downloaded to the workspace, then schedule
-        :py:meth:`.process_process_file` to be run on
+        :py:meth:`.process_page_file` to be run on
         them via `executor` (enforcing a per-page time
         limit of `max_seconds`).
@@ -935,9 +934,8 @@ class Processor():
             cwd = self.old_pwd
         else:
             cwd = getcwd()
-        ret = [cand for cand in list_resource_candidates(executable, val,
-                                                         cwd=cwd, moduled=self.moduledir)
-               if exists(cand)]
+        ret = list(filter(exists, list_resource_candidates(executable, val,
+                                                           cwd=cwd, moduled=self.moduledir)))
         if ret:
             self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
             return ret[0]
@@ -968,17 +966,9 @@ class Processor():
         """
         List all resources found in the filesystem and matching content-type by filename suffix
         """
-        mimetypes = get_processor_resource_types(None, self.ocrd_tool)
-        for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
+        for res in list_all_resources(self.executable, ocrd_tool=self.ocrd_tool, moduled=self.moduledir):
             res = Path(res)
-            if '*/*' not in mimetypes:
-                if res.is_dir() and 'text/directory' not in mimetypes:
-                    continue
-                # if we do not know all MIME types, then keep the file, otherwise require suffix match
-                if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
-                                             for mime in mimetypes):
-                    continue
-            yield res
+            yield res.name
     @property
     def module(self):

ocrd/processor/builtin/dummy/ocrd-tool.json CHANGED Viewed

@@ -37,6 +37,31 @@
           "description": "Whether to extract an image for each filtered segment and write to the output fileGrp."
         }
       }
+    },
+    "ocrd-command": {
+      "executable": "ocrd-command",
+      "description": "Bare-bones processor runs shell commands to process PAGE files",
+        "steps": ["recognition/text-recognition", "recognition/font-identification", "recognition/post-correction", "layout/segmentation", "layout/analysis"],
+      "categories": [],
+      "input_file_grp_cardinality": [1, -1],
+      "output_file_grp_cardinality": 1,
+      "parameters": {
+        "command": {
+          "type": "string",
+          "default": "cat @INFILE > @OUTFILE",
+          "description": "Shell command to operate on PAGE files, with @INFILE as place-holder for the input file path(s), and @OUTFILE as place-holder for the output file path. If running on multiple input fileGrps, then @INFILE must be repeated as many times."
+        }
+      }
+    },
+    "ocrd-merge": {
+      "executable": "ocrd-merge",
+      "description": "Bare-bones processor merges annotations from multiple fileGrps",
+        "steps": ["layout/segmentation"],
+      "categories": [],
+      "input_file_grp_cardinality": [1, -1],
+      "output_file_grp_cardinality": 1,
+      "parameters": {
+      }
     }
   }
 }

ocrd/processor/builtin/merge_processor.py ADDED Viewed

@@ -0,0 +1,131 @@
+# pylint: disable=missing-module-docstring,invalid-name
+from typing import Optional
+from itertools import count
+from collections import OrderedDict as odict
+import click
+from ocrd import Processor, OcrdPageResult
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+from ocrd_modelfactory import page_from_file
+from ocrd_models import OcrdPage
+from ocrd_models.ocrd_page import (
+    BorderType,
+    CoordsType,
+    ReadingOrderType,
+    UnorderedGroupType,
+)
+from ocrd_utils import bbox_from_points
+_SEGTYPES = [
+    "NoiseRegion",
+    "LineDrawingRegion",
+    "AdvertRegion",
+    "ImageRegion",
+    "ChartRegion",
+    "MusicRegion",
+    "GraphicRegion",
+    "UnknownRegion",
+    "CustomRegion",
+    "SeparatorRegion",
+    "MathsRegion",
+    "TextRegion",
+    "MapRegion",
+    "ChemRegion",
+    "TableRegion",
+    "TextLine",
+    "Word",
+    "Glyph"
+]
+def get_border_bbox(pcgts):
+    if pcgts.Page.Border is None:
+        return [0, 0, pcgts.Page.imageWidth, pcgts.Page.imageHeight]
+    return bbox_from_points(pcgts.Page.Border.Coords.points)
+def rename_segments(pcgts, start=1):
+    renamed = {}
+    rodict = pcgts.Page.get_ReadingOrderGroups()
+    # get everything that has an identifier
+    nodes = pcgts.xpath("//*[@id]")
+    # filter segments
+    segments = [segment for segment in map(pcgts.revmap.get, nodes)
+                # get PAGE objects from matching etree nodes
+                # but allow only hierarchy segments
+                if segment.__class__.__name__.replace('Type', '') in _SEGTYPES]
+    # count segments and rename them
+    # fixme: or perhaps better to have each segment type named and counted differently?
+    num = 0
+    regions = []
+    for num, segment in zip(count(start=start), segments):
+        segtype = segment.original_tagname_
+        #parent = segment.parent_object_
+        newname = "seg%011d" % num
+        assert not segment.id in renamed
+        if segtype.endswith('Region') and segment.id in rodict:
+            # update reading order
+            roelem = rodict[segment.id]
+            roelem.regionRef = newname
+        renamed[segment.id] = newname
+        segment.id = newname
+    return num
+class MergeProcessor(Processor):
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
+        """
+        Merge PAGE segment hierarchy elements from all input file groups.
+        For each page, open and deserialise PAGE input files. Rename all elements
+        of the segment hierarchy to new (clash-free) identifers. Redefine the
+        `Border` coordinates as the convex hull of all input borders. Then add all
+        regions from all input files, concatenating them into a single `ReadingOrder`
+        in the order of input file groups.
+        Produce a new PAGE output file by serialising the resulting hierarchy.
+        """
+        actual_pcgts = list(filter(None, input_pcgts))
+        assert len(set(pcgts.Page.imageFilename for pcgts in actual_pcgts)) == 1, \
+            "input files must all reference the same @imageFilename"
+        # create new PAGE for image
+        result = OcrdPageResult(page_from_file(actual_pcgts[0].Page.imageFilename))
+        # unify Border
+        borders = [get_border_bbox(pcgts) for pcgts in actual_pcgts]
+        minx, miny, maxx, maxy = zip(*borders)
+        minx = min(minx)
+        miny = min(miny)
+        maxx = max(maxx)
+        maxy = max(maxy)
+        result.pcgts.Page.set_Border(
+            BorderType(CoordsType(
+                points=f"{minx},{miny} {maxx},{miny} {maxx},{maxy} {minx},{maxy}")))
+        # rename all segments
+        num = 1
+        for pcgts in actual_pcgts:
+            num = rename_segments(pcgts, num)
+        # concatenate all regions
+        ug = UnorderedGroupType(id="merged")
+        result.pcgts.Page.set_ReadingOrder(ReadingOrderType(UnorderedGroup=ug))
+        for pcgts in actual_pcgts:
+            for region in pcgts.Page.get_AllRegions():
+                adder = getattr(result.pcgts.Page, 'add_' + region.original_tagname_)
+                adder(region)
+            if pcgts.Page.ReadingOrder:
+                group = pcgts.Page.ReadingOrder.OrderedGroup or pcgts.Page.ReadingOrder.UnorderedGroup
+                adder = getattr(ug, 'add_' + group.original_tagname_)
+                adder(group)
+        return result
+    @property
+    def metadata_filename(self):
+        return 'processor/builtin/dummy/ocrd-tool.json'
+    @property
+    def executable(self):
+        return 'ocrd-merge'
+@click.command()
+@ocrd_cli_options
+def cli(*args, **kwargs):
+    return ocrd_cli_wrap_processor(MergeProcessor, *args, **kwargs)

ocrd/processor/builtin/param_command_header2unordered.json ADDED Viewed

@@ -0,0 +1,7 @@
+{
+    # requires https://github.com/bertsky/workflow-configuration installed
+    # partitions PAGE-XML ReadingOrder from single OrderedGroup to top
+    # UnorderedGroup divided into OrderedGroups starting at every @type=header
+    # text regions.
+    "command": "page-header2unordered @INFILE > @OUTFILE"
+}

ocrd/processor/builtin/param_command_heading2unordered.json ADDED Viewed

@@ -0,0 +1,7 @@
+{
+    # requires https://github.com/bertsky/workflow-configuration installed
+    # partitions PAGE-XML ReadingOrder from single OrderedGroup to top
+    # UnorderedGroup divided into OrderedGroups starting at every @type=heading
+    # text regions.
+    "command": "page-heading2unordered @INFILE > @OUTFILE"
+}

ocrd/processor/builtin/param_command_lines2orientation.json ADDED Viewed

@@ -0,0 +1,6 @@
+{
+    # requires https://github.com/bertsky/workflow-configuration installed
+    # retrieves lines from all paragraphs and geometrically determines
+    # their average skew, annotating the result under /Page/@orientation
+    "command": "page-lines2orientation @INFILE > @OUTFILE"
+}

ocrd/processor/builtin/param_command_page-update-version.json ADDED Viewed

@@ -0,0 +1,5 @@
+{
+    # requires https://github.com/PRImA-Research-Lab/prima-page-converter installed
+    # with the main JAR file copied to /usr/local/share/
+    "command": "java -jar /usr/local/share/PageConverter.jar -source-xml @INFILE -convert-to LATEST -target-xml @OUTFILE"
+}

ocrd/processor/builtin/param_command_transkribus-to-prima.json ADDED Viewed

@@ -0,0 +1,8 @@
+{
+    # requires https://github.com/kba/transkribus-to-prima installed
+    # converts PAGE from Transkribus dialect to PRImA standard
+    # also runs various fixes often necessary to make this work:
+    # - ensuring coordinates range within the image size
+    # - ensuring segment identifiers do not start with numbers
+    "command": "page-fix-coordinates @INFILE - | sed -e 's/ id=\"/ id=\"id/' -e 's/regionRef=\"/regionRef=\"id/' | transkribus-to-prima -V - @OUTFILE"
+}

ocrd/processor/builtin/shell_processor.py ADDED Viewed

@@ -0,0 +1,128 @@
+# pylint: disable=missing-module-docstring,invalid-name
+from typing import Optional, get_args
+import os
+import subprocess
+from tempfile import TemporaryDirectory
+import click
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+from ocrd import Processor
+from ocrd_models import OcrdPage, OcrdFileType
+from ocrd_models.ocrd_page import to_xml
+from ocrd_modelfactory import page_from_file
+from ocrd_utils import config, make_file_id, MIMETYPE_PAGE
+class ShellProcessor(Processor):
+    def setup(self):
+        command = self.parameter['command']
+        if '@INFILE' not in command:
+            raise Exception("command parameter requires @INFILE pattern")
+        if '@OUTFILE' not in command:
+            raise Exception("command parameter requires @OUTFILE pattern")
+    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
+        """
+        Process PAGE files via arbitrary command line on the shell.
+        \b
+        For each selected physical page of the workspace, pass ``command``
+        to the shell, replacing:
+        - the string ``@INFILE`` with the PAGE input file path,
+        - the string ``@OUTFILE`` with the PAGE output file path.
+        Modify the resulting PAGE output file with our new `@pcGtsId` and
+        metadata.
+        """
+        input_paths: List[str] = [""] * len(input_files)
+        input_pos = next(i for i, input_file in enumerate(input_files)
+                         if input_file is not None)
+        page_id = input_files[input_pos].pageId
+        self._base_logger.info("processing page %s", page_id)
+        for i, input_file in enumerate(input_files):
+            grp = self.input_file_grp.split(',')[i]
+            if input_file is None:
+                self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}")
+                continue
+            assert isinstance(input_file, get_args(OcrdFileType))
+            if not input_file.local_filename:
+                self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}')
+                if config.OCRD_MISSING_INPUT == 'ABORT':
+                    raise MissingInputFile(grp, page_id, input_file.mimetype)
+                continue
+            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
+            if os.path.exists(input_file.local_filename):
+                input_paths[i] = input_file.local_filename
+            else:
+                self._base_logger.error(f"non-existing local file for input fileGrp {grp} for page {page_id}")
+        if not any(input_paths):
+            self._base_logger.warning(f'skipping page {page_id}')
+            return
+        output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
+        if input_files[input_pos].fileGrp == self.output_file_grp:
+            # input=output fileGrp: re-use ID exactly
+            output_file_id = input_files[input_pos].ID
+        output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
+        if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
+            # short-cut avoiding useless computation:
+            raise FileExistsError(
+                f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
+            )
+        command = self.parameter['command']
+        with TemporaryDirectory(suffix=page_id) as tmpdir:
+            out_path = os.path.join(tmpdir, output_file_id + ".xml")
+            # remove quotation around filename patterns, if any
+            command = command.replace('"@INFILE"', '@INFILE').replace('"@OUTFILE"', '@OUTFILE')
+            command = command.replace("'@INFILE'", '@INFILE').replace("'@OUTFILE'", '@OUTFILE')
+            # replace filename patterns with actual paths, quoted
+            for in_path in input_paths:
+                command = command.replace('@INFILE', '"' + in_path + '"', 1)
+            command = command.replace('@OUTFILE', '"' + out_path + '"')
+            # execute command pattern
+            self.logger.debug("Running command: '%s'", command)
+            # pylint: disable=subprocess-run-check
+            result = subprocess.run(command, shell=True,
+                                    universal_newlines=True,
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE)
+            self.logger.debug("Command for %s returned: %d", page_id, result.returncode)
+            if result.stdout:
+                self.logger.info("Command for %s stdout: %s", page_id, result.stdout)
+            if result.stderr:
+                self.logger.warning("Command for %s stderr: %s", page_id, result.stderr)
+            if result.returncode != 0:
+                self.logger.error("Command for %s failed", page_id)
+                return
+            try:
+                result = page_from_file(out_path)
+                assert isinstance(result, OcrdPage)
+            except ValueError as err:
+                # not PAGE and not an image to generate PAGE for
+                self._base_logger.error(f"non-PAGE output for page {page_id}: {err}")
+                return
+        result.set_pcGtsId(output_file_id)
+        self.add_metadata(result)
+        self.workspace.add_file(
+            file_id=output_file_id,
+            file_grp=self.output_file_grp,
+            page_id=page_id,
+            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
+            mimetype=MIMETYPE_PAGE,
+            content=to_xml(result),
+        )
+    @property
+    def metadata_filename(self):
+        return 'processor/builtin/dummy/ocrd-tool.json'
+    @property
+    def executable(self):
+        return 'ocrd-command'
+@click.command()
+@ocrd_cli_options
+def cli(*args, **kwargs):
+    return ocrd_cli_wrap_processor(ShellProcessor, *args, **kwargs)

ocrd 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

ocrd 3.7.0py3-none-any.whl → 3.8.1py3-none-any.whl