ocrd 3.0.0b4__py3-none-any.whl → 3.0.0b6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/bashlib.py +6 -4
- ocrd/cli/ocrd_tool.py +1 -1
- ocrd/cli/validate.py +6 -3
- ocrd/cli/workspace.py +71 -56
- ocrd/decorators/__init__.py +6 -6
- ocrd/decorators/ocrd_cli_options.py +1 -0
- ocrd/lib.bash +24 -21
- ocrd/mets_server.py +39 -8
- ocrd/processor/base.py +307 -89
- ocrd/processor/builtin/dummy_processor.py +0 -2
- ocrd/processor/helpers.py +16 -7
- ocrd/processor/ocrd_page_result.py +2 -2
- ocrd/workspace.py +3 -0
- {ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/METADATA +2 -1
- {ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/RECORD +23 -23
- {ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/WHEEL +1 -1
- ocrd_models/ocrd_mets.py +9 -0
- ocrd_models/ocrd_page_generateds.py +44 -11
- ocrd_utils/logging.py +6 -2
- ocrd_utils/str.py +2 -1
- {ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/LICENSE +0 -0
- {ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/entry_points.txt +0 -0
- {ocrd-3.0.0b4.dist-info → ocrd-3.0.0b6.dist-info}/top_level.txt +0 -0
ocrd/processor/base.py
CHANGED
|
@@ -16,14 +16,20 @@ import json
|
|
|
16
16
|
import os
|
|
17
17
|
from os import getcwd
|
|
18
18
|
from pathlib import Path
|
|
19
|
-
from typing import List, Optional, Union, get_args
|
|
19
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, get_args
|
|
20
20
|
import sys
|
|
21
21
|
import inspect
|
|
22
22
|
import tarfile
|
|
23
23
|
import io
|
|
24
24
|
import weakref
|
|
25
|
+
from collections import defaultdict
|
|
25
26
|
from frozendict import frozendict
|
|
26
|
-
|
|
27
|
+
# concurrent.futures is buggy in py38,
|
|
28
|
+
# this is where the fixes came from:
|
|
29
|
+
from loky import Future, ProcessPoolExecutor
|
|
30
|
+
import multiprocessing as mp
|
|
31
|
+
from threading import Timer
|
|
32
|
+
from _thread import interrupt_main
|
|
27
33
|
|
|
28
34
|
from click import wrap_text
|
|
29
35
|
from deprecated import deprecated
|
|
@@ -105,6 +111,31 @@ class MissingInputFile(ValueError):
|
|
|
105
111
|
f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
|
|
106
112
|
super().__init__(self.message)
|
|
107
113
|
|
|
114
|
+
class DummyFuture:
|
|
115
|
+
"""
|
|
116
|
+
Mimics some of `concurrent.futures.Future` but runs immediately.
|
|
117
|
+
"""
|
|
118
|
+
def __init__(self, fn, *args, **kwargs):
|
|
119
|
+
self.fn = fn
|
|
120
|
+
self.args = args
|
|
121
|
+
self.kwargs = kwargs
|
|
122
|
+
def result(self):
|
|
123
|
+
return self.fn(*self.args, **self.kwargs)
|
|
124
|
+
class DummyExecutor:
|
|
125
|
+
"""
|
|
126
|
+
Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs
|
|
127
|
+
everything immediately in this process.
|
|
128
|
+
"""
|
|
129
|
+
def __init__(self, initializer=None, initargs=(), **kwargs):
|
|
130
|
+
initializer(*initargs)
|
|
131
|
+
def shutdown(self, **kwargs):
|
|
132
|
+
pass
|
|
133
|
+
def submit(self, fn, *args, **kwargs) -> DummyFuture:
|
|
134
|
+
return DummyFuture(fn, *args, **kwargs)
|
|
135
|
+
|
|
136
|
+
TFuture = Union[DummyFuture, Future]
|
|
137
|
+
TExecutor = Union[DummyExecutor, ProcessPoolExecutor]
|
|
138
|
+
|
|
108
139
|
class Processor():
|
|
109
140
|
"""
|
|
110
141
|
A processor is a tool that implements the uniform OCR-D
|
|
@@ -339,7 +370,7 @@ class Processor():
|
|
|
339
370
|
self._finalizer = weakref.finalize(self, self.shutdown)
|
|
340
371
|
# workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
|
|
341
372
|
setattr(self, 'process',
|
|
342
|
-
deprecated(version='3.0', reason='process() should be replaced with
|
|
373
|
+
deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
|
|
343
374
|
|
|
344
375
|
def show_help(self, subcommand=None):
|
|
345
376
|
"""
|
|
@@ -358,6 +389,7 @@ class Processor():
|
|
|
358
389
|
"""
|
|
359
390
|
Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
|
|
360
391
|
"""
|
|
392
|
+
# verify input and output file groups in parameters
|
|
361
393
|
assert self.input_file_grp is not None
|
|
362
394
|
assert self.output_file_grp is not None
|
|
363
395
|
input_file_grps = self.input_file_grp.split(',')
|
|
@@ -374,12 +406,23 @@ class Processor():
|
|
|
374
406
|
assert len(grps) >= minimum, msg % (len(grps), str(spec))
|
|
375
407
|
if maximum > 0:
|
|
376
408
|
assert len(grps) <= maximum, msg % (len(grps), str(spec))
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
409
|
+
# FIXME: enforce unconditionally as soon as grace period for deprecation is over
|
|
410
|
+
if 'input_file_grp_cardinality' in self.ocrd_tool:
|
|
411
|
+
assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
|
|
412
|
+
"Unexpected number of input file groups %d vs %s")
|
|
413
|
+
if 'output_file_grp_cardinality' in self.ocrd_tool:
|
|
414
|
+
assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
|
|
415
|
+
"Unexpected number of output file groups %d vs %s")
|
|
416
|
+
# verify input and output file groups in METS
|
|
381
417
|
for input_file_grp in input_file_grps:
|
|
382
|
-
assert input_file_grp in self.workspace.mets.file_groups
|
|
418
|
+
assert input_file_grp in self.workspace.mets.file_groups, \
|
|
419
|
+
f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}"
|
|
420
|
+
for output_file_grp in output_file_grps:
|
|
421
|
+
assert output_file_grp not in self.workspace.mets.file_groups \
|
|
422
|
+
or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \
|
|
423
|
+
or not any(self.workspace.mets.find_files(
|
|
424
|
+
pageId=self.page_id, fileGrp=output_file_grp)), \
|
|
425
|
+
f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}"
|
|
383
426
|
# keep this for backwards compatibility:
|
|
384
427
|
return True
|
|
385
428
|
|
|
@@ -444,6 +487,9 @@ class Processor():
|
|
|
444
487
|
for the given :py:data:`page_id` (or all pages)
|
|
445
488
|
under the given :py:data:`parameter`.
|
|
446
489
|
|
|
490
|
+
Delegates to :py:meth:`.process_workspace_submit_tasks`
|
|
491
|
+
and :py:meth:`.process_workspace_handle_tasks`.
|
|
492
|
+
|
|
447
493
|
(This will iterate over pages and files, calling
|
|
448
494
|
:py:meth:`.process_page_file` and handling exceptions.
|
|
449
495
|
It should be overridden by subclasses to handle cases
|
|
@@ -453,11 +499,7 @@ class Processor():
|
|
|
453
499
|
self.workspace = workspace
|
|
454
500
|
self.verify()
|
|
455
501
|
try:
|
|
456
|
-
|
|
457
|
-
nr_skipped = 0
|
|
458
|
-
nr_copied = 0
|
|
459
|
-
|
|
460
|
-
# set up multithreading
|
|
502
|
+
# set up multitasking
|
|
461
503
|
max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
|
|
462
504
|
if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
|
|
463
505
|
self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
|
|
@@ -469,84 +511,217 @@ class Processor():
|
|
|
469
511
|
if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
|
|
470
512
|
self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
|
|
471
513
|
max_seconds = self.max_page_seconds
|
|
472
|
-
|
|
514
|
+
|
|
515
|
+
if max_workers > 1:
|
|
516
|
+
executor_cls = ProcessPoolExecutor
|
|
517
|
+
else:
|
|
518
|
+
executor_cls = DummyExecutor
|
|
519
|
+
executor = executor_cls(
|
|
473
520
|
max_workers=max_workers or 1,
|
|
474
|
-
|
|
521
|
+
# only forking method avoids pickling
|
|
522
|
+
context=mp.get_context('fork'),
|
|
523
|
+
# share processor instance as global to avoid pickling
|
|
524
|
+
initializer=_page_worker_set_ctxt,
|
|
525
|
+
initargs=(self,),
|
|
475
526
|
)
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
for input_file in input_file_tuple
|
|
483
|
-
if input_file)
|
|
484
|
-
self._base_logger.info(f"preparing page {page_id}")
|
|
485
|
-
for i, input_file in enumerate(input_file_tuple):
|
|
486
|
-
if input_file is None:
|
|
487
|
-
# file/page not found in this file grp
|
|
488
|
-
continue
|
|
489
|
-
input_files[i] = input_file
|
|
490
|
-
if not self.download:
|
|
491
|
-
continue
|
|
492
|
-
try:
|
|
493
|
-
input_files[i] = self.workspace.download_file(input_file)
|
|
494
|
-
except (ValueError, FileNotFoundError, HTTPError) as e:
|
|
495
|
-
self._base_logger.error(repr(e))
|
|
496
|
-
self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
|
|
497
|
-
# process page
|
|
498
|
-
tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
|
|
499
|
-
self._base_logger.debug("submitted %d processing tasks", len(tasks))
|
|
500
|
-
|
|
501
|
-
for task in tasks:
|
|
502
|
-
# wait for results, handle errors
|
|
503
|
-
page_id, input_files = tasks[task]
|
|
504
|
-
# FIXME: differentiate error cases in various ways:
|
|
505
|
-
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
|
|
506
|
-
# - transient (I/O or OOM) error → maybe sleep, retry
|
|
507
|
-
# - persistent (data) error → skip / dummy / raise
|
|
508
|
-
try:
|
|
509
|
-
self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
|
|
510
|
-
task.result(timeout=max_seconds or None)
|
|
511
|
-
nr_succeeded += 1
|
|
512
|
-
# exclude NotImplementedError, so we can try process() below
|
|
513
|
-
except NotImplementedError:
|
|
514
|
-
raise
|
|
515
|
-
# handle input failures separately
|
|
516
|
-
except FileExistsError as err:
|
|
517
|
-
if config.OCRD_EXISTING_OUTPUT == 'ABORT':
|
|
518
|
-
raise err
|
|
519
|
-
if config.OCRD_EXISTING_OUTPUT == 'SKIP':
|
|
520
|
-
continue
|
|
521
|
-
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
522
|
-
# too late here, must not happen
|
|
523
|
-
raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
|
|
524
|
-
# broad coverage of output failures (including TimeoutError)
|
|
525
|
-
except (Exception, TimeoutError) as err:
|
|
526
|
-
# FIXME: add re-usable/actionable logging
|
|
527
|
-
if config.OCRD_MISSING_OUTPUT == 'ABORT':
|
|
528
|
-
self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
529
|
-
raise err
|
|
530
|
-
self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
531
|
-
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
532
|
-
nr_skipped += 1
|
|
533
|
-
continue
|
|
534
|
-
if config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
535
|
-
self._copy_page_file(input_files[0])
|
|
536
|
-
nr_copied += 1
|
|
537
|
-
else:
|
|
538
|
-
desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
|
|
539
|
-
raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
|
|
540
|
-
|
|
541
|
-
if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
542
|
-
raise Exception(f"too many failures with skipped output ({nr_skipped})")
|
|
543
|
-
if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
544
|
-
raise Exception(f"too many failures with fallback output ({nr_skipped})")
|
|
545
|
-
executor.shutdown()
|
|
527
|
+
try:
|
|
528
|
+
self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
|
|
529
|
+
tasks = self.process_workspace_submit_tasks(executor, max_seconds)
|
|
530
|
+
stats = self.process_workspace_handle_tasks(tasks)
|
|
531
|
+
finally:
|
|
532
|
+
executor.shutdown(kill_workers=True, wait=False)
|
|
546
533
|
|
|
547
534
|
except NotImplementedError:
|
|
548
535
|
# fall back to deprecated method
|
|
549
|
-
|
|
536
|
+
try:
|
|
537
|
+
self.process()
|
|
538
|
+
except Exception as err:
|
|
539
|
+
# suppress the NotImplementedError context
|
|
540
|
+
raise err from None
|
|
541
|
+
|
|
542
|
+
def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
|
|
543
|
+
"""
|
|
544
|
+
Look up all input files of the given ``workspace``
|
|
545
|
+
from the given :py:data:`input_file_grp`
|
|
546
|
+
for the given :py:data:`page_id` (or all pages),
|
|
547
|
+
and schedules calling :py:meth:`.process_page_file`
|
|
548
|
+
on them for each page via `executor` (enforcing
|
|
549
|
+
a per-page time limit of `max_seconds`).
|
|
550
|
+
|
|
551
|
+
When running with `OCRD_MAX_PARALLEL_PAGES>1` and
|
|
552
|
+
the workspace via METS Server, the executor will fork
|
|
553
|
+
this many worker parallel subprocesses each processing
|
|
554
|
+
one page at a time. (Interprocess communication is
|
|
555
|
+
done via task and result queues.)
|
|
556
|
+
|
|
557
|
+
Otherwise, tasks are run sequentially in the
|
|
558
|
+
current process.
|
|
559
|
+
|
|
560
|
+
Delegates to :py:meth:`.zip_input_files` to get
|
|
561
|
+
the input files for each page, and then calls
|
|
562
|
+
:py:meth:`.process_workspace_submit_page_task`.
|
|
563
|
+
|
|
564
|
+
Returns a dict mapping the per-page tasks
|
|
565
|
+
(i.e. futures submitted to the executor)
|
|
566
|
+
to their corresponding pageId and input files.
|
|
567
|
+
"""
|
|
568
|
+
tasks = {}
|
|
569
|
+
for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
|
|
570
|
+
task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple)
|
|
571
|
+
tasks[task] = (page_id, input_files)
|
|
572
|
+
self._base_logger.debug("submitted %d processing tasks", len(tasks))
|
|
573
|
+
return tasks
|
|
574
|
+
|
|
575
|
+
def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]:
|
|
576
|
+
"""
|
|
577
|
+
Ensure all input files for a single page are
|
|
578
|
+
downloaded to the workspace, then schedule
|
|
579
|
+
:py:meth:`.process_process_file` to be run on
|
|
580
|
+
them via `executor` (enforcing a per-page time
|
|
581
|
+
limit of `max_seconds`).
|
|
582
|
+
|
|
583
|
+
Delegates to :py:meth:`.process_page_file`
|
|
584
|
+
(wrapped in :py:func:`_page_worker` to share
|
|
585
|
+
the processor instance across forked processes).
|
|
586
|
+
|
|
587
|
+
\b
|
|
588
|
+
Returns a tuple of:
|
|
589
|
+
- the scheduled future object,
|
|
590
|
+
- the corresponding pageId,
|
|
591
|
+
- the corresponding input files.
|
|
592
|
+
"""
|
|
593
|
+
input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
|
|
594
|
+
page_id = next(input_file.pageId
|
|
595
|
+
for input_file in input_file_tuple
|
|
596
|
+
if input_file)
|
|
597
|
+
self._base_logger.info(f"preparing page {page_id}")
|
|
598
|
+
for i, input_file in enumerate(input_file_tuple):
|
|
599
|
+
if input_file is None:
|
|
600
|
+
# file/page not found in this file grp
|
|
601
|
+
continue
|
|
602
|
+
input_files[i] = input_file
|
|
603
|
+
if not self.download:
|
|
604
|
+
continue
|
|
605
|
+
try:
|
|
606
|
+
input_files[i] = self.workspace.download_file(input_file)
|
|
607
|
+
except (ValueError, FileNotFoundError, HTTPError) as e:
|
|
608
|
+
self._base_logger.error(repr(e))
|
|
609
|
+
self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
|
|
610
|
+
# process page
|
|
611
|
+
#executor.submit(self.process_page_file, *input_files)
|
|
612
|
+
return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
|
|
613
|
+
|
|
614
|
+
def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]:
|
|
615
|
+
"""
|
|
616
|
+
Look up scheduled per-page futures one by one,
|
|
617
|
+
handle errors (exceptions) and gather results.
|
|
618
|
+
|
|
619
|
+
\b
|
|
620
|
+
Enforces policies configured by the following
|
|
621
|
+
environment variables:
|
|
622
|
+
- `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
|
|
623
|
+
- `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
|
|
624
|
+
- `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
|
|
625
|
+
|
|
626
|
+
\b
|
|
627
|
+
Returns a tuple of:
|
|
628
|
+
- the number of successfully processed pages
|
|
629
|
+
- the number of failed (i.e. skipped or copied) pages
|
|
630
|
+
- a dict of the type and corresponding number of exceptions seen
|
|
631
|
+
- the number of total requested pages (i.e. success+fail+existing).
|
|
632
|
+
|
|
633
|
+
Delegates to :py:meth:`.process_workspace_handle_page_task`
|
|
634
|
+
for each page.
|
|
635
|
+
"""
|
|
636
|
+
# aggregate info for logging:
|
|
637
|
+
nr_succeeded = 0
|
|
638
|
+
nr_failed = 0
|
|
639
|
+
nr_errors = defaultdict(int) # count causes
|
|
640
|
+
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
641
|
+
reason = "skipped"
|
|
642
|
+
elif config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
643
|
+
reason = "fallback-copied"
|
|
644
|
+
for task in tasks:
|
|
645
|
+
# wait for results, handle errors
|
|
646
|
+
page_id, input_files = tasks[task]
|
|
647
|
+
result = self.process_workspace_handle_page_task(page_id, input_files, task)
|
|
648
|
+
if isinstance(result, Exception):
|
|
649
|
+
nr_errors[result.__class__.__name__] += 1
|
|
650
|
+
nr_failed += 1
|
|
651
|
+
# FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed
|
|
652
|
+
if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS:
|
|
653
|
+
# already irredeemably many failures, stop short
|
|
654
|
+
nr_errors = dict(nr_errors)
|
|
655
|
+
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})")
|
|
656
|
+
elif result:
|
|
657
|
+
nr_succeeded += 1
|
|
658
|
+
# else skipped - already exists
|
|
659
|
+
nr_errors = dict(nr_errors)
|
|
660
|
+
if nr_failed > 0:
|
|
661
|
+
nr_all = nr_succeeded + nr_failed
|
|
662
|
+
if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
|
|
663
|
+
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
|
|
664
|
+
self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
|
|
665
|
+
return nr_succeeded, nr_failed, nr_errors, len(tasks)
|
|
666
|
+
|
|
667
|
+
def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]:
|
|
668
|
+
"""
|
|
669
|
+
\b
|
|
670
|
+
Await a single page result and handle errors (exceptions),
|
|
671
|
+
enforcing policies configured by the following
|
|
672
|
+
environment variables:
|
|
673
|
+
- `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
|
|
674
|
+
- `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
|
|
675
|
+
- `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
|
|
676
|
+
|
|
677
|
+
\b
|
|
678
|
+
Returns
|
|
679
|
+
- true in case of success
|
|
680
|
+
- false in case the output already exists
|
|
681
|
+
- the exception in case of failure
|
|
682
|
+
"""
|
|
683
|
+
# FIXME: differentiate error cases in various ways:
|
|
684
|
+
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
|
|
685
|
+
# - transient (I/O or OOM) error → maybe sleep, retry
|
|
686
|
+
# - persistent (data) error → skip / dummy / raise
|
|
687
|
+
try:
|
|
688
|
+
self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id)
|
|
689
|
+
# timeout kwarg on future is useless: it only raises TimeoutError here,
|
|
690
|
+
# but does not stop the running process/thread, and executor itself
|
|
691
|
+
# offers nothing to that effect:
|
|
692
|
+
# task.result(timeout=max_seconds or None)
|
|
693
|
+
# so we instead applied the timeout within the worker function
|
|
694
|
+
task.result()
|
|
695
|
+
return True
|
|
696
|
+
except NotImplementedError:
|
|
697
|
+
# exclude NotImplementedError, so we can try process() below
|
|
698
|
+
raise
|
|
699
|
+
# handle input failures separately
|
|
700
|
+
except FileExistsError as err:
|
|
701
|
+
if config.OCRD_EXISTING_OUTPUT == 'ABORT':
|
|
702
|
+
raise err
|
|
703
|
+
if config.OCRD_EXISTING_OUTPUT == 'SKIP':
|
|
704
|
+
return False
|
|
705
|
+
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
706
|
+
# too late here, must not happen
|
|
707
|
+
raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
|
|
708
|
+
except KeyboardInterrupt:
|
|
709
|
+
raise
|
|
710
|
+
# broad coverage of output failures (including TimeoutError)
|
|
711
|
+
except Exception as err:
|
|
712
|
+
# FIXME: add re-usable/actionable logging
|
|
713
|
+
if config.OCRD_MISSING_OUTPUT == 'ABORT':
|
|
714
|
+
self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
715
|
+
raise err
|
|
716
|
+
self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
717
|
+
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
718
|
+
pass
|
|
719
|
+
elif config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
720
|
+
self._copy_page_file(input_files[0])
|
|
721
|
+
else:
|
|
722
|
+
desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
|
|
723
|
+
raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
|
|
724
|
+
return err
|
|
550
725
|
|
|
551
726
|
def _copy_page_file(self, input_file : OcrdFileType) -> None:
|
|
552
727
|
"""
|
|
@@ -574,7 +749,6 @@ class Processor():
|
|
|
574
749
|
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
|
|
575
750
|
mimetype=MIMETYPE_PAGE,
|
|
576
751
|
content=to_xml(input_pcgts),
|
|
577
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
578
752
|
)
|
|
579
753
|
|
|
580
754
|
def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
|
|
@@ -603,6 +777,12 @@ class Processor():
|
|
|
603
777
|
# not PAGE and not an image to generate PAGE for
|
|
604
778
|
self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
|
|
605
779
|
output_file_id = make_file_id(input_files[0], self.output_file_grp)
|
|
780
|
+
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
|
|
781
|
+
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
|
|
782
|
+
# short-cut avoiding useless computation:
|
|
783
|
+
raise FileExistsError(
|
|
784
|
+
f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
|
|
785
|
+
)
|
|
606
786
|
result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
|
|
607
787
|
for image_result in result.images:
|
|
608
788
|
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
|
|
@@ -616,6 +796,8 @@ class Processor():
|
|
|
616
796
|
image_result.alternative_image.set_imageHeight(image_result.pil.height)
|
|
617
797
|
elif isinstance(image_result.alternative_image, AlternativeImageType):
|
|
618
798
|
image_result.alternative_image.set_filename(image_file_path)
|
|
799
|
+
elif image_result.alternative_image is None:
|
|
800
|
+
pass # do not reference in PAGE result
|
|
619
801
|
else:
|
|
620
802
|
raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
|
|
621
803
|
f"{type(image_result.alternative_image)}")
|
|
@@ -625,7 +807,6 @@ class Processor():
|
|
|
625
807
|
self.output_file_grp,
|
|
626
808
|
page_id=page_id,
|
|
627
809
|
file_path=image_file_path,
|
|
628
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
629
810
|
)
|
|
630
811
|
result.pcgts.set_pcGtsId(output_file_id)
|
|
631
812
|
self.add_metadata(result.pcgts)
|
|
@@ -636,7 +817,6 @@ class Processor():
|
|
|
636
817
|
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
|
|
637
818
|
mimetype=MIMETYPE_PAGE,
|
|
638
819
|
content=to_xml(result.pcgts),
|
|
639
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
640
820
|
)
|
|
641
821
|
|
|
642
822
|
def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
|
|
@@ -919,6 +1099,44 @@ class Processor():
|
|
|
919
1099
|
ifts.append(tuple(ifiles))
|
|
920
1100
|
return ifts
|
|
921
1101
|
|
|
1102
|
+
_page_worker_processor = None
|
|
1103
|
+
"""
|
|
1104
|
+
This global binding for the processor is required to avoid
|
|
1105
|
+
squeezing the processor through a mp.Queue (which is impossible
|
|
1106
|
+
due to unpicklable attributes like .workspace.mets._tree anyway)
|
|
1107
|
+
when calling Processor.process_page_file as page worker processes
|
|
1108
|
+
in Processor.process_workspace. Forking allows inheriting global
|
|
1109
|
+
objects, and with the METS Server we do not mutate the local
|
|
1110
|
+
processor instance anyway.
|
|
1111
|
+
"""
|
|
1112
|
+
def _page_worker_set_ctxt(processor):
|
|
1113
|
+
"""
|
|
1114
|
+
Overwrites `ocrd.processor.base._page_worker_processor` instance
|
|
1115
|
+
for sharing with subprocesses in ProcessPoolExecutor initializer.
|
|
1116
|
+
"""
|
|
1117
|
+
global _page_worker_processor
|
|
1118
|
+
_page_worker_processor = processor
|
|
1119
|
+
|
|
1120
|
+
def _page_worker(timeout, *input_files):
|
|
1121
|
+
"""
|
|
1122
|
+
Wraps a `Processor.process_page_file` call as payload (call target)
|
|
1123
|
+
of the ProcessPoolExecutor workers, but also enforces the given timeout.
|
|
1124
|
+
"""
|
|
1125
|
+
page_id = next((file.pageId for file in input_files
|
|
1126
|
+
if hasattr(file, 'pageId')), "")
|
|
1127
|
+
if timeout > 0:
|
|
1128
|
+
timer = Timer(timeout, interrupt_main)
|
|
1129
|
+
timer.start()
|
|
1130
|
+
try:
|
|
1131
|
+
_page_worker_processor.process_page_file(*input_files)
|
|
1132
|
+
_page_worker_processor.logger.debug("page worker completed for page %s", page_id)
|
|
1133
|
+
except KeyboardInterrupt:
|
|
1134
|
+
_page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
|
|
1135
|
+
raise TimeoutError()
|
|
1136
|
+
finally:
|
|
1137
|
+
if timeout > 0:
|
|
1138
|
+
timer.cancel()
|
|
1139
|
+
|
|
922
1140
|
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
|
|
923
1141
|
"""Generate a string describing the full CLI of this processor including params.
|
|
924
1142
|
|
|
@@ -47,7 +47,6 @@ class DummyProcessor(Processor):
|
|
|
47
47
|
mimetype=input_file.mimetype,
|
|
48
48
|
local_filename=local_filename,
|
|
49
49
|
content=f.read(),
|
|
50
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
51
50
|
)
|
|
52
51
|
file_id = file_id + '_PAGE'
|
|
53
52
|
pcgts = page_from_file(output_file)
|
|
@@ -62,7 +61,6 @@ class DummyProcessor(Processor):
|
|
|
62
61
|
local_filename=join(self.output_file_grp, file_id + '.xml'),
|
|
63
62
|
mimetype=MIMETYPE_PAGE,
|
|
64
63
|
content=to_xml(pcgts),
|
|
65
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
66
64
|
)
|
|
67
65
|
else:
|
|
68
66
|
if self.parameter['copy_files']:
|
ocrd/processor/helpers.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Helper methods for running and documenting processors
|
|
3
3
|
"""
|
|
4
4
|
from time import perf_counter, process_time
|
|
5
|
+
from os import times
|
|
5
6
|
from functools import lru_cache
|
|
6
7
|
import json
|
|
7
8
|
import inspect
|
|
@@ -89,11 +90,12 @@ def run_processor(
|
|
|
89
90
|
|
|
90
91
|
ocrd_tool = processor.ocrd_tool
|
|
91
92
|
name = '%s v%s' % (ocrd_tool['executable'], processor.version)
|
|
92
|
-
otherrole = ocrd_tool
|
|
93
|
+
otherrole = ocrd_tool.get('steps', [''])[0]
|
|
93
94
|
logProfile = getLogger('ocrd.process.profile')
|
|
94
95
|
log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
|
|
95
96
|
t0_wall = perf_counter()
|
|
96
97
|
t0_cpu = process_time()
|
|
98
|
+
t0_os = times()
|
|
97
99
|
if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
|
|
98
100
|
backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
|
|
99
101
|
from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
|
|
@@ -123,7 +125,13 @@ def run_processor(
|
|
|
123
125
|
|
|
124
126
|
t1_wall = perf_counter() - t0_wall
|
|
125
127
|
t1_cpu = process_time() - t0_cpu
|
|
126
|
-
|
|
128
|
+
t1_os = times()
|
|
129
|
+
# add CPU time from child processes (page worker etc)
|
|
130
|
+
t1_cpu += t1_os.children_user - t0_os.children_user
|
|
131
|
+
t1_cpu += t1_os.children_system - t0_os.children_system
|
|
132
|
+
logProfile.info(
|
|
133
|
+
"Executing processor '%s' took %fs (wall) %fs (CPU)( "
|
|
134
|
+
"[--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']",
|
|
127
135
|
ocrd_tool['executable'],
|
|
128
136
|
t1_wall,
|
|
129
137
|
t1_cpu,
|
|
@@ -131,7 +139,7 @@ def run_processor(
|
|
|
131
139
|
processor.output_file_grp or '',
|
|
132
140
|
json.dumps(processor.parameter) or '',
|
|
133
141
|
processor.page_id or ''
|
|
134
|
-
)
|
|
142
|
+
)
|
|
135
143
|
workspace.mets.add_agent(
|
|
136
144
|
name=name,
|
|
137
145
|
_type='OTHER',
|
|
@@ -234,10 +242,10 @@ def get_cached_processor(parameter: dict, processor_class):
|
|
|
234
242
|
def get_processor(
|
|
235
243
|
processor_class,
|
|
236
244
|
parameter: Optional[dict] = None,
|
|
237
|
-
workspace: Workspace = None,
|
|
238
|
-
page_id: str = None,
|
|
239
|
-
input_file_grp: List[str] = None,
|
|
240
|
-
output_file_grp: List[str] = None,
|
|
245
|
+
workspace: Optional[Workspace] = None,
|
|
246
|
+
page_id: Optional[str] = None,
|
|
247
|
+
input_file_grp: Optional[List[str]] = None,
|
|
248
|
+
output_file_grp: Optional[List[str]] = None,
|
|
241
249
|
instance_caching: bool = False,
|
|
242
250
|
):
|
|
243
251
|
if processor_class:
|
|
@@ -258,6 +266,7 @@ def get_processor(
|
|
|
258
266
|
else:
|
|
259
267
|
# avoid passing workspace already (deprecated chdir behaviour)
|
|
260
268
|
processor = processor_class(None, parameter=parameter)
|
|
269
|
+
assert processor
|
|
261
270
|
# set current processing parameters
|
|
262
271
|
processor.workspace = workspace
|
|
263
272
|
processor.page_id = page_id
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import List, Union
|
|
2
|
+
from typing import List, Union, Optional
|
|
3
3
|
from ocrd_models.ocrd_page import OcrdPage
|
|
4
4
|
from PIL.Image import Image
|
|
5
5
|
|
|
@@ -9,7 +9,7 @@ from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
|
|
|
9
9
|
class OcrdPageResultImage():
|
|
10
10
|
pil : Image
|
|
11
11
|
file_id_suffix : str
|
|
12
|
-
alternative_image : Union[AlternativeImageType, PageType]
|
|
12
|
+
alternative_image : Optional[Union[AlternativeImageType, PageType]]
|
|
13
13
|
|
|
14
14
|
@dataclass
|
|
15
15
|
class OcrdPageResult():
|
ocrd/workspace.py
CHANGED
|
@@ -19,6 +19,7 @@ from ocrd_models.ocrd_page import parse, BorderType, to_xml
|
|
|
19
19
|
from ocrd_modelfactory import exif_from_filename, page_from_file
|
|
20
20
|
from ocrd_utils import (
|
|
21
21
|
atomic_write,
|
|
22
|
+
config,
|
|
22
23
|
getLogger,
|
|
23
24
|
image_from_polygon,
|
|
24
25
|
coordinates_of_segment,
|
|
@@ -427,6 +428,8 @@ class Workspace():
|
|
|
427
428
|
kwargs["pageId"] = kwargs.pop("page_id")
|
|
428
429
|
if "file_id" in kwargs:
|
|
429
430
|
kwargs["ID"] = kwargs.pop("file_id")
|
|
431
|
+
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
432
|
+
kwargs["force"] = True
|
|
430
433
|
|
|
431
434
|
ret = self.mets.add_file(file_grp, **kwargs)
|
|
432
435
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0b6
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -24,6 +24,7 @@ Requires-Dist: frozendict>=2.3.4
|
|
|
24
24
|
Requires-Dist: gdown
|
|
25
25
|
Requires-Dist: httpx>=0.22.0
|
|
26
26
|
Requires-Dist: jsonschema>=4
|
|
27
|
+
Requires-Dist: loky
|
|
27
28
|
Requires-Dist: lxml
|
|
28
29
|
Requires-Dist: memory-profiler>=0.58.0
|
|
29
30
|
Requires-Dist: numpy
|