ocrd 3.0.0b4__py3-none-any.whl → 3.0.0b6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocrd/processor/base.py CHANGED
@@ -16,14 +16,20 @@ import json
16
16
  import os
17
17
  from os import getcwd
18
18
  from pathlib import Path
19
- from typing import List, Optional, Union, get_args
19
+ from typing import Any, Dict, List, Optional, Tuple, Union, get_args
20
20
  import sys
21
21
  import inspect
22
22
  import tarfile
23
23
  import io
24
24
  import weakref
25
+ from collections import defaultdict
25
26
  from frozendict import frozendict
26
- from concurrent.futures import ThreadPoolExecutor, TimeoutError
27
+ # concurrent.futures is buggy in py38,
28
+ # this is where the fixes came from:
29
+ from loky import Future, ProcessPoolExecutor
30
+ import multiprocessing as mp
31
+ from threading import Timer
32
+ from _thread import interrupt_main
27
33
 
28
34
  from click import wrap_text
29
35
  from deprecated import deprecated
@@ -105,6 +111,31 @@ class MissingInputFile(ValueError):
105
111
  f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
106
112
  super().__init__(self.message)
107
113
 
114
+ class DummyFuture:
115
+ """
116
+ Mimics some of `concurrent.futures.Future` but runs immediately.
117
+ """
118
+ def __init__(self, fn, *args, **kwargs):
119
+ self.fn = fn
120
+ self.args = args
121
+ self.kwargs = kwargs
122
+ def result(self):
123
+ return self.fn(*self.args, **self.kwargs)
124
+ class DummyExecutor:
125
+ """
126
+ Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs
127
+ everything immediately in this process.
128
+ """
129
+ def __init__(self, initializer=None, initargs=(), **kwargs):
130
+ initializer(*initargs)
131
+ def shutdown(self, **kwargs):
132
+ pass
133
+ def submit(self, fn, *args, **kwargs) -> DummyFuture:
134
+ return DummyFuture(fn, *args, **kwargs)
135
+
136
+ TFuture = Union[DummyFuture, Future]
137
+ TExecutor = Union[DummyExecutor, ProcessPoolExecutor]
138
+
108
139
  class Processor():
109
140
  """
110
141
  A processor is a tool that implements the uniform OCR-D
@@ -339,7 +370,7 @@ class Processor():
339
370
  self._finalizer = weakref.finalize(self, self.shutdown)
340
371
  # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
341
372
  setattr(self, 'process',
342
- deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
373
+ deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
343
374
 
344
375
  def show_help(self, subcommand=None):
345
376
  """
@@ -358,6 +389,7 @@ class Processor():
358
389
  """
359
390
  Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
360
391
  """
392
+ # verify input and output file groups in parameters
361
393
  assert self.input_file_grp is not None
362
394
  assert self.output_file_grp is not None
363
395
  input_file_grps = self.input_file_grp.split(',')
@@ -374,12 +406,23 @@ class Processor():
374
406
  assert len(grps) >= minimum, msg % (len(grps), str(spec))
375
407
  if maximum > 0:
376
408
  assert len(grps) <= maximum, msg % (len(grps), str(spec))
377
- assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
378
- "Unexpected number of input file groups %d vs %s")
379
- assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
380
- "Unexpected number of output file groups %d vs %s")
409
+ # FIXME: enforce unconditionally as soon as grace period for deprecation is over
410
+ if 'input_file_grp_cardinality' in self.ocrd_tool:
411
+ assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
412
+ "Unexpected number of input file groups %d vs %s")
413
+ if 'output_file_grp_cardinality' in self.ocrd_tool:
414
+ assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
415
+ "Unexpected number of output file groups %d vs %s")
416
+ # verify input and output file groups in METS
381
417
  for input_file_grp in input_file_grps:
382
- assert input_file_grp in self.workspace.mets.file_groups
418
+ assert input_file_grp in self.workspace.mets.file_groups, \
419
+ f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}"
420
+ for output_file_grp in output_file_grps:
421
+ assert output_file_grp not in self.workspace.mets.file_groups \
422
+ or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \
423
+ or not any(self.workspace.mets.find_files(
424
+ pageId=self.page_id, fileGrp=output_file_grp)), \
425
+ f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}"
383
426
  # keep this for backwards compatibility:
384
427
  return True
385
428
 
@@ -444,6 +487,9 @@ class Processor():
444
487
  for the given :py:data:`page_id` (or all pages)
445
488
  under the given :py:data:`parameter`.
446
489
 
490
+ Delegates to :py:meth:`.process_workspace_submit_tasks`
491
+ and :py:meth:`.process_workspace_handle_tasks`.
492
+
447
493
  (This will iterate over pages and files, calling
448
494
  :py:meth:`.process_page_file` and handling exceptions.
449
495
  It should be overridden by subclasses to handle cases
@@ -453,11 +499,7 @@ class Processor():
453
499
  self.workspace = workspace
454
500
  self.verify()
455
501
  try:
456
- nr_succeeded = 0
457
- nr_skipped = 0
458
- nr_copied = 0
459
-
460
- # set up multithreading
502
+ # set up multitasking
461
503
  max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
462
504
  if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
463
505
  self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
@@ -469,84 +511,217 @@ class Processor():
469
511
  if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
470
512
  self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
471
513
  max_seconds = self.max_page_seconds
472
- executor = ThreadPoolExecutor(
514
+
515
+ if max_workers > 1:
516
+ executor_cls = ProcessPoolExecutor
517
+ else:
518
+ executor_cls = DummyExecutor
519
+ executor = executor_cls(
473
520
  max_workers=max_workers or 1,
474
- thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
521
+ # only forking method avoids pickling
522
+ context=mp.get_context('fork'),
523
+ # share processor instance as global to avoid pickling
524
+ initializer=_page_worker_set_ctxt,
525
+ initargs=(self,),
475
526
  )
476
- self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
477
- tasks = {}
478
-
479
- for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
480
- input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
481
- page_id = next(input_file.pageId
482
- for input_file in input_file_tuple
483
- if input_file)
484
- self._base_logger.info(f"preparing page {page_id}")
485
- for i, input_file in enumerate(input_file_tuple):
486
- if input_file is None:
487
- # file/page not found in this file grp
488
- continue
489
- input_files[i] = input_file
490
- if not self.download:
491
- continue
492
- try:
493
- input_files[i] = self.workspace.download_file(input_file)
494
- except (ValueError, FileNotFoundError, HTTPError) as e:
495
- self._base_logger.error(repr(e))
496
- self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
497
- # process page
498
- tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
499
- self._base_logger.debug("submitted %d processing tasks", len(tasks))
500
-
501
- for task in tasks:
502
- # wait for results, handle errors
503
- page_id, input_files = tasks[task]
504
- # FIXME: differentiate error cases in various ways:
505
- # - ResourceNotFoundError → use ResourceManager to download (once), then retry
506
- # - transient (I/O or OOM) error → maybe sleep, retry
507
- # - persistent (data) error → skip / dummy / raise
508
- try:
509
- self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
510
- task.result(timeout=max_seconds or None)
511
- nr_succeeded += 1
512
- # exclude NotImplementedError, so we can try process() below
513
- except NotImplementedError:
514
- raise
515
- # handle input failures separately
516
- except FileExistsError as err:
517
- if config.OCRD_EXISTING_OUTPUT == 'ABORT':
518
- raise err
519
- if config.OCRD_EXISTING_OUTPUT == 'SKIP':
520
- continue
521
- if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
522
- # too late here, must not happen
523
- raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
524
- # broad coverage of output failures (including TimeoutError)
525
- except (Exception, TimeoutError) as err:
526
- # FIXME: add re-usable/actionable logging
527
- if config.OCRD_MISSING_OUTPUT == 'ABORT':
528
- self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
529
- raise err
530
- self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
531
- if config.OCRD_MISSING_OUTPUT == 'SKIP':
532
- nr_skipped += 1
533
- continue
534
- if config.OCRD_MISSING_OUTPUT == 'COPY':
535
- self._copy_page_file(input_files[0])
536
- nr_copied += 1
537
- else:
538
- desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
539
- raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
540
-
541
- if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
542
- raise Exception(f"too many failures with skipped output ({nr_skipped})")
543
- if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
544
- raise Exception(f"too many failures with fallback output ({nr_skipped})")
545
- executor.shutdown()
527
+ try:
528
+ self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
529
+ tasks = self.process_workspace_submit_tasks(executor, max_seconds)
530
+ stats = self.process_workspace_handle_tasks(tasks)
531
+ finally:
532
+ executor.shutdown(kill_workers=True, wait=False)
546
533
 
547
534
  except NotImplementedError:
548
535
  # fall back to deprecated method
549
- self.process()
536
+ try:
537
+ self.process()
538
+ except Exception as err:
539
+ # suppress the NotImplementedError context
540
+ raise err from None
541
+
542
+ def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
543
+ """
544
+ Look up all input files of the given ``workspace``
545
+ from the given :py:data:`input_file_grp`
546
+ for the given :py:data:`page_id` (or all pages),
547
+ and schedules calling :py:meth:`.process_page_file`
548
+ on them for each page via `executor` (enforcing
549
+ a per-page time limit of `max_seconds`).
550
+
551
+ When running with `OCRD_MAX_PARALLEL_PAGES>1` and
552
+ the workspace via METS Server, the executor will fork
553
+ this many worker parallel subprocesses each processing
554
+ one page at a time. (Interprocess communication is
555
+ done via task and result queues.)
556
+
557
+ Otherwise, tasks are run sequentially in the
558
+ current process.
559
+
560
+ Delegates to :py:meth:`.zip_input_files` to get
561
+ the input files for each page, and then calls
562
+ :py:meth:`.process_workspace_submit_page_task`.
563
+
564
+ Returns a dict mapping the per-page tasks
565
+ (i.e. futures submitted to the executor)
566
+ to their corresponding pageId and input files.
567
+ """
568
+ tasks = {}
569
+ for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
570
+ task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple)
571
+ tasks[task] = (page_id, input_files)
572
+ self._base_logger.debug("submitted %d processing tasks", len(tasks))
573
+ return tasks
574
+
575
+ def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]:
576
+ """
577
+ Ensure all input files for a single page are
578
+ downloaded to the workspace, then schedule
579
+ :py:meth:`.process_process_file` to be run on
580
+ them via `executor` (enforcing a per-page time
581
+ limit of `max_seconds`).
582
+
583
+ Delegates to :py:meth:`.process_page_file`
584
+ (wrapped in :py:func:`_page_worker` to share
585
+ the processor instance across forked processes).
586
+
587
+ \b
588
+ Returns a tuple of:
589
+ - the scheduled future object,
590
+ - the corresponding pageId,
591
+ - the corresponding input files.
592
+ """
593
+ input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
594
+ page_id = next(input_file.pageId
595
+ for input_file in input_file_tuple
596
+ if input_file)
597
+ self._base_logger.info(f"preparing page {page_id}")
598
+ for i, input_file in enumerate(input_file_tuple):
599
+ if input_file is None:
600
+ # file/page not found in this file grp
601
+ continue
602
+ input_files[i] = input_file
603
+ if not self.download:
604
+ continue
605
+ try:
606
+ input_files[i] = self.workspace.download_file(input_file)
607
+ except (ValueError, FileNotFoundError, HTTPError) as e:
608
+ self._base_logger.error(repr(e))
609
+ self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
610
+ # process page
611
+ #executor.submit(self.process_page_file, *input_files)
612
+ return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
613
+
614
+ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]:
615
+ """
616
+ Look up scheduled per-page futures one by one,
617
+ handle errors (exceptions) and gather results.
618
+
619
+ \b
620
+ Enforces policies configured by the following
621
+ environment variables:
622
+ - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
623
+ - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
624
+ - `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
625
+
626
+ \b
627
+ Returns a tuple of:
628
+ - the number of successfully processed pages
629
+ - the number of failed (i.e. skipped or copied) pages
630
+ - a dict of the type and corresponding number of exceptions seen
631
+ - the number of total requested pages (i.e. success+fail+existing).
632
+
633
+ Delegates to :py:meth:`.process_workspace_handle_page_task`
634
+ for each page.
635
+ """
636
+ # aggregate info for logging:
637
+ nr_succeeded = 0
638
+ nr_failed = 0
639
+ nr_errors = defaultdict(int) # count causes
640
+ if config.OCRD_MISSING_OUTPUT == 'SKIP':
641
+ reason = "skipped"
642
+ elif config.OCRD_MISSING_OUTPUT == 'COPY':
643
+ reason = "fallback-copied"
644
+ for task in tasks:
645
+ # wait for results, handle errors
646
+ page_id, input_files = tasks[task]
647
+ result = self.process_workspace_handle_page_task(page_id, input_files, task)
648
+ if isinstance(result, Exception):
649
+ nr_errors[result.__class__.__name__] += 1
650
+ nr_failed += 1
651
+ # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed
652
+ if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS:
653
+ # already irredeemably many failures, stop short
654
+ nr_errors = dict(nr_errors)
655
+ raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})")
656
+ elif result:
657
+ nr_succeeded += 1
658
+ # else skipped - already exists
659
+ nr_errors = dict(nr_errors)
660
+ if nr_failed > 0:
661
+ nr_all = nr_succeeded + nr_failed
662
+ if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
663
+ raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
664
+ self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
665
+ return nr_succeeded, nr_failed, nr_errors, len(tasks)
666
+
667
+ def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]:
668
+ """
669
+ \b
670
+ Await a single page result and handle errors (exceptions),
671
+ enforcing policies configured by the following
672
+ environment variables:
673
+ - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
674
+ - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
675
+ - `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
676
+
677
+ \b
678
+ Returns
679
+ - true in case of success
680
+ - false in case the output already exists
681
+ - the exception in case of failure
682
+ """
683
+ # FIXME: differentiate error cases in various ways:
684
+ # - ResourceNotFoundError → use ResourceManager to download (once), then retry
685
+ # - transient (I/O or OOM) error → maybe sleep, retry
686
+ # - persistent (data) error → skip / dummy / raise
687
+ try:
688
+ self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id)
689
+ # timeout kwarg on future is useless: it only raises TimeoutError here,
690
+ # but does not stop the running process/thread, and executor itself
691
+ # offers nothing to that effect:
692
+ # task.result(timeout=max_seconds or None)
693
+ # so we instead applied the timeout within the worker function
694
+ task.result()
695
+ return True
696
+ except NotImplementedError:
697
+ # exclude NotImplementedError, so we can try process() below
698
+ raise
699
+ # handle input failures separately
700
+ except FileExistsError as err:
701
+ if config.OCRD_EXISTING_OUTPUT == 'ABORT':
702
+ raise err
703
+ if config.OCRD_EXISTING_OUTPUT == 'SKIP':
704
+ return False
705
+ if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
706
+ # too late here, must not happen
707
+ raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
708
+ except KeyboardInterrupt:
709
+ raise
710
+ # broad coverage of output failures (including TimeoutError)
711
+ except Exception as err:
712
+ # FIXME: add re-usable/actionable logging
713
+ if config.OCRD_MISSING_OUTPUT == 'ABORT':
714
+ self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
715
+ raise err
716
+ self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
717
+ if config.OCRD_MISSING_OUTPUT == 'SKIP':
718
+ pass
719
+ elif config.OCRD_MISSING_OUTPUT == 'COPY':
720
+ self._copy_page_file(input_files[0])
721
+ else:
722
+ desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
723
+ raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
724
+ return err
550
725
 
551
726
  def _copy_page_file(self, input_file : OcrdFileType) -> None:
552
727
  """
@@ -574,7 +749,6 @@ class Processor():
574
749
  local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
575
750
  mimetype=MIMETYPE_PAGE,
576
751
  content=to_xml(input_pcgts),
577
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
578
752
  )
579
753
 
580
754
  def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
@@ -603,6 +777,12 @@ class Processor():
603
777
  # not PAGE and not an image to generate PAGE for
604
778
  self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
605
779
  output_file_id = make_file_id(input_files[0], self.output_file_grp)
780
+ output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
781
+ if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
782
+ # short-cut avoiding useless computation:
783
+ raise FileExistsError(
784
+ f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
785
+ )
606
786
  result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
607
787
  for image_result in result.images:
608
788
  image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
@@ -616,6 +796,8 @@ class Processor():
616
796
  image_result.alternative_image.set_imageHeight(image_result.pil.height)
617
797
  elif isinstance(image_result.alternative_image, AlternativeImageType):
618
798
  image_result.alternative_image.set_filename(image_file_path)
799
+ elif image_result.alternative_image is None:
800
+ pass # do not reference in PAGE result
619
801
  else:
620
802
  raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
621
803
  f"{type(image_result.alternative_image)}")
@@ -625,7 +807,6 @@ class Processor():
625
807
  self.output_file_grp,
626
808
  page_id=page_id,
627
809
  file_path=image_file_path,
628
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
629
810
  )
630
811
  result.pcgts.set_pcGtsId(output_file_id)
631
812
  self.add_metadata(result.pcgts)
@@ -636,7 +817,6 @@ class Processor():
636
817
  local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
637
818
  mimetype=MIMETYPE_PAGE,
638
819
  content=to_xml(result.pcgts),
639
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
640
820
  )
641
821
 
642
822
  def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
@@ -919,6 +1099,44 @@ class Processor():
919
1099
  ifts.append(tuple(ifiles))
920
1100
  return ifts
921
1101
 
1102
+ _page_worker_processor = None
1103
+ """
1104
+ This global binding for the processor is required to avoid
1105
+ squeezing the processor through a mp.Queue (which is impossible
1106
+ due to unpicklable attributes like .workspace.mets._tree anyway)
1107
+ when calling Processor.process_page_file as page worker processes
1108
+ in Processor.process_workspace. Forking allows inheriting global
1109
+ objects, and with the METS Server we do not mutate the local
1110
+ processor instance anyway.
1111
+ """
1112
+ def _page_worker_set_ctxt(processor):
1113
+ """
1114
+ Overwrites `ocrd.processor.base._page_worker_processor` instance
1115
+ for sharing with subprocesses in ProcessPoolExecutor initializer.
1116
+ """
1117
+ global _page_worker_processor
1118
+ _page_worker_processor = processor
1119
+
1120
+ def _page_worker(timeout, *input_files):
1121
+ """
1122
+ Wraps a `Processor.process_page_file` call as payload (call target)
1123
+ of the ProcessPoolExecutor workers, but also enforces the given timeout.
1124
+ """
1125
+ page_id = next((file.pageId for file in input_files
1126
+ if hasattr(file, 'pageId')), "")
1127
+ if timeout > 0:
1128
+ timer = Timer(timeout, interrupt_main)
1129
+ timer.start()
1130
+ try:
1131
+ _page_worker_processor.process_page_file(*input_files)
1132
+ _page_worker_processor.logger.debug("page worker completed for page %s", page_id)
1133
+ except KeyboardInterrupt:
1134
+ _page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
1135
+ raise TimeoutError()
1136
+ finally:
1137
+ if timeout > 0:
1138
+ timer.cancel()
1139
+
922
1140
  def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
923
1141
  """Generate a string describing the full CLI of this processor including params.
924
1142
 
@@ -47,7 +47,6 @@ class DummyProcessor(Processor):
47
47
  mimetype=input_file.mimetype,
48
48
  local_filename=local_filename,
49
49
  content=f.read(),
50
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
51
50
  )
52
51
  file_id = file_id + '_PAGE'
53
52
  pcgts = page_from_file(output_file)
@@ -62,7 +61,6 @@ class DummyProcessor(Processor):
62
61
  local_filename=join(self.output_file_grp, file_id + '.xml'),
63
62
  mimetype=MIMETYPE_PAGE,
64
63
  content=to_xml(pcgts),
65
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
66
64
  )
67
65
  else:
68
66
  if self.parameter['copy_files']:
ocrd/processor/helpers.py CHANGED
@@ -2,6 +2,7 @@
2
2
  Helper methods for running and documenting processors
3
3
  """
4
4
  from time import perf_counter, process_time
5
+ from os import times
5
6
  from functools import lru_cache
6
7
  import json
7
8
  import inspect
@@ -89,11 +90,12 @@ def run_processor(
89
90
 
90
91
  ocrd_tool = processor.ocrd_tool
91
92
  name = '%s v%s' % (ocrd_tool['executable'], processor.version)
92
- otherrole = ocrd_tool['steps'][0]
93
+ otherrole = ocrd_tool.get('steps', [''])[0]
93
94
  logProfile = getLogger('ocrd.process.profile')
94
95
  log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
95
96
  t0_wall = perf_counter()
96
97
  t0_cpu = process_time()
98
+ t0_os = times()
97
99
  if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
98
100
  backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
99
101
  from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
@@ -123,7 +125,13 @@ def run_processor(
123
125
 
124
126
  t1_wall = perf_counter() - t0_wall
125
127
  t1_cpu = process_time() - t0_cpu
126
- logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % (
128
+ t1_os = times()
129
+ # add CPU time from child processes (page worker etc)
130
+ t1_cpu += t1_os.children_user - t0_os.children_user
131
+ t1_cpu += t1_os.children_system - t0_os.children_system
132
+ logProfile.info(
133
+ "Executing processor '%s' took %fs (wall) %fs (CPU)( "
134
+ "[--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']",
127
135
  ocrd_tool['executable'],
128
136
  t1_wall,
129
137
  t1_cpu,
@@ -131,7 +139,7 @@ def run_processor(
131
139
  processor.output_file_grp or '',
132
140
  json.dumps(processor.parameter) or '',
133
141
  processor.page_id or ''
134
- ))
142
+ )
135
143
  workspace.mets.add_agent(
136
144
  name=name,
137
145
  _type='OTHER',
@@ -234,10 +242,10 @@ def get_cached_processor(parameter: dict, processor_class):
234
242
  def get_processor(
235
243
  processor_class,
236
244
  parameter: Optional[dict] = None,
237
- workspace: Workspace = None,
238
- page_id: str = None,
239
- input_file_grp: List[str] = None,
240
- output_file_grp: List[str] = None,
245
+ workspace: Optional[Workspace] = None,
246
+ page_id: Optional[str] = None,
247
+ input_file_grp: Optional[List[str]] = None,
248
+ output_file_grp: Optional[List[str]] = None,
241
249
  instance_caching: bool = False,
242
250
  ):
243
251
  if processor_class:
@@ -258,6 +266,7 @@ def get_processor(
258
266
  else:
259
267
  # avoid passing workspace already (deprecated chdir behaviour)
260
268
  processor = processor_class(None, parameter=parameter)
269
+ assert processor
261
270
  # set current processing parameters
262
271
  processor.workspace = workspace
263
272
  processor.page_id = page_id
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List, Union
2
+ from typing import List, Union, Optional
3
3
  from ocrd_models.ocrd_page import OcrdPage
4
4
  from PIL.Image import Image
5
5
 
@@ -9,7 +9,7 @@ from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
9
9
  class OcrdPageResultImage():
10
10
  pil : Image
11
11
  file_id_suffix : str
12
- alternative_image : Union[AlternativeImageType, PageType]
12
+ alternative_image : Optional[Union[AlternativeImageType, PageType]]
13
13
 
14
14
  @dataclass
15
15
  class OcrdPageResult():
ocrd/workspace.py CHANGED
@@ -19,6 +19,7 @@ from ocrd_models.ocrd_page import parse, BorderType, to_xml
19
19
  from ocrd_modelfactory import exif_from_filename, page_from_file
20
20
  from ocrd_utils import (
21
21
  atomic_write,
22
+ config,
22
23
  getLogger,
23
24
  image_from_polygon,
24
25
  coordinates_of_segment,
@@ -427,6 +428,8 @@ class Workspace():
427
428
  kwargs["pageId"] = kwargs.pop("page_id")
428
429
  if "file_id" in kwargs:
429
430
  kwargs["ID"] = kwargs.pop("file_id")
431
+ if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
432
+ kwargs["force"] = True
430
433
 
431
434
  ret = self.mets.add_file(file_grp, **kwargs)
432
435
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.0.0b4
3
+ Version: 3.0.0b6
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -24,6 +24,7 @@ Requires-Dist: frozendict>=2.3.4
24
24
  Requires-Dist: gdown
25
25
  Requires-Dist: httpx>=0.22.0
26
26
  Requires-Dist: jsonschema>=4
27
+ Requires-Dist: loky
27
28
  Requires-Dist: lxml
28
29
  Requires-Dist: memory-profiler>=0.58.0
29
30
  Requires-Dist: numpy