ocrd 3.0.0b1__py3-none-any.whl → 3.0.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocrd/cli/__init__.py CHANGED
@@ -61,11 +61,11 @@ Variables:
61
61
  \b
62
62
  {config.describe('OCRD_DOWNLOAD_INPUT')}
63
63
  \b
64
- {config.describe('OCRD_MISSING_INPUT')}
64
+ {config.describe('OCRD_MISSING_INPUT', wrap_text=False)}
65
65
  \b
66
- {config.describe('OCRD_MISSING_OUTPUT')}
66
+ {config.describe('OCRD_MISSING_OUTPUT', wrap_text=False)}
67
67
  \b
68
- {config.describe('OCRD_EXISTING_OUTPUT')}
68
+ {config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)}
69
69
  \b
70
70
  {config.describe('OCRD_METS_CACHING')}
71
71
  \b
@@ -39,6 +39,7 @@ def ocrd_cli_options(f):
39
39
  parameter_option,
40
40
  parameter_override_option,
41
41
  loglevel_option,
42
+ option('--log-filename', default=None),
42
43
  option('--address', type=ServerAddressParamType()),
43
44
  option('--queue', type=QueueServerParamType()),
44
45
  option('--database', type=DatabaseParamType()),
@@ -48,7 +49,6 @@ def ocrd_cli_options(f):
48
49
  option('-D', '--dump-module-dir', is_flag=True, default=False),
49
50
  option('-h', '--help', is_flag=True, default=False),
50
51
  option('-V', '--version', is_flag=True, default=False),
51
- option('--log-filename', default=None),
52
52
  # Subcommand, only used for 'worker'/'server'. Cannot be handled in
53
53
  # click because processors use the @command decorator and even if they
54
54
  # were using `group`, you cannot combine have a command with
ocrd/lib.bash CHANGED
@@ -156,6 +156,7 @@ ocrd__parse_argv () {
156
156
  while [[ "${1:-}" = -* ]];do
157
157
  case "$1" in
158
158
  -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;;
159
+ --log-filename) exec 2> "$2" ; shift ;;
159
160
  -h|--help|--usage) ocrd__usage; exit ;;
160
161
  -J|--dump-json) ocrd__dumpjson; exit ;;
161
162
  -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;;
ocrd/mets_server.py CHANGED
@@ -120,7 +120,7 @@ class ClientSideOcrdMets:
120
120
 
121
121
  def __init__(self, url, workspace_path: Optional[str] = None):
122
122
  self.protocol = "tcp" if url.startswith("http://") else "uds"
123
- self.log = getLogger(f"ocrd.mets_client[{url}]")
123
+ self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}")
124
124
  self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}'
125
125
  self.ws_dir_path = workspace_path if workspace_path else None
126
126
 
@@ -3,6 +3,7 @@ from .base import (
3
3
  ResourceNotFoundError,
4
4
  NonUniqueInputFile,
5
5
  MissingInputFile,
6
+ generate_processor_help,
6
7
  )
7
8
  from .ocrd_page_result import (
8
9
  OcrdPageResult,
@@ -11,5 +12,4 @@ from .ocrd_page_result import (
11
12
  from .helpers import (
12
13
  run_cli,
13
14
  run_processor,
14
- generate_processor_help
15
15
  )
ocrd/processor/base.py CHANGED
@@ -23,12 +23,16 @@ import tarfile
23
23
  import io
24
24
  import weakref
25
25
  from frozendict import frozendict
26
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError
27
+
28
+ from click import wrap_text
26
29
  from deprecated import deprecated
27
30
  from requests import HTTPError
28
31
 
29
- from ocrd.workspace import Workspace
32
+ from ..workspace import Workspace
33
+ from ..mets_server import ClientSideOcrdMets
30
34
  from ocrd_models.ocrd_file import OcrdFileType
31
- from ocrd.processor.ocrd_page_result import OcrdPageResult
35
+ from .ocrd_page_result import OcrdPageResult
32
36
  from ocrd_utils import (
33
37
  VERSION as OCRD_VERSION,
34
38
  MIMETYPE_PAGE,
@@ -58,7 +62,7 @@ from ocrd_modelfactory import page_from_file
58
62
  from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
59
63
 
60
64
  # XXX imports must remain for backwards-compatibility
61
- from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import
65
+ from .helpers import run_cli, run_processor # pylint: disable=unused-import
62
66
 
63
67
 
64
68
  class ResourceNotFoundError(FileNotFoundError):
@@ -118,7 +122,27 @@ class Processor():
118
122
  maximum number of cached instances (ignored if negative), to be applied on top of
119
123
  :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
120
124
 
121
- (Override this if you know how many instances fit into memory at once.)
125
+ (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
126
+ """
127
+
128
+ max_workers : int = -1
129
+ """
130
+ maximum number of processor threads for page-parallel processing (ignored if negative),
131
+ to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
132
+ whatever is smaller).
133
+
134
+ (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
135
+ - at once, or if your class is not thread-safe.)
136
+ """
137
+
138
+ max_page_seconds : int = -1
139
+ """
140
+ maximum number of seconds may be spent processing a single page (ignored if negative),
141
+ to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
142
+ (i.e. whatever is smaller).
143
+
144
+ (Override this if you know how costly this processor may be, irrespective of image size
145
+ or complexity of the page.)
122
146
  """
123
147
 
124
148
  @property
@@ -142,7 +166,11 @@ class Processor():
142
166
 
143
167
  (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
144
168
  """
145
- return resource_filename(self.__module__.split('.')[0], self.metadata_filename)
169
+ # XXX HACK
170
+ module_tokens = self.__module__.split('.')
171
+ if module_tokens[0] == 'src':
172
+ module_tokens.pop(0)
173
+ return resource_filename(module_tokens[0], self.metadata_filename)
146
174
 
147
175
  @cached_property
148
176
  def metadata_rawdict(self) -> dict:
@@ -273,12 +301,12 @@ class Processor():
273
301
  if ocrd_tool is not None:
274
302
  deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
275
303
  "use or override metadata/executable/ocrd-tool properties instead")
276
- self._ocrd_tool = ocrd_tool
277
- self._executable = ocrd_tool['executable']
304
+ self.ocrd_tool = ocrd_tool
305
+ self.executable = ocrd_tool['executable']
278
306
  if version is not None:
279
307
  deprecation_warning("Passing 'version' as keyword argument to Processor is deprecated - "
280
308
  "use or override metadata/version properties instead")
281
- self._version = version
309
+ self.version = version
282
310
  if workspace is not None:
283
311
  deprecation_warning("Passing a workspace argument other than 'None' to Processor "
284
312
  "is deprecated - pass as argument to process_workspace instead")
@@ -422,13 +450,35 @@ class Processor():
422
450
  self.workspace = workspace
423
451
  self.verify()
424
452
  try:
425
- # FIXME: add page parallelization by running multiprocessing.Pool (#322)
453
+ nr_succeeded = 0
454
+ nr_skipped = 0
455
+ nr_copied = 0
456
+
457
+ # set up multithreading
458
+ if self.max_workers <= 0:
459
+ max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
460
+ else:
461
+ max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers))
462
+ if max_workers > 1:
463
+ assert isinstance(workspace.mets, ClientSideOcrdMets), \
464
+ "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
465
+ if self.max_page_seconds <= 0:
466
+ max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
467
+ else:
468
+ max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds))
469
+ executor = ThreadPoolExecutor(
470
+ max_workers=max_workers or 1,
471
+ thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
472
+ )
473
+ self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
474
+ tasks = {}
475
+
426
476
  for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
427
477
  input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
428
478
  page_id = next(input_file.pageId
429
479
  for input_file in input_file_tuple
430
480
  if input_file)
431
- self._base_logger.info(f"processing page {page_id}")
481
+ self._base_logger.info(f"preparing page {page_id}")
432
482
  for i, input_file in enumerate(input_file_tuple):
433
483
  if input_file is None:
434
484
  # file/page not found in this file grp
@@ -441,35 +491,56 @@ class Processor():
441
491
  except (ValueError, FileNotFoundError, HTTPError) as e:
442
492
  self._base_logger.error(repr(e))
443
493
  self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
494
+ # process page
495
+ tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
496
+ self._base_logger.debug("submitted %d processing tasks", len(tasks))
497
+
498
+ for task in tasks:
499
+ # wait for results, handle errors
500
+ page_id, input_files = tasks[task]
444
501
  # FIXME: differentiate error cases in various ways:
445
502
  # - ResourceNotFoundError → use ResourceManager to download (once), then retry
446
503
  # - transient (I/O or OOM) error → maybe sleep, retry
447
504
  # - persistent (data) error → skip / dummy / raise
448
505
  try:
449
- self.process_page_file(*input_files)
450
- except Exception as err:
451
- # we have to be broad here, but want to exclude NotImplementedError
452
- if isinstance(err, NotImplementedError):
506
+ self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
507
+ task.result(timeout=max_seconds or None)
508
+ nr_succeeded += 1
509
+ # exclude NotImplementedError, so we can try process() below
510
+ except NotImplementedError:
511
+ raise
512
+ # handle input failures separately
513
+ except FileExistsError as err:
514
+ if config.OCRD_EXISTING_OUTPUT == 'ABORT':
453
515
  raise err
454
- if isinstance(err, FileExistsError):
455
- if config.OCRD_EXISTING_OUTPUT == 'ABORT':
456
- raise err
457
- if config.OCRD_EXISTING_OUTPUT == 'SKIP':
458
- continue
459
- if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
460
- # too late here, must not happen
461
- raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
462
- # FIXME: re-usable/actionable logging
463
- self._base_logger.exception(f"Failure on page {page_id}: {err}")
516
+ if config.OCRD_EXISTING_OUTPUT == 'SKIP':
517
+ continue
518
+ if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
519
+ # too late here, must not happen
520
+ raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
521
+ # broad coverage of output failures (including TimeoutError)
522
+ except (Exception, TimeoutError) as err:
523
+ # FIXME: add re-usable/actionable logging
464
524
  if config.OCRD_MISSING_OUTPUT == 'ABORT':
525
+ self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
465
526
  raise err
527
+ self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
466
528
  if config.OCRD_MISSING_OUTPUT == 'SKIP':
529
+ nr_skipped += 1
467
530
  continue
468
531
  if config.OCRD_MISSING_OUTPUT == 'COPY':
469
532
  self._copy_page_file(input_files[0])
533
+ nr_copied += 1
470
534
  else:
471
535
  desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
472
536
  raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
537
+
538
+ if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
539
+ raise Exception(f"too many failures with skipped output ({nr_skipped})")
540
+ if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
541
+ raise Exception(f"too many failures with fallback output ({nr_skipped})")
542
+ executor.shutdown()
543
+
473
544
  except NotImplementedError:
474
545
  # fall back to deprecated method
475
546
  self.process()
@@ -493,13 +564,14 @@ class Processor():
493
564
  output_file_id = make_file_id(input_file, self.output_file_grp)
494
565
  input_pcgts.set_pcGtsId(output_file_id)
495
566
  self.add_metadata(input_pcgts)
496
- self.workspace.add_file(file_id=output_file_id,
497
- file_grp=self.output_file_grp,
498
- page_id=input_file.pageId,
499
- local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
500
- mimetype=MIMETYPE_PAGE,
501
- content=to_xml(input_pcgts),
502
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
567
+ self.workspace.add_file(
568
+ file_id=output_file_id,
569
+ file_grp=self.output_file_grp,
570
+ page_id=input_file.pageId,
571
+ local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
572
+ mimetype=MIMETYPE_PAGE,
573
+ content=to_xml(input_pcgts),
574
+ force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
503
575
  )
504
576
 
505
577
  def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
@@ -516,6 +588,7 @@ class Processor():
516
588
  input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
517
589
  assert isinstance(input_files[0], get_args(OcrdFileType))
518
590
  page_id = input_files[0].pageId
591
+ self._base_logger.info("processing page %s", page_id)
519
592
  for i, input_file in enumerate(input_files):
520
593
  assert isinstance(input_file, get_args(OcrdFileType))
521
594
  self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
@@ -532,6 +605,9 @@ class Processor():
532
605
  image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
533
606
  image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
534
607
  if isinstance(image_result.alternative_image, PageType):
608
+ # special case: not an alternative image, but replacing the original image
609
+ # (this is needed by certain processors when the original's coordinate system
610
+ # cannot or must not be kept)
535
611
  image_result.alternative_image.set_imageFilename(image_file_path)
536
612
  image_result.alternative_image.set_imageWidth(image_result.pil.width)
537
613
  image_result.alternative_image.set_imageHeight(image_result.pil.height)
@@ -550,13 +626,14 @@ class Processor():
550
626
  )
551
627
  result.pcgts.set_pcGtsId(output_file_id)
552
628
  self.add_metadata(result.pcgts)
553
- self.workspace.add_file(file_id=output_file_id,
554
- file_grp=self.output_file_grp,
555
- page_id=page_id,
556
- local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
557
- mimetype=MIMETYPE_PAGE,
558
- content=to_xml(result.pcgts),
559
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
629
+ self.workspace.add_file(
630
+ file_id=output_file_id,
631
+ file_grp=self.output_file_grp,
632
+ page_id=page_id,
633
+ local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
634
+ mimetype=MIMETYPE_PAGE,
635
+ content=to_xml(result.pcgts),
636
+ force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
560
637
  )
561
638
 
562
639
  def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
@@ -838,3 +915,151 @@ class Processor():
838
915
  if ifiles[0] or not require_first:
839
916
  ifts.append(tuple(ifiles))
840
917
  return ifts
918
+
919
+ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
920
+ """Generate a string describing the full CLI of this processor including params.
921
+
922
+ Args:
923
+ ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
924
+ processor_instance (object, optional): the processor implementation
925
+ (for adding any module/class/function docstrings)
926
+ subcommand (string): 'worker' or 'server'
927
+ """
928
+ doc_help = ''
929
+ if processor_instance:
930
+ module = inspect.getmodule(processor_instance)
931
+ if module and module.__doc__:
932
+ doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
933
+ if processor_instance.__doc__:
934
+ doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
935
+ # Try to find the most concrete docstring among the various methods that an implementation
936
+ # could overload, first serving.
937
+ # In doing so, compare with Processor to avoid a glitch in the way py>=3.5 inherits docstrings.
938
+ # (They are supposed to only repeat information inspect.getdoc, rather than inherit __doc__ itself.)
939
+ for method in ['process_page_pcgts', 'process_page_file', 'process_workspace', 'process']:
940
+ instance_method = getattr(processor_instance, method)
941
+ superclass_method = getattr(Processor, method)
942
+ if instance_method.__doc__ and instance_method.__doc__ != superclass_method.__doc__:
943
+ doc_help += '\n' + inspect.cleandoc(instance_method.__doc__) + '\n'
944
+ break
945
+ if doc_help:
946
+ doc_help = '\n\n' + wrap_text(doc_help, width=72,
947
+ initial_indent=' > ',
948
+ subsequent_indent=' > ',
949
+ preserve_paragraphs=True)
950
+ subcommands = '''\
951
+ worker Start a processing worker rather than do local processing
952
+ server Start a processor server rather than do local processing
953
+ '''
954
+
955
+ processing_worker_options = '''\
956
+ --queue The RabbitMQ server address in format
957
+ "amqp://{user}:{pass}@{host}:{port}/{vhost}"
958
+ [amqp://admin:admin@localhost:5672]
959
+ --database The MongoDB server address in format
960
+ "mongodb://{host}:{port}"
961
+ [mongodb://localhost:27018]
962
+ --log-filename Filename to redirect STDOUT/STDERR to,
963
+ if specified.
964
+ '''
965
+
966
+ processing_server_options = '''\
967
+ --address The Processor server address in format
968
+ "{host}:{port}"
969
+ --database The MongoDB server address in format
970
+ "mongodb://{host}:{port}"
971
+ [mongodb://localhost:27018]
972
+ '''
973
+
974
+ processing_options = '''\
975
+ -m, --mets URL-PATH URL or file path of METS to process [./mets.xml]
976
+ -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)]
977
+ -I, --input-file-grp USE File group(s) used as input
978
+ -O, --output-file-grp USE File group(s) used as output
979
+ -g, --page-id ID Physical page ID(s) to process instead of full document []
980
+ --overwrite Remove existing output pages/images
981
+ (with "--page-id", remove only those).
982
+ Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
983
+ --debug Abort on any errors with full stack trace.
984
+ Short-hand for OCRD_MISSING_OUTPUT=ABORT
985
+ --profile Enable profiling
986
+ --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile"
987
+ -p, --parameter JSON-PATH Parameters, either verbatim JSON string
988
+ or JSON file path
989
+ -P, --param-override KEY VAL Override a single JSON object key-value pair,
990
+ taking precedence over --parameter
991
+ -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS
992
+ If URL starts with http:// start an HTTP server there,
993
+ otherwise URL is a path to an on-demand-created unix socket
994
+ -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
995
+ Override log level globally [INFO]
996
+ --log-filename LOG-PATH File to redirect stderr logging to (overriding ocrd_logging.conf).
997
+ '''
998
+
999
+ information_options = '''\
1000
+ -C, --show-resource RESNAME Dump the content of processor resource RESNAME
1001
+ -L, --list-resources List names of processor resources
1002
+ -J, --dump-json Dump tool description as JSON
1003
+ -D, --dump-module-dir Show the 'module' resource location path for this processor
1004
+ -h, --help Show this message
1005
+ -V, --version Show version
1006
+ '''
1007
+
1008
+ parameter_help = ''
1009
+ if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
1010
+ parameter_help = ' NONE\n'
1011
+ else:
1012
+ def wrap(s):
1013
+ return wrap_text(s, initial_indent=' '*3,
1014
+ subsequent_indent=' '*4,
1015
+ width=72, preserve_paragraphs=True)
1016
+ for param_name, param in ocrd_tool['parameters'].items():
1017
+ parameter_help += wrap('"%s" [%s%s]' % (
1018
+ param_name,
1019
+ param['type'],
1020
+ ' - REQUIRED' if 'required' in param and param['required'] else
1021
+ ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
1022
+ parameter_help += '\n ' + wrap(param['description'])
1023
+ if 'enum' in param:
1024
+ parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
1025
+ parameter_help += "\n"
1026
+
1027
+ if not subcommand:
1028
+ return f'''\
1029
+ Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
1030
+
1031
+ {ocrd_tool['description']}{doc_help}
1032
+
1033
+ Subcommands:
1034
+ {subcommands}
1035
+ Options for processing:
1036
+ {processing_options}
1037
+ Options for information:
1038
+ {information_options}
1039
+ Parameters:
1040
+ {parameter_help}
1041
+ '''
1042
+ elif subcommand == 'worker':
1043
+ return f'''\
1044
+ Usage: {ocrd_tool['executable']} worker [OPTIONS]
1045
+
1046
+ Run {ocrd_tool['executable']} as a processing worker.
1047
+
1048
+ {ocrd_tool['description']}{doc_help}
1049
+
1050
+ Options:
1051
+ {processing_worker_options}
1052
+ '''
1053
+ elif subcommand == 'server':
1054
+ return f'''\
1055
+ Usage: {ocrd_tool['executable']} server [OPTIONS]
1056
+
1057
+ Run {ocrd_tool['executable']} as a processor sever.
1058
+
1059
+ {ocrd_tool['description']}{doc_help}
1060
+
1061
+ Options:
1062
+ {processing_server_options}
1063
+ '''
1064
+ else:
1065
+ pass
ocrd/processor/helpers.py CHANGED
@@ -8,13 +8,11 @@ import inspect
8
8
  from subprocess import run
9
9
  from typing import List, Optional
10
10
 
11
- from click import wrap_text
12
- from ocrd.workspace import Workspace
11
+ from ..workspace import Workspace
13
12
  from ocrd_utils import freeze_args, getLogger, config, setOverrideLogLevel, getLevelName, sparkline
14
13
 
15
14
 
16
15
  __all__ = [
17
- 'generate_processor_help',
18
16
  'run_cli',
19
17
  'run_processor'
20
18
  ]
@@ -213,147 +211,6 @@ def run_cli(
213
211
  return result.returncode
214
212
 
215
213
 
216
- def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
217
- """Generate a string describing the full CLI of this processor including params.
218
-
219
- Args:
220
- ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
221
- processor_instance (object, optional): the processor implementation
222
- (for adding any module/class/function docstrings)
223
- subcommand (string): 'worker' or 'server'
224
- """
225
- doc_help = ''
226
- if processor_instance:
227
- module = inspect.getmodule(processor_instance)
228
- if module and module.__doc__:
229
- doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
230
- if processor_instance.__doc__:
231
- doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
232
- if processor_instance.process_workspace.__doc__:
233
- doc_help += '\n' + inspect.cleandoc(processor_instance.process_workspace.__doc__) + '\n'
234
- if processor_instance.process.__doc__:
235
- doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n'
236
- if doc_help:
237
- doc_help = '\n\n' + wrap_text(doc_help, width=72,
238
- initial_indent=' > ',
239
- subsequent_indent=' > ',
240
- preserve_paragraphs=True)
241
- subcommands = '''\
242
- worker Start a processing worker rather than do local processing
243
- server Start a processor server rather than do local processing
244
- '''
245
-
246
- processing_worker_options = '''\
247
- --queue The RabbitMQ server address in format
248
- "amqp://{user}:{pass}@{host}:{port}/{vhost}"
249
- [amqp://admin:admin@localhost:5672]
250
- --database The MongoDB server address in format
251
- "mongodb://{host}:{port}"
252
- [mongodb://localhost:27018]
253
- --log-filename Filename to redirect STDOUT/STDERR to,
254
- if specified.
255
- '''
256
-
257
- processing_server_options = '''\
258
- --address The Processor server address in format
259
- "{host}:{port}"
260
- --database The MongoDB server address in format
261
- "mongodb://{host}:{port}"
262
- [mongodb://localhost:27018]
263
- '''
264
-
265
- processing_options = '''\
266
- -m, --mets URL-PATH URL or file path of METS to process [./mets.xml]
267
- -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)]
268
- -I, --input-file-grp USE File group(s) used as input
269
- -O, --output-file-grp USE File group(s) used as output
270
- -g, --page-id ID Physical page ID(s) to process instead of full document []
271
- --overwrite Remove existing output pages/images
272
- (with "--page-id", remove only those).
273
- Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
274
- --debug Abort on any errors with full stack trace.
275
- Short-hand for OCRD_MISSING_OUTPUT=ABORT
276
- --profile Enable profiling
277
- --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile"
278
- -p, --parameter JSON-PATH Parameters, either verbatim JSON string
279
- or JSON file path
280
- -P, --param-override KEY VAL Override a single JSON object key-value pair,
281
- taking precedence over --parameter
282
- -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS
283
- If URL starts with http:// start an HTTP server there,
284
- otherwise URL is a path to an on-demand-created unix socket
285
- -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
286
- Override log level globally [INFO]
287
- '''
288
-
289
- information_options = '''\
290
- -C, --show-resource RESNAME Dump the content of processor resource RESNAME
291
- -L, --list-resources List names of processor resources
292
- -J, --dump-json Dump tool description as JSON
293
- -D, --dump-module-dir Show the 'module' resource location path for this processor
294
- -h, --help Show this message
295
- -V, --version Show version
296
- '''
297
-
298
- parameter_help = ''
299
- if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
300
- parameter_help = ' NONE\n'
301
- else:
302
- def wrap(s):
303
- return wrap_text(s, initial_indent=' '*3,
304
- subsequent_indent=' '*4,
305
- width=72, preserve_paragraphs=True)
306
- for param_name, param in ocrd_tool['parameters'].items():
307
- parameter_help += wrap('"%s" [%s%s]' % (
308
- param_name,
309
- param['type'],
310
- ' - REQUIRED' if 'required' in param and param['required'] else
311
- ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
312
- parameter_help += '\n ' + wrap(param['description'])
313
- if 'enum' in param:
314
- parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
315
- parameter_help += "\n"
316
-
317
- if not subcommand:
318
- return f'''\
319
- Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
320
-
321
- {ocrd_tool['description']}{doc_help}
322
-
323
- Subcommands:
324
- {subcommands}
325
- Options for processing:
326
- {processing_options}
327
- Options for information:
328
- {information_options}
329
- Parameters:
330
- {parameter_help}
331
- '''
332
- elif subcommand == 'worker':
333
- return f'''\
334
- Usage: {ocrd_tool['executable']} worker [OPTIONS]
335
-
336
- Run {ocrd_tool['executable']} as a processing worker.
337
-
338
- {ocrd_tool['description']}{doc_help}
339
-
340
- Options:
341
- {processing_worker_options}
342
- '''
343
- elif subcommand == 'server':
344
- return f'''\
345
- Usage: {ocrd_tool['executable']} server [OPTIONS]
346
-
347
- Run {ocrd_tool['executable']} as a processor sever.
348
-
349
- {ocrd_tool['description']}{doc_help}
350
-
351
- Options:
352
- {processing_server_options}
353
- '''
354
- else:
355
- pass
356
-
357
214
 
358
215
  # not decorated here but at runtime (on first use)
359
216
  #@freeze_args
ocrd/workspace.py CHANGED
@@ -95,8 +95,8 @@ class Workspace():
95
95
  if self.is_remote:
96
96
  mets = ClientSideOcrdMets(mets_server_url, self.directory)
97
97
  if mets.workspace_path != self.directory:
98
- raise ValueError(f"METS server {mets_server_url} workspace directory {mets.workspace_path} differs "
99
- f"from local workspace directory {self.directory}. These are not the same workspaces.")
98
+ raise ValueError(f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs "
99
+ f"from local workspace directory '{self.directory}'. These are not the same workspaces.")
100
100
  else:
101
101
  mets = OcrdMets(filename=self.mets_target)
102
102
  self.mets = mets
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.0.0b1
3
+ Version: 3.0.0b3
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -94,17 +94,12 @@ complete stack of OCR-D-related software.
94
94
 
95
95
  The easiest way to install is via `pip`:
96
96
 
97
- ```sh
98
- pip install ocrd
97
+ pip install ocrd
99
98
 
100
- # or just the functionality you need, e.g.
101
-
102
- pip install ocrd_modelfactory
103
- ```
104
99
 
105
100
  All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher.
106
101
 
107
- **NOTE** Some OCR-D-Tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
102
+ > **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
108
103
  * using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes
109
104
  * custom Python logging configurations in your personal account
110
105
 
@@ -129,7 +124,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an
129
124
 
130
125
  Some parts of the software are configured via environment variables:
131
126
 
132
- * `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification.
133
127
  * `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens:
134
128
  * `CPU`: Enable CPU profiling of processor runs
135
129
  * `RSS`: Enable RSS memory profiling
@@ -142,18 +136,46 @@ Some parts of the software are configured via environment variables:
142
136
  * `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`.
143
137
  * `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`.
144
138
 
145
- * `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files.
139
+ * `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files.
146
140
  * `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading.
147
141
 
142
+ * `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing:
143
+ * `SKIP`: ignore and proceed with next page's input
144
+ * `ABORT`: throw `MissingInputFile` exception
145
+
146
+ * `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing:
147
+ * `SKIP`: ignore and proceed processing next page
148
+ * `COPY`: fall back to copying input PAGE to output fileGrp for page
149
+ * `ABORT`: re-throw whatever caused processing to fail
150
+
151
+ * `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).
152
+
153
+ * `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing:
154
+ * `SKIP`: ignore and proceed processing next page
155
+ * `OVERWRITE`: force writing result to output fileGrp for page
156
+ * `ABORT`: re-throw `FileExistsError` exception
157
+
158
+
148
159
  * `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations.
149
160
 
150
161
  * `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.
151
162
 
163
+ * `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation.
164
+
165
+ * `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.
166
+
152
167
  * `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`).
153
168
  * `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`).
154
169
  * `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`).
155
170
  * `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started.
156
171
 
172
+ * `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again.
173
+ * `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds).
174
+
175
+ * `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created.
176
+ * `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored.
177
+
178
+
157
179
 
158
180
  ## Packages
159
181
 
@@ -1,16 +1,16 @@
1
1
  ocrd/__init__.py,sha256=ZswMVmlqFhAEIzMR3my6IKPq9XLH21aDPC_m_8Jh4dA,1076
2
2
  ocrd/constants.py,sha256=6dn3mG54WqHsKInmLZp4kJjNqqPtBoFoSuLUuRbOps0,740
3
- ocrd/lib.bash,sha256=7amCf_0mwE3tkz-e_HxA30aQnL1x2O4BF_8dE9fbqJg,10692
4
- ocrd/mets_server.py,sha256=M7hZOvOEbCfx5jCjeZ5Uv4CgviVZjGV15N8SN1ATpNY,20149
3
+ ocrd/lib.bash,sha256=dlOQd36OVsibwEPXg7i-e_JmIc-3uH-SIwGcE8OJ3Cc,10744
4
+ ocrd/mets_server.py,sha256=SrCnCmWBuyEauHrNF0jLZO_Wi8WoPCMWxhec2wE5Y3w,20160
5
5
  ocrd/ocrd-all-tool.json,sha256=9bX2VYnUwhTAzAvKaoT77BFzbgBGgyIt7qBqARpwWNc,586
6
6
  ocrd/resolver.py,sha256=Ba9ALQbTXz6_mla4VqN9tAfHoj6aKuNJAU4tIDnjcHE,14952
7
7
  ocrd/resource_list.yml,sha256=82-PiqkZnka1kTj3MQqNn4wXWKHHtoFchsQuetWuqFs,2633
8
8
  ocrd/resource_manager.py,sha256=8BMVKJq8J56hugi8vtGn9Ffuk7oRkbs197aG74aKbCY,16733
9
9
  ocrd/task_sequence.py,sha256=spiaUQaMM7M8WdBDoQGmLuTPm7tOugYXD6rcJ2UXzxw,6991
10
- ocrd/workspace.py,sha256=t5xveWhSLj5cbsuVkOqT6VZEQ9DRCxucT7FUvTNFnDA,65604
10
+ ocrd/workspace.py,sha256=4s0qscEosS7rQ0jfn1qJeT9B3eC31YippAX-RUjXghA,65608
11
11
  ocrd/workspace_backup.py,sha256=iab_JjZ_mMP-G8NIUk4PZmfpNlQuGRoqc3NbTSSew1w,3621
12
12
  ocrd/workspace_bagger.py,sha256=yU8H3xR5WmQKvgQewac71ie-DUWcfLnMS01D55zsEHQ,11971
13
- ocrd/cli/__init__.py,sha256=XyYcbIuajaS2YM6HEWD4dfitdAzn111AWIaFPsTHoKQ,2621
13
+ ocrd/cli/__init__.py,sha256=lNR6wMf7JhQ8Jf33tUkowJr0mB3423OMY0_6dkMRLvU,2672
14
14
  ocrd/cli/bashlib.py,sha256=XGcO-MmYM3xJBRkSCLEZcGs0hqbw2GR8oyijJPtKnYM,5888
15
15
  ocrd/cli/log.py,sha256=6_FrVmTKIIVNUaNLkuOJx8pvPhensHMuayJ0PA7T-XA,1562
16
16
  ocrd/cli/network.py,sha256=oWBHFEURxfUdb_t-F4svP_ri7o5mqBoNQnLZLbsZLTA,602
@@ -23,11 +23,11 @@ ocrd/cli/zip.py,sha256=MMJLw3OXWiJVfVtrdJcBkbB8vA1IzSautluazZRuCQ0,5910
23
23
  ocrd/decorators/__init__.py,sha256=IJlA1XcdVBO6Hxm9rNDya7QYcqeWcaXXuLtGjfjcen8,7596
24
24
  ocrd/decorators/loglevel_option.py,sha256=tgipROEu3t4hkwWvFssd80k2SbTBwBIC4WNE6Gc-XAg,798
25
25
  ocrd/decorators/mets_find_options.py,sha256=d4oATKMP6bFQHNqOK6nLqgUiWF2FYdkPvzkTVRMYpKo,635
26
- ocrd/decorators/ocrd_cli_options.py,sha256=-bpYOyHod3pMmbooyw_dIOqRnhCoZbdmE8vrnWbQRNA,2621
26
+ ocrd/decorators/ocrd_cli_options.py,sha256=4pcBLAFPSpYZLj6r9Yj1GZOQl4r_RWU00pyA4mHwFQk,2621
27
27
  ocrd/decorators/parameter_option.py,sha256=n8hYw7XVTd3i3tvpK8F1Jx_CqRp6EGF9qJVH95yj92Q,1076
28
- ocrd/processor/__init__.py,sha256=m2lYRvJogmt2Xb4UgqjCXWkfOtqpKW3h8OEWDnblwTQ,273
29
- ocrd/processor/base.py,sha256=EvzR-uN5fDH1jKRWnm4CpCVpqge4Z7aprzW2PeXMqxU,38521
30
- ocrd/processor/helpers.py,sha256=7TmhKRrgi-T_q8dJX6DIVEVZ3qmNABJEcz3S9RXSUjI,16484
28
+ ocrd/processor/__init__.py,sha256=39ymNwYRdc-b_OJzzKmWCvo2ga3KdsGSYDHE1Hzkn_w,274
29
+ ocrd/processor/base.py,sha256=341APZGx6zCbuxgX-XTkKhPfeQkqblykmC9zSMPH3ss,48843
30
+ ocrd/processor/helpers.py,sha256=Lp9zbHYCLpT3GnPzl-p7UCSFU5Nx99gYEYXwW04v0RI,10157
31
31
  ocrd/processor/ocrd_page_result.py,sha256=AazEmnWyPEN47TxXVg0WUQpgFNV_mlIiExwwycUj0nQ,490
32
32
  ocrd/processor/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  ocrd/processor/builtin/dummy_processor.py,sha256=6ORike_59wb_UUivhA6Iw_Ldg0AaMzX37F7qL9R8S_A,3704
@@ -87,12 +87,12 @@ ocrd_network/runtime_data/hosts.py,sha256=ml19ptzH4TFofyJR-Qp_Mn3sZUFbWoNe__rRXZ
87
87
  ocrd_network/runtime_data/network_agents.py,sha256=5p_zKLqECBIHLw-Ya6eKcKSZcUM4ESiipEIphVxHBEA,5192
88
88
  ocrd_network/runtime_data/network_services.py,sha256=xrPpFUU_Pa-XzGe2FEt5RmO17xqykIUmTr_9g6S7XSs,7892
89
89
  ocrd_utils/__init__.py,sha256=U_zAQJwxg_aJ4CR84CKMNAUP6Cob8Er8Ikj42JmnUKo,5977
90
- ocrd_utils/config.py,sha256=Zs7XXjfWa1oQ1tvnu6DR1WmfafohMPU9KR-Qx3qoC74,9818
90
+ ocrd_utils/config.py,sha256=Rkqv5wWEmlDDD0l1IWo9TPgn5ppPnHPRH9FfkMST29E,11117
91
91
  ocrd_utils/constants.py,sha256=ImbG1d8t2MW3uuFi-mN6aY90Zn74liAKZBKlfuKN86w,3278
92
92
  ocrd_utils/deprecate.py,sha256=4i50sZsA3Eevqn5D-SL5yGf9KEZfGCV4A5Anzn1GRMs,1026
93
93
  ocrd_utils/image.py,sha256=zNNX1cnRy6yvrxx8mnYQiqWraAh5-i4a1AOfCCg4SmI,24781
94
94
  ocrd_utils/introspect.py,sha256=gfBlmeEFuRmRUSgdSK0jOxRpYqDRXl2IAE6gv2MZ6as,1977
95
- ocrd_utils/logging.py,sha256=Mw49E_JX3lGNi2TEOK3qXInQf5E2zg-Vbse5uFgxO9M,8156
95
+ ocrd_utils/logging.py,sha256=5_-5T5OWSYicNk8SQyjVqdRj2bVl-gDK1Th-C7oW_HE,8248
96
96
  ocrd_utils/ocrd_logging.conf,sha256=kl9x9JS1d8h8F0QZabvrjZtW1iApIaChvkImYafKO5g,3623
97
97
  ocrd_utils/os.py,sha256=acRRdDBI8L6BK0Mf773yKEzwdpZSFRBJEKB2crL4EjU,9865
98
98
  ocrd_utils/str.py,sha256=JIhsyWphqJuxJAzhRQJUqlZ44AGOeObEPJMFhfWhfhQ,10084
@@ -118,9 +118,9 @@ ocrd_validators/xlink.xsd,sha256=8fW7YAMWXN2PbB_MMvj9H5ZeFoEBDzuYBtlGC8_6ijw,318
118
118
  ocrd_validators/xsd_mets_validator.py,sha256=4GWfLyqkmca0x7osDuXuExYuM0HWVrKoqn0S35sFhHU,467
119
119
  ocrd_validators/xsd_page_validator.py,sha256=BNz_9u-Ek4UCeyZu3KxSQoolfW9lvuaSR9nIu1XXxeE,467
120
120
  ocrd_validators/xsd_validator.py,sha256=6HrVAf6SzCvfUIuQdIzz9bOq4V-zhyii9yrUPoK2Uvo,2094
121
- ocrd-3.0.0b1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
122
- ocrd-3.0.0b1.dist-info/METADATA,sha256=NKl_vrDFcwK3ayhp9_fe_2gAPESgGghDz3QQcdJRSRw,8785
123
- ocrd-3.0.0b1.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
124
- ocrd-3.0.0b1.dist-info/entry_points.txt,sha256=tV_gAdO8cbnOjS0GmKfJKbN60xBAV2DQRX6hEjleSjE,94
125
- ocrd-3.0.0b1.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
126
- ocrd-3.0.0b1.dist-info/RECORD,,
121
+ ocrd-3.0.0b3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
122
+ ocrd-3.0.0b3.dist-info/METADATA,sha256=WZhPkJV0F8A5k-0IVK8HZ5zGWVWwYSa6FuDlpkuh4Xc,10397
123
+ ocrd-3.0.0b3.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
124
+ ocrd-3.0.0b3.dist-info/entry_points.txt,sha256=tV_gAdO8cbnOjS0GmKfJKbN60xBAV2DQRX6hEjleSjE,94
125
+ ocrd-3.0.0b3.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
126
+ ocrd-3.0.0b3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (73.0.1)
2
+ Generator: setuptools (74.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
ocrd_utils/config.py CHANGED
@@ -78,14 +78,26 @@ class OcrdEnvConfig():
78
78
  raise ValueError(f"Unregistered env variable {name}")
79
79
  return self._variables[name].has_default
80
80
 
81
+ def reset_defaults(self):
82
+ for name in self._variables:
83
+ try:
84
+ # we cannot use hasattr, because that delegates to getattr,
85
+ # which we override and provide defaults for (which of course
86
+ # cannot be removed)
87
+ if self.__getattribute__(name):
88
+ delattr(self, name)
89
+ except AttributeError:
90
+ pass
91
+
81
92
  def describe(self, name, *args, **kwargs):
82
93
  if not name in self._variables:
83
94
  raise ValueError(f"Unregistered env variable {name}")
84
95
  return self._variables[name].describe(*args, **kwargs)
85
96
 
86
97
  def __getattr__(self, name):
98
+ # will be called if name is not accessible (has not been added directly yet)
87
99
  if not name in self._variables:
88
- raise ValueError(f"Unregistered env variable {name}")
100
+ raise AttributeError(f"Unregistered env variable {name}")
89
101
  var_obj = self._variables[name]
90
102
  try:
91
103
  raw_value = self.raw_value(name)
@@ -120,6 +132,16 @@ config.add('OCRD_MAX_PROCESSOR_CACHE',
120
132
  parser=int,
121
133
  default=(True, 128))
122
134
 
135
+ config.add('OCRD_MAX_PARALLEL_PAGES',
136
+ description="Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.",
137
+ parser=int,
138
+ default=(True, 1))
139
+
140
+ config.add('OCRD_PROCESSING_PAGE_TIMEOUT',
141
+ description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.",
142
+ parser=int,
143
+ default=(True, 0))
144
+
123
145
  config.add("OCRD_PROFILE",
124
146
  description="""\
125
147
  Whether to enable gathering runtime statistics
@@ -184,6 +206,11 @@ How to deal with missing output files (for some fileGrp/pageId) during processin
184
206
  validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
185
207
  parser=str)
186
208
 
209
+ config.add("OCRD_MAX_MISSING_OUTPUTS",
210
+ description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).",
211
+ default=(True, 0.1),
212
+ parser=float)
213
+
187
214
  config.add("OCRD_EXISTING_OUTPUT",
188
215
  description="""\
189
216
  How to deal with already existing output files (for some fileGrp/pageId) during processing:
ocrd_utils/logging.py CHANGED
@@ -210,11 +210,13 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG):
210
210
  # logging.basicConfig(level=logging.CRITICAL)
211
211
  # logging.disable(logging.ERROR)
212
212
  # remove all handlers for the ocrd logger
213
- for logger_name in ROOT_OCRD_LOGGERS:
213
+ for logger_name in ROOT_OCRD_LOGGERS + ['']:
214
214
  for handler in logging.getLogger(logger_name).handlers[:]:
215
215
  logging.getLogger(logger_name).removeHandler(handler)
216
216
  for logger_name in LOGGING_DEFAULTS:
217
217
  logging.getLogger(logger_name).setLevel(logging.NOTSET)
218
+ # Python default log level is WARNING
219
+ logging.root.setLevel(logging.WARNING)
218
220
 
219
221
  # Initializing stream handlers at module level
220
222
  # would cause message output in all runtime contexts,