ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. ocrd/cli/__init__.py +34 -26
  2. ocrd/cli/bashlib.py +32 -18
  3. ocrd/cli/ocrd_tool.py +7 -5
  4. ocrd/cli/workspace.py +10 -8
  5. ocrd/decorators/__init__.py +13 -7
  6. ocrd/decorators/ocrd_cli_options.py +1 -1
  7. ocrd/lib.bash +3 -0
  8. ocrd/mets_server.py +3 -4
  9. ocrd/processor/__init__.py +1 -1
  10. ocrd/processor/base.py +421 -98
  11. ocrd/processor/builtin/dummy_processor.py +4 -11
  12. ocrd/processor/helpers.py +24 -161
  13. ocrd/processor/ocrd_page_result.py +3 -3
  14. ocrd/resolver.py +0 -3
  15. ocrd/resource_manager.py +9 -5
  16. ocrd/workspace.py +10 -11
  17. ocrd/workspace_backup.py +1 -1
  18. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/METADATA +32 -10
  19. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/RECORD +49 -48
  20. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/WHEEL +1 -1
  21. ocrd_modelfactory/__init__.py +1 -1
  22. ocrd_models/constants.py +0 -1
  23. ocrd_models/ocrd_exif.py +2 -2
  24. ocrd_models/ocrd_file.py +2 -2
  25. ocrd_models/ocrd_mets.py +22 -22
  26. ocrd_models/ocrd_page.py +0 -1
  27. ocrd_models/ocrd_xml_base.py +2 -2
  28. ocrd_network/cli/client.py +134 -30
  29. ocrd_network/client.py +53 -27
  30. ocrd_network/client_utils.py +101 -0
  31. ocrd_network/processing_server.py +1 -1
  32. ocrd_network/runtime_data/deployer.py +12 -3
  33. ocrd_network/server_utils.py +12 -10
  34. ocrd_utils/__init__.py +2 -0
  35. ocrd_utils/config.py +31 -2
  36. ocrd_utils/image.py +25 -25
  37. ocrd_utils/logging.py +20 -20
  38. ocrd_utils/os.py +4 -5
  39. ocrd_utils/str.py +10 -3
  40. ocrd_validators/json_validator.py +1 -3
  41. ocrd_validators/ocrd_tool_validator.py +2 -2
  42. ocrd_validators/page_validator.py +56 -56
  43. ocrd_validators/parameter_validator.py +2 -2
  44. ocrd_validators/resource_list_validator.py +4 -3
  45. ocrd_validators/workspace_validator.py +21 -21
  46. ocrd_validators/xsd_validator.py +1 -1
  47. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/LICENSE +0 -0
  48. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/entry_points.txt +0 -0
  49. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  # pylint: disable=missing-module-docstring,invalid-name
2
- from os.path import join, basename
3
- from typing import Optional, Union
2
+ from os.path import join
3
+ from typing import Optional
4
4
 
5
5
  import click
6
6
 
@@ -10,7 +10,6 @@ from ocrd.processor.ocrd_page_result import OcrdPageResult
10
10
  from ocrd_models.ocrd_file import OcrdFileType
11
11
  from ocrd_models.ocrd_page import OcrdPage, to_xml
12
12
  from ocrd_utils import (
13
- getLogger,
14
13
  make_file_id,
15
14
  MIME_TO_EXT,
16
15
  MIMETYPE_PAGE,
@@ -20,8 +19,6 @@ from ocrd_utils import (
20
19
  )
21
20
  from ocrd_modelfactory import page_from_file
22
21
 
23
- OCRD_TOOL = parse_json_string_with_comments(resource_string(__package__ + '.dummy', 'ocrd-tool.json'))
24
-
25
22
  class DummyProcessor(Processor):
26
23
  """
27
24
  Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
@@ -76,17 +73,13 @@ class DummyProcessor(Processor):
76
73
  super().process_page_file(input_file)
77
74
 
78
75
  @property
79
- def metadata(self):
80
- return OCRD_TOOL
76
+ def metadata_filename(self):
77
+ return 'processor/builtin/dummy/ocrd-tool.json'
81
78
 
82
79
  @property
83
80
  def executable(self):
84
81
  return 'ocrd-dummy'
85
82
 
86
- @property
87
- def version(self):
88
- return '0.0.3'
89
-
90
83
  @click.command()
91
84
  @ocrd_cli_options
92
85
  def cli(*args, **kwargs):
ocrd/processor/helpers.py CHANGED
@@ -1,21 +1,18 @@
1
1
  """
2
2
  Helper methods for running and documenting processors
3
3
  """
4
- from os import chdir, getcwd
5
4
  from time import perf_counter, process_time
6
5
  from functools import lru_cache
7
6
  import json
8
7
  import inspect
9
8
  from subprocess import run
10
- from typing import List
9
+ from typing import List, Optional
11
10
 
12
- from click import wrap_text
13
- from ocrd.workspace import Workspace
11
+ from ..workspace import Workspace
14
12
  from ocrd_utils import freeze_args, getLogger, config, setOverrideLogLevel, getLevelName, sparkline
15
13
 
16
14
 
17
15
  __all__ = [
18
- 'generate_processor_help',
19
16
  'run_cli',
20
17
  'run_processor'
21
18
  ]
@@ -39,10 +36,7 @@ def run_processor(
39
36
  log_level=None,
40
37
  input_file_grp=None,
41
38
  output_file_grp=None,
42
- show_resource=None,
43
- list_resources=False,
44
39
  parameter=None,
45
- parameter_override=None,
46
40
  working_dir=None,
47
41
  mets_server_url=None,
48
42
  instance_caching=False
@@ -84,7 +78,7 @@ def run_processor(
84
78
  log.debug("Running processor %s", processorClass)
85
79
 
86
80
  processor = get_processor(
87
- processor_class=processorClass,
81
+ processorClass,
88
82
  parameter=parameter,
89
83
  workspace=None,
90
84
  page_id=page_id,
@@ -102,7 +96,7 @@ def run_processor(
102
96
  t0_cpu = process_time()
103
97
  if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
104
98
  backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
105
- from memory_profiler import memory_usage
99
+ from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
106
100
  try:
107
101
  mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}),
108
102
  # only run process once
@@ -212,156 +206,15 @@ def run_cli(
212
206
  if not log_filename:
213
207
  result = run(args, check=False)
214
208
  else:
215
- with open(log_filename, 'a') as file_desc:
209
+ with open(log_filename, 'a', encoding='utf-8') as file_desc:
216
210
  result = run(args, check=False, stdout=file_desc, stderr=file_desc)
217
211
  return result.returncode
218
212
 
219
213
 
220
- def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
221
- """Generate a string describing the full CLI of this processor including params.
222
-
223
- Args:
224
- ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
225
- processor_instance (object, optional): the processor implementation
226
- (for adding any module/class/function docstrings)
227
- subcommand (string): 'worker' or 'server'
228
- """
229
- doc_help = ''
230
- if processor_instance:
231
- module = inspect.getmodule(processor_instance)
232
- if module and module.__doc__:
233
- doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
234
- if processor_instance.__doc__:
235
- doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
236
- if processor_instance.process_workspace.__doc__:
237
- doc_help += '\n' + inspect.cleandoc(processor_instance.process_workspace.__doc__) + '\n'
238
- if processor_instance.process.__doc__:
239
- doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n'
240
- if doc_help:
241
- doc_help = '\n\n' + wrap_text(doc_help, width=72,
242
- initial_indent=' > ',
243
- subsequent_indent=' > ',
244
- preserve_paragraphs=True)
245
- subcommands = '''\
246
- worker Start a processing worker rather than do local processing
247
- server Start a processor server rather than do local processing
248
- '''
249
-
250
- processing_worker_options = '''\
251
- --queue The RabbitMQ server address in format
252
- "amqp://{user}:{pass}@{host}:{port}/{vhost}"
253
- [amqp://admin:admin@localhost:5672]
254
- --database The MongoDB server address in format
255
- "mongodb://{host}:{port}"
256
- [mongodb://localhost:27018]
257
- --log-filename Filename to redirect STDOUT/STDERR to,
258
- if specified.
259
- '''
260
-
261
- processing_server_options = '''\
262
- --address The Processor server address in format
263
- "{host}:{port}"
264
- --database The MongoDB server address in format
265
- "mongodb://{host}:{port}"
266
- [mongodb://localhost:27018]
267
- '''
268
-
269
- processing_options = '''\
270
- -m, --mets URL-PATH URL or file path of METS to process [./mets.xml]
271
- -w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)]
272
- -I, --input-file-grp USE File group(s) used as input
273
- -O, --output-file-grp USE File group(s) used as output
274
- -g, --page-id ID Physical page ID(s) to process instead of full document []
275
- --overwrite Remove existing output pages/images
276
- (with "--page-id", remove only those).
277
- Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
278
- --debug Abort on any errors with full stack trace.
279
- Short-hand for OCRD_MISSING_OUTPUT=ABORT
280
- --profile Enable profiling
281
- --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile"
282
- -p, --parameter JSON-PATH Parameters, either verbatim JSON string
283
- or JSON file path
284
- -P, --param-override KEY VAL Override a single JSON object key-value pair,
285
- taking precedence over --parameter
286
- -U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS
287
- If URL starts with http:// start an HTTP server there,
288
- otherwise URL is a path to an on-demand-created unix socket
289
- -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
290
- Override log level globally [INFO]
291
- '''
292
-
293
- information_options = '''\
294
- -C, --show-resource RESNAME Dump the content of processor resource RESNAME
295
- -L, --list-resources List names of processor resources
296
- -J, --dump-json Dump tool description as JSON
297
- -D, --dump-module-dir Show the 'module' resource location path for this processor
298
- -h, --help Show this message
299
- -V, --version Show version
300
- '''
301
-
302
- parameter_help = ''
303
- if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
304
- parameter_help = ' NONE\n'
305
- else:
306
- def wrap(s):
307
- return wrap_text(s, initial_indent=' '*3,
308
- subsequent_indent=' '*4,
309
- width=72, preserve_paragraphs=True)
310
- for param_name, param in ocrd_tool['parameters'].items():
311
- parameter_help += wrap('"%s" [%s%s]' % (
312
- param_name,
313
- param['type'],
314
- ' - REQUIRED' if 'required' in param and param['required'] else
315
- ' - %s' % json.dumps(param['default']) if 'default' in param else ''))
316
- parameter_help += '\n ' + wrap(param['description'])
317
- if 'enum' in param:
318
- parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
319
- parameter_help += "\n"
320
-
321
- if not subcommand:
322
- return f'''\
323
- Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
324
-
325
- {ocrd_tool['description']}{doc_help}
326
-
327
- Subcommands:
328
- {subcommands}
329
- Options for processing:
330
- {processing_options}
331
- Options for information:
332
- {information_options}
333
- Parameters:
334
- {parameter_help}
335
- '''
336
- elif subcommand == 'worker':
337
- return f'''\
338
- Usage: {ocrd_tool['executable']} worker [OPTIONS]
339
-
340
- Run {ocrd_tool['executable']} as a processing worker.
341
-
342
- {ocrd_tool['description']}{doc_help}
343
-
344
- Options:
345
- {processing_worker_options}
346
- '''
347
- elif subcommand == 'server':
348
- return f'''\
349
- Usage: {ocrd_tool['executable']} server [OPTIONS]
350
-
351
- Run {ocrd_tool['executable']} as a processor sever.
352
214
 
353
- {ocrd_tool['description']}{doc_help}
354
-
355
- Options:
356
- {processing_server_options}
357
- '''
358
- else:
359
- pass
360
-
361
-
362
- # Taken from https://github.com/OCR-D/core/pull/884
363
- @freeze_args
364
- @lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
215
+ # not decorated here but at runtime (on first use)
216
+ #@freeze_args
217
+ #@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
365
218
  def get_cached_processor(parameter: dict, processor_class):
366
219
  """
367
220
  Call this function to get back an instance of a processor.
@@ -374,16 +227,13 @@ def get_cached_processor(parameter: dict, processor_class):
374
227
  Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
375
228
  """
376
229
  if processor_class:
377
- dict_params = dict(parameter) if parameter else None
378
- processor = processor_class(None, parameter=dict_params)
379
- processor.setup()
230
+ processor = processor_class(None, parameter=dict(parameter))
380
231
  return processor
381
232
  return None
382
233
 
383
-
384
234
  def get_processor(
385
235
  processor_class,
386
- parameter: dict,
236
+ parameter: Optional[dict] = None,
387
237
  workspace: Workspace = None,
388
238
  page_id: str = None,
389
239
  input_file_grp: List[str] = None,
@@ -391,11 +241,24 @@ def get_processor(
391
241
  instance_caching: bool = False,
392
242
  ):
393
243
  if processor_class:
244
+ if parameter is None:
245
+ parameter = {}
394
246
  if instance_caching:
247
+ global get_cached_processor
248
+ if not hasattr(get_cached_processor, '__wrapped__'):
249
+ # first call: wrap
250
+ if processor_class.max_instances < 0:
251
+ maxsize = config.OCRD_MAX_PROCESSOR_CACHE
252
+ else:
253
+ maxsize = min(config.OCRD_MAX_PROCESSOR_CACHE, processor_class.max_instances)
254
+ # wrapping in call cache
255
+ # wrapping dict into frozendict (from https://github.com/OCR-D/core/pull/884)
256
+ get_cached_processor = freeze_args(lru_cache(maxsize=maxsize)(get_cached_processor))
395
257
  processor = get_cached_processor(parameter, processor_class)
396
258
  else:
259
+ # avoid passing workspace already (deprecated chdir behaviour)
397
260
  processor = processor_class(None, parameter=parameter)
398
- processor.setup()
261
+ # set current processing parameters
399
262
  processor.workspace = workspace
400
263
  processor.page_id = page_id
401
264
  processor.input_file_grp = input_file_grp
@@ -1,15 +1,15 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List
2
+ from typing import List, Union
3
3
  from ocrd_models.ocrd_page import OcrdPage
4
4
  from PIL.Image import Image
5
5
 
6
- from ocrd_models.ocrd_page_generateds import AlternativeImageType
6
+ from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
7
7
 
8
8
  @dataclass
9
9
  class OcrdPageResultImage():
10
10
  pil : Image
11
11
  file_id_suffix : str
12
- alternative_image : AlternativeImageType
12
+ alternative_image : Union[AlternativeImageType, PageType]
13
13
 
14
14
  @dataclass
15
15
  class OcrdPageResult():
ocrd/resolver.py CHANGED
@@ -18,7 +18,6 @@ from ocrd_utils import (
18
18
  )
19
19
  from ocrd.workspace import Workspace
20
20
  from ocrd_models import OcrdMets
21
- from ocrd_models.constants import NAMESPACES as NS
22
21
  from ocrd_models.utils import handle_oai_response
23
22
 
24
23
  class Resolver():
@@ -310,5 +309,3 @@ class Resolver():
310
309
  raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))
311
310
 
312
311
  return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url
313
-
314
-
ocrd/resource_manager.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from pathlib import Path
2
2
  from os.path import join
3
- from os import environ, listdir, makedirs, getcwd, path, unlink
3
+ from os import environ, listdir, getcwd, unlink
4
4
  from shutil import copytree, rmtree, copy
5
5
  from fnmatch import filter as apply_glob
6
6
  from datetime import datetime
@@ -13,14 +13,18 @@ from gdown.parse_url import parse_url as gparse_url
13
13
  from gdown.download import get_url_from_gdrive_confirmation
14
14
  from yaml import safe_load, safe_dump
15
15
 
16
+ # pylint: disable=wrong-import-position
17
+
16
18
  # https://github.com/OCR-D/core/issues/867
17
19
  # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
18
20
  import yaml.constructor
19
- yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
20
- yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
21
+ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \
22
+ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str']
23
+
24
+ # pylint: enable=wrong-import-position
21
25
 
22
26
  from ocrd_validators import OcrdResourceListValidator
23
- from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config
27
+ from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
24
28
  from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
25
29
  from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
26
30
 
@@ -248,7 +252,7 @@ class OcrdResourceManager:
248
252
  if "Content-Disposition" not in r.headers:
249
253
  url = get_url_from_gdrive_confirmation(r.text)
250
254
  except RuntimeError as e:
251
- log.warning("Cannot unwrap Google Drive URL: ", e)
255
+ log.warning("Cannot unwrap Google Drive URL: %s", e)
252
256
  with open(filename, 'wb') as f:
253
257
  with requests.get(url, stream=True) as r:
254
258
  r.raise_for_status()
ocrd/workspace.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import io
2
2
  from os import makedirs, unlink, listdir, path
3
3
  from pathlib import Path
4
- from shutil import move, copyfileobj
4
+ from shutil import copyfileobj
5
5
  from re import sub
6
6
  from tempfile import NamedTemporaryFile
7
7
  from contextlib import contextmanager
@@ -43,7 +43,6 @@ from ocrd_utils import (
43
43
  MIME_TO_PIL,
44
44
  MIMETYPE_PAGE,
45
45
  REGEX_PREFIX,
46
- config
47
46
  )
48
47
 
49
48
  from .workspace_backup import WorkspaceBackupManager
@@ -96,8 +95,8 @@ class Workspace():
96
95
  if self.is_remote:
97
96
  mets = ClientSideOcrdMets(mets_server_url, self.directory)
98
97
  if mets.workspace_path != self.directory:
99
- raise ValueError(f"METS server {mets_server_url} workspace directory {mets.workspace_path} differs "
100
- f"from local workspace directory {self.directory}. These are not the same workspaces.")
98
+ raise ValueError(f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs "
99
+ f"from local workspace directory '{self.directory}'. These are not the same workspaces.")
101
100
  else:
102
101
  mets = OcrdMets(filename=self.mets_target)
103
102
  self.mets = mets
@@ -111,7 +110,7 @@ class Workspace():
111
110
 
112
111
  def __repr__(self):
113
112
  return 'Workspace[remote=%s, directory=%s, baseurl=%s, file_groups=%s, files=%s]' % (
114
- not not self.is_remote,
113
+ self.is_remote,
115
114
  self.directory,
116
115
  self.baseurl,
117
116
  self.mets.file_groups,
@@ -648,7 +647,7 @@ class Workspace():
648
647
  log = getLogger('ocrd.workspace.image_from_page')
649
648
  page_image_info = self.resolve_image_exif(page.imageFilename)
650
649
  page_image = self._resolve_image_as_pil(page.imageFilename)
651
- page_coords = dict()
650
+ page_coords = {}
652
651
  # use identity as initial affine coordinate transform:
653
652
  page_coords['transform'] = np.eye(3)
654
653
  # interim bbox (updated with each change to the transform):
@@ -1091,7 +1090,7 @@ class Workspace():
1091
1090
  The (absolute) path of the created file.
1092
1091
  """
1093
1092
  log = getLogger('ocrd.workspace.save_image_file')
1094
- saveargs = dict()
1093
+ saveargs = {}
1095
1094
  if 'dpi' in image.info:
1096
1095
  saveargs['dpi'] = image.info['dpi']
1097
1096
  image_bytes = io.BytesIO()
@@ -1168,9 +1167,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh
1168
1167
  # Transpose in affine coordinate transform:
1169
1168
  # (consistent with image transposition or AlternativeImage below)
1170
1169
  transposition = {
1171
- 90: Image.ROTATE_90,
1172
- 180: Image.ROTATE_180,
1173
- 270: Image.ROTATE_270
1170
+ 90: Image.Transpose.ROTATE_90,
1171
+ 180: Image.Transpose.ROTATE_180,
1172
+ 270: Image.Transpose.ROTATE_270
1174
1173
  }.get(orientation) # no default
1175
1174
  segment_coords['transform'] = transpose_coordinates(
1176
1175
  segment_coords['transform'], transposition,
@@ -1238,5 +1237,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa
1238
1237
  segment_image = segment_image.resize((int(segment_image.width * factor),
1239
1238
  int(segment_image.height * factor)),
1240
1239
  # slowest, but highest quality:
1241
- Image.BICUBIC)
1240
+ Image.Resampling.BICUBIC)
1242
1241
  return segment_image, segment_coords, segment_xywh
ocrd/workspace_backup.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from datetime import datetime
2
2
  from os import makedirs
3
- from os.path import join, basename, getsize, abspath
3
+ from os.path import join, basename, getsize
4
4
  from glob import glob
5
5
  from shutil import copy
6
6
  import hashlib
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.0.0a2
3
+ Version: 3.0.0b2
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -94,17 +94,12 @@ complete stack of OCR-D-related software.
94
94
 
95
95
  The easiest way to install is via `pip`:
96
96
 
97
- ```sh
98
- pip install ocrd
97
+ pip install ocrd
99
98
 
100
- # or just the functionality you need, e.g.
101
-
102
- pip install ocrd_modelfactory
103
- ```
104
99
 
105
100
  All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher.
106
101
 
107
- **NOTE** Some OCR-D-Tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
102
+ > **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
108
103
  * using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes
109
104
  * custom Python logging configurations in your personal account
110
105
 
@@ -129,7 +124,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an
129
124
 
130
125
  Some parts of the software are configured via environment variables:
131
126
 
132
- * `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification.
133
127
  * `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens:
134
128
  * `CPU`: Enable CPU profiling of processor runs
135
129
  * `RSS`: Enable RSS memory profiling
@@ -142,18 +136,46 @@ Some parts of the software are configured via environment variables:
142
136
  * `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`.
143
137
  * `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`.
144
138
 
145
- * `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files.
139
+ * `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files.
146
140
  * `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading.
147
141
 
142
+ * `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing:
143
+ * `SKIP`: ignore and proceed with next page's input
144
+ * `ABORT`: throw `MissingInputFile` exception
145
+
146
+ * `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing:
147
+ * `SKIP`: ignore and proceed processing next page
148
+ * `COPY`: fall back to copying input PAGE to output fileGrp for page
149
+ * `ABORT`: re-throw whatever caused processing to fail
150
+
151
+ * `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).
152
+
153
+ * `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing:
154
+ * `SKIP`: ignore and proceed processing next page
155
+ * `OVERWRITE`: force writing result to output fileGrp for page
156
+ * `ABORT`: re-throw `FileExistsError` exception
157
+
158
+
148
159
  * `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations.
149
160
 
150
161
  * `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.
151
162
 
163
+ * `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation.
164
+
165
+ * `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.
166
+
152
167
  * `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`).
153
168
  * `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`).
154
169
  * `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`).
155
170
  * `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started.
156
171
 
172
+ * `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again.
173
+ * `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds).
174
+
175
+ * `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created.
176
+ * `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored.
177
+
178
+
157
179
 
158
180
  ## Packages
159
181