ocrd 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. ocrd/cli/__init__.py +8 -6
  2. ocrd/cli/bashlib.py +8 -114
  3. ocrd/cli/network.py +0 -2
  4. ocrd/cli/ocrd_tool.py +26 -4
  5. ocrd/cli/process.py +1 -0
  6. ocrd/cli/resmgr.py +0 -1
  7. ocrd/cli/validate.py +32 -13
  8. ocrd/cli/workspace.py +125 -52
  9. ocrd/cli/zip.py +13 -4
  10. ocrd/decorators/__init__.py +28 -52
  11. ocrd/decorators/loglevel_option.py +4 -0
  12. ocrd/decorators/mets_find_options.py +2 -1
  13. ocrd/decorators/ocrd_cli_options.py +3 -7
  14. ocrd/decorators/parameter_option.py +12 -11
  15. ocrd/mets_server.py +11 -15
  16. ocrd/processor/base.py +88 -71
  17. ocrd/processor/builtin/dummy_processor.py +7 -4
  18. ocrd/processor/builtin/filter_processor.py +3 -2
  19. ocrd/processor/helpers.py +5 -6
  20. ocrd/processor/ocrd_page_result.py +7 -5
  21. ocrd/resolver.py +42 -32
  22. ocrd/task_sequence.py +11 -4
  23. ocrd/workspace.py +64 -54
  24. ocrd/workspace_backup.py +3 -0
  25. ocrd/workspace_bagger.py +15 -8
  26. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/METADATA +2 -8
  27. ocrd-3.7.0.dist-info/RECORD +123 -0
  28. ocrd_modelfactory/__init__.py +4 -2
  29. ocrd_models/constants.py +18 -1
  30. ocrd_models/ocrd_agent.py +1 -1
  31. ocrd_models/ocrd_exif.py +7 -3
  32. ocrd_models/ocrd_file.py +24 -19
  33. ocrd_models/ocrd_mets.py +90 -67
  34. ocrd_models/ocrd_page.py +17 -13
  35. ocrd_models/ocrd_xml_base.py +1 -0
  36. ocrd_models/report.py +2 -1
  37. ocrd_models/utils.py +4 -3
  38. ocrd_models/xpath_functions.py +3 -1
  39. ocrd_network/__init__.py +1 -2
  40. ocrd_network/cli/__init__.py +0 -2
  41. ocrd_network/cli/client.py +122 -50
  42. ocrd_network/cli/processing_server.py +1 -2
  43. ocrd_network/client.py +2 -2
  44. ocrd_network/client_utils.py +30 -13
  45. ocrd_network/constants.py +1 -6
  46. ocrd_network/database.py +3 -3
  47. ocrd_network/logging_utils.py +2 -7
  48. ocrd_network/models/__init__.py +0 -2
  49. ocrd_network/models/job.py +31 -33
  50. ocrd_network/models/messages.py +3 -2
  51. ocrd_network/models/workspace.py +5 -5
  52. ocrd_network/process_helpers.py +54 -17
  53. ocrd_network/processing_server.py +63 -114
  54. ocrd_network/processing_worker.py +6 -5
  55. ocrd_network/rabbitmq_utils/__init__.py +2 -0
  56. ocrd_network/rabbitmq_utils/helpers.py +24 -7
  57. ocrd_network/runtime_data/__init__.py +1 -2
  58. ocrd_network/runtime_data/deployer.py +12 -85
  59. ocrd_network/runtime_data/hosts.py +61 -130
  60. ocrd_network/runtime_data/network_agents.py +7 -31
  61. ocrd_network/runtime_data/network_services.py +1 -1
  62. ocrd_network/server_cache.py +1 -1
  63. ocrd_network/server_utils.py +13 -52
  64. ocrd_network/utils.py +1 -0
  65. ocrd_utils/__init__.py +4 -4
  66. ocrd_utils/config.py +86 -76
  67. ocrd_utils/deprecate.py +3 -0
  68. ocrd_utils/image.py +51 -23
  69. ocrd_utils/introspect.py +8 -3
  70. ocrd_utils/logging.py +15 -7
  71. ocrd_utils/os.py +17 -4
  72. ocrd_utils/str.py +32 -16
  73. ocrd_validators/json_validator.py +4 -1
  74. ocrd_validators/ocrd_tool_validator.py +2 -1
  75. ocrd_validators/ocrd_zip_validator.py +5 -4
  76. ocrd_validators/page_validator.py +21 -9
  77. ocrd_validators/parameter_validator.py +3 -2
  78. ocrd_validators/processing_server_config.schema.yml +1 -33
  79. ocrd_validators/resource_list_validator.py +3 -1
  80. ocrd_validators/workspace_validator.py +30 -20
  81. ocrd_validators/xsd_mets_validator.py +2 -1
  82. ocrd_validators/xsd_page_validator.py +2 -1
  83. ocrd_validators/xsd_validator.py +4 -2
  84. ocrd/cli/log.py +0 -51
  85. ocrd/lib.bash +0 -317
  86. ocrd-3.5.1.dist-info/RECORD +0 -128
  87. ocrd_network/cli/processor_server.py +0 -31
  88. ocrd_network/models/ocrd_tool.py +0 -12
  89. ocrd_network/processor_server.py +0 -255
  90. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/LICENSE +0 -0
  91. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/WHEEL +0 -0
  92. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/entry_points.txt +0 -0
  93. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/top_level.txt +0 -0
ocrd/processor/base.py CHANGED
@@ -16,7 +16,7 @@ import json
16
16
  import os
17
17
  from os import getcwd
18
18
  from pathlib import Path
19
- from typing import Any, Dict, List, Optional, Tuple, Union, get_args
19
+ from typing import Dict, List, Optional, Tuple, Union, get_args
20
20
  import sys
21
21
  import logging
22
22
  import logging.handlers
@@ -68,7 +68,7 @@ from ocrd_modelfactory import page_from_file
68
68
  from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
69
69
 
70
70
  # XXX imports must remain for backwards-compatibility
71
- from .helpers import run_cli, run_processor # pylint: disable=unused-import
71
+ from .helpers import run_cli, run_processor # pylint: disable=unused-import
72
72
 
73
73
 
74
74
  class ResourceNotFoundError(FileNotFoundError):
@@ -83,6 +83,7 @@ class ResourceNotFoundError(FileNotFoundError):
83
83
  f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
84
84
  super().__init__(self.message)
85
85
 
86
+
86
87
  class NonUniqueInputFile(ValueError):
87
88
  """
88
89
  An exception signifying the specified fileGrp / pageId / mimetype
@@ -97,6 +98,7 @@ class NonUniqueInputFile(ValueError):
97
98
  f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
98
99
  super().__init__(self.message)
99
100
 
101
+
100
102
  class MissingInputFile(ValueError):
101
103
  """
102
104
  An exception signifying the specified fileGrp / pageId / mimetype
@@ -111,6 +113,7 @@ class MissingInputFile(ValueError):
111
113
  f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
112
114
  super().__init__(self.message)
113
115
 
116
+
114
117
  class DummyFuture:
115
118
  """
116
119
  Mimics some of `concurrent.futures.Future` but runs immediately.
@@ -119,8 +122,11 @@ class DummyFuture:
119
122
  self.fn = fn
120
123
  self.args = args
121
124
  self.kwargs = kwargs
125
+
122
126
  def result(self):
123
127
  return self.fn(*self.args, **self.kwargs)
128
+
129
+
124
130
  class DummyExecutor:
125
131
  """
126
132
  Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs
@@ -128,15 +134,19 @@ class DummyExecutor:
128
134
  """
129
135
  def __init__(self, initializer=None, initargs=(), **kwargs):
130
136
  initializer(*initargs)
137
+
131
138
  def shutdown(self, **kwargs):
132
139
  # allow gc to catch processor instance (unless cached)
133
140
  _page_worker_set_ctxt(None, None)
141
+
134
142
  def submit(self, fn, *args, **kwargs) -> DummyFuture:
135
143
  return DummyFuture(fn, *args, **kwargs)
136
144
 
145
+
137
146
  TFuture = Union[DummyFuture, Future]
138
147
  TExecutor = Union[DummyExecutor, ProcessPoolExecutor]
139
148
 
149
+
140
150
  class Processor():
141
151
  """
142
152
  A processor is a tool that implements the uniform OCR-D
@@ -149,7 +159,7 @@ class Processor():
149
159
  parameters.
150
160
  """
151
161
 
152
- max_instances : int = -1
162
+ max_instances: int = -1
153
163
  """
154
164
  maximum number of cached instances (ignored if negative), to be applied on top of
155
165
  :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
@@ -157,7 +167,7 @@ class Processor():
157
167
  (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
158
168
  """
159
169
 
160
- max_workers : int = -1
170
+ max_workers: int = -1
161
171
  """
162
172
  maximum number of processor forks for page-parallel processing (ignored if negative),
163
173
  to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
@@ -167,7 +177,7 @@ class Processor():
167
177
  - at once, or if your class already creates threads prior to forking, e.g. during ``setup``.)
168
178
  """
169
179
 
170
- max_page_seconds : int = -1
180
+ max_page_seconds: int = -1
171
181
  """
172
182
  maximum number of seconds may be spent processing a single page (ignored if negative),
173
183
  to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
@@ -284,7 +294,7 @@ class Processor():
284
294
  return None
285
295
 
286
296
  @parameter.setter
287
- def parameter(self, parameter : dict) -> None:
297
+ def parameter(self, parameter: dict) -> None:
288
298
  if self.parameter is not None:
289
299
  self.shutdown()
290
300
  parameterValidator = ParameterValidator(self.ocrd_tool)
@@ -299,7 +309,7 @@ class Processor():
299
309
  def __init__(
300
310
  self,
301
311
  # FIXME: remove in favor of process_workspace(workspace)
302
- workspace : Optional[Workspace],
312
+ workspace: Optional[Workspace],
303
313
  ocrd_tool=None,
304
314
  parameter=None,
305
315
  input_file_grp=None,
@@ -365,8 +375,10 @@ class Processor():
365
375
  if parameter is not None:
366
376
  self.parameter = parameter
367
377
  # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
368
- setattr(self, 'process',
369
- deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
378
+ setattr(self, 'process', deprecated(
379
+ version='3.0', reason='process() should be replaced '
380
+ 'with process_page_pcgts() or process_page_file() or process_workspace()')(
381
+ getattr(self, 'process')))
370
382
 
371
383
  def __del__(self):
372
384
  self._base_logger.debug("shutting down %s in %s", repr(self), mp.current_process().name)
@@ -394,7 +406,8 @@ class Processor():
394
406
  assert self.output_file_grp is not None
395
407
  input_file_grps = self.input_file_grp.split(',')
396
408
  output_file_grps = self.output_file_grp.split(',')
397
- def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
409
+
410
+ def assert_file_grp_cardinality(grps: List[str], spec: Union[int, List[int]], msg):
398
411
  if isinstance(spec, int):
399
412
  if spec > 0:
400
413
  assert len(grps) == spec, msg % (len(grps), str(spec))
@@ -418,10 +431,10 @@ class Processor():
418
431
  assert input_file_grp in self.workspace.mets.file_groups, \
419
432
  f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}"
420
433
  for output_file_grp in output_file_grps:
421
- assert output_file_grp not in self.workspace.mets.file_groups \
422
- or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \
423
- or not any(self.workspace.mets.find_files(
424
- pageId=self.page_id, fileGrp=output_file_grp)), \
434
+ assert (output_file_grp not in self.workspace.mets.file_groups
435
+ or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP']
436
+ or not any(self.workspace.mets.find_files(
437
+ pageId=self.page_id, fileGrp=output_file_grp))), \
425
438
  f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}"
426
439
  # keep this for backwards compatibility:
427
440
  return True
@@ -465,7 +478,8 @@ class Processor():
465
478
  """
466
479
  pass
467
480
 
468
- @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
481
+ @deprecated(version='3.0', reason='process() should be replaced '
482
+ 'with process_page_pcgts() or process_page_file() or process_workspace()')
469
483
  def process(self) -> None:
470
484
  """
471
485
  Process all files of the :py:data:`workspace`
@@ -528,7 +542,8 @@ class Processor():
528
542
  )
529
543
  if max_workers > 1:
530
544
  # forward messages from log queue (in subprocesses) to all root handlers
531
- log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True)
545
+ log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers,
546
+ respect_handler_level=True)
532
547
  log_listener.start()
533
548
  tasks = None
534
549
  try:
@@ -553,7 +568,8 @@ class Processor():
553
568
  # suppress the NotImplementedError context
554
569
  raise err from None
555
570
 
556
- def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
571
+ def process_workspace_submit_tasks(self, executor: TExecutor, max_seconds: int) -> Dict[
572
+ TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
557
573
  """
558
574
  Look up all input files of the given ``workspace``
559
575
  from the given :py:data:`input_file_grp`
@@ -571,7 +587,7 @@ class Processor():
571
587
  Otherwise, tasks are run sequentially in the
572
588
  current process.
573
589
 
574
- Delegates to :py:meth:`.zip_input_files` to get
590
+ Delegates to :py:meth:`.zip_input_files` to get
575
591
  the input files for each page, and then calls
576
592
  :py:meth:`.process_workspace_submit_page_task`.
577
593
 
@@ -586,7 +602,9 @@ class Processor():
586
602
  self._base_logger.debug("submitted %d processing tasks", len(tasks))
587
603
  return tasks
588
604
 
589
- def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]:
605
+ def process_workspace_submit_page_task(self, executor: TExecutor, max_seconds: int,
606
+ input_file_tuple: List[Optional[OcrdFileType]]) -> Tuple[
607
+ TFuture, str, List[Optional[OcrdFileType]]]:
590
608
  """
591
609
  Ensure all input files for a single page are
592
610
  downloaded to the workspace, then schedule
@@ -604,7 +622,7 @@ class Processor():
604
622
  - the corresponding pageId,
605
623
  - the corresponding input files.
606
624
  """
607
- input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
625
+ input_files: List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
608
626
  page_id = next(input_file.pageId
609
627
  for input_file in input_file_tuple
610
628
  if input_file)
@@ -625,7 +643,8 @@ class Processor():
625
643
  #executor.submit(self.process_page_file, *input_files)
626
644
  return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
627
645
 
628
- def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]:
646
+ def process_workspace_handle_tasks(self, tasks: Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[
647
+ int, int, Dict[str, int], int]:
629
648
  """
630
649
  Look up scheduled per-page futures one by one,
631
650
  handle errors (exceptions) and gather results.
@@ -650,7 +669,7 @@ class Processor():
650
669
  # aggregate info for logging:
651
670
  nr_succeeded = 0
652
671
  nr_failed = 0
653
- nr_errors = defaultdict(int) # count causes
672
+ nr_errors = defaultdict(int) # count causes
654
673
  if config.OCRD_MISSING_OUTPUT == 'SKIP':
655
674
  reason = "skipped"
656
675
  elif config.OCRD_MISSING_OUTPUT == 'COPY':
@@ -666,7 +685,8 @@ class Processor():
666
685
  if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS:
667
686
  # already irredeemably many failures, stop short
668
687
  nr_errors = dict(nr_errors)
669
- raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})")
688
+ raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, "
689
+ f"{str(nr_errors)})")
670
690
  elif result:
671
691
  nr_succeeded += 1
672
692
  # else skipped - already exists
@@ -676,13 +696,15 @@ class Processor():
676
696
  if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
677
697
  raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
678
698
  self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
679
- self._base_logger.debug("succeeded %d, missed %d of %d pages due to %s", nr_succeeded, nr_failed, nr_all, str(nr_errors))
699
+ self._base_logger.debug("succeeded %d, missed %d of %d pages due to %s",
700
+ nr_succeeded, nr_failed, nr_all, str(nr_errors))
680
701
  return nr_succeeded, nr_failed, nr_errors, len(tasks)
681
702
 
682
- def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]:
703
+ def process_workspace_handle_page_task(self, page_id: str, input_files: List[Optional[OcrdFileType]],
704
+ task: TFuture) -> Union[bool, Exception]:
683
705
  """
684
706
  \b
685
- Await a single page result and handle errors (exceptions),
707
+ Await a single page result and handle errors (exceptions),
686
708
  enforcing policies configured by the following
687
709
  environment variables:
688
710
  - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
@@ -738,14 +760,14 @@ class Processor():
738
760
  raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
739
761
  return err
740
762
 
741
- def _copy_page_file(self, input_file : OcrdFileType) -> None:
763
+ def _copy_page_file(self, input_file: OcrdFileType) -> None:
742
764
  """
743
765
  Copy the given ``input_file`` of the :py:data:`workspace`,
744
766
  representing one physical page (passed as one opened
745
767
  :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
746
768
  and add it as if it was a processing result.
747
769
  """
748
- input_pcgts : OcrdPage
770
+ input_pcgts: OcrdPage
749
771
  assert isinstance(input_file, get_args(OcrdFileType))
750
772
  self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
751
773
  try:
@@ -766,7 +788,7 @@ class Processor():
766
788
  content=to_xml(input_pcgts),
767
789
  )
768
790
 
769
- def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
791
+ def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
770
792
  """
771
793
  Process the given ``input_files`` of the :py:data:`workspace`,
772
794
  representing one physical page (passed as one opened
@@ -777,7 +799,7 @@ class Processor():
777
799
  (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
778
800
  to handle cases like multiple output fileGrps, non-PAGE input etc.)
779
801
  """
780
- input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
802
+ input_pcgts: List[Optional[OcrdPage]] = [None] * len(input_files)
781
803
  input_pos = next(i for i, input_file in enumerate(input_files) if input_file is not None)
782
804
  page_id = input_files[input_pos].pageId
783
805
  self._base_logger.info("processing page %s", page_id)
@@ -827,7 +849,7 @@ class Processor():
827
849
  elif isinstance(image_result.alternative_image, AlternativeImageType):
828
850
  image_result.alternative_image.set_filename(image_file_path)
829
851
  elif image_result.alternative_image is None:
830
- pass # do not reference in PAGE result
852
+ pass # do not reference in PAGE result
831
853
  else:
832
854
  raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
833
855
  f"{type(image_result.alternative_image)}")
@@ -849,7 +871,7 @@ class Processor():
849
871
  content=to_xml(result.pcgts),
850
872
  )
851
873
 
852
- def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
874
+ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
853
875
  """
854
876
  Process the given ``input_pcgts`` of the :py:data:`.workspace`,
855
877
  representing one physical page (passed as one parsed
@@ -876,24 +898,25 @@ class Processor():
876
898
  """
877
899
  metadata_obj = pcgts.get_Metadata()
878
900
  assert metadata_obj is not None
879
- metadata_obj.add_MetadataItem(
880
- MetadataItemType(type_="processingStep",
881
- name=self.ocrd_tool['steps'][0],
882
- value=self.ocrd_tool['executable'],
883
- Labels=[LabelsType(
884
- externalModel="ocrd-tool",
885
- externalId="parameters",
886
- Label=[LabelType(type_=name,
887
- value=self.parameter[name])
888
- for name in self.parameter.keys()]),
889
- LabelsType(
901
+ metadata_item = MetadataItemType(
902
+ type_="processingStep",
903
+ name=self.ocrd_tool['steps'][0],
904
+ value=self.ocrd_tool['executable'],
905
+ Labels=[LabelsType(
906
+ externalModel="ocrd-tool",
907
+ externalId="parameters",
908
+ Label=[LabelType(type_=name,
909
+ value=self.parameter[name])
910
+ for name in self.parameter.keys()]),
911
+ LabelsType(
890
912
  externalModel="ocrd-tool",
891
913
  externalId="version",
892
914
  Label=[LabelType(type_=self.ocrd_tool['executable'],
893
915
  value=self.version),
894
916
  LabelType(type_='ocrd/core',
895
917
  value=OCRD_VERSION)])
896
- ]))
918
+ ])
919
+ metadata_obj.add_MetadataItem(metadata_item)
897
920
 
898
921
  def resolve_resource(self, val):
899
922
  """
@@ -948,8 +971,8 @@ class Processor():
948
971
  mimetypes = get_processor_resource_types(None, self.ocrd_tool)
949
972
  for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
950
973
  res = Path(res)
951
- if not '*/*' in mimetypes:
952
- if res.is_dir() and not 'text/directory' in mimetypes:
974
+ if '*/*' not in mimetypes:
975
+ if res.is_dir() and 'text/directory' not in mimetypes:
953
976
  continue
954
977
  # if we do not know all MIME types, then keep the file, otherwise require suffix match
955
978
  if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
@@ -1070,16 +1093,18 @@ class Processor():
1070
1093
  continue
1071
1094
  ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
1072
1095
  if ift[i]:
1073
- self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
1096
+ self._base_logger.debug(
1097
+ f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
1074
1098
  # fileGrp has multiple files for this page ID
1075
1099
  if mimetype:
1076
1100
  # filter was active, this must not happen
1077
- self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
1078
- f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
1101
+ self._base_logger.warning(
1102
+ f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
1103
+ f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
1079
1104
  if on_error == 'skip':
1080
1105
  ift[i] = None
1081
1106
  elif on_error == 'first':
1082
- pass # keep first match
1107
+ pass # keep first match
1083
1108
  elif on_error == 'last':
1084
1109
  ift[i] = file_
1085
1110
  elif on_error == 'abort':
@@ -1088,18 +1113,19 @@ class Processor():
1088
1113
  raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
1089
1114
  elif (ift[i].mimetype == MIMETYPE_PAGE and
1090
1115
  file_.mimetype != MIMETYPE_PAGE):
1091
- pass # keep PAGE match
1116
+ pass # keep PAGE match
1092
1117
  elif (ift[i].mimetype == MIMETYPE_PAGE and
1093
1118
  file_.mimetype == MIMETYPE_PAGE):
1094
1119
  raise NonUniqueInputFile(ifg, file_.pageId, None)
1095
1120
  else:
1096
1121
  # filter was inactive but no PAGE is in control, this must not happen
1097
- self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
1098
- f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
1122
+ self._base_logger.warning(
1123
+ f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
1124
+ f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
1099
1125
  if on_error == 'skip':
1100
1126
  ift[i] = None
1101
1127
  elif on_error == 'first':
1102
- pass # keep first match
1128
+ pass # keep first match
1103
1129
  elif on_error == 'last':
1104
1130
  ift[i] = file_
1105
1131
  elif on_error == 'abort':
@@ -1133,6 +1159,7 @@ class Processor():
1133
1159
  ifts.append(tuple(ifiles))
1134
1160
  return ifts
1135
1161
 
1162
+
1136
1163
  _page_worker_processor = None
1137
1164
  """
1138
1165
  This global binding for the processor is required to avoid
@@ -1143,6 +1170,8 @@ in Processor.process_workspace. Forking allows inheriting global
1143
1170
  objects, and with the METS Server we do not mutate the local
1144
1171
  processor instance anyway.
1145
1172
  """
1173
+
1174
+
1146
1175
  def _page_worker_set_ctxt(processor, log_queue):
1147
1176
  """
1148
1177
  Overwrites `ocrd.processor.base._page_worker_processor` instance
@@ -1154,6 +1183,7 @@ def _page_worker_set_ctxt(processor, log_queue):
1154
1183
  # replace all log handlers with just one queue handler
1155
1184
  logging.root.handlers = [logging.handlers.QueueHandler(log_queue)]
1156
1185
 
1186
+
1157
1187
  def _page_worker(timeout, *input_files):
1158
1188
  """
1159
1189
  Wraps a `Processor.process_page_file` call as payload (call target)
@@ -1171,6 +1201,7 @@ def _page_worker(timeout, *input_files):
1171
1201
  _page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
1172
1202
  raise
1173
1203
 
1204
+
1174
1205
  def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
1175
1206
  """Generate a string describing the full CLI of this processor including params.
1176
1207
 
@@ -1178,7 +1209,7 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
1178
1209
  ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
1179
1210
  processor_instance (object, optional): the processor implementation
1180
1211
  (for adding any module/class/function docstrings)
1181
- subcommand (string): 'worker' or 'server'
1212
+ subcommand (string, optional): 'worker'
1182
1213
  """
1183
1214
  doc_help = ''
1184
1215
  if processor_instance:
@@ -1204,7 +1235,6 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
1204
1235
  preserve_paragraphs=True)
1205
1236
  subcommands = '''\
1206
1237
  worker Start a processing worker rather than do local processing
1207
- server Start a processor server rather than do local processing
1208
1238
  '''
1209
1239
 
1210
1240
  processing_worker_options = '''\
@@ -1219,8 +1249,6 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
1219
1249
  '''
1220
1250
 
1221
1251
  processing_server_options = '''\
1222
- --address The Processor server address in format
1223
- "{host}:{port}"
1224
1252
  --database The MongoDB server address in format
1225
1253
  "mongodb://{host}:{port}"
1226
1254
  [mongodb://localhost:27018]
@@ -1265,8 +1293,8 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
1265
1293
  parameter_help = ' NONE\n'
1266
1294
  else:
1267
1295
  def wrap(s):
1268
- return wrap_text(s, initial_indent=' '*3,
1269
- subsequent_indent=' '*4,
1296
+ return wrap_text(s, initial_indent=' ' * 3,
1297
+ subsequent_indent=' ' * 4,
1270
1298
  width=72, preserve_paragraphs=True)
1271
1299
  for param_name, param in ocrd_tool['parameters'].items():
1272
1300
  parameter_help += wrap('"%s" [%s%s]' % (
@@ -1304,17 +1332,6 @@ Usage: {ocrd_tool['executable']} worker [OPTIONS]
1304
1332
 
1305
1333
  Options:
1306
1334
  {processing_worker_options}
1307
- '''
1308
- elif subcommand == 'server':
1309
- return f'''\
1310
- Usage: {ocrd_tool['executable']} server [OPTIONS]
1311
-
1312
- Run {ocrd_tool['executable']} as a processor sever.
1313
-
1314
- {ocrd_tool['description']}{doc_help}
1315
-
1316
- Options:
1317
- {processing_server_options}
1318
1335
  '''
1319
1336
  else:
1320
1337
  pass
@@ -16,6 +16,7 @@ from ocrd_utils import (
16
16
  )
17
17
  from ocrd_modelfactory import page_from_file
18
18
 
19
+
19
20
  class DummyProcessor(Processor):
20
21
  """
21
22
  Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
@@ -57,13 +58,14 @@ class DummyProcessor(Processor):
57
58
  page_id=input_file.pageId,
58
59
  local_filename=join(self.output_file_grp, file_id + '.xml'),
59
60
  mimetype=MIMETYPE_PAGE,
60
- content=to_xml(pcgts),
61
- )
61
+ content=to_xml(pcgts))
62
62
  else:
63
63
  if self.parameter['copy_files']:
64
- self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename)
64
+ self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed",
65
+ input_file.local_filename)
65
66
  else:
66
- self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename)
67
+ self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false",
68
+ input_file.local_filename)
67
69
  # we can rely on base implementation verbatim
68
70
  super().process_page_file(input_file)
69
71
 
@@ -75,6 +77,7 @@ class DummyProcessor(Processor):
75
77
  def executable(self):
76
78
  return 'ocrd-dummy'
77
79
 
80
+
78
81
  @click.command()
79
82
  @ocrd_cli_options
80
83
  def cli(*args, **kwargs):
@@ -1,7 +1,6 @@
1
1
  # pylint: disable=missing-module-docstring,invalid-name
2
2
  from typing import Optional
3
3
 
4
- from lxml import etree
5
4
  import click
6
5
 
7
6
  from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
@@ -29,6 +28,7 @@ _SEGTYPES = [
29
28
  "Glyph"
30
29
  ]
31
30
 
31
+
32
32
  class FilterProcessor(Processor):
33
33
  def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
34
34
  """
@@ -57,7 +57,7 @@ class FilterProcessor(Processor):
57
57
  # but allow only hierarchy segments
58
58
  segments = [segment for segment in map(pcgts.revmap.get, nodes)
59
59
  if segment.__class__.__name__.replace('Type', '') in _SEGTYPES]
60
- if not(len(segments)):
60
+ if not len(segments):
61
61
  self.logger.info("no matches")
62
62
  return result
63
63
  rodict = pcgts.get_Page().get_ReadingOrderGroups()
@@ -102,6 +102,7 @@ class FilterProcessor(Processor):
102
102
  def executable(self):
103
103
  return 'ocrd-filter'
104
104
 
105
+
105
106
  @click.command()
106
107
  @ocrd_cli_options
107
108
  def cli(*args, **kwargs):
ocrd/processor/helpers.py CHANGED
@@ -5,7 +5,6 @@ from time import perf_counter, process_time
5
5
  from os import times
6
6
  from functools import lru_cache
7
7
  import json
8
- import inspect
9
8
  from subprocess import run
10
9
  from typing import List, Optional
11
10
 
@@ -28,6 +27,7 @@ def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=Non
28
27
  workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir, mets_server_url=mets_server_url)
29
28
  return workspace
30
29
 
30
+
31
31
  def run_processor(
32
32
  processorClass,
33
33
  mets_url=None,
@@ -41,7 +41,7 @@ def run_processor(
41
41
  working_dir=None,
42
42
  mets_server_url=None,
43
43
  instance_caching=False
44
- ): # pylint: disable=too-many-locals
44
+ ): # pylint: disable=too-many-locals
45
45
  """
46
46
  Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace.
47
47
 
@@ -66,8 +66,7 @@ def run_processor(
66
66
  when a match occurs - as long as the program is being run. They only get deleted (and
67
67
  their resources freed) when as many as :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE`
68
68
  instances have already been cached while this particular parameter set was re-used
69
- least frequently. (See :py:class:`~ocrd_network.ProcessingWorker` and
70
- :py:class:`~ocrd_network.ProcessorServer` for use-cases.)
69
+ least frequently. (See :py:class:`~ocrd_network.ProcessingWorker` for use-cases.)
71
70
 
72
71
  Args:
73
72
  processorClass (object): Python class of the module processor.
@@ -104,7 +103,7 @@ def run_processor(
104
103
  t0_os = times()
105
104
  if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
106
105
  backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
107
- from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
106
+ from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
108
107
  try:
109
108
  mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}),
110
109
  # only run process once
@@ -225,7 +224,6 @@ def run_cli(
225
224
  return result.returncode
226
225
 
227
226
 
228
-
229
227
  # not decorated here but at runtime (on first use)
230
228
  #@freeze_args
231
229
  #@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
@@ -245,6 +243,7 @@ def get_cached_processor(parameter: dict, processor_class):
245
243
  return processor
246
244
  return None
247
245
 
246
+
248
247
  def get_processor(
249
248
  processor_class,
250
249
  parameter: Optional[dict] = None,
@@ -5,13 +5,15 @@ from PIL.Image import Image
5
5
 
6
6
  from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
7
7
 
8
+
8
9
  @dataclass
9
10
  class OcrdPageResultImage():
10
- pil : Image
11
- file_id_suffix : str
12
- alternative_image : Optional[Union[AlternativeImageType, PageType]]
11
+ pil: Image
12
+ file_id_suffix: str
13
+ alternative_image: Optional[Union[AlternativeImageType, PageType]]
14
+
13
15
 
14
16
  @dataclass
15
17
  class OcrdPageResult():
16
- pcgts : OcrdPage
17
- images : List[OcrdPageResultImage] = field(default_factory=list)
18
+ pcgts: OcrdPage
19
+ images: List[OcrdPageResultImage] = field(default_factory=list)