ocrd 3.5.1__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +6 -2
- ocrd/cli/bashlib.py +7 -2
- ocrd/cli/log.py +7 -2
- ocrd/cli/network.py +0 -2
- ocrd/cli/ocrd_tool.py +26 -4
- ocrd/cli/process.py +1 -0
- ocrd/cli/resmgr.py +0 -1
- ocrd/cli/validate.py +32 -13
- ocrd/cli/workspace.py +125 -52
- ocrd/cli/zip.py +13 -4
- ocrd/decorators/__init__.py +28 -52
- ocrd/decorators/loglevel_option.py +4 -0
- ocrd/decorators/mets_find_options.py +2 -1
- ocrd/decorators/ocrd_cli_options.py +3 -7
- ocrd/decorators/parameter_option.py +12 -11
- ocrd/lib.bash +6 -13
- ocrd/mets_server.py +6 -10
- ocrd/processor/base.py +88 -71
- ocrd/processor/builtin/dummy_processor.py +7 -4
- ocrd/processor/builtin/filter_processor.py +3 -2
- ocrd/processor/helpers.py +5 -6
- ocrd/processor/ocrd_page_result.py +7 -5
- ocrd/resolver.py +42 -32
- ocrd/task_sequence.py +11 -4
- ocrd/workspace.py +64 -54
- ocrd/workspace_backup.py +3 -0
- ocrd/workspace_bagger.py +15 -8
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/METADATA +1 -1
- ocrd-3.6.0.dist-info/RECORD +125 -0
- ocrd_modelfactory/__init__.py +4 -2
- ocrd_models/constants.py +18 -1
- ocrd_models/ocrd_agent.py +1 -1
- ocrd_models/ocrd_exif.py +7 -3
- ocrd_models/ocrd_file.py +24 -19
- ocrd_models/ocrd_mets.py +90 -67
- ocrd_models/ocrd_page.py +17 -13
- ocrd_models/ocrd_xml_base.py +1 -0
- ocrd_models/report.py +2 -1
- ocrd_models/utils.py +4 -3
- ocrd_models/xpath_functions.py +3 -1
- ocrd_network/__init__.py +1 -2
- ocrd_network/cli/__init__.py +0 -2
- ocrd_network/cli/client.py +122 -50
- ocrd_network/cli/processing_server.py +1 -2
- ocrd_network/client.py +2 -2
- ocrd_network/client_utils.py +30 -13
- ocrd_network/constants.py +1 -6
- ocrd_network/database.py +3 -3
- ocrd_network/logging_utils.py +2 -7
- ocrd_network/models/__init__.py +0 -2
- ocrd_network/models/job.py +2 -5
- ocrd_network/models/workspace.py +1 -1
- ocrd_network/process_helpers.py +54 -17
- ocrd_network/processing_server.py +63 -114
- ocrd_network/processing_worker.py +6 -5
- ocrd_network/rabbitmq_utils/__init__.py +2 -0
- ocrd_network/rabbitmq_utils/helpers.py +24 -7
- ocrd_network/runtime_data/__init__.py +1 -2
- ocrd_network/runtime_data/deployer.py +12 -85
- ocrd_network/runtime_data/hosts.py +61 -130
- ocrd_network/runtime_data/network_agents.py +7 -31
- ocrd_network/runtime_data/network_services.py +1 -1
- ocrd_network/server_cache.py +1 -1
- ocrd_network/server_utils.py +13 -52
- ocrd_network/utils.py +1 -0
- ocrd_utils/__init__.py +4 -4
- ocrd_utils/config.py +86 -76
- ocrd_utils/deprecate.py +3 -0
- ocrd_utils/image.py +51 -23
- ocrd_utils/introspect.py +8 -3
- ocrd_utils/logging.py +12 -7
- ocrd_utils/os.py +16 -3
- ocrd_utils/str.py +32 -16
- ocrd_validators/json_validator.py +4 -1
- ocrd_validators/ocrd_tool_validator.py +2 -1
- ocrd_validators/ocrd_zip_validator.py +5 -4
- ocrd_validators/page_validator.py +21 -9
- ocrd_validators/parameter_validator.py +3 -2
- ocrd_validators/processing_server_config.schema.yml +1 -33
- ocrd_validators/resource_list_validator.py +3 -1
- ocrd_validators/workspace_validator.py +30 -20
- ocrd_validators/xsd_mets_validator.py +2 -1
- ocrd_validators/xsd_page_validator.py +2 -1
- ocrd_validators/xsd_validator.py +4 -2
- ocrd-3.5.1.dist-info/RECORD +0 -128
- ocrd_network/cli/processor_server.py +0 -31
- ocrd_network/models/ocrd_tool.py +0 -12
- ocrd_network/processor_server.py +0 -255
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/LICENSE +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/WHEEL +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/entry_points.txt +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/top_level.txt +0 -0
ocrd/processor/base.py
CHANGED
|
@@ -16,7 +16,7 @@ import json
|
|
|
16
16
|
import os
|
|
17
17
|
from os import getcwd
|
|
18
18
|
from pathlib import Path
|
|
19
|
-
from typing import
|
|
19
|
+
from typing import Dict, List, Optional, Tuple, Union, get_args
|
|
20
20
|
import sys
|
|
21
21
|
import logging
|
|
22
22
|
import logging.handlers
|
|
@@ -68,7 +68,7 @@ from ocrd_modelfactory import page_from_file
|
|
|
68
68
|
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
|
|
69
69
|
|
|
70
70
|
# XXX imports must remain for backwards-compatibility
|
|
71
|
-
from .helpers import run_cli, run_processor
|
|
71
|
+
from .helpers import run_cli, run_processor # pylint: disable=unused-import
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
class ResourceNotFoundError(FileNotFoundError):
|
|
@@ -83,6 +83,7 @@ class ResourceNotFoundError(FileNotFoundError):
|
|
|
83
83
|
f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
|
|
84
84
|
super().__init__(self.message)
|
|
85
85
|
|
|
86
|
+
|
|
86
87
|
class NonUniqueInputFile(ValueError):
|
|
87
88
|
"""
|
|
88
89
|
An exception signifying the specified fileGrp / pageId / mimetype
|
|
@@ -97,6 +98,7 @@ class NonUniqueInputFile(ValueError):
|
|
|
97
98
|
f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
|
|
98
99
|
super().__init__(self.message)
|
|
99
100
|
|
|
101
|
+
|
|
100
102
|
class MissingInputFile(ValueError):
|
|
101
103
|
"""
|
|
102
104
|
An exception signifying the specified fileGrp / pageId / mimetype
|
|
@@ -111,6 +113,7 @@ class MissingInputFile(ValueError):
|
|
|
111
113
|
f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
|
|
112
114
|
super().__init__(self.message)
|
|
113
115
|
|
|
116
|
+
|
|
114
117
|
class DummyFuture:
|
|
115
118
|
"""
|
|
116
119
|
Mimics some of `concurrent.futures.Future` but runs immediately.
|
|
@@ -119,8 +122,11 @@ class DummyFuture:
|
|
|
119
122
|
self.fn = fn
|
|
120
123
|
self.args = args
|
|
121
124
|
self.kwargs = kwargs
|
|
125
|
+
|
|
122
126
|
def result(self):
|
|
123
127
|
return self.fn(*self.args, **self.kwargs)
|
|
128
|
+
|
|
129
|
+
|
|
124
130
|
class DummyExecutor:
|
|
125
131
|
"""
|
|
126
132
|
Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs
|
|
@@ -128,15 +134,19 @@ class DummyExecutor:
|
|
|
128
134
|
"""
|
|
129
135
|
def __init__(self, initializer=None, initargs=(), **kwargs):
|
|
130
136
|
initializer(*initargs)
|
|
137
|
+
|
|
131
138
|
def shutdown(self, **kwargs):
|
|
132
139
|
# allow gc to catch processor instance (unless cached)
|
|
133
140
|
_page_worker_set_ctxt(None, None)
|
|
141
|
+
|
|
134
142
|
def submit(self, fn, *args, **kwargs) -> DummyFuture:
|
|
135
143
|
return DummyFuture(fn, *args, **kwargs)
|
|
136
144
|
|
|
145
|
+
|
|
137
146
|
TFuture = Union[DummyFuture, Future]
|
|
138
147
|
TExecutor = Union[DummyExecutor, ProcessPoolExecutor]
|
|
139
148
|
|
|
149
|
+
|
|
140
150
|
class Processor():
|
|
141
151
|
"""
|
|
142
152
|
A processor is a tool that implements the uniform OCR-D
|
|
@@ -149,7 +159,7 @@ class Processor():
|
|
|
149
159
|
parameters.
|
|
150
160
|
"""
|
|
151
161
|
|
|
152
|
-
max_instances
|
|
162
|
+
max_instances: int = -1
|
|
153
163
|
"""
|
|
154
164
|
maximum number of cached instances (ignored if negative), to be applied on top of
|
|
155
165
|
:py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
|
|
@@ -157,7 +167,7 @@ class Processor():
|
|
|
157
167
|
(Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.)
|
|
158
168
|
"""
|
|
159
169
|
|
|
160
|
-
max_workers
|
|
170
|
+
max_workers: int = -1
|
|
161
171
|
"""
|
|
162
172
|
maximum number of processor forks for page-parallel processing (ignored if negative),
|
|
163
173
|
to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
|
|
@@ -167,7 +177,7 @@ class Processor():
|
|
|
167
177
|
- at once, or if your class already creates threads prior to forking, e.g. during ``setup``.)
|
|
168
178
|
"""
|
|
169
179
|
|
|
170
|
-
max_page_seconds
|
|
180
|
+
max_page_seconds: int = -1
|
|
171
181
|
"""
|
|
172
182
|
maximum number of seconds may be spent processing a single page (ignored if negative),
|
|
173
183
|
to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT`
|
|
@@ -284,7 +294,7 @@ class Processor():
|
|
|
284
294
|
return None
|
|
285
295
|
|
|
286
296
|
@parameter.setter
|
|
287
|
-
def parameter(self, parameter
|
|
297
|
+
def parameter(self, parameter: dict) -> None:
|
|
288
298
|
if self.parameter is not None:
|
|
289
299
|
self.shutdown()
|
|
290
300
|
parameterValidator = ParameterValidator(self.ocrd_tool)
|
|
@@ -299,7 +309,7 @@ class Processor():
|
|
|
299
309
|
def __init__(
|
|
300
310
|
self,
|
|
301
311
|
# FIXME: remove in favor of process_workspace(workspace)
|
|
302
|
-
workspace
|
|
312
|
+
workspace: Optional[Workspace],
|
|
303
313
|
ocrd_tool=None,
|
|
304
314
|
parameter=None,
|
|
305
315
|
input_file_grp=None,
|
|
@@ -365,8 +375,10 @@ class Processor():
|
|
|
365
375
|
if parameter is not None:
|
|
366
376
|
self.parameter = parameter
|
|
367
377
|
# workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
|
|
368
|
-
setattr(self, 'process',
|
|
369
|
-
|
|
378
|
+
setattr(self, 'process', deprecated(
|
|
379
|
+
version='3.0', reason='process() should be replaced '
|
|
380
|
+
'with process_page_pcgts() or process_page_file() or process_workspace()')(
|
|
381
|
+
getattr(self, 'process')))
|
|
370
382
|
|
|
371
383
|
def __del__(self):
|
|
372
384
|
self._base_logger.debug("shutting down %s in %s", repr(self), mp.current_process().name)
|
|
@@ -394,7 +406,8 @@ class Processor():
|
|
|
394
406
|
assert self.output_file_grp is not None
|
|
395
407
|
input_file_grps = self.input_file_grp.split(',')
|
|
396
408
|
output_file_grps = self.output_file_grp.split(',')
|
|
397
|
-
|
|
409
|
+
|
|
410
|
+
def assert_file_grp_cardinality(grps: List[str], spec: Union[int, List[int]], msg):
|
|
398
411
|
if isinstance(spec, int):
|
|
399
412
|
if spec > 0:
|
|
400
413
|
assert len(grps) == spec, msg % (len(grps), str(spec))
|
|
@@ -418,10 +431,10 @@ class Processor():
|
|
|
418
431
|
assert input_file_grp in self.workspace.mets.file_groups, \
|
|
419
432
|
f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}"
|
|
420
433
|
for output_file_grp in output_file_grps:
|
|
421
|
-
assert output_file_grp not in self.workspace.mets.file_groups
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
434
|
+
assert (output_file_grp not in self.workspace.mets.file_groups
|
|
435
|
+
or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP']
|
|
436
|
+
or not any(self.workspace.mets.find_files(
|
|
437
|
+
pageId=self.page_id, fileGrp=output_file_grp))), \
|
|
425
438
|
f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}"
|
|
426
439
|
# keep this for backwards compatibility:
|
|
427
440
|
return True
|
|
@@ -465,7 +478,8 @@ class Processor():
|
|
|
465
478
|
"""
|
|
466
479
|
pass
|
|
467
480
|
|
|
468
|
-
@deprecated(version='3.0', reason='process() should be replaced
|
|
481
|
+
@deprecated(version='3.0', reason='process() should be replaced '
|
|
482
|
+
'with process_page_pcgts() or process_page_file() or process_workspace()')
|
|
469
483
|
def process(self) -> None:
|
|
470
484
|
"""
|
|
471
485
|
Process all files of the :py:data:`workspace`
|
|
@@ -528,7 +542,8 @@ class Processor():
|
|
|
528
542
|
)
|
|
529
543
|
if max_workers > 1:
|
|
530
544
|
# forward messages from log queue (in subprocesses) to all root handlers
|
|
531
|
-
log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers,
|
|
545
|
+
log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers,
|
|
546
|
+
respect_handler_level=True)
|
|
532
547
|
log_listener.start()
|
|
533
548
|
tasks = None
|
|
534
549
|
try:
|
|
@@ -553,7 +568,8 @@ class Processor():
|
|
|
553
568
|
# suppress the NotImplementedError context
|
|
554
569
|
raise err from None
|
|
555
570
|
|
|
556
|
-
def process_workspace_submit_tasks(self, executor
|
|
571
|
+
def process_workspace_submit_tasks(self, executor: TExecutor, max_seconds: int) -> Dict[
|
|
572
|
+
TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
|
|
557
573
|
"""
|
|
558
574
|
Look up all input files of the given ``workspace``
|
|
559
575
|
from the given :py:data:`input_file_grp`
|
|
@@ -571,7 +587,7 @@ class Processor():
|
|
|
571
587
|
Otherwise, tasks are run sequentially in the
|
|
572
588
|
current process.
|
|
573
589
|
|
|
574
|
-
Delegates to :py:meth:`.zip_input_files` to get
|
|
590
|
+
Delegates to :py:meth:`.zip_input_files` to get
|
|
575
591
|
the input files for each page, and then calls
|
|
576
592
|
:py:meth:`.process_workspace_submit_page_task`.
|
|
577
593
|
|
|
@@ -586,7 +602,9 @@ class Processor():
|
|
|
586
602
|
self._base_logger.debug("submitted %d processing tasks", len(tasks))
|
|
587
603
|
return tasks
|
|
588
604
|
|
|
589
|
-
def process_workspace_submit_page_task(self, executor
|
|
605
|
+
def process_workspace_submit_page_task(self, executor: TExecutor, max_seconds: int,
|
|
606
|
+
input_file_tuple: List[Optional[OcrdFileType]]) -> Tuple[
|
|
607
|
+
TFuture, str, List[Optional[OcrdFileType]]]:
|
|
590
608
|
"""
|
|
591
609
|
Ensure all input files for a single page are
|
|
592
610
|
downloaded to the workspace, then schedule
|
|
@@ -604,7 +622,7 @@ class Processor():
|
|
|
604
622
|
- the corresponding pageId,
|
|
605
623
|
- the corresponding input files.
|
|
606
624
|
"""
|
|
607
|
-
input_files
|
|
625
|
+
input_files: List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
|
|
608
626
|
page_id = next(input_file.pageId
|
|
609
627
|
for input_file in input_file_tuple
|
|
610
628
|
if input_file)
|
|
@@ -625,7 +643,8 @@ class Processor():
|
|
|
625
643
|
#executor.submit(self.process_page_file, *input_files)
|
|
626
644
|
return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
|
|
627
645
|
|
|
628
|
-
def process_workspace_handle_tasks(self, tasks
|
|
646
|
+
def process_workspace_handle_tasks(self, tasks: Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[
|
|
647
|
+
int, int, Dict[str, int], int]:
|
|
629
648
|
"""
|
|
630
649
|
Look up scheduled per-page futures one by one,
|
|
631
650
|
handle errors (exceptions) and gather results.
|
|
@@ -650,7 +669,7 @@ class Processor():
|
|
|
650
669
|
# aggregate info for logging:
|
|
651
670
|
nr_succeeded = 0
|
|
652
671
|
nr_failed = 0
|
|
653
|
-
nr_errors = defaultdict(int)
|
|
672
|
+
nr_errors = defaultdict(int) # count causes
|
|
654
673
|
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
655
674
|
reason = "skipped"
|
|
656
675
|
elif config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
@@ -666,7 +685,8 @@ class Processor():
|
|
|
666
685
|
if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS:
|
|
667
686
|
# already irredeemably many failures, stop short
|
|
668
687
|
nr_errors = dict(nr_errors)
|
|
669
|
-
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded},
|
|
688
|
+
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, "
|
|
689
|
+
f"{str(nr_errors)})")
|
|
670
690
|
elif result:
|
|
671
691
|
nr_succeeded += 1
|
|
672
692
|
# else skipped - already exists
|
|
@@ -676,13 +696,15 @@ class Processor():
|
|
|
676
696
|
if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
|
|
677
697
|
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
|
|
678
698
|
self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
|
|
679
|
-
self._base_logger.debug("succeeded %d, missed %d of %d pages due to %s",
|
|
699
|
+
self._base_logger.debug("succeeded %d, missed %d of %d pages due to %s",
|
|
700
|
+
nr_succeeded, nr_failed, nr_all, str(nr_errors))
|
|
680
701
|
return nr_succeeded, nr_failed, nr_errors, len(tasks)
|
|
681
702
|
|
|
682
|
-
def process_workspace_handle_page_task(self, page_id
|
|
703
|
+
def process_workspace_handle_page_task(self, page_id: str, input_files: List[Optional[OcrdFileType]],
|
|
704
|
+
task: TFuture) -> Union[bool, Exception]:
|
|
683
705
|
"""
|
|
684
706
|
\b
|
|
685
|
-
Await a single page result and handle errors (exceptions),
|
|
707
|
+
Await a single page result and handle errors (exceptions),
|
|
686
708
|
enforcing policies configured by the following
|
|
687
709
|
environment variables:
|
|
688
710
|
- `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
|
|
@@ -738,14 +760,14 @@ class Processor():
|
|
|
738
760
|
raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
|
|
739
761
|
return err
|
|
740
762
|
|
|
741
|
-
def _copy_page_file(self, input_file
|
|
763
|
+
def _copy_page_file(self, input_file: OcrdFileType) -> None:
|
|
742
764
|
"""
|
|
743
765
|
Copy the given ``input_file`` of the :py:data:`workspace`,
|
|
744
766
|
representing one physical page (passed as one opened
|
|
745
767
|
:py:class:`~ocrd_models.OcrdFile` per input fileGrp)
|
|
746
768
|
and add it as if it was a processing result.
|
|
747
769
|
"""
|
|
748
|
-
input_pcgts
|
|
770
|
+
input_pcgts: OcrdPage
|
|
749
771
|
assert isinstance(input_file, get_args(OcrdFileType))
|
|
750
772
|
self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
|
|
751
773
|
try:
|
|
@@ -766,7 +788,7 @@ class Processor():
|
|
|
766
788
|
content=to_xml(input_pcgts),
|
|
767
789
|
)
|
|
768
790
|
|
|
769
|
-
def process_page_file(self, *input_files
|
|
791
|
+
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
|
|
770
792
|
"""
|
|
771
793
|
Process the given ``input_files`` of the :py:data:`workspace`,
|
|
772
794
|
representing one physical page (passed as one opened
|
|
@@ -777,7 +799,7 @@ class Processor():
|
|
|
777
799
|
(This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
|
|
778
800
|
to handle cases like multiple output fileGrps, non-PAGE input etc.)
|
|
779
801
|
"""
|
|
780
|
-
input_pcgts
|
|
802
|
+
input_pcgts: List[Optional[OcrdPage]] = [None] * len(input_files)
|
|
781
803
|
input_pos = next(i for i, input_file in enumerate(input_files) if input_file is not None)
|
|
782
804
|
page_id = input_files[input_pos].pageId
|
|
783
805
|
self._base_logger.info("processing page %s", page_id)
|
|
@@ -827,7 +849,7 @@ class Processor():
|
|
|
827
849
|
elif isinstance(image_result.alternative_image, AlternativeImageType):
|
|
828
850
|
image_result.alternative_image.set_filename(image_file_path)
|
|
829
851
|
elif image_result.alternative_image is None:
|
|
830
|
-
pass
|
|
852
|
+
pass # do not reference in PAGE result
|
|
831
853
|
else:
|
|
832
854
|
raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
|
|
833
855
|
f"{type(image_result.alternative_image)}")
|
|
@@ -849,7 +871,7 @@ class Processor():
|
|
|
849
871
|
content=to_xml(result.pcgts),
|
|
850
872
|
)
|
|
851
873
|
|
|
852
|
-
def process_page_pcgts(self, *input_pcgts
|
|
874
|
+
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
|
853
875
|
"""
|
|
854
876
|
Process the given ``input_pcgts`` of the :py:data:`.workspace`,
|
|
855
877
|
representing one physical page (passed as one parsed
|
|
@@ -876,24 +898,25 @@ class Processor():
|
|
|
876
898
|
"""
|
|
877
899
|
metadata_obj = pcgts.get_Metadata()
|
|
878
900
|
assert metadata_obj is not None
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
901
|
+
metadata_item = MetadataItemType(
|
|
902
|
+
type_="processingStep",
|
|
903
|
+
name=self.ocrd_tool['steps'][0],
|
|
904
|
+
value=self.ocrd_tool['executable'],
|
|
905
|
+
Labels=[LabelsType(
|
|
906
|
+
externalModel="ocrd-tool",
|
|
907
|
+
externalId="parameters",
|
|
908
|
+
Label=[LabelType(type_=name,
|
|
909
|
+
value=self.parameter[name])
|
|
910
|
+
for name in self.parameter.keys()]),
|
|
911
|
+
LabelsType(
|
|
890
912
|
externalModel="ocrd-tool",
|
|
891
913
|
externalId="version",
|
|
892
914
|
Label=[LabelType(type_=self.ocrd_tool['executable'],
|
|
893
915
|
value=self.version),
|
|
894
916
|
LabelType(type_='ocrd/core',
|
|
895
917
|
value=OCRD_VERSION)])
|
|
896
|
-
|
|
918
|
+
])
|
|
919
|
+
metadata_obj.add_MetadataItem(metadata_item)
|
|
897
920
|
|
|
898
921
|
def resolve_resource(self, val):
|
|
899
922
|
"""
|
|
@@ -948,8 +971,8 @@ class Processor():
|
|
|
948
971
|
mimetypes = get_processor_resource_types(None, self.ocrd_tool)
|
|
949
972
|
for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
|
|
950
973
|
res = Path(res)
|
|
951
|
-
if
|
|
952
|
-
if res.is_dir() and
|
|
974
|
+
if '*/*' not in mimetypes:
|
|
975
|
+
if res.is_dir() and 'text/directory' not in mimetypes:
|
|
953
976
|
continue
|
|
954
977
|
# if we do not know all MIME types, then keep the file, otherwise require suffix match
|
|
955
978
|
if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
|
|
@@ -1070,16 +1093,18 @@ class Processor():
|
|
|
1070
1093
|
continue
|
|
1071
1094
|
ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
|
|
1072
1095
|
if ift[i]:
|
|
1073
|
-
self._base_logger.debug(
|
|
1096
|
+
self._base_logger.debug(
|
|
1097
|
+
f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
|
|
1074
1098
|
# fileGrp has multiple files for this page ID
|
|
1075
1099
|
if mimetype:
|
|
1076
1100
|
# filter was active, this must not happen
|
|
1077
|
-
self._base_logger.warning(
|
|
1078
|
-
|
|
1101
|
+
self._base_logger.warning(
|
|
1102
|
+
f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
|
|
1103
|
+
f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
|
|
1079
1104
|
if on_error == 'skip':
|
|
1080
1105
|
ift[i] = None
|
|
1081
1106
|
elif on_error == 'first':
|
|
1082
|
-
pass
|
|
1107
|
+
pass # keep first match
|
|
1083
1108
|
elif on_error == 'last':
|
|
1084
1109
|
ift[i] = file_
|
|
1085
1110
|
elif on_error == 'abort':
|
|
@@ -1088,18 +1113,19 @@ class Processor():
|
|
|
1088
1113
|
raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
|
|
1089
1114
|
elif (ift[i].mimetype == MIMETYPE_PAGE and
|
|
1090
1115
|
file_.mimetype != MIMETYPE_PAGE):
|
|
1091
|
-
pass
|
|
1116
|
+
pass # keep PAGE match
|
|
1092
1117
|
elif (ift[i].mimetype == MIMETYPE_PAGE and
|
|
1093
1118
|
file_.mimetype == MIMETYPE_PAGE):
|
|
1094
1119
|
raise NonUniqueInputFile(ifg, file_.pageId, None)
|
|
1095
1120
|
else:
|
|
1096
1121
|
# filter was inactive but no PAGE is in control, this must not happen
|
|
1097
|
-
self._base_logger.warning(
|
|
1098
|
-
|
|
1122
|
+
self._base_logger.warning(
|
|
1123
|
+
f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
|
|
1124
|
+
f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
|
|
1099
1125
|
if on_error == 'skip':
|
|
1100
1126
|
ift[i] = None
|
|
1101
1127
|
elif on_error == 'first':
|
|
1102
|
-
pass
|
|
1128
|
+
pass # keep first match
|
|
1103
1129
|
elif on_error == 'last':
|
|
1104
1130
|
ift[i] = file_
|
|
1105
1131
|
elif on_error == 'abort':
|
|
@@ -1133,6 +1159,7 @@ class Processor():
|
|
|
1133
1159
|
ifts.append(tuple(ifiles))
|
|
1134
1160
|
return ifts
|
|
1135
1161
|
|
|
1162
|
+
|
|
1136
1163
|
_page_worker_processor = None
|
|
1137
1164
|
"""
|
|
1138
1165
|
This global binding for the processor is required to avoid
|
|
@@ -1143,6 +1170,8 @@ in Processor.process_workspace. Forking allows inheriting global
|
|
|
1143
1170
|
objects, and with the METS Server we do not mutate the local
|
|
1144
1171
|
processor instance anyway.
|
|
1145
1172
|
"""
|
|
1173
|
+
|
|
1174
|
+
|
|
1146
1175
|
def _page_worker_set_ctxt(processor, log_queue):
|
|
1147
1176
|
"""
|
|
1148
1177
|
Overwrites `ocrd.processor.base._page_worker_processor` instance
|
|
@@ -1154,6 +1183,7 @@ def _page_worker_set_ctxt(processor, log_queue):
|
|
|
1154
1183
|
# replace all log handlers with just one queue handler
|
|
1155
1184
|
logging.root.handlers = [logging.handlers.QueueHandler(log_queue)]
|
|
1156
1185
|
|
|
1186
|
+
|
|
1157
1187
|
def _page_worker(timeout, *input_files):
|
|
1158
1188
|
"""
|
|
1159
1189
|
Wraps a `Processor.process_page_file` call as payload (call target)
|
|
@@ -1171,6 +1201,7 @@ def _page_worker(timeout, *input_files):
|
|
|
1171
1201
|
_page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
|
|
1172
1202
|
raise
|
|
1173
1203
|
|
|
1204
|
+
|
|
1174
1205
|
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
|
|
1175
1206
|
"""Generate a string describing the full CLI of this processor including params.
|
|
1176
1207
|
|
|
@@ -1178,7 +1209,7 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
|
|
|
1178
1209
|
ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
|
|
1179
1210
|
processor_instance (object, optional): the processor implementation
|
|
1180
1211
|
(for adding any module/class/function docstrings)
|
|
1181
|
-
subcommand (string): 'worker'
|
|
1212
|
+
subcommand (string, optional): 'worker'
|
|
1182
1213
|
"""
|
|
1183
1214
|
doc_help = ''
|
|
1184
1215
|
if processor_instance:
|
|
@@ -1204,7 +1235,6 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
|
|
|
1204
1235
|
preserve_paragraphs=True)
|
|
1205
1236
|
subcommands = '''\
|
|
1206
1237
|
worker Start a processing worker rather than do local processing
|
|
1207
|
-
server Start a processor server rather than do local processing
|
|
1208
1238
|
'''
|
|
1209
1239
|
|
|
1210
1240
|
processing_worker_options = '''\
|
|
@@ -1219,8 +1249,6 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
|
|
|
1219
1249
|
'''
|
|
1220
1250
|
|
|
1221
1251
|
processing_server_options = '''\
|
|
1222
|
-
--address The Processor server address in format
|
|
1223
|
-
"{host}:{port}"
|
|
1224
1252
|
--database The MongoDB server address in format
|
|
1225
1253
|
"mongodb://{host}:{port}"
|
|
1226
1254
|
[mongodb://localhost:27018]
|
|
@@ -1265,8 +1293,8 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
|
|
|
1265
1293
|
parameter_help = ' NONE\n'
|
|
1266
1294
|
else:
|
|
1267
1295
|
def wrap(s):
|
|
1268
|
-
return wrap_text(s, initial_indent=' '*3,
|
|
1269
|
-
subsequent_indent=' '*4,
|
|
1296
|
+
return wrap_text(s, initial_indent=' ' * 3,
|
|
1297
|
+
subsequent_indent=' ' * 4,
|
|
1270
1298
|
width=72, preserve_paragraphs=True)
|
|
1271
1299
|
for param_name, param in ocrd_tool['parameters'].items():
|
|
1272
1300
|
parameter_help += wrap('"%s" [%s%s]' % (
|
|
@@ -1304,17 +1332,6 @@ Usage: {ocrd_tool['executable']} worker [OPTIONS]
|
|
|
1304
1332
|
|
|
1305
1333
|
Options:
|
|
1306
1334
|
{processing_worker_options}
|
|
1307
|
-
'''
|
|
1308
|
-
elif subcommand == 'server':
|
|
1309
|
-
return f'''\
|
|
1310
|
-
Usage: {ocrd_tool['executable']} server [OPTIONS]
|
|
1311
|
-
|
|
1312
|
-
Run {ocrd_tool['executable']} as a processor sever.
|
|
1313
|
-
|
|
1314
|
-
{ocrd_tool['description']}{doc_help}
|
|
1315
|
-
|
|
1316
|
-
Options:
|
|
1317
|
-
{processing_server_options}
|
|
1318
1335
|
'''
|
|
1319
1336
|
else:
|
|
1320
1337
|
pass
|
|
@@ -16,6 +16,7 @@ from ocrd_utils import (
|
|
|
16
16
|
)
|
|
17
17
|
from ocrd_modelfactory import page_from_file
|
|
18
18
|
|
|
19
|
+
|
|
19
20
|
class DummyProcessor(Processor):
|
|
20
21
|
"""
|
|
21
22
|
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
|
|
@@ -57,13 +58,14 @@ class DummyProcessor(Processor):
|
|
|
57
58
|
page_id=input_file.pageId,
|
|
58
59
|
local_filename=join(self.output_file_grp, file_id + '.xml'),
|
|
59
60
|
mimetype=MIMETYPE_PAGE,
|
|
60
|
-
content=to_xml(pcgts)
|
|
61
|
-
)
|
|
61
|
+
content=to_xml(pcgts))
|
|
62
62
|
else:
|
|
63
63
|
if self.parameter['copy_files']:
|
|
64
|
-
self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed",
|
|
64
|
+
self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed",
|
|
65
|
+
input_file.local_filename)
|
|
65
66
|
else:
|
|
66
|
-
self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false",
|
|
67
|
+
self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false",
|
|
68
|
+
input_file.local_filename)
|
|
67
69
|
# we can rely on base implementation verbatim
|
|
68
70
|
super().process_page_file(input_file)
|
|
69
71
|
|
|
@@ -75,6 +77,7 @@ class DummyProcessor(Processor):
|
|
|
75
77
|
def executable(self):
|
|
76
78
|
return 'ocrd-dummy'
|
|
77
79
|
|
|
80
|
+
|
|
78
81
|
@click.command()
|
|
79
82
|
@ocrd_cli_options
|
|
80
83
|
def cli(*args, **kwargs):
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# pylint: disable=missing-module-docstring,invalid-name
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from lxml import etree
|
|
5
4
|
import click
|
|
6
5
|
|
|
7
6
|
from ocrd import Processor, OcrdPageResult, OcrdPageResultImage
|
|
@@ -29,6 +28,7 @@ _SEGTYPES = [
|
|
|
29
28
|
"Glyph"
|
|
30
29
|
]
|
|
31
30
|
|
|
31
|
+
|
|
32
32
|
class FilterProcessor(Processor):
|
|
33
33
|
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
|
|
34
34
|
"""
|
|
@@ -57,7 +57,7 @@ class FilterProcessor(Processor):
|
|
|
57
57
|
# but allow only hierarchy segments
|
|
58
58
|
segments = [segment for segment in map(pcgts.revmap.get, nodes)
|
|
59
59
|
if segment.__class__.__name__.replace('Type', '') in _SEGTYPES]
|
|
60
|
-
if not
|
|
60
|
+
if not len(segments):
|
|
61
61
|
self.logger.info("no matches")
|
|
62
62
|
return result
|
|
63
63
|
rodict = pcgts.get_Page().get_ReadingOrderGroups()
|
|
@@ -102,6 +102,7 @@ class FilterProcessor(Processor):
|
|
|
102
102
|
def executable(self):
|
|
103
103
|
return 'ocrd-filter'
|
|
104
104
|
|
|
105
|
+
|
|
105
106
|
@click.command()
|
|
106
107
|
@ocrd_cli_options
|
|
107
108
|
def cli(*args, **kwargs):
|
ocrd/processor/helpers.py
CHANGED
|
@@ -5,7 +5,6 @@ from time import perf_counter, process_time
|
|
|
5
5
|
from os import times
|
|
6
6
|
from functools import lru_cache
|
|
7
7
|
import json
|
|
8
|
-
import inspect
|
|
9
8
|
from subprocess import run
|
|
10
9
|
from typing import List, Optional
|
|
11
10
|
|
|
@@ -28,6 +27,7 @@ def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=Non
|
|
|
28
27
|
workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir, mets_server_url=mets_server_url)
|
|
29
28
|
return workspace
|
|
30
29
|
|
|
30
|
+
|
|
31
31
|
def run_processor(
|
|
32
32
|
processorClass,
|
|
33
33
|
mets_url=None,
|
|
@@ -41,7 +41,7 @@ def run_processor(
|
|
|
41
41
|
working_dir=None,
|
|
42
42
|
mets_server_url=None,
|
|
43
43
|
instance_caching=False
|
|
44
|
-
):
|
|
44
|
+
): # pylint: disable=too-many-locals
|
|
45
45
|
"""
|
|
46
46
|
Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace.
|
|
47
47
|
|
|
@@ -66,8 +66,7 @@ def run_processor(
|
|
|
66
66
|
when a match occurs - as long as the program is being run. They only get deleted (and
|
|
67
67
|
their resources freed) when as many as :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE`
|
|
68
68
|
instances have already been cached while this particular parameter set was re-used
|
|
69
|
-
least frequently. (See :py:class:`~ocrd_network.ProcessingWorker`
|
|
70
|
-
:py:class:`~ocrd_network.ProcessorServer` for use-cases.)
|
|
69
|
+
least frequently. (See :py:class:`~ocrd_network.ProcessingWorker` for use-cases.)
|
|
71
70
|
|
|
72
71
|
Args:
|
|
73
72
|
processorClass (object): Python class of the module processor.
|
|
@@ -104,7 +103,7 @@ def run_processor(
|
|
|
104
103
|
t0_os = times()
|
|
105
104
|
if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
|
|
106
105
|
backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
|
|
107
|
-
from memory_profiler import memory_usage
|
|
106
|
+
from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
|
|
108
107
|
try:
|
|
109
108
|
mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}),
|
|
110
109
|
# only run process once
|
|
@@ -225,7 +224,6 @@ def run_cli(
|
|
|
225
224
|
return result.returncode
|
|
226
225
|
|
|
227
226
|
|
|
228
|
-
|
|
229
227
|
# not decorated here but at runtime (on first use)
|
|
230
228
|
#@freeze_args
|
|
231
229
|
#@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
|
|
@@ -245,6 +243,7 @@ def get_cached_processor(parameter: dict, processor_class):
|
|
|
245
243
|
return processor
|
|
246
244
|
return None
|
|
247
245
|
|
|
246
|
+
|
|
248
247
|
def get_processor(
|
|
249
248
|
processor_class,
|
|
250
249
|
parameter: Optional[dict] = None,
|
|
@@ -5,13 +5,15 @@ from PIL.Image import Image
|
|
|
5
5
|
|
|
6
6
|
from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
|
|
7
7
|
|
|
8
|
+
|
|
8
9
|
@dataclass
|
|
9
10
|
class OcrdPageResultImage():
|
|
10
|
-
pil
|
|
11
|
-
file_id_suffix
|
|
12
|
-
alternative_image
|
|
11
|
+
pil: Image
|
|
12
|
+
file_id_suffix: str
|
|
13
|
+
alternative_image: Optional[Union[AlternativeImageType, PageType]]
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
@dataclass
|
|
15
17
|
class OcrdPageResult():
|
|
16
|
-
pcgts
|
|
17
|
-
images
|
|
18
|
+
pcgts: OcrdPage
|
|
19
|
+
images: List[OcrdPageResultImage] = field(default_factory=list)
|