ocrd 3.0.0b5__tar.gz → 3.0.0b7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrd-3.0.0b5/src/ocrd.egg-info → ocrd-3.0.0b7}/PKG-INFO +2 -1
- ocrd-3.0.0b7/VERSION +1 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/requirements.txt +1 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/workspace.py +21 -11
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/__init__.py +6 -6
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/base.py +302 -84
- ocrd-3.0.0b7/src/ocrd/processor/concurrent.py +909 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/helpers.py +10 -2
- {ocrd-3.0.0b5 → ocrd-3.0.0b7/src/ocrd.egg-info}/PKG-INFO +2 -1
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/SOURCES.txt +1 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/requires.txt +1 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_mets.py +9 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/logging.py +27 -52
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/ocrd_logging.conf +14 -16
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_decorators.py +7 -10
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_logging.py +6 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_logging_conf.py +21 -28
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_mets_server.py +19 -9
- ocrd-3.0.0b5/VERSION +0 -1
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/LICENSE +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/MANIFEST.in +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_bashlib.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_modelfactory.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_models.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_network.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_utils.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/README_ocrd_validators.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/pyproject.toml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/setup.cfg +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/bashlib.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/log.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/network.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/ocrd_tool.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/process.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/resmgr.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/validate.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/cli/zip.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/loglevel_option.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/mets_find_options.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/ocrd_cli_options.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/decorators/parameter_option.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/lib.bash +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/mets_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/ocrd-all-tool.json +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/builtin/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/processor/ocrd_page_result.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/resolver.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/resource_list.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/resource_manager.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/task_sequence.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/workspace.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/workspace_backup.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd/workspace_bagger.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/dependency_links.txt +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/entry_points.txt +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd.egg-info/top_level.txt +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_modelfactory/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/mets-empty.xml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_agent.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_exif.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_file.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_page.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_page_generateds.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/ocrd_xml_base.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/report.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_models/utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/client.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/processing_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/processing_worker.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/cli/processor_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/client.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/client_utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/database.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/logging_utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/job.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/messages.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/ocrd_tool.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/workflow.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/models/workspace.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/param_validators.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/process_helpers.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/processing_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/processing_worker.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/processor_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/config_parser.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/deployer.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/hosts.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/network_agents.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/runtime_data/network_services.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/server_cache.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/server_utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_network/utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/config.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/deprecate.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/image.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/introspect.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/os.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_utils/str.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/bagit-profile.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/json_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/message_processing.schema.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/message_result.schema.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/mets.xsd +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/page.xsd +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/page_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/parameter_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/processing_server_config_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/resource_list_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/workspace_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/xlink.xsd +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/xsd_mets_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/xsd_page_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/src/ocrd_validators/xsd_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_model_factory.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_resolver.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_resolver_oai.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_resource_manager.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_task_sequence.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_version.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_workspace.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b7}/tests/test_workspace_remove.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0b7
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -26,6 +26,7 @@ Requires-Dist: httpx>=0.22.0
|
|
|
26
26
|
Requires-Dist: importlib_metadata; python_version < "3.8"
|
|
27
27
|
Requires-Dist: importlib_resources; python_version < "3.10"
|
|
28
28
|
Requires-Dist: jsonschema>=4
|
|
29
|
+
Requires-Dist: loky
|
|
29
30
|
Requires-Dist: lxml
|
|
30
31
|
Requires-Dist: memory-profiler>=0.58.0
|
|
31
32
|
Requires-Dist: numpy
|
ocrd-3.0.0b7/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.0.0b7
|
|
@@ -149,7 +149,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
|
|
|
149
149
|
LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
|
|
150
150
|
ctx.directory = workspace_dir
|
|
151
151
|
|
|
152
|
-
assert not ctx.mets_server_url
|
|
152
|
+
assert not ctx.mets_server_url, \
|
|
153
|
+
f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
153
154
|
workspace = ctx.resolver.workspace_from_url(
|
|
154
155
|
mets_url,
|
|
155
156
|
dst_dir=ctx.directory,
|
|
@@ -185,7 +186,8 @@ def workspace_init(ctx, clobber_mets, directory):
|
|
|
185
186
|
if directory:
|
|
186
187
|
LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
|
|
187
188
|
ctx.directory = directory
|
|
188
|
-
assert not ctx.mets_server_url
|
|
189
|
+
assert not ctx.mets_server_url, \
|
|
190
|
+
f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
189
191
|
workspace = ctx.resolver.workspace_from_nothing(
|
|
190
192
|
directory=ctx.directory,
|
|
191
193
|
mets_basename=ctx.mets_basename,
|
|
@@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin
|
|
|
506
508
|
(If any ``ID`` starts with ``//``, then its remainder
|
|
507
509
|
will be interpreted as a regular expression.)
|
|
508
510
|
"""
|
|
511
|
+
assert not ctx.mets_server_url, \
|
|
512
|
+
f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
509
513
|
workspace = ctx.workspace()
|
|
510
514
|
for i in id:
|
|
511
515
|
workspace.remove_file(i, force=force, keep_file=keep_file)
|
|
@@ -524,6 +528,8 @@ def rename_group(ctx, old, new):
|
|
|
524
528
|
"""
|
|
525
529
|
Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
|
|
526
530
|
"""
|
|
531
|
+
assert not ctx.mets_server_url, \
|
|
532
|
+
f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
527
533
|
workspace = ctx.workspace()
|
|
528
534
|
workspace.rename_file_group(old, new)
|
|
529
535
|
workspace.save_mets()
|
|
@@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files):
|
|
|
545
551
|
(If any ``GROUP`` starts with ``//``, then its remainder
|
|
546
552
|
will be interpreted as a regular expression.)
|
|
547
553
|
"""
|
|
554
|
+
assert not ctx.mets_server_url, \
|
|
555
|
+
f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
548
556
|
workspace = ctx.workspace()
|
|
549
557
|
for g in group:
|
|
550
558
|
workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
|
|
@@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
|
|
|
567
575
|
(If any ``FILTER`` starts with ``//``, then its remainder
|
|
568
576
|
will be interpreted as a regular expression.)
|
|
569
577
|
"""
|
|
578
|
+
assert not ctx.mets_server_url, \
|
|
579
|
+
f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
570
580
|
workspace = ctx.workspace()
|
|
571
581
|
with pushd_popd(workspace.directory):
|
|
572
582
|
for f in workspace.find_files(
|
|
@@ -673,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page
|
|
|
673
683
|
will be interpreted as a regular expression.)
|
|
674
684
|
"""
|
|
675
685
|
workspace = ctx.workspace()
|
|
676
|
-
find_kwargs = {}
|
|
677
|
-
if page_id_range and 'ID' in output_field:
|
|
678
|
-
find_kwargs['pageId'] = page_id_range
|
|
679
|
-
page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
|
|
680
686
|
ret = []
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
ret = [[x] for x in page_ids]
|
|
684
|
-
else:
|
|
685
|
-
for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
|
|
687
|
+
if page_id_range or list(output_field) != ['ID']:
|
|
688
|
+
for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)):
|
|
686
689
|
ret.append([])
|
|
687
690
|
for k in output_field:
|
|
688
691
|
ret[i].append(page_div.get(k, 'None'))
|
|
692
|
+
else:
|
|
693
|
+
for page_id in workspace.mets.physical_pages:
|
|
694
|
+
ret.append([page_id])
|
|
689
695
|
|
|
690
696
|
if numeric_range:
|
|
691
697
|
start, end = map(int, numeric_range.split('..'))
|
|
@@ -762,6 +768,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
|
|
|
762
768
|
if contentids:
|
|
763
769
|
update_kwargs['CONTENTIDS'] = contentids
|
|
764
770
|
try:
|
|
771
|
+
assert not ctx.mets_server_url, \
|
|
772
|
+
f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
765
773
|
workspace = ctx.workspace()
|
|
766
774
|
workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
|
|
767
775
|
workspace.save_mets()
|
|
@@ -800,6 +808,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
|
|
|
800
808
|
mets_path = Path(mets_path)
|
|
801
809
|
if filegrp_mapping:
|
|
802
810
|
filegrp_mapping = loads(filegrp_mapping)
|
|
811
|
+
assert not ctx.mets_server_url, \
|
|
812
|
+
f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
803
813
|
workspace = ctx.workspace()
|
|
804
814
|
other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
|
|
805
815
|
workspace.merge(
|
|
@@ -13,7 +13,6 @@ from ocrd_utils import (
|
|
|
13
13
|
redirect_stderr_and_stdout_to_file,
|
|
14
14
|
)
|
|
15
15
|
from ocrd_validators import WorkspaceValidator
|
|
16
|
-
from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
|
|
17
16
|
|
|
18
17
|
from ..resolver import Resolver
|
|
19
18
|
from ..processor.base import ResourceNotFoundError, run_processor
|
|
@@ -23,8 +22,6 @@ from .parameter_option import parameter_option, parameter_override_option
|
|
|
23
22
|
from .ocrd_cli_options import ocrd_cli_options
|
|
24
23
|
from .mets_find_options import mets_find_options
|
|
25
24
|
|
|
26
|
-
SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER]
|
|
27
|
-
|
|
28
25
|
|
|
29
26
|
def ocrd_cli_wrap_processor(
|
|
30
27
|
processorClass,
|
|
@@ -88,11 +85,9 @@ def ocrd_cli_wrap_processor(
|
|
|
88
85
|
if list_resources:
|
|
89
86
|
processor.list_resources()
|
|
90
87
|
sys.exit()
|
|
91
|
-
if subcommand:
|
|
88
|
+
if subcommand or address or queue or database:
|
|
92
89
|
# Used for checking/starting network agents for the WebAPI architecture
|
|
93
90
|
check_and_run_network_agent(processorClass, subcommand, address, database, queue)
|
|
94
|
-
elif address or queue or database:
|
|
95
|
-
raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}")
|
|
96
91
|
|
|
97
92
|
# from here: single-run processing context
|
|
98
93
|
initLogging()
|
|
@@ -162,6 +157,11 @@ def ocrd_cli_wrap_processor(
|
|
|
162
157
|
def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str):
|
|
163
158
|
"""
|
|
164
159
|
"""
|
|
160
|
+
from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
|
|
161
|
+
SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER]
|
|
162
|
+
|
|
163
|
+
if not subcommand:
|
|
164
|
+
raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}")
|
|
165
165
|
if subcommand not in SUBCOMMANDS:
|
|
166
166
|
raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}")
|
|
167
167
|
|
|
@@ -16,14 +16,21 @@ import json
|
|
|
16
16
|
import os
|
|
17
17
|
from os import getcwd
|
|
18
18
|
from pathlib import Path
|
|
19
|
-
from typing import Any, List, Optional, Union, get_args
|
|
19
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, get_args
|
|
20
20
|
import sys
|
|
21
|
+
import logging
|
|
22
|
+
import logging.handlers
|
|
21
23
|
import inspect
|
|
22
24
|
import tarfile
|
|
23
25
|
import io
|
|
24
|
-
import
|
|
26
|
+
from collections import defaultdict
|
|
25
27
|
from frozendict import frozendict
|
|
26
|
-
|
|
28
|
+
# concurrent.futures is buggy in py38,
|
|
29
|
+
# this is where the fixes came from:
|
|
30
|
+
from loky import Future, ProcessPoolExecutor
|
|
31
|
+
import multiprocessing as mp
|
|
32
|
+
from threading import Timer
|
|
33
|
+
from _thread import interrupt_main
|
|
27
34
|
|
|
28
35
|
from click import wrap_text
|
|
29
36
|
from deprecated import deprecated
|
|
@@ -105,6 +112,31 @@ class MissingInputFile(ValueError):
|
|
|
105
112
|
f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
|
|
106
113
|
super().__init__(self.message)
|
|
107
114
|
|
|
115
|
+
class DummyFuture:
|
|
116
|
+
"""
|
|
117
|
+
Mimics some of `concurrent.futures.Future` but runs immediately.
|
|
118
|
+
"""
|
|
119
|
+
def __init__(self, fn, *args, **kwargs):
|
|
120
|
+
self.fn = fn
|
|
121
|
+
self.args = args
|
|
122
|
+
self.kwargs = kwargs
|
|
123
|
+
def result(self):
|
|
124
|
+
return self.fn(*self.args, **self.kwargs)
|
|
125
|
+
class DummyExecutor:
|
|
126
|
+
"""
|
|
127
|
+
Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs
|
|
128
|
+
everything immediately in this process.
|
|
129
|
+
"""
|
|
130
|
+
def __init__(self, initializer=None, initargs=(), **kwargs):
|
|
131
|
+
initializer(*initargs)
|
|
132
|
+
def shutdown(self, **kwargs):
|
|
133
|
+
pass
|
|
134
|
+
def submit(self, fn, *args, **kwargs) -> DummyFuture:
|
|
135
|
+
return DummyFuture(fn, *args, **kwargs)
|
|
136
|
+
|
|
137
|
+
TFuture = Union[DummyFuture, Future]
|
|
138
|
+
TExecutor = Union[DummyExecutor, ProcessPoolExecutor]
|
|
139
|
+
|
|
108
140
|
class Processor():
|
|
109
141
|
"""
|
|
110
142
|
A processor is a tool that implements the uniform OCR-D
|
|
@@ -127,12 +159,12 @@ class Processor():
|
|
|
127
159
|
|
|
128
160
|
max_workers : int = -1
|
|
129
161
|
"""
|
|
130
|
-
maximum number of processor
|
|
162
|
+
maximum number of processor forks for page-parallel processing (ignored if negative),
|
|
131
163
|
to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e.
|
|
132
164
|
whatever is smaller).
|
|
133
165
|
|
|
134
166
|
(Override this if you know how many pages fit into processing units - GPU shaders / CPU cores
|
|
135
|
-
- at once, or if your class
|
|
167
|
+
- at once, or if your class already creates threads prior to forking, e.g. during ``setup``.)
|
|
136
168
|
"""
|
|
137
169
|
|
|
138
170
|
max_page_seconds : int = -1
|
|
@@ -335,12 +367,14 @@ class Processor():
|
|
|
335
367
|
self._base_logger = getLogger('ocrd.processor.base')
|
|
336
368
|
if parameter is not None:
|
|
337
369
|
self.parameter = parameter
|
|
338
|
-
# ensure that shutdown gets called at destruction
|
|
339
|
-
self._finalizer = weakref.finalize(self, self.shutdown)
|
|
340
370
|
# workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
|
|
341
371
|
setattr(self, 'process',
|
|
342
372
|
deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
|
|
343
373
|
|
|
374
|
+
def __del__(self):
|
|
375
|
+
self._base_logger.debug("shutting down")
|
|
376
|
+
self.shutdown()
|
|
377
|
+
|
|
344
378
|
def show_help(self, subcommand=None):
|
|
345
379
|
"""
|
|
346
380
|
Print a usage description including the standard CLI and all of this processor's ocrd-tool
|
|
@@ -456,6 +490,9 @@ class Processor():
|
|
|
456
490
|
for the given :py:data:`page_id` (or all pages)
|
|
457
491
|
under the given :py:data:`parameter`.
|
|
458
492
|
|
|
493
|
+
Delegates to :py:meth:`.process_workspace_submit_tasks`
|
|
494
|
+
and :py:meth:`.process_workspace_handle_tasks`.
|
|
495
|
+
|
|
459
496
|
(This will iterate over pages and files, calling
|
|
460
497
|
:py:meth:`.process_page_file` and handling exceptions.
|
|
461
498
|
It should be overridden by subclasses to handle cases
|
|
@@ -465,11 +502,7 @@ class Processor():
|
|
|
465
502
|
self.workspace = workspace
|
|
466
503
|
self.verify()
|
|
467
504
|
try:
|
|
468
|
-
|
|
469
|
-
nr_skipped = 0
|
|
470
|
-
nr_copied = 0
|
|
471
|
-
|
|
472
|
-
# set up multithreading
|
|
505
|
+
# set up multitasking
|
|
473
506
|
max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
|
|
474
507
|
if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
|
|
475
508
|
self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
|
|
@@ -481,80 +514,34 @@ class Processor():
|
|
|
481
514
|
if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
|
|
482
515
|
self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
|
|
483
516
|
max_seconds = self.max_page_seconds
|
|
484
|
-
|
|
517
|
+
|
|
518
|
+
if max_workers > 1:
|
|
519
|
+
executor_cls = ProcessPoolExecutor
|
|
520
|
+
log_queue = mp.Queue()
|
|
521
|
+
# forward messages from log queue (in subprocesses) to all root handlers
|
|
522
|
+
log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True)
|
|
523
|
+
else:
|
|
524
|
+
executor_cls = DummyExecutor
|
|
525
|
+
log_queue = None
|
|
526
|
+
log_listener = None
|
|
527
|
+
executor = executor_cls(
|
|
485
528
|
max_workers=max_workers or 1,
|
|
486
|
-
|
|
529
|
+
# only forking method avoids pickling
|
|
530
|
+
context=mp.get_context('fork'),
|
|
531
|
+
# share processor instance as global to avoid pickling
|
|
532
|
+
initializer=_page_worker_set_ctxt,
|
|
533
|
+
initargs=(self, log_queue),
|
|
487
534
|
)
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
if input_file is None:
|
|
499
|
-
# file/page not found in this file grp
|
|
500
|
-
continue
|
|
501
|
-
input_files[i] = input_file
|
|
502
|
-
if not self.download:
|
|
503
|
-
continue
|
|
504
|
-
try:
|
|
505
|
-
input_files[i] = self.workspace.download_file(input_file)
|
|
506
|
-
except (ValueError, FileNotFoundError, HTTPError) as e:
|
|
507
|
-
self._base_logger.error(repr(e))
|
|
508
|
-
self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
|
|
509
|
-
# process page
|
|
510
|
-
tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
|
|
511
|
-
self._base_logger.debug("submitted %d processing tasks", len(tasks))
|
|
512
|
-
|
|
513
|
-
for task in tasks:
|
|
514
|
-
# wait for results, handle errors
|
|
515
|
-
page_id, input_files = tasks[task]
|
|
516
|
-
# FIXME: differentiate error cases in various ways:
|
|
517
|
-
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
|
|
518
|
-
# - transient (I/O or OOM) error → maybe sleep, retry
|
|
519
|
-
# - persistent (data) error → skip / dummy / raise
|
|
520
|
-
try:
|
|
521
|
-
self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
|
|
522
|
-
task.result(timeout=max_seconds or None)
|
|
523
|
-
nr_succeeded += 1
|
|
524
|
-
# exclude NotImplementedError, so we can try process() below
|
|
525
|
-
except NotImplementedError:
|
|
526
|
-
raise
|
|
527
|
-
# handle input failures separately
|
|
528
|
-
except FileExistsError as err:
|
|
529
|
-
if config.OCRD_EXISTING_OUTPUT == 'ABORT':
|
|
530
|
-
raise err
|
|
531
|
-
if config.OCRD_EXISTING_OUTPUT == 'SKIP':
|
|
532
|
-
continue
|
|
533
|
-
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
534
|
-
# too late here, must not happen
|
|
535
|
-
raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
|
|
536
|
-
# broad coverage of output failures (including TimeoutError)
|
|
537
|
-
except (Exception, TimeoutError) as err:
|
|
538
|
-
# FIXME: add re-usable/actionable logging
|
|
539
|
-
if config.OCRD_MISSING_OUTPUT == 'ABORT':
|
|
540
|
-
self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
541
|
-
raise err
|
|
542
|
-
self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
543
|
-
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
544
|
-
nr_skipped += 1
|
|
545
|
-
continue
|
|
546
|
-
if config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
547
|
-
self._copy_page_file(input_files[0])
|
|
548
|
-
nr_copied += 1
|
|
549
|
-
else:
|
|
550
|
-
desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
|
|
551
|
-
raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
|
|
552
|
-
|
|
553
|
-
if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
554
|
-
raise Exception(f"too many failures with skipped output ({nr_skipped})")
|
|
555
|
-
if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
556
|
-
raise Exception(f"too many failures with fallback output ({nr_skipped})")
|
|
557
|
-
executor.shutdown()
|
|
535
|
+
if max_workers > 1:
|
|
536
|
+
log_listener.start()
|
|
537
|
+
try:
|
|
538
|
+
self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
|
|
539
|
+
tasks = self.process_workspace_submit_tasks(executor, max_seconds)
|
|
540
|
+
stats = self.process_workspace_handle_tasks(tasks)
|
|
541
|
+
finally:
|
|
542
|
+
executor.shutdown(kill_workers=True, wait=False)
|
|
543
|
+
if max_workers > 1:
|
|
544
|
+
log_listener.stop()
|
|
558
545
|
|
|
559
546
|
except NotImplementedError:
|
|
560
547
|
# fall back to deprecated method
|
|
@@ -564,6 +551,190 @@ class Processor():
|
|
|
564
551
|
# suppress the NotImplementedError context
|
|
565
552
|
raise err from None
|
|
566
553
|
|
|
554
|
+
def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
|
|
555
|
+
"""
|
|
556
|
+
Look up all input files of the given ``workspace``
|
|
557
|
+
from the given :py:data:`input_file_grp`
|
|
558
|
+
for the given :py:data:`page_id` (or all pages),
|
|
559
|
+
and schedules calling :py:meth:`.process_page_file`
|
|
560
|
+
on them for each page via `executor` (enforcing
|
|
561
|
+
a per-page time limit of `max_seconds`).
|
|
562
|
+
|
|
563
|
+
When running with `OCRD_MAX_PARALLEL_PAGES>1` and
|
|
564
|
+
the workspace via METS Server, the executor will fork
|
|
565
|
+
this many worker parallel subprocesses each processing
|
|
566
|
+
one page at a time. (Interprocess communication is
|
|
567
|
+
done via task and result queues.)
|
|
568
|
+
|
|
569
|
+
Otherwise, tasks are run sequentially in the
|
|
570
|
+
current process.
|
|
571
|
+
|
|
572
|
+
Delegates to :py:meth:`.zip_input_files` to get
|
|
573
|
+
the input files for each page, and then calls
|
|
574
|
+
:py:meth:`.process_workspace_submit_page_task`.
|
|
575
|
+
|
|
576
|
+
Returns a dict mapping the per-page tasks
|
|
577
|
+
(i.e. futures submitted to the executor)
|
|
578
|
+
to their corresponding pageId and input files.
|
|
579
|
+
"""
|
|
580
|
+
tasks = {}
|
|
581
|
+
for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
|
|
582
|
+
task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple)
|
|
583
|
+
tasks[task] = (page_id, input_files)
|
|
584
|
+
self._base_logger.debug("submitted %d processing tasks", len(tasks))
|
|
585
|
+
return tasks
|
|
586
|
+
|
|
587
|
+
def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]:
|
|
588
|
+
"""
|
|
589
|
+
Ensure all input files for a single page are
|
|
590
|
+
downloaded to the workspace, then schedule
|
|
591
|
+
:py:meth:`.process_process_file` to be run on
|
|
592
|
+
them via `executor` (enforcing a per-page time
|
|
593
|
+
limit of `max_seconds`).
|
|
594
|
+
|
|
595
|
+
Delegates to :py:meth:`.process_page_file`
|
|
596
|
+
(wrapped in :py:func:`_page_worker` to share
|
|
597
|
+
the processor instance across forked processes).
|
|
598
|
+
|
|
599
|
+
\b
|
|
600
|
+
Returns a tuple of:
|
|
601
|
+
- the scheduled future object,
|
|
602
|
+
- the corresponding pageId,
|
|
603
|
+
- the corresponding input files.
|
|
604
|
+
"""
|
|
605
|
+
input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
|
|
606
|
+
page_id = next(input_file.pageId
|
|
607
|
+
for input_file in input_file_tuple
|
|
608
|
+
if input_file)
|
|
609
|
+
self._base_logger.info(f"preparing page {page_id}")
|
|
610
|
+
for i, input_file in enumerate(input_file_tuple):
|
|
611
|
+
if input_file is None:
|
|
612
|
+
# file/page not found in this file grp
|
|
613
|
+
continue
|
|
614
|
+
input_files[i] = input_file
|
|
615
|
+
if not self.download:
|
|
616
|
+
continue
|
|
617
|
+
try:
|
|
618
|
+
input_files[i] = self.workspace.download_file(input_file)
|
|
619
|
+
except (ValueError, FileNotFoundError, HTTPError) as e:
|
|
620
|
+
self._base_logger.error(repr(e))
|
|
621
|
+
self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
|
|
622
|
+
# process page
|
|
623
|
+
#executor.submit(self.process_page_file, *input_files)
|
|
624
|
+
return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
|
|
625
|
+
|
|
626
|
+
def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]:
|
|
627
|
+
"""
|
|
628
|
+
Look up scheduled per-page futures one by one,
|
|
629
|
+
handle errors (exceptions) and gather results.
|
|
630
|
+
|
|
631
|
+
\b
|
|
632
|
+
Enforces policies configured by the following
|
|
633
|
+
environment variables:
|
|
634
|
+
- `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
|
|
635
|
+
- `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
|
|
636
|
+
- `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
|
|
637
|
+
|
|
638
|
+
\b
|
|
639
|
+
Returns a tuple of:
|
|
640
|
+
- the number of successfully processed pages
|
|
641
|
+
- the number of failed (i.e. skipped or copied) pages
|
|
642
|
+
- a dict of the type and corresponding number of exceptions seen
|
|
643
|
+
- the number of total requested pages (i.e. success+fail+existing).
|
|
644
|
+
|
|
645
|
+
Delegates to :py:meth:`.process_workspace_handle_page_task`
|
|
646
|
+
for each page.
|
|
647
|
+
"""
|
|
648
|
+
# aggregate info for logging:
|
|
649
|
+
nr_succeeded = 0
|
|
650
|
+
nr_failed = 0
|
|
651
|
+
nr_errors = defaultdict(int) # count causes
|
|
652
|
+
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
653
|
+
reason = "skipped"
|
|
654
|
+
elif config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
655
|
+
reason = "fallback-copied"
|
|
656
|
+
for task in tasks:
|
|
657
|
+
# wait for results, handle errors
|
|
658
|
+
page_id, input_files = tasks[task]
|
|
659
|
+
result = self.process_workspace_handle_page_task(page_id, input_files, task)
|
|
660
|
+
if isinstance(result, Exception):
|
|
661
|
+
nr_errors[result.__class__.__name__] += 1
|
|
662
|
+
nr_failed += 1
|
|
663
|
+
# FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed
|
|
664
|
+
if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS:
|
|
665
|
+
# already irredeemably many failures, stop short
|
|
666
|
+
nr_errors = dict(nr_errors)
|
|
667
|
+
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})")
|
|
668
|
+
elif result:
|
|
669
|
+
nr_succeeded += 1
|
|
670
|
+
# else skipped - already exists
|
|
671
|
+
nr_errors = dict(nr_errors)
|
|
672
|
+
if nr_failed > 0:
|
|
673
|
+
nr_all = nr_succeeded + nr_failed
|
|
674
|
+
if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
|
|
675
|
+
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
|
|
676
|
+
self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
|
|
677
|
+
return nr_succeeded, nr_failed, nr_errors, len(tasks)
|
|
678
|
+
|
|
679
|
+
def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]:
|
|
680
|
+
"""
|
|
681
|
+
\b
|
|
682
|
+
Await a single page result and handle errors (exceptions),
|
|
683
|
+
enforcing policies configured by the following
|
|
684
|
+
environment variables:
|
|
685
|
+
- `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
|
|
686
|
+
- `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
|
|
687
|
+
- `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
|
|
688
|
+
|
|
689
|
+
\b
|
|
690
|
+
Returns
|
|
691
|
+
- true in case of success
|
|
692
|
+
- false in case the output already exists
|
|
693
|
+
- the exception in case of failure
|
|
694
|
+
"""
|
|
695
|
+
# FIXME: differentiate error cases in various ways:
|
|
696
|
+
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
|
|
697
|
+
# - transient (I/O or OOM) error → maybe sleep, retry
|
|
698
|
+
# - persistent (data) error → skip / dummy / raise
|
|
699
|
+
try:
|
|
700
|
+
self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id)
|
|
701
|
+
# timeout kwarg on future is useless: it only raises TimeoutError here,
|
|
702
|
+
# but does not stop the running process/thread, and executor itself
|
|
703
|
+
# offers nothing to that effect:
|
|
704
|
+
# task.result(timeout=max_seconds or None)
|
|
705
|
+
# so we instead applied the timeout within the worker function
|
|
706
|
+
task.result()
|
|
707
|
+
return True
|
|
708
|
+
except NotImplementedError:
|
|
709
|
+
# exclude NotImplementedError, so we can try process() below
|
|
710
|
+
raise
|
|
711
|
+
# handle input failures separately
|
|
712
|
+
except FileExistsError as err:
|
|
713
|
+
if config.OCRD_EXISTING_OUTPUT == 'ABORT':
|
|
714
|
+
raise err
|
|
715
|
+
if config.OCRD_EXISTING_OUTPUT == 'SKIP':
|
|
716
|
+
return False
|
|
717
|
+
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
718
|
+
# too late here, must not happen
|
|
719
|
+
raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
|
|
720
|
+
except KeyboardInterrupt:
|
|
721
|
+
raise
|
|
722
|
+
# broad coverage of output failures (including TimeoutError)
|
|
723
|
+
except Exception as err:
|
|
724
|
+
# FIXME: add re-usable/actionable logging
|
|
725
|
+
if config.OCRD_MISSING_OUTPUT == 'ABORT':
|
|
726
|
+
self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
727
|
+
raise err
|
|
728
|
+
self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
729
|
+
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
730
|
+
pass
|
|
731
|
+
elif config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
732
|
+
self._copy_page_file(input_files[0])
|
|
733
|
+
else:
|
|
734
|
+
desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
|
|
735
|
+
raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
|
|
736
|
+
return err
|
|
737
|
+
|
|
567
738
|
def _copy_page_file(self, input_file : OcrdFileType) -> None:
|
|
568
739
|
"""
|
|
569
740
|
Copy the given ``input_file`` of the :py:data:`workspace`,
|
|
@@ -618,6 +789,12 @@ class Processor():
|
|
|
618
789
|
# not PAGE and not an image to generate PAGE for
|
|
619
790
|
self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
|
|
620
791
|
output_file_id = make_file_id(input_files[0], self.output_file_grp)
|
|
792
|
+
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
|
|
793
|
+
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
|
|
794
|
+
# short-cut avoiding useless computation:
|
|
795
|
+
raise FileExistsError(
|
|
796
|
+
f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
|
|
797
|
+
)
|
|
621
798
|
result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
|
|
622
799
|
for image_result in result.images:
|
|
623
800
|
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
|
|
@@ -934,6 +1111,47 @@ class Processor():
|
|
|
934
1111
|
ifts.append(tuple(ifiles))
|
|
935
1112
|
return ifts
|
|
936
1113
|
|
|
1114
|
+
_page_worker_processor = None
|
|
1115
|
+
"""
|
|
1116
|
+
This global binding for the processor is required to avoid
|
|
1117
|
+
squeezing the processor through a mp.Queue (which is impossible
|
|
1118
|
+
due to unpicklable attributes like .workspace.mets._tree anyway)
|
|
1119
|
+
when calling Processor.process_page_file as page worker processes
|
|
1120
|
+
in Processor.process_workspace. Forking allows inheriting global
|
|
1121
|
+
objects, and with the METS Server we do not mutate the local
|
|
1122
|
+
processor instance anyway.
|
|
1123
|
+
"""
|
|
1124
|
+
def _page_worker_set_ctxt(processor, log_queue):
|
|
1125
|
+
"""
|
|
1126
|
+
Overwrites `ocrd.processor.base._page_worker_processor` instance
|
|
1127
|
+
for sharing with subprocesses in ProcessPoolExecutor initializer.
|
|
1128
|
+
"""
|
|
1129
|
+
global _page_worker_processor
|
|
1130
|
+
_page_worker_processor = processor
|
|
1131
|
+
if log_queue:
|
|
1132
|
+
# replace all log handlers with just one queue handler
|
|
1133
|
+
logging.root.handlers = [logging.handlers.QueueHandler(log_queue)]
|
|
1134
|
+
|
|
1135
|
+
def _page_worker(timeout, *input_files):
|
|
1136
|
+
"""
|
|
1137
|
+
Wraps a `Processor.process_page_file` call as payload (call target)
|
|
1138
|
+
of the ProcessPoolExecutor workers, but also enforces the given timeout.
|
|
1139
|
+
"""
|
|
1140
|
+
page_id = next((file.pageId for file in input_files
|
|
1141
|
+
if hasattr(file, 'pageId')), "")
|
|
1142
|
+
if timeout > 0:
|
|
1143
|
+
timer = Timer(timeout, interrupt_main)
|
|
1144
|
+
timer.start()
|
|
1145
|
+
try:
|
|
1146
|
+
_page_worker_processor.process_page_file(*input_files)
|
|
1147
|
+
_page_worker_processor.logger.debug("page worker completed for page %s", page_id)
|
|
1148
|
+
except KeyboardInterrupt:
|
|
1149
|
+
_page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
|
|
1150
|
+
raise TimeoutError()
|
|
1151
|
+
finally:
|
|
1152
|
+
if timeout > 0:
|
|
1153
|
+
timer.cancel()
|
|
1154
|
+
|
|
937
1155
|
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
|
|
938
1156
|
"""Generate a string describing the full CLI of this processor including params.
|
|
939
1157
|
|