ocrd 3.0.0b5__tar.gz → 3.0.0b6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrd-3.0.0b5/src/ocrd.egg-info → ocrd-3.0.0b6}/PKG-INFO +2 -1
- ocrd-3.0.0b6/VERSION +1 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/requirements.txt +1 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/workspace.py +21 -11
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/__init__.py +6 -6
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/base.py +282 -79
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/helpers.py +10 -2
- {ocrd-3.0.0b5 → ocrd-3.0.0b6/src/ocrd.egg-info}/PKG-INFO +2 -1
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/requires.txt +1 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_mets.py +9 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/logging.py +6 -2
- ocrd-3.0.0b5/VERSION +0 -1
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/LICENSE +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/MANIFEST.in +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_bashlib.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_modelfactory.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_models.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_network.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_utils.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/README_ocrd_validators.md +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/pyproject.toml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/setup.cfg +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/bashlib.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/log.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/network.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/ocrd_tool.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/process.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/resmgr.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/validate.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/cli/zip.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/loglevel_option.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/mets_find_options.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/ocrd_cli_options.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/decorators/parameter_option.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/lib.bash +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/mets_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/ocrd-all-tool.json +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/builtin/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/processor/ocrd_page_result.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/resolver.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/resource_list.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/resource_manager.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/task_sequence.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/workspace.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/workspace_backup.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd/workspace_bagger.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/SOURCES.txt +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/dependency_links.txt +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/entry_points.txt +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd.egg-info/top_level.txt +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_modelfactory/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/mets-empty.xml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_agent.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_exif.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_file.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_page.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_page_generateds.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/ocrd_xml_base.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/report.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_models/utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/client.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/processing_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/processing_worker.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/cli/processor_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/client.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/client_utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/database.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/logging_utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/job.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/messages.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/ocrd_tool.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/workflow.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/models/workspace.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/param_validators.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/process_helpers.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/processing_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/processing_worker.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/processor_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/config_parser.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/deployer.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/hosts.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/network_agents.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/runtime_data/network_services.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/server_cache.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/server_utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_network/utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/config.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/deprecate.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/image.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/introspect.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/ocrd_logging.conf +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/os.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_utils/str.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/__init__.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/bagit-profile.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/constants.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/json_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/message_processing.schema.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/message_result.schema.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/mets.xsd +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/page.xsd +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/page_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/parameter_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/processing_server_config_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/resource_list_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/workspace_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/xlink.xsd +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/xsd_mets_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/xsd_page_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/src/ocrd_validators/xsd_validator.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_decorators.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_logging.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_logging_conf.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_mets_server.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_model_factory.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_resolver.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_resolver_oai.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_resource_manager.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_task_sequence.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_utils.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_version.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_workspace.py +0 -0
- {ocrd-3.0.0b5 → ocrd-3.0.0b6}/tests/test_workspace_remove.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0b6
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -26,6 +26,7 @@ Requires-Dist: httpx>=0.22.0
|
|
|
26
26
|
Requires-Dist: importlib_metadata; python_version < "3.8"
|
|
27
27
|
Requires-Dist: importlib_resources; python_version < "3.10"
|
|
28
28
|
Requires-Dist: jsonschema>=4
|
|
29
|
+
Requires-Dist: loky
|
|
29
30
|
Requires-Dist: lxml
|
|
30
31
|
Requires-Dist: memory-profiler>=0.58.0
|
|
31
32
|
Requires-Dist: numpy
|
ocrd-3.0.0b6/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.0.0b6
|
|
@@ -149,7 +149,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
|
|
|
149
149
|
LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
|
|
150
150
|
ctx.directory = workspace_dir
|
|
151
151
|
|
|
152
|
-
assert not ctx.mets_server_url
|
|
152
|
+
assert not ctx.mets_server_url, \
|
|
153
|
+
f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
153
154
|
workspace = ctx.resolver.workspace_from_url(
|
|
154
155
|
mets_url,
|
|
155
156
|
dst_dir=ctx.directory,
|
|
@@ -185,7 +186,8 @@ def workspace_init(ctx, clobber_mets, directory):
|
|
|
185
186
|
if directory:
|
|
186
187
|
LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
|
|
187
188
|
ctx.directory = directory
|
|
188
|
-
assert not ctx.mets_server_url
|
|
189
|
+
assert not ctx.mets_server_url, \
|
|
190
|
+
f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
189
191
|
workspace = ctx.resolver.workspace_from_nothing(
|
|
190
192
|
directory=ctx.directory,
|
|
191
193
|
mets_basename=ctx.mets_basename,
|
|
@@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin
|
|
|
506
508
|
(If any ``ID`` starts with ``//``, then its remainder
|
|
507
509
|
will be interpreted as a regular expression.)
|
|
508
510
|
"""
|
|
511
|
+
assert not ctx.mets_server_url, \
|
|
512
|
+
f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
509
513
|
workspace = ctx.workspace()
|
|
510
514
|
for i in id:
|
|
511
515
|
workspace.remove_file(i, force=force, keep_file=keep_file)
|
|
@@ -524,6 +528,8 @@ def rename_group(ctx, old, new):
|
|
|
524
528
|
"""
|
|
525
529
|
Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
|
|
526
530
|
"""
|
|
531
|
+
assert not ctx.mets_server_url, \
|
|
532
|
+
f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
527
533
|
workspace = ctx.workspace()
|
|
528
534
|
workspace.rename_file_group(old, new)
|
|
529
535
|
workspace.save_mets()
|
|
@@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files):
|
|
|
545
551
|
(If any ``GROUP`` starts with ``//``, then its remainder
|
|
546
552
|
will be interpreted as a regular expression.)
|
|
547
553
|
"""
|
|
554
|
+
assert not ctx.mets_server_url, \
|
|
555
|
+
f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
548
556
|
workspace = ctx.workspace()
|
|
549
557
|
for g in group:
|
|
550
558
|
workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
|
|
@@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
|
|
|
567
575
|
(If any ``FILTER`` starts with ``//``, then its remainder
|
|
568
576
|
will be interpreted as a regular expression.)
|
|
569
577
|
"""
|
|
578
|
+
assert not ctx.mets_server_url, \
|
|
579
|
+
f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
570
580
|
workspace = ctx.workspace()
|
|
571
581
|
with pushd_popd(workspace.directory):
|
|
572
582
|
for f in workspace.find_files(
|
|
@@ -673,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page
|
|
|
673
683
|
will be interpreted as a regular expression.)
|
|
674
684
|
"""
|
|
675
685
|
workspace = ctx.workspace()
|
|
676
|
-
find_kwargs = {}
|
|
677
|
-
if page_id_range and 'ID' in output_field:
|
|
678
|
-
find_kwargs['pageId'] = page_id_range
|
|
679
|
-
page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
|
|
680
686
|
ret = []
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
ret = [[x] for x in page_ids]
|
|
684
|
-
else:
|
|
685
|
-
for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
|
|
687
|
+
if page_id_range or list(output_field) != ['ID']:
|
|
688
|
+
for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)):
|
|
686
689
|
ret.append([])
|
|
687
690
|
for k in output_field:
|
|
688
691
|
ret[i].append(page_div.get(k, 'None'))
|
|
692
|
+
else:
|
|
693
|
+
for page_id in workspace.mets.physical_pages:
|
|
694
|
+
ret.append([page_id])
|
|
689
695
|
|
|
690
696
|
if numeric_range:
|
|
691
697
|
start, end = map(int, numeric_range.split('..'))
|
|
@@ -762,6 +768,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
|
|
|
762
768
|
if contentids:
|
|
763
769
|
update_kwargs['CONTENTIDS'] = contentids
|
|
764
770
|
try:
|
|
771
|
+
assert not ctx.mets_server_url, \
|
|
772
|
+
f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
765
773
|
workspace = ctx.workspace()
|
|
766
774
|
workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
|
|
767
775
|
workspace.save_mets()
|
|
@@ -800,6 +808,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
|
|
|
800
808
|
mets_path = Path(mets_path)
|
|
801
809
|
if filegrp_mapping:
|
|
802
810
|
filegrp_mapping = loads(filegrp_mapping)
|
|
811
|
+
assert not ctx.mets_server_url, \
|
|
812
|
+
f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
|
|
803
813
|
workspace = ctx.workspace()
|
|
804
814
|
other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
|
|
805
815
|
workspace.merge(
|
|
@@ -13,7 +13,6 @@ from ocrd_utils import (
|
|
|
13
13
|
redirect_stderr_and_stdout_to_file,
|
|
14
14
|
)
|
|
15
15
|
from ocrd_validators import WorkspaceValidator
|
|
16
|
-
from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
|
|
17
16
|
|
|
18
17
|
from ..resolver import Resolver
|
|
19
18
|
from ..processor.base import ResourceNotFoundError, run_processor
|
|
@@ -23,8 +22,6 @@ from .parameter_option import parameter_option, parameter_override_option
|
|
|
23
22
|
from .ocrd_cli_options import ocrd_cli_options
|
|
24
23
|
from .mets_find_options import mets_find_options
|
|
25
24
|
|
|
26
|
-
SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER]
|
|
27
|
-
|
|
28
25
|
|
|
29
26
|
def ocrd_cli_wrap_processor(
|
|
30
27
|
processorClass,
|
|
@@ -88,11 +85,9 @@ def ocrd_cli_wrap_processor(
|
|
|
88
85
|
if list_resources:
|
|
89
86
|
processor.list_resources()
|
|
90
87
|
sys.exit()
|
|
91
|
-
if subcommand:
|
|
88
|
+
if subcommand or address or queue or database:
|
|
92
89
|
# Used for checking/starting network agents for the WebAPI architecture
|
|
93
90
|
check_and_run_network_agent(processorClass, subcommand, address, database, queue)
|
|
94
|
-
elif address or queue or database:
|
|
95
|
-
raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}")
|
|
96
91
|
|
|
97
92
|
# from here: single-run processing context
|
|
98
93
|
initLogging()
|
|
@@ -162,6 +157,11 @@ def ocrd_cli_wrap_processor(
|
|
|
162
157
|
def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str):
|
|
163
158
|
"""
|
|
164
159
|
"""
|
|
160
|
+
from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
|
|
161
|
+
SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER]
|
|
162
|
+
|
|
163
|
+
if not subcommand:
|
|
164
|
+
raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}")
|
|
165
165
|
if subcommand not in SUBCOMMANDS:
|
|
166
166
|
raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}")
|
|
167
167
|
|
|
@@ -16,14 +16,20 @@ import json
|
|
|
16
16
|
import os
|
|
17
17
|
from os import getcwd
|
|
18
18
|
from pathlib import Path
|
|
19
|
-
from typing import Any, List, Optional, Union, get_args
|
|
19
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, get_args
|
|
20
20
|
import sys
|
|
21
21
|
import inspect
|
|
22
22
|
import tarfile
|
|
23
23
|
import io
|
|
24
24
|
import weakref
|
|
25
|
+
from collections import defaultdict
|
|
25
26
|
from frozendict import frozendict
|
|
26
|
-
|
|
27
|
+
# concurrent.futures is buggy in py38,
|
|
28
|
+
# this is where the fixes came from:
|
|
29
|
+
from loky import Future, ProcessPoolExecutor
|
|
30
|
+
import multiprocessing as mp
|
|
31
|
+
from threading import Timer
|
|
32
|
+
from _thread import interrupt_main
|
|
27
33
|
|
|
28
34
|
from click import wrap_text
|
|
29
35
|
from deprecated import deprecated
|
|
@@ -105,6 +111,31 @@ class MissingInputFile(ValueError):
|
|
|
105
111
|
f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
|
|
106
112
|
super().__init__(self.message)
|
|
107
113
|
|
|
114
|
+
class DummyFuture:
|
|
115
|
+
"""
|
|
116
|
+
Mimics some of `concurrent.futures.Future` but runs immediately.
|
|
117
|
+
"""
|
|
118
|
+
def __init__(self, fn, *args, **kwargs):
|
|
119
|
+
self.fn = fn
|
|
120
|
+
self.args = args
|
|
121
|
+
self.kwargs = kwargs
|
|
122
|
+
def result(self):
|
|
123
|
+
return self.fn(*self.args, **self.kwargs)
|
|
124
|
+
class DummyExecutor:
|
|
125
|
+
"""
|
|
126
|
+
Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs
|
|
127
|
+
everything immediately in this process.
|
|
128
|
+
"""
|
|
129
|
+
def __init__(self, initializer=None, initargs=(), **kwargs):
|
|
130
|
+
initializer(*initargs)
|
|
131
|
+
def shutdown(self, **kwargs):
|
|
132
|
+
pass
|
|
133
|
+
def submit(self, fn, *args, **kwargs) -> DummyFuture:
|
|
134
|
+
return DummyFuture(fn, *args, **kwargs)
|
|
135
|
+
|
|
136
|
+
TFuture = Union[DummyFuture, Future]
|
|
137
|
+
TExecutor = Union[DummyExecutor, ProcessPoolExecutor]
|
|
138
|
+
|
|
108
139
|
class Processor():
|
|
109
140
|
"""
|
|
110
141
|
A processor is a tool that implements the uniform OCR-D
|
|
@@ -456,6 +487,9 @@ class Processor():
|
|
|
456
487
|
for the given :py:data:`page_id` (or all pages)
|
|
457
488
|
under the given :py:data:`parameter`.
|
|
458
489
|
|
|
490
|
+
Delegates to :py:meth:`.process_workspace_submit_tasks`
|
|
491
|
+
and :py:meth:`.process_workspace_handle_tasks`.
|
|
492
|
+
|
|
459
493
|
(This will iterate over pages and files, calling
|
|
460
494
|
:py:meth:`.process_page_file` and handling exceptions.
|
|
461
495
|
It should be overridden by subclasses to handle cases
|
|
@@ -465,11 +499,7 @@ class Processor():
|
|
|
465
499
|
self.workspace = workspace
|
|
466
500
|
self.verify()
|
|
467
501
|
try:
|
|
468
|
-
|
|
469
|
-
nr_skipped = 0
|
|
470
|
-
nr_copied = 0
|
|
471
|
-
|
|
472
|
-
# set up multithreading
|
|
502
|
+
# set up multitasking
|
|
473
503
|
max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
|
|
474
504
|
if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
|
|
475
505
|
self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
|
|
@@ -481,80 +511,25 @@ class Processor():
|
|
|
481
511
|
if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
|
|
482
512
|
self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
|
|
483
513
|
max_seconds = self.max_page_seconds
|
|
484
|
-
|
|
514
|
+
|
|
515
|
+
if max_workers > 1:
|
|
516
|
+
executor_cls = ProcessPoolExecutor
|
|
517
|
+
else:
|
|
518
|
+
executor_cls = DummyExecutor
|
|
519
|
+
executor = executor_cls(
|
|
485
520
|
max_workers=max_workers or 1,
|
|
486
|
-
|
|
521
|
+
# only forking method avoids pickling
|
|
522
|
+
context=mp.get_context('fork'),
|
|
523
|
+
# share processor instance as global to avoid pickling
|
|
524
|
+
initializer=_page_worker_set_ctxt,
|
|
525
|
+
initargs=(self,),
|
|
487
526
|
)
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
for input_file in input_file_tuple
|
|
495
|
-
if input_file)
|
|
496
|
-
self._base_logger.info(f"preparing page {page_id}")
|
|
497
|
-
for i, input_file in enumerate(input_file_tuple):
|
|
498
|
-
if input_file is None:
|
|
499
|
-
# file/page not found in this file grp
|
|
500
|
-
continue
|
|
501
|
-
input_files[i] = input_file
|
|
502
|
-
if not self.download:
|
|
503
|
-
continue
|
|
504
|
-
try:
|
|
505
|
-
input_files[i] = self.workspace.download_file(input_file)
|
|
506
|
-
except (ValueError, FileNotFoundError, HTTPError) as e:
|
|
507
|
-
self._base_logger.error(repr(e))
|
|
508
|
-
self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
|
|
509
|
-
# process page
|
|
510
|
-
tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files)
|
|
511
|
-
self._base_logger.debug("submitted %d processing tasks", len(tasks))
|
|
512
|
-
|
|
513
|
-
for task in tasks:
|
|
514
|
-
# wait for results, handle errors
|
|
515
|
-
page_id, input_files = tasks[task]
|
|
516
|
-
# FIXME: differentiate error cases in various ways:
|
|
517
|
-
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
|
|
518
|
-
# - transient (I/O or OOM) error → maybe sleep, retry
|
|
519
|
-
# - persistent (data) error → skip / dummy / raise
|
|
520
|
-
try:
|
|
521
|
-
self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds)
|
|
522
|
-
task.result(timeout=max_seconds or None)
|
|
523
|
-
nr_succeeded += 1
|
|
524
|
-
# exclude NotImplementedError, so we can try process() below
|
|
525
|
-
except NotImplementedError:
|
|
526
|
-
raise
|
|
527
|
-
# handle input failures separately
|
|
528
|
-
except FileExistsError as err:
|
|
529
|
-
if config.OCRD_EXISTING_OUTPUT == 'ABORT':
|
|
530
|
-
raise err
|
|
531
|
-
if config.OCRD_EXISTING_OUTPUT == 'SKIP':
|
|
532
|
-
continue
|
|
533
|
-
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
534
|
-
# too late here, must not happen
|
|
535
|
-
raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
|
|
536
|
-
# broad coverage of output failures (including TimeoutError)
|
|
537
|
-
except (Exception, TimeoutError) as err:
|
|
538
|
-
# FIXME: add re-usable/actionable logging
|
|
539
|
-
if config.OCRD_MISSING_OUTPUT == 'ABORT':
|
|
540
|
-
self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
541
|
-
raise err
|
|
542
|
-
self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
543
|
-
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
544
|
-
nr_skipped += 1
|
|
545
|
-
continue
|
|
546
|
-
if config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
547
|
-
self._copy_page_file(input_files[0])
|
|
548
|
-
nr_copied += 1
|
|
549
|
-
else:
|
|
550
|
-
desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
|
|
551
|
-
raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
|
|
552
|
-
|
|
553
|
-
if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
554
|
-
raise Exception(f"too many failures with skipped output ({nr_skipped})")
|
|
555
|
-
if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS:
|
|
556
|
-
raise Exception(f"too many failures with fallback output ({nr_skipped})")
|
|
557
|
-
executor.shutdown()
|
|
527
|
+
try:
|
|
528
|
+
self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1)
|
|
529
|
+
tasks = self.process_workspace_submit_tasks(executor, max_seconds)
|
|
530
|
+
stats = self.process_workspace_handle_tasks(tasks)
|
|
531
|
+
finally:
|
|
532
|
+
executor.shutdown(kill_workers=True, wait=False)
|
|
558
533
|
|
|
559
534
|
except NotImplementedError:
|
|
560
535
|
# fall back to deprecated method
|
|
@@ -564,6 +539,190 @@ class Processor():
|
|
|
564
539
|
# suppress the NotImplementedError context
|
|
565
540
|
raise err from None
|
|
566
541
|
|
|
542
|
+
def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]:
|
|
543
|
+
"""
|
|
544
|
+
Look up all input files of the given ``workspace``
|
|
545
|
+
from the given :py:data:`input_file_grp`
|
|
546
|
+
for the given :py:data:`page_id` (or all pages),
|
|
547
|
+
and schedules calling :py:meth:`.process_page_file`
|
|
548
|
+
on them for each page via `executor` (enforcing
|
|
549
|
+
a per-page time limit of `max_seconds`).
|
|
550
|
+
|
|
551
|
+
When running with `OCRD_MAX_PARALLEL_PAGES>1` and
|
|
552
|
+
the workspace via METS Server, the executor will fork
|
|
553
|
+
this many worker parallel subprocesses each processing
|
|
554
|
+
one page at a time. (Interprocess communication is
|
|
555
|
+
done via task and result queues.)
|
|
556
|
+
|
|
557
|
+
Otherwise, tasks are run sequentially in the
|
|
558
|
+
current process.
|
|
559
|
+
|
|
560
|
+
Delegates to :py:meth:`.zip_input_files` to get
|
|
561
|
+
the input files for each page, and then calls
|
|
562
|
+
:py:meth:`.process_workspace_submit_page_task`.
|
|
563
|
+
|
|
564
|
+
Returns a dict mapping the per-page tasks
|
|
565
|
+
(i.e. futures submitted to the executor)
|
|
566
|
+
to their corresponding pageId and input files.
|
|
567
|
+
"""
|
|
568
|
+
tasks = {}
|
|
569
|
+
for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
|
|
570
|
+
task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple)
|
|
571
|
+
tasks[task] = (page_id, input_files)
|
|
572
|
+
self._base_logger.debug("submitted %d processing tasks", len(tasks))
|
|
573
|
+
return tasks
|
|
574
|
+
|
|
575
|
+
def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]:
|
|
576
|
+
"""
|
|
577
|
+
Ensure all input files for a single page are
|
|
578
|
+
downloaded to the workspace, then schedule
|
|
579
|
+
:py:meth:`.process_process_file` to be run on
|
|
580
|
+
them via `executor` (enforcing a per-page time
|
|
581
|
+
limit of `max_seconds`).
|
|
582
|
+
|
|
583
|
+
Delegates to :py:meth:`.process_page_file`
|
|
584
|
+
(wrapped in :py:func:`_page_worker` to share
|
|
585
|
+
the processor instance across forked processes).
|
|
586
|
+
|
|
587
|
+
\b
|
|
588
|
+
Returns a tuple of:
|
|
589
|
+
- the scheduled future object,
|
|
590
|
+
- the corresponding pageId,
|
|
591
|
+
- the corresponding input files.
|
|
592
|
+
"""
|
|
593
|
+
input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple)
|
|
594
|
+
page_id = next(input_file.pageId
|
|
595
|
+
for input_file in input_file_tuple
|
|
596
|
+
if input_file)
|
|
597
|
+
self._base_logger.info(f"preparing page {page_id}")
|
|
598
|
+
for i, input_file in enumerate(input_file_tuple):
|
|
599
|
+
if input_file is None:
|
|
600
|
+
# file/page not found in this file grp
|
|
601
|
+
continue
|
|
602
|
+
input_files[i] = input_file
|
|
603
|
+
if not self.download:
|
|
604
|
+
continue
|
|
605
|
+
try:
|
|
606
|
+
input_files[i] = self.workspace.download_file(input_file)
|
|
607
|
+
except (ValueError, FileNotFoundError, HTTPError) as e:
|
|
608
|
+
self._base_logger.error(repr(e))
|
|
609
|
+
self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
|
|
610
|
+
# process page
|
|
611
|
+
#executor.submit(self.process_page_file, *input_files)
|
|
612
|
+
return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
|
|
613
|
+
|
|
614
|
+
def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]:
|
|
615
|
+
"""
|
|
616
|
+
Look up scheduled per-page futures one by one,
|
|
617
|
+
handle errors (exceptions) and gather results.
|
|
618
|
+
|
|
619
|
+
\b
|
|
620
|
+
Enforces policies configured by the following
|
|
621
|
+
environment variables:
|
|
622
|
+
- `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
|
|
623
|
+
- `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
|
|
624
|
+
- `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
|
|
625
|
+
|
|
626
|
+
\b
|
|
627
|
+
Returns a tuple of:
|
|
628
|
+
- the number of successfully processed pages
|
|
629
|
+
- the number of failed (i.e. skipped or copied) pages
|
|
630
|
+
- a dict of the type and corresponding number of exceptions seen
|
|
631
|
+
- the number of total requested pages (i.e. success+fail+existing).
|
|
632
|
+
|
|
633
|
+
Delegates to :py:meth:`.process_workspace_handle_page_task`
|
|
634
|
+
for each page.
|
|
635
|
+
"""
|
|
636
|
+
# aggregate info for logging:
|
|
637
|
+
nr_succeeded = 0
|
|
638
|
+
nr_failed = 0
|
|
639
|
+
nr_errors = defaultdict(int) # count causes
|
|
640
|
+
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
641
|
+
reason = "skipped"
|
|
642
|
+
elif config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
643
|
+
reason = "fallback-copied"
|
|
644
|
+
for task in tasks:
|
|
645
|
+
# wait for results, handle errors
|
|
646
|
+
page_id, input_files = tasks[task]
|
|
647
|
+
result = self.process_workspace_handle_page_task(page_id, input_files, task)
|
|
648
|
+
if isinstance(result, Exception):
|
|
649
|
+
nr_errors[result.__class__.__name__] += 1
|
|
650
|
+
nr_failed += 1
|
|
651
|
+
# FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed
|
|
652
|
+
if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS:
|
|
653
|
+
# already irredeemably many failures, stop short
|
|
654
|
+
nr_errors = dict(nr_errors)
|
|
655
|
+
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})")
|
|
656
|
+
elif result:
|
|
657
|
+
nr_succeeded += 1
|
|
658
|
+
# else skipped - already exists
|
|
659
|
+
nr_errors = dict(nr_errors)
|
|
660
|
+
if nr_failed > 0:
|
|
661
|
+
nr_all = nr_succeeded + nr_failed
|
|
662
|
+
if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
|
|
663
|
+
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
|
|
664
|
+
self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
|
|
665
|
+
return nr_succeeded, nr_failed, nr_errors, len(tasks)
|
|
666
|
+
|
|
667
|
+
def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]:
|
|
668
|
+
"""
|
|
669
|
+
\b
|
|
670
|
+
Await a single page result and handle errors (exceptions),
|
|
671
|
+
enforcing policies configured by the following
|
|
672
|
+
environment variables:
|
|
673
|
+
- `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite)
|
|
674
|
+
- `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy)
|
|
675
|
+
- `OCRD_MAX_MISSING_OUTPUTS` (abort after all).
|
|
676
|
+
|
|
677
|
+
\b
|
|
678
|
+
Returns
|
|
679
|
+
- true in case of success
|
|
680
|
+
- false in case the output already exists
|
|
681
|
+
- the exception in case of failure
|
|
682
|
+
"""
|
|
683
|
+
# FIXME: differentiate error cases in various ways:
|
|
684
|
+
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
|
|
685
|
+
# - transient (I/O or OOM) error → maybe sleep, retry
|
|
686
|
+
# - persistent (data) error → skip / dummy / raise
|
|
687
|
+
try:
|
|
688
|
+
self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id)
|
|
689
|
+
# timeout kwarg on future is useless: it only raises TimeoutError here,
|
|
690
|
+
# but does not stop the running process/thread, and executor itself
|
|
691
|
+
# offers nothing to that effect:
|
|
692
|
+
# task.result(timeout=max_seconds or None)
|
|
693
|
+
# so we instead applied the timeout within the worker function
|
|
694
|
+
task.result()
|
|
695
|
+
return True
|
|
696
|
+
except NotImplementedError:
|
|
697
|
+
# exclude NotImplementedError, so we can try process() below
|
|
698
|
+
raise
|
|
699
|
+
# handle input failures separately
|
|
700
|
+
except FileExistsError as err:
|
|
701
|
+
if config.OCRD_EXISTING_OUTPUT == 'ABORT':
|
|
702
|
+
raise err
|
|
703
|
+
if config.OCRD_EXISTING_OUTPUT == 'SKIP':
|
|
704
|
+
return False
|
|
705
|
+
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
706
|
+
# too late here, must not happen
|
|
707
|
+
raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
|
|
708
|
+
except KeyboardInterrupt:
|
|
709
|
+
raise
|
|
710
|
+
# broad coverage of output failures (including TimeoutError)
|
|
711
|
+
except Exception as err:
|
|
712
|
+
# FIXME: add re-usable/actionable logging
|
|
713
|
+
if config.OCRD_MISSING_OUTPUT == 'ABORT':
|
|
714
|
+
self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
715
|
+
raise err
|
|
716
|
+
self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
|
|
717
|
+
if config.OCRD_MISSING_OUTPUT == 'SKIP':
|
|
718
|
+
pass
|
|
719
|
+
elif config.OCRD_MISSING_OUTPUT == 'COPY':
|
|
720
|
+
self._copy_page_file(input_files[0])
|
|
721
|
+
else:
|
|
722
|
+
desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
|
|
723
|
+
raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
|
|
724
|
+
return err
|
|
725
|
+
|
|
567
726
|
def _copy_page_file(self, input_file : OcrdFileType) -> None:
|
|
568
727
|
"""
|
|
569
728
|
Copy the given ``input_file`` of the :py:data:`workspace`,
|
|
@@ -618,6 +777,12 @@ class Processor():
|
|
|
618
777
|
# not PAGE and not an image to generate PAGE for
|
|
619
778
|
self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
|
|
620
779
|
output_file_id = make_file_id(input_files[0], self.output_file_grp)
|
|
780
|
+
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
|
|
781
|
+
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
|
|
782
|
+
# short-cut avoiding useless computation:
|
|
783
|
+
raise FileExistsError(
|
|
784
|
+
f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
|
|
785
|
+
)
|
|
621
786
|
result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
|
|
622
787
|
for image_result in result.images:
|
|
623
788
|
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
|
|
@@ -934,6 +1099,44 @@ class Processor():
|
|
|
934
1099
|
ifts.append(tuple(ifiles))
|
|
935
1100
|
return ifts
|
|
936
1101
|
|
|
1102
|
+
_page_worker_processor = None
|
|
1103
|
+
"""
|
|
1104
|
+
This global binding for the processor is required to avoid
|
|
1105
|
+
squeezing the processor through a mp.Queue (which is impossible
|
|
1106
|
+
due to unpicklable attributes like .workspace.mets._tree anyway)
|
|
1107
|
+
when calling Processor.process_page_file as page worker processes
|
|
1108
|
+
in Processor.process_workspace. Forking allows inheriting global
|
|
1109
|
+
objects, and with the METS Server we do not mutate the local
|
|
1110
|
+
processor instance anyway.
|
|
1111
|
+
"""
|
|
1112
|
+
def _page_worker_set_ctxt(processor):
|
|
1113
|
+
"""
|
|
1114
|
+
Overwrites `ocrd.processor.base._page_worker_processor` instance
|
|
1115
|
+
for sharing with subprocesses in ProcessPoolExecutor initializer.
|
|
1116
|
+
"""
|
|
1117
|
+
global _page_worker_processor
|
|
1118
|
+
_page_worker_processor = processor
|
|
1119
|
+
|
|
1120
|
+
def _page_worker(timeout, *input_files):
|
|
1121
|
+
"""
|
|
1122
|
+
Wraps a `Processor.process_page_file` call as payload (call target)
|
|
1123
|
+
of the ProcessPoolExecutor workers, but also enforces the given timeout.
|
|
1124
|
+
"""
|
|
1125
|
+
page_id = next((file.pageId for file in input_files
|
|
1126
|
+
if hasattr(file, 'pageId')), "")
|
|
1127
|
+
if timeout > 0:
|
|
1128
|
+
timer = Timer(timeout, interrupt_main)
|
|
1129
|
+
timer.start()
|
|
1130
|
+
try:
|
|
1131
|
+
_page_worker_processor.process_page_file(*input_files)
|
|
1132
|
+
_page_worker_processor.logger.debug("page worker completed for page %s", page_id)
|
|
1133
|
+
except KeyboardInterrupt:
|
|
1134
|
+
_page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
|
|
1135
|
+
raise TimeoutError()
|
|
1136
|
+
finally:
|
|
1137
|
+
if timeout > 0:
|
|
1138
|
+
timer.cancel()
|
|
1139
|
+
|
|
937
1140
|
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
|
|
938
1141
|
"""Generate a string describing the full CLI of this processor including params.
|
|
939
1142
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Helper methods for running and documenting processors
|
|
3
3
|
"""
|
|
4
4
|
from time import perf_counter, process_time
|
|
5
|
+
from os import times
|
|
5
6
|
from functools import lru_cache
|
|
6
7
|
import json
|
|
7
8
|
import inspect
|
|
@@ -94,6 +95,7 @@ def run_processor(
|
|
|
94
95
|
log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
|
|
95
96
|
t0_wall = perf_counter()
|
|
96
97
|
t0_cpu = process_time()
|
|
98
|
+
t0_os = times()
|
|
97
99
|
if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
|
|
98
100
|
backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
|
|
99
101
|
from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
|
|
@@ -123,7 +125,13 @@ def run_processor(
|
|
|
123
125
|
|
|
124
126
|
t1_wall = perf_counter() - t0_wall
|
|
125
127
|
t1_cpu = process_time() - t0_cpu
|
|
126
|
-
|
|
128
|
+
t1_os = times()
|
|
129
|
+
# add CPU time from child processes (page worker etc)
|
|
130
|
+
t1_cpu += t1_os.children_user - t0_os.children_user
|
|
131
|
+
t1_cpu += t1_os.children_system - t0_os.children_system
|
|
132
|
+
logProfile.info(
|
|
133
|
+
"Executing processor '%s' took %fs (wall) %fs (CPU)( "
|
|
134
|
+
"[--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']",
|
|
127
135
|
ocrd_tool['executable'],
|
|
128
136
|
t1_wall,
|
|
129
137
|
t1_cpu,
|
|
@@ -131,7 +139,7 @@ def run_processor(
|
|
|
131
139
|
processor.output_file_grp or '',
|
|
132
140
|
json.dumps(processor.parameter) or '',
|
|
133
141
|
processor.page_id or ''
|
|
134
|
-
)
|
|
142
|
+
)
|
|
135
143
|
workspace.mets.add_agent(
|
|
136
144
|
name=name,
|
|
137
145
|
_type='OTHER',
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0b6
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -26,6 +26,7 @@ Requires-Dist: httpx>=0.22.0
|
|
|
26
26
|
Requires-Dist: importlib_metadata; python_version < "3.8"
|
|
27
27
|
Requires-Dist: importlib_resources; python_version < "3.10"
|
|
28
28
|
Requires-Dist: jsonschema>=4
|
|
29
|
+
Requires-Dist: loky
|
|
29
30
|
Requires-Dist: lxml
|
|
30
31
|
Requires-Dist: memory-profiler>=0.58.0
|
|
31
32
|
Requires-Dist: numpy
|
|
@@ -599,7 +599,16 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
599
599
|
If return_divs is set, returns div memory objects instead of strings of ids
|
|
600
600
|
"""
|
|
601
601
|
if for_fileIds is None and for_pageIds is None:
|
|
602
|
+
if return_divs:
|
|
603
|
+
if self._cache_flag:
|
|
604
|
+
return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values())
|
|
605
|
+
|
|
606
|
+
return [x for x in self._tree.getroot().xpath(
|
|
607
|
+
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
|
|
608
|
+
namespaces=NS)]
|
|
609
|
+
|
|
602
610
|
return self.physical_pages
|
|
611
|
+
|
|
603
612
|
# log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
|
|
604
613
|
if for_pageIds is not None:
|
|
605
614
|
ret = []
|