ocrd 3.4.0__tar.gz → 3.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrd-3.4.0/src/ocrd.egg-info → ocrd-3.5.0}/PKG-INFO +2 -2
- ocrd-3.5.0/VERSION +1 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/requirements.txt +1 -1
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/__init__.py +6 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/ocrd_cli_options.py +1 -1
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/base.py +21 -13
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/workspace.py +15 -19
- {ocrd-3.4.0 → ocrd-3.5.0/src/ocrd.egg-info}/PKG-INFO +2 -2
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/requires.txt +1 -1
- ocrd-3.5.0/src/ocrd_models/constants.py +205 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_mets.py +231 -97
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/constants.py +9 -5
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/os.py +1 -1
- ocrd-3.4.0/VERSION +0 -1
- ocrd-3.4.0/src/ocrd_models/constants.py +0 -100
- {ocrd-3.4.0 → ocrd-3.5.0}/LICENSE +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/MANIFEST.in +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/README.md +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/README_bashlib.md +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd.md +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_modelfactory.md +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_models.md +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_network.md +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_utils.md +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_validators.md +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/pyproject.toml +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/setup.cfg +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/bashlib.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/log.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/network.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/ocrd_tool.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/process.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/resmgr.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/validate.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/workspace.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/zip.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/constants.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/loglevel_option.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/mets_find_options.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/parameter_option.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/lib.bash +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/mets_server.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/ocrd-all-tool.json +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/filter_processor.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/helpers.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/ocrd_page_result.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/resolver.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/resource_list.yml +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/resource_manager.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/task_sequence.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/workspace_backup.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/workspace_bagger.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/SOURCES.txt +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/dependency_links.txt +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/entry_points.txt +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/top_level.txt +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_modelfactory/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/mets-empty.xml +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_agent.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_exif.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_file.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_page.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_page_generateds.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_xml_base.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/report.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/utils.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/xpath_functions.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/client.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/processing_server.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/processing_worker.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/processor_server.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/client.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/client_utils.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/database.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/logging_utils.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/job.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/messages.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/ocrd_tool.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/workflow.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/workspace.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/param_validators.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/process_helpers.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/processing_server.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/processing_worker.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/processor_server.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/config_parser.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/deployer.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/hosts.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/network_agents.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/network_services.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/server_cache.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/server_utils.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/utils.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/config.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/constants.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/deprecate.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/image.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/introspect.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/logging.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/ocrd_logging.conf +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/str.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/__init__.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/bagit-profile.yml +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/constants.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/json_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/message_processing.schema.yml +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/message_result.schema.yml +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/mets.xsd +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/page.xsd +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/page_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/parameter_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/processing_server_config_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/resource_list_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/workspace_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/xlink.xsd +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/xsd_mets_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/xsd_page_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/xsd_validator.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_decorators.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_logging.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_logging_conf.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_mets_server.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_model_factory.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_resolver.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_resolver_oai.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_resource_manager.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_task_sequence.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_utils.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_version.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_workspace.py +0 -0
- {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_workspace_remove.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -21,7 +21,7 @@ Requires-Dist: elementpath
|
|
|
21
21
|
Requires-Dist: fastapi>=0.78.0
|
|
22
22
|
Requires-Dist: filetype
|
|
23
23
|
Requires-Dist: Flask
|
|
24
|
-
Requires-Dist: frozendict>=2.
|
|
24
|
+
Requires-Dist: frozendict>=2.4.0
|
|
25
25
|
Requires-Dist: gdown
|
|
26
26
|
Requires-Dist: httpx>=0.22.0
|
|
27
27
|
Requires-Dist: importlib_metadata; python_version < "3.8"
|
ocrd-3.5.0/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.5.0
|
|
@@ -67,6 +67,12 @@ Variables:
|
|
|
67
67
|
\b
|
|
68
68
|
{config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)}
|
|
69
69
|
\b
|
|
70
|
+
{config.describe('OCRD_MAX_MISSING_OUTPUTS')}
|
|
71
|
+
\b
|
|
72
|
+
{config.describe('OCRD_MAX_PARALLEL_PAGES')}
|
|
73
|
+
\b
|
|
74
|
+
{config.describe('OCRD_PROCESSING_PAGE_TIMEOUT')}
|
|
75
|
+
\b
|
|
70
76
|
{config.describe('OCRD_METS_CACHING')}
|
|
71
77
|
\b
|
|
72
78
|
{config.describe('OCRD_MAX_PROCESSOR_CACHE')}
|
|
@@ -56,7 +56,7 @@ def ocrd_cli_options(f):
|
|
|
56
56
|
# subcommands. So we have to work around that by creating a
|
|
57
57
|
# pseudo-subcommand handled in ocrd_cli_wrap_processor
|
|
58
58
|
argument('subcommand', nargs=1, required=False,
|
|
59
|
-
type=click.Choice(
|
|
59
|
+
type=click.Choice(list(map(str, AgentType)))),
|
|
60
60
|
]
|
|
61
61
|
for param in params:
|
|
62
62
|
param(f)
|
|
@@ -29,8 +29,7 @@ from frozendict import frozendict
|
|
|
29
29
|
# this is where the fixes came from:
|
|
30
30
|
from loky import Future, ProcessPoolExecutor
|
|
31
31
|
import multiprocessing as mp
|
|
32
|
-
from
|
|
33
|
-
from _thread import interrupt_main
|
|
32
|
+
from multiprocessing.pool import ThreadPool
|
|
34
33
|
|
|
35
34
|
from click import wrap_text
|
|
36
35
|
from deprecated import deprecated
|
|
@@ -783,11 +782,16 @@ class Processor():
|
|
|
783
782
|
page_id = input_files[input_pos].pageId
|
|
784
783
|
self._base_logger.info("processing page %s", page_id)
|
|
785
784
|
for i, input_file in enumerate(input_files):
|
|
785
|
+
grp = self.input_file_grp.split(',')[i]
|
|
786
786
|
if input_file is None:
|
|
787
|
-
grp = self.input_file_grp.split(',')[i]
|
|
788
787
|
self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}")
|
|
789
788
|
continue
|
|
790
789
|
assert isinstance(input_file, get_args(OcrdFileType))
|
|
790
|
+
if not input_file.local_filename:
|
|
791
|
+
self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}')
|
|
792
|
+
if config.OCRD_MISSING_INPUT == 'ABORT':
|
|
793
|
+
raise MissingInputFile(grp, page_id, input_file.mimetype)
|
|
794
|
+
continue
|
|
791
795
|
self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
|
|
792
796
|
try:
|
|
793
797
|
page_ = page_from_file(input_file)
|
|
@@ -796,6 +800,9 @@ class Processor():
|
|
|
796
800
|
except ValueError as err:
|
|
797
801
|
# not PAGE and not an image to generate PAGE for
|
|
798
802
|
self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
|
|
803
|
+
if not any(input_pcgts):
|
|
804
|
+
self._base_logger.warning(f'skipping page {page_id}')
|
|
805
|
+
return
|
|
799
806
|
output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
|
|
800
807
|
if input_files[input_pos].fileGrp == self.output_file_grp:
|
|
801
808
|
# input=output fileGrp: re-use ID exactly
|
|
@@ -1107,7 +1114,11 @@ class Processor():
|
|
|
1107
1114
|
self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
|
|
1108
1115
|
f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
|
|
1109
1116
|
ifts = []
|
|
1110
|
-
|
|
1117
|
+
# use physical page order
|
|
1118
|
+
for page in self.workspace.mets.physical_pages:
|
|
1119
|
+
if page not in pages:
|
|
1120
|
+
continue
|
|
1121
|
+
ifiles = pages[page]
|
|
1111
1122
|
for i, ifg in enumerate(ifgs):
|
|
1112
1123
|
if not ifiles[i]:
|
|
1113
1124
|
# could be from non-unique with on_error=skip or from true gap
|
|
@@ -1150,18 +1161,15 @@ def _page_worker(timeout, *input_files):
|
|
|
1150
1161
|
"""
|
|
1151
1162
|
page_id = next((file.pageId for file in input_files
|
|
1152
1163
|
if hasattr(file, 'pageId')), "")
|
|
1153
|
-
|
|
1154
|
-
timer = Timer(timeout, interrupt_main)
|
|
1155
|
-
timer.start()
|
|
1164
|
+
pool = ThreadPool(processes=1)
|
|
1156
1165
|
try:
|
|
1157
|
-
_page_worker_processor.process_page_file(*input_files)
|
|
1166
|
+
#_page_worker_processor.process_page_file(*input_files)
|
|
1167
|
+
async_result = pool.apply_async(_page_worker_processor.process_page_file, input_files)
|
|
1168
|
+
async_result.get(timeout or None)
|
|
1158
1169
|
_page_worker_processor.logger.debug("page worker completed for page %s", page_id)
|
|
1159
|
-
except
|
|
1170
|
+
except mp.TimeoutError:
|
|
1160
1171
|
_page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
|
|
1161
|
-
raise
|
|
1162
|
-
finally:
|
|
1163
|
-
if timeout > 0:
|
|
1164
|
-
timer.cancel()
|
|
1172
|
+
raise
|
|
1165
1173
|
|
|
1166
1174
|
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
|
|
1167
1175
|
"""Generate a string describing the full CLI of this processor including params.
|
|
@@ -777,16 +777,14 @@ class Workspace():
|
|
|
777
777
|
raise Exception('Found no AlternativeImage that satisfies all requirements ' +
|
|
778
778
|
'filename="%s" in page "%s"' % (
|
|
779
779
|
filename, page_id))
|
|
780
|
-
if not all(feature in page_coords['features']
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
'filter="%s" in page "%s"' % (
|
|
789
|
-
feature_filter, page_id))
|
|
780
|
+
if (not all(feature in page_coords['features']
|
|
781
|
+
for feature in feature_selector.split(',') if feature) or
|
|
782
|
+
any(feature in page_coords['features']
|
|
783
|
+
for feature in feature_filter.split(',') if feature)):
|
|
784
|
+
raise Exception('Found no AlternativeImage that satisfies all requirements' +
|
|
785
|
+
' selector="%s"' % feature_selector +
|
|
786
|
+
' filter="%s"' % feature_filter +
|
|
787
|
+
' in page "%s"' % page_id)
|
|
790
788
|
# ensure DPI will be set in image meta-data again
|
|
791
789
|
if 'DPI' in page_coords:
|
|
792
790
|
dpi = page_coords['DPI']
|
|
@@ -1038,16 +1036,14 @@ class Workspace():
|
|
|
1038
1036
|
raise Exception('Found no AlternativeImage that satisfies all requirements ' +
|
|
1039
1037
|
'filename="%s" in segment "%s"' % (
|
|
1040
1038
|
filename, segment.id))
|
|
1041
|
-
if not all(feature in segment_coords['features']
|
|
1042
|
-
|
|
1039
|
+
if (not all(feature in segment_coords['features']
|
|
1040
|
+
for feature in feature_selector.split(',') if feature) or
|
|
1041
|
+
any(feature in segment_coords['features']
|
|
1042
|
+
for feature in feature_filter.split(',') if feature)):
|
|
1043
1043
|
raise Exception('Found no AlternativeImage that satisfies all requirements' +
|
|
1044
|
-
'selector="%s"
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
for feature in feature_filter.split(',') if feature):
|
|
1048
|
-
raise Exception('Found no AlternativeImage that satisfies all requirements ' +
|
|
1049
|
-
'filter="%s" in segment "%s"' % (
|
|
1050
|
-
feature_filter, segment.id))
|
|
1044
|
+
' selector="%s"' % feature_selector +
|
|
1045
|
+
' filter="%s"' % feature_filter +
|
|
1046
|
+
' in segment "%s"' % segment.id)
|
|
1051
1047
|
# ensure DPI will be set in image meta-data again
|
|
1052
1048
|
if 'DPI' in segment_coords:
|
|
1053
1049
|
dpi = segment_coords['DPI']
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -21,7 +21,7 @@ Requires-Dist: elementpath
|
|
|
21
21
|
Requires-Dist: fastapi>=0.78.0
|
|
22
22
|
Requires-Dist: filetype
|
|
23
23
|
Requires-Dist: Flask
|
|
24
|
-
Requires-Dist: frozendict>=2.
|
|
24
|
+
Requires-Dist: frozendict>=2.4.0
|
|
25
25
|
Requires-Dist: gdown
|
|
26
26
|
Requires-Dist: httpx>=0.22.0
|
|
27
27
|
Requires-Dist: importlib_metadata; python_version < "3.8"
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constants for ocrd_models.
|
|
3
|
+
"""
|
|
4
|
+
from re import Pattern
|
|
5
|
+
from enum import Enum, auto
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any, List, Optional, Union
|
|
9
|
+
from ocrd_utils import resource_string
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
'IDENTIFIER_PRIORITY',
|
|
13
|
+
'METS_XML_EMPTY',
|
|
14
|
+
'NAMESPACES',
|
|
15
|
+
'TAG_METS_AGENT',
|
|
16
|
+
'TAG_METS_DIV',
|
|
17
|
+
'TAG_METS_FILE',
|
|
18
|
+
'TAG_METS_FILEGRP',
|
|
19
|
+
'TAG_METS_FILESEC',
|
|
20
|
+
'TAG_METS_FPTR',
|
|
21
|
+
'TAG_METS_FLOCAT',
|
|
22
|
+
'TAG_METS_METSHDR',
|
|
23
|
+
'TAG_METS_NAME',
|
|
24
|
+
'TAG_METS_NOTE',
|
|
25
|
+
'TAG_METS_STRUCTMAP',
|
|
26
|
+
'TAG_MODS_IDENTIFIER',
|
|
27
|
+
'TAG_PAGE_ALTERNATIVEIMAGE',
|
|
28
|
+
'TAG_PAGE_COORDS',
|
|
29
|
+
'TAG_PAGE_READINGORDER',
|
|
30
|
+
'TAG_PAGE_REGIONREFINDEXED',
|
|
31
|
+
'TAG_PAGE_TEXTLINE',
|
|
32
|
+
'TAG_PAGE_TEXTEQUIV',
|
|
33
|
+
'TAG_PAGE_TEXTREGION',
|
|
34
|
+
'METS_PAGE_DIV_ATTRIBUTE',
|
|
35
|
+
'METS_STRUCT_DIV_ATTRIBUTE',
|
|
36
|
+
'METS_DIV_ATTRIBUTE_ATOM_PATTERN',
|
|
37
|
+
'METS_DIV_ATTRIBUTE_RANGE_PATTERN',
|
|
38
|
+
'METS_DIV_ATTRIBUTE_REGEX_PATTERN',
|
|
39
|
+
'PAGE_REGION_TYPES',
|
|
40
|
+
'PAGE_ALTIMG_FEATURES',
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
IDENTIFIER_PRIORITY = ['purl', 'urn', 'doi', 'url']
|
|
45
|
+
|
|
46
|
+
METS_XML_EMPTY = resource_string(__package__, 'mets-empty.xml')
|
|
47
|
+
|
|
48
|
+
NAMESPACES = {
|
|
49
|
+
'mets': "http://www.loc.gov/METS/",
|
|
50
|
+
'mods': "http://www.loc.gov/mods/v3",
|
|
51
|
+
'xlink': "http://www.w3.org/1999/xlink",
|
|
52
|
+
'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
|
|
53
|
+
'xsl': 'http://www.w3.org/1999/XSL/Transform#',
|
|
54
|
+
'ocrd': 'https://ocr-d.de',
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets']
|
|
58
|
+
TAG_METS_DIV = '{%s}div' % NAMESPACES['mets']
|
|
59
|
+
TAG_METS_FILE = '{%s}file' % NAMESPACES['mets']
|
|
60
|
+
TAG_METS_FILEGRP = '{%s}fileGrp' % NAMESPACES['mets']
|
|
61
|
+
TAG_METS_FILESEC = '{%s}fileSec' % NAMESPACES['mets']
|
|
62
|
+
TAG_METS_FPTR = '{%s}fptr' % NAMESPACES['mets']
|
|
63
|
+
TAG_METS_FLOCAT = '{%s}FLocat' % NAMESPACES['mets']
|
|
64
|
+
TAG_METS_METSHDR = '{%s}metsHdr' % NAMESPACES['mets']
|
|
65
|
+
TAG_METS_NAME = '{%s}name' % NAMESPACES['mets']
|
|
66
|
+
TAG_METS_NOTE = '{%s}note' % NAMESPACES['mets']
|
|
67
|
+
TAG_METS_STRUCTMAP = '{%s}structMap' % NAMESPACES['mets']
|
|
68
|
+
|
|
69
|
+
TAG_MODS_IDENTIFIER = '{%s}identifier' % NAMESPACES['mods']
|
|
70
|
+
|
|
71
|
+
TAG_PAGE_ALTERNATIVEIMAGE = '{%s}AlternativeImage' % NAMESPACES['page']
|
|
72
|
+
TAG_PAGE_COORDS = '{%s}Coords' % NAMESPACES['page']
|
|
73
|
+
TAG_PAGE_READINGORDER = '{%s}ReadingOrder' % NAMESPACES['page']
|
|
74
|
+
TAG_PAGE_REGIONREFINDEXED = '{%s}RegionRefIndexed' % NAMESPACES['page']
|
|
75
|
+
TAG_PAGE_TEXTLINE = '{%s}TextLine' % NAMESPACES['page']
|
|
76
|
+
TAG_PAGE_TEXTEQUIV = '{%s}TextEquiv' % NAMESPACES['page']
|
|
77
|
+
TAG_PAGE_TEXTREGION = '{%s}TextRegion' % NAMESPACES['page']
|
|
78
|
+
|
|
79
|
+
PAGE_REGION_TYPES = [
|
|
80
|
+
'Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image',
|
|
81
|
+
'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
|
|
82
|
+
'Separator', 'Table', 'Text', 'Unknown'
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
PAGE_ALTIMG_FEATURES = [
|
|
86
|
+
'binarized',
|
|
87
|
+
'grayscale_normalized',
|
|
88
|
+
'despeckled',
|
|
89
|
+
'cropped',
|
|
90
|
+
'deskewed',
|
|
91
|
+
'rotated-90',
|
|
92
|
+
'rotated-180',
|
|
93
|
+
'rotated-270',
|
|
94
|
+
'dewarped',
|
|
95
|
+
'clipped',
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class METS_PAGE_DIV_ATTRIBUTE(Enum):
|
|
100
|
+
"""page selection attributes of PHYSICAL mets:structMap//mets:div"""
|
|
101
|
+
ID = auto()
|
|
102
|
+
ORDER = auto()
|
|
103
|
+
ORDERLABEL = auto()
|
|
104
|
+
LABEL = auto()
|
|
105
|
+
CONTENTIDS = auto()
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
def names(cls):
|
|
109
|
+
return [x.name for x in cls]
|
|
110
|
+
@classmethod
|
|
111
|
+
def type_prefix(cls):
|
|
112
|
+
"""disambiguation prefix to use for all subtypes"""
|
|
113
|
+
return "physical:"
|
|
114
|
+
def prefix(self):
|
|
115
|
+
"""disambiguation prefix to use for this attribute type"""
|
|
116
|
+
return self.type_prefix() + self.name.lower() + ":"
|
|
117
|
+
|
|
118
|
+
class METS_STRUCT_DIV_ATTRIBUTE(Enum):
|
|
119
|
+
"""page selection attributes of LOGICAL mets:structMap//mets:div"""
|
|
120
|
+
ID = auto()
|
|
121
|
+
DMDID = auto()
|
|
122
|
+
TYPE = auto()
|
|
123
|
+
LABEL = auto()
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def names(cls):
|
|
127
|
+
return [x.name for x in cls]
|
|
128
|
+
@classmethod
|
|
129
|
+
def type_prefix(cls):
|
|
130
|
+
"""disambiguation prefix to use for all subtypes"""
|
|
131
|
+
return "logical:"
|
|
132
|
+
def prefix(self):
|
|
133
|
+
"""disambiguation prefix to use for this attribute type"""
|
|
134
|
+
return self.type_prefix() + self.name.lower() + ":"
|
|
135
|
+
|
|
136
|
+
@dataclass
|
|
137
|
+
class METS_DIV_ATTRIBUTE_PATTERN(ABC):
|
|
138
|
+
"""page selection pattern (abstract supertype)"""
|
|
139
|
+
|
|
140
|
+
expr: Any
|
|
141
|
+
"""pattern value to match a mets:div against"""
|
|
142
|
+
attr: List[Union[METS_PAGE_DIV_ATTRIBUTE, METS_STRUCT_DIV_ATTRIBUTE]] = field(
|
|
143
|
+
default_factory=lambda: list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE))
|
|
144
|
+
"""attribute type(s) to match a mets:div for
|
|
145
|
+
(pre-disambiguated with prefix syntax, or filled upon first match)
|
|
146
|
+
"""
|
|
147
|
+
has_matched: bool = field(init=False, default=False)
|
|
148
|
+
"""whether this pattern has already been matched"""
|
|
149
|
+
|
|
150
|
+
def attr_prefix(self):
|
|
151
|
+
"""attribute type disambiguation prefix corresponding to the current state of disambiguation"""
|
|
152
|
+
if self.attr == list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE):
|
|
153
|
+
return ""
|
|
154
|
+
if self.attr == list(METS_PAGE_DIV_ATTRIBUTE):
|
|
155
|
+
return METS_PAGE_DIV_ATTRIBUTE.type_prefix()
|
|
156
|
+
if self.attr == list(METS_STRUCT_DIV_ATTRIBUTE):
|
|
157
|
+
return METS_STRUCT_DIV_ATTRIBUTE.type_prefix()
|
|
158
|
+
assert len(self.attr) == 1, "unexpected type ambiguity: %s" % repr(self.attr)
|
|
159
|
+
return self.attr[0].prefix()
|
|
160
|
+
|
|
161
|
+
@abstractmethod
|
|
162
|
+
def _matches(self, input) -> bool:
|
|
163
|
+
return
|
|
164
|
+
def matches(self, input) -> bool:
|
|
165
|
+
"""does the selection pattern match on the given attribute value?"""
|
|
166
|
+
if (matched := self._matches(input)):
|
|
167
|
+
self.has_matched = True
|
|
168
|
+
return matched
|
|
169
|
+
|
|
170
|
+
@dataclass
|
|
171
|
+
class METS_DIV_ATTRIBUTE_ATOM_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
|
|
172
|
+
"""page selection pattern for literal (single value) matching"""
|
|
173
|
+
|
|
174
|
+
expr: str
|
|
175
|
+
def __repr__(self):
|
|
176
|
+
return "%s%s" % (self.attr_prefix(), self.expr)
|
|
177
|
+
def _matches(self, input):
|
|
178
|
+
return input == self.expr
|
|
179
|
+
|
|
180
|
+
@dataclass
|
|
181
|
+
class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
|
|
182
|
+
"""page selection pattern for interval (list expansion) matching"""
|
|
183
|
+
|
|
184
|
+
expr: List[str]
|
|
185
|
+
start: str = field(init=False)
|
|
186
|
+
"""first value of the range after expansion, before matching-exhausting"""
|
|
187
|
+
stop: str = field(init=False)
|
|
188
|
+
"""last value of the range after expansion, before matching-exhausting"""
|
|
189
|
+
def __post_init__(self):
|
|
190
|
+
self.start = self.expr[0]
|
|
191
|
+
self.stop = self.expr[-1]
|
|
192
|
+
def __repr__(self):
|
|
193
|
+
return "%s%s..%s" % (self.attr_prefix(), self.start, self.stop)
|
|
194
|
+
def _matches(self, input):
|
|
195
|
+
return input in self.expr
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class METS_DIV_ATTRIBUTE_REGEX_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
|
|
199
|
+
"""page selection pattern for regular expression matching"""
|
|
200
|
+
|
|
201
|
+
expr: Pattern
|
|
202
|
+
def __repr__(self):
|
|
203
|
+
return "%s//%s" % (self.attr_prefix(), self.expr.pattern)
|
|
204
|
+
def _matches(self, input):
|
|
205
|
+
return bool(self.expr.fullmatch(input))
|