ocrd 3.4.1__tar.gz → 3.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocrd-3.4.1/src/ocrd.egg-info → ocrd-3.5.0}/PKG-INFO +2 -2
- ocrd-3.5.0/VERSION +1 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/requirements.txt +1 -1
- {ocrd-3.4.1 → ocrd-3.5.0/src/ocrd.egg-info}/PKG-INFO +2 -2
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/requires.txt +1 -1
- ocrd-3.5.0/src/ocrd_models/constants.py +205 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_mets.py +231 -97
- ocrd-3.4.1/VERSION +0 -1
- ocrd-3.4.1/src/ocrd_models/constants.py +0 -100
- {ocrd-3.4.1 → ocrd-3.5.0}/LICENSE +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/MANIFEST.in +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/README.md +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/README_bashlib.md +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd.md +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_modelfactory.md +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_models.md +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_network.md +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_utils.md +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_validators.md +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/pyproject.toml +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/setup.cfg +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/bashlib.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/log.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/network.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/ocrd_tool.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/process.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/resmgr.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/validate.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/workspace.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/zip.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/constants.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/loglevel_option.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/mets_find_options.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/ocrd_cli_options.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/parameter_option.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/lib.bash +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/mets_server.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/ocrd-all-tool.json +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/base.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/filter_processor.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/helpers.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/ocrd_page_result.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/resolver.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/resource_list.yml +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/resource_manager.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/task_sequence.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/workspace.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/workspace_backup.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/workspace_bagger.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/SOURCES.txt +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/dependency_links.txt +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/entry_points.txt +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/top_level.txt +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_modelfactory/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/mets-empty.xml +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_agent.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_exif.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_file.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_page.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_page_generateds.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_xml_base.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/report.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/utils.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/xpath_functions.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/client.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/processing_server.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/processing_worker.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/processor_server.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/client.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/client_utils.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/constants.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/database.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/logging_utils.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/job.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/messages.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/ocrd_tool.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/workflow.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/workspace.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/param_validators.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/process_helpers.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/processing_server.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/processing_worker.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/processor_server.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/config_parser.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/deployer.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/hosts.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/network_agents.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/network_services.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/server_cache.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/server_utils.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/utils.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/config.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/constants.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/deprecate.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/image.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/introspect.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/logging.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/ocrd_logging.conf +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/os.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/str.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/__init__.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/bagit-profile.yml +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/constants.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/json_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/message_processing.schema.yml +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/message_result.schema.yml +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/mets.xsd +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/page.xsd +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/page_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/parameter_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/processing_server_config_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/resource_list_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/workspace_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/xlink.xsd +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/xsd_mets_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/xsd_page_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/xsd_validator.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_decorators.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_logging.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_logging_conf.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_mets_server.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_model_factory.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_resolver.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_resolver_oai.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_resource_manager.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_task_sequence.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_utils.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_version.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_workspace.py +0 -0
- {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_workspace_remove.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -21,7 +21,7 @@ Requires-Dist: elementpath
|
|
|
21
21
|
Requires-Dist: fastapi>=0.78.0
|
|
22
22
|
Requires-Dist: filetype
|
|
23
23
|
Requires-Dist: Flask
|
|
24
|
-
Requires-Dist: frozendict>=2.
|
|
24
|
+
Requires-Dist: frozendict>=2.4.0
|
|
25
25
|
Requires-Dist: gdown
|
|
26
26
|
Requires-Dist: httpx>=0.22.0
|
|
27
27
|
Requires-Dist: importlib_metadata; python_version < "3.8"
|
ocrd-3.5.0/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.5.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -21,7 +21,7 @@ Requires-Dist: elementpath
|
|
|
21
21
|
Requires-Dist: fastapi>=0.78.0
|
|
22
22
|
Requires-Dist: filetype
|
|
23
23
|
Requires-Dist: Flask
|
|
24
|
-
Requires-Dist: frozendict>=2.
|
|
24
|
+
Requires-Dist: frozendict>=2.4.0
|
|
25
25
|
Requires-Dist: gdown
|
|
26
26
|
Requires-Dist: httpx>=0.22.0
|
|
27
27
|
Requires-Dist: importlib_metadata; python_version < "3.8"
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constants for ocrd_models.
|
|
3
|
+
"""
|
|
4
|
+
from re import Pattern
|
|
5
|
+
from enum import Enum, auto
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any, List, Optional, Union
|
|
9
|
+
from ocrd_utils import resource_string
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
'IDENTIFIER_PRIORITY',
|
|
13
|
+
'METS_XML_EMPTY',
|
|
14
|
+
'NAMESPACES',
|
|
15
|
+
'TAG_METS_AGENT',
|
|
16
|
+
'TAG_METS_DIV',
|
|
17
|
+
'TAG_METS_FILE',
|
|
18
|
+
'TAG_METS_FILEGRP',
|
|
19
|
+
'TAG_METS_FILESEC',
|
|
20
|
+
'TAG_METS_FPTR',
|
|
21
|
+
'TAG_METS_FLOCAT',
|
|
22
|
+
'TAG_METS_METSHDR',
|
|
23
|
+
'TAG_METS_NAME',
|
|
24
|
+
'TAG_METS_NOTE',
|
|
25
|
+
'TAG_METS_STRUCTMAP',
|
|
26
|
+
'TAG_MODS_IDENTIFIER',
|
|
27
|
+
'TAG_PAGE_ALTERNATIVEIMAGE',
|
|
28
|
+
'TAG_PAGE_COORDS',
|
|
29
|
+
'TAG_PAGE_READINGORDER',
|
|
30
|
+
'TAG_PAGE_REGIONREFINDEXED',
|
|
31
|
+
'TAG_PAGE_TEXTLINE',
|
|
32
|
+
'TAG_PAGE_TEXTEQUIV',
|
|
33
|
+
'TAG_PAGE_TEXTREGION',
|
|
34
|
+
'METS_PAGE_DIV_ATTRIBUTE',
|
|
35
|
+
'METS_STRUCT_DIV_ATTRIBUTE',
|
|
36
|
+
'METS_DIV_ATTRIBUTE_ATOM_PATTERN',
|
|
37
|
+
'METS_DIV_ATTRIBUTE_RANGE_PATTERN',
|
|
38
|
+
'METS_DIV_ATTRIBUTE_REGEX_PATTERN',
|
|
39
|
+
'PAGE_REGION_TYPES',
|
|
40
|
+
'PAGE_ALTIMG_FEATURES',
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
IDENTIFIER_PRIORITY = ['purl', 'urn', 'doi', 'url']
|
|
45
|
+
|
|
46
|
+
METS_XML_EMPTY = resource_string(__package__, 'mets-empty.xml')
|
|
47
|
+
|
|
48
|
+
NAMESPACES = {
|
|
49
|
+
'mets': "http://www.loc.gov/METS/",
|
|
50
|
+
'mods': "http://www.loc.gov/mods/v3",
|
|
51
|
+
'xlink': "http://www.w3.org/1999/xlink",
|
|
52
|
+
'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
|
|
53
|
+
'xsl': 'http://www.w3.org/1999/XSL/Transform#',
|
|
54
|
+
'ocrd': 'https://ocr-d.de',
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets']
|
|
58
|
+
TAG_METS_DIV = '{%s}div' % NAMESPACES['mets']
|
|
59
|
+
TAG_METS_FILE = '{%s}file' % NAMESPACES['mets']
|
|
60
|
+
TAG_METS_FILEGRP = '{%s}fileGrp' % NAMESPACES['mets']
|
|
61
|
+
TAG_METS_FILESEC = '{%s}fileSec' % NAMESPACES['mets']
|
|
62
|
+
TAG_METS_FPTR = '{%s}fptr' % NAMESPACES['mets']
|
|
63
|
+
TAG_METS_FLOCAT = '{%s}FLocat' % NAMESPACES['mets']
|
|
64
|
+
TAG_METS_METSHDR = '{%s}metsHdr' % NAMESPACES['mets']
|
|
65
|
+
TAG_METS_NAME = '{%s}name' % NAMESPACES['mets']
|
|
66
|
+
TAG_METS_NOTE = '{%s}note' % NAMESPACES['mets']
|
|
67
|
+
TAG_METS_STRUCTMAP = '{%s}structMap' % NAMESPACES['mets']
|
|
68
|
+
|
|
69
|
+
TAG_MODS_IDENTIFIER = '{%s}identifier' % NAMESPACES['mods']
|
|
70
|
+
|
|
71
|
+
TAG_PAGE_ALTERNATIVEIMAGE = '{%s}AlternativeImage' % NAMESPACES['page']
|
|
72
|
+
TAG_PAGE_COORDS = '{%s}Coords' % NAMESPACES['page']
|
|
73
|
+
TAG_PAGE_READINGORDER = '{%s}ReadingOrder' % NAMESPACES['page']
|
|
74
|
+
TAG_PAGE_REGIONREFINDEXED = '{%s}RegionRefIndexed' % NAMESPACES['page']
|
|
75
|
+
TAG_PAGE_TEXTLINE = '{%s}TextLine' % NAMESPACES['page']
|
|
76
|
+
TAG_PAGE_TEXTEQUIV = '{%s}TextEquiv' % NAMESPACES['page']
|
|
77
|
+
TAG_PAGE_TEXTREGION = '{%s}TextRegion' % NAMESPACES['page']
|
|
78
|
+
|
|
79
|
+
PAGE_REGION_TYPES = [
|
|
80
|
+
'Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image',
|
|
81
|
+
'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
|
|
82
|
+
'Separator', 'Table', 'Text', 'Unknown'
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
PAGE_ALTIMG_FEATURES = [
|
|
86
|
+
'binarized',
|
|
87
|
+
'grayscale_normalized',
|
|
88
|
+
'despeckled',
|
|
89
|
+
'cropped',
|
|
90
|
+
'deskewed',
|
|
91
|
+
'rotated-90',
|
|
92
|
+
'rotated-180',
|
|
93
|
+
'rotated-270',
|
|
94
|
+
'dewarped',
|
|
95
|
+
'clipped',
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class METS_PAGE_DIV_ATTRIBUTE(Enum):
|
|
100
|
+
"""page selection attributes of PHYSICAL mets:structMap//mets:div"""
|
|
101
|
+
ID = auto()
|
|
102
|
+
ORDER = auto()
|
|
103
|
+
ORDERLABEL = auto()
|
|
104
|
+
LABEL = auto()
|
|
105
|
+
CONTENTIDS = auto()
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
def names(cls):
|
|
109
|
+
return [x.name for x in cls]
|
|
110
|
+
@classmethod
|
|
111
|
+
def type_prefix(cls):
|
|
112
|
+
"""disambiguation prefix to use for all subtypes"""
|
|
113
|
+
return "physical:"
|
|
114
|
+
def prefix(self):
|
|
115
|
+
"""disambiguation prefix to use for this attribute type"""
|
|
116
|
+
return self.type_prefix() + self.name.lower() + ":"
|
|
117
|
+
|
|
118
|
+
class METS_STRUCT_DIV_ATTRIBUTE(Enum):
|
|
119
|
+
"""page selection attributes of LOGICAL mets:structMap//mets:div"""
|
|
120
|
+
ID = auto()
|
|
121
|
+
DMDID = auto()
|
|
122
|
+
TYPE = auto()
|
|
123
|
+
LABEL = auto()
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def names(cls):
|
|
127
|
+
return [x.name for x in cls]
|
|
128
|
+
@classmethod
|
|
129
|
+
def type_prefix(cls):
|
|
130
|
+
"""disambiguation prefix to use for all subtypes"""
|
|
131
|
+
return "logical:"
|
|
132
|
+
def prefix(self):
|
|
133
|
+
"""disambiguation prefix to use for this attribute type"""
|
|
134
|
+
return self.type_prefix() + self.name.lower() + ":"
|
|
135
|
+
|
|
136
|
+
@dataclass
|
|
137
|
+
class METS_DIV_ATTRIBUTE_PATTERN(ABC):
|
|
138
|
+
"""page selection pattern (abstract supertype)"""
|
|
139
|
+
|
|
140
|
+
expr: Any
|
|
141
|
+
"""pattern value to match a mets:div against"""
|
|
142
|
+
attr: List[Union[METS_PAGE_DIV_ATTRIBUTE, METS_STRUCT_DIV_ATTRIBUTE]] = field(
|
|
143
|
+
default_factory=lambda: list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE))
|
|
144
|
+
"""attribute type(s) to match a mets:div for
|
|
145
|
+
(pre-disambiguated with prefix syntax, or filled upon first match)
|
|
146
|
+
"""
|
|
147
|
+
has_matched: bool = field(init=False, default=False)
|
|
148
|
+
"""whether this pattern has already been matched"""
|
|
149
|
+
|
|
150
|
+
def attr_prefix(self):
|
|
151
|
+
"""attribute type disambiguation prefix corresponding to the current state of disambiguation"""
|
|
152
|
+
if self.attr == list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE):
|
|
153
|
+
return ""
|
|
154
|
+
if self.attr == list(METS_PAGE_DIV_ATTRIBUTE):
|
|
155
|
+
return METS_PAGE_DIV_ATTRIBUTE.type_prefix()
|
|
156
|
+
if self.attr == list(METS_STRUCT_DIV_ATTRIBUTE):
|
|
157
|
+
return METS_STRUCT_DIV_ATTRIBUTE.type_prefix()
|
|
158
|
+
assert len(self.attr) == 1, "unexpected type ambiguity: %s" % repr(self.attr)
|
|
159
|
+
return self.attr[0].prefix()
|
|
160
|
+
|
|
161
|
+
@abstractmethod
|
|
162
|
+
def _matches(self, input) -> bool:
|
|
163
|
+
return
|
|
164
|
+
def matches(self, input) -> bool:
|
|
165
|
+
"""does the selection pattern match on the given attribute value?"""
|
|
166
|
+
if (matched := self._matches(input)):
|
|
167
|
+
self.has_matched = True
|
|
168
|
+
return matched
|
|
169
|
+
|
|
170
|
+
@dataclass
|
|
171
|
+
class METS_DIV_ATTRIBUTE_ATOM_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
|
|
172
|
+
"""page selection pattern for literal (single value) matching"""
|
|
173
|
+
|
|
174
|
+
expr: str
|
|
175
|
+
def __repr__(self):
|
|
176
|
+
return "%s%s" % (self.attr_prefix(), self.expr)
|
|
177
|
+
def _matches(self, input):
|
|
178
|
+
return input == self.expr
|
|
179
|
+
|
|
180
|
+
@dataclass
|
|
181
|
+
class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
|
|
182
|
+
"""page selection pattern for interval (list expansion) matching"""
|
|
183
|
+
|
|
184
|
+
expr: List[str]
|
|
185
|
+
start: str = field(init=False)
|
|
186
|
+
"""first value of the range after expansion, before matching-exhausting"""
|
|
187
|
+
stop: str = field(init=False)
|
|
188
|
+
"""last value of the range after expansion, before matching-exhausting"""
|
|
189
|
+
def __post_init__(self):
|
|
190
|
+
self.start = self.expr[0]
|
|
191
|
+
self.stop = self.expr[-1]
|
|
192
|
+
def __repr__(self):
|
|
193
|
+
return "%s%s..%s" % (self.attr_prefix(), self.start, self.stop)
|
|
194
|
+
def _matches(self, input):
|
|
195
|
+
return input in self.expr
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class METS_DIV_ATTRIBUTE_REGEX_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
|
|
199
|
+
"""page selection pattern for regular expression matching"""
|
|
200
|
+
|
|
201
|
+
expr: Pattern
|
|
202
|
+
def __repr__(self):
|
|
203
|
+
return "%s//%s" % (self.attr_prefix(), self.expr.pattern)
|
|
204
|
+
def _matches(self, input):
|
|
205
|
+
return bool(self.expr.fullmatch(input))
|
|
@@ -29,7 +29,12 @@ from .constants import (
|
|
|
29
29
|
IDENTIFIER_PRIORITY,
|
|
30
30
|
TAG_MODS_IDENTIFIER,
|
|
31
31
|
METS_XML_EMPTY,
|
|
32
|
-
METS_PAGE_DIV_ATTRIBUTE
|
|
32
|
+
METS_PAGE_DIV_ATTRIBUTE,
|
|
33
|
+
METS_STRUCT_DIV_ATTRIBUTE,
|
|
34
|
+
METS_DIV_ATTRIBUTE_PATTERN,
|
|
35
|
+
METS_DIV_ATTRIBUTE_ATOM_PATTERN,
|
|
36
|
+
METS_DIV_ATTRIBUTE_RANGE_PATTERN,
|
|
37
|
+
METS_DIV_ATTRIBUTE_REGEX_PATTERN,
|
|
33
38
|
)
|
|
34
39
|
|
|
35
40
|
from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore
|
|
@@ -43,9 +48,11 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
43
48
|
API to a single METS file
|
|
44
49
|
"""
|
|
45
50
|
_cache_flag : bool
|
|
46
|
-
# Cache for the pages (mets:div)
|
|
47
|
-
# The dictionary's
|
|
48
|
-
# The dictionary's
|
|
51
|
+
# Cache for the physical pages (mets:div) - two nested dictionaries
|
|
52
|
+
# The outer dictionary's key: attribute type
|
|
53
|
+
# The outer dictionary's value: inner dictionary
|
|
54
|
+
# The inner dictionary's key: attribute value (str)
|
|
55
|
+
# The inner dictionary's value: a 'div' object at some memory location
|
|
49
56
|
_page_cache : Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET._Element]]
|
|
50
57
|
# Cache for the files (mets:file) - two nested dictionaries
|
|
51
58
|
# The outer dictionary's Key: 'fileGrp.USE'
|
|
@@ -59,6 +66,12 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
59
66
|
# The inner dictionary's Key: 'fptr.FILEID'
|
|
60
67
|
# The inner dictionary's Value: a 'fptr' object at some memory location
|
|
61
68
|
_fptr_cache : Dict[str, Dict[str, ET._Element]]
|
|
69
|
+
# Cache for the logical structural divs (mets:div) - two nested dictionaries
|
|
70
|
+
# The outer dictionary's key: attribute type
|
|
71
|
+
# The outer dictionary's value: inner dictionary
|
|
72
|
+
# The inner dictionary's key: attribute value (str)
|
|
73
|
+
# The inner dictionary's value: a list of corresponding physical div.ID
|
|
74
|
+
_struct_cache : Dict[METS_STRUCT_DIV_ATTRIBUTE, Dict[str, List[str]]]
|
|
62
75
|
|
|
63
76
|
@staticmethod
|
|
64
77
|
def empty_mets(now : Optional[str] = None, cache_flag : bool = False):
|
|
@@ -111,7 +124,6 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
111
124
|
return
|
|
112
125
|
|
|
113
126
|
log = getLogger('ocrd.models.ocrd_mets._fill_caches-files')
|
|
114
|
-
|
|
115
127
|
for el_fileGrp in el_fileSec.findall('mets:fileGrp', NS):
|
|
116
128
|
fileGrp_use = el_fileGrp.get('USE')
|
|
117
129
|
|
|
@@ -124,10 +136,10 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
124
136
|
# log.info("File added to the cache: %s" % file_id)
|
|
125
137
|
|
|
126
138
|
# Fill with pages
|
|
139
|
+
log = getLogger('ocrd.models.ocrd_mets._fill_caches-pages')
|
|
127
140
|
el_div_list = tree_root.findall(".//mets:div[@TYPE='page']", NS)
|
|
128
141
|
if len(el_div_list) == 0:
|
|
129
142
|
return
|
|
130
|
-
log = getLogger('ocrd.models.ocrd_mets._fill_caches-pages')
|
|
131
143
|
|
|
132
144
|
for el_div in el_div_list:
|
|
133
145
|
div_id = el_div.get('ID')
|
|
@@ -148,11 +160,30 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
148
160
|
# log.info("Len of page_cache: %s" % len(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID]))
|
|
149
161
|
# log.info("Len of fptr_cache: %s" % len(self._fptr_cache))
|
|
150
162
|
|
|
163
|
+
# Fill with logical divs
|
|
164
|
+
log = getLogger('ocrd.models.ocrd_mets._fill_caches-structs')
|
|
165
|
+
el_struct_list = tree_root.findall("mets:structMap[@TYPE='LOGICAL']//mets:div", NS)
|
|
166
|
+
el_smlink_list = tree_root.findall("mets:structLink/mets:smLink", NS)
|
|
167
|
+
if len(el_struct_list) == 0 or len(el_smlink_list) == 0:
|
|
168
|
+
return
|
|
169
|
+
smlink_map = {}
|
|
170
|
+
for link in el_smlink_list:
|
|
171
|
+
link_log = link.get('{%s}from' % NS['xlink'])
|
|
172
|
+
link_phy = link.get('{%s}to' % NS['xlink'])
|
|
173
|
+
smlink_map.setdefault(link_log, list()).append(link_phy)
|
|
174
|
+
for el_div in el_struct_list:
|
|
175
|
+
for attr in METS_STRUCT_DIV_ATTRIBUTE:
|
|
176
|
+
val = self._struct_cache[attr].setdefault(str(el_div.get(attr.name)), list())
|
|
177
|
+
val.extend(smlink_map.get(el_div.get('ID'), []))
|
|
178
|
+
|
|
179
|
+
# log.info("Len of struct_cache: %s" % len(self._struct_cache[METS_STRUCT_DIV_ATTRIBUTE.ID]))
|
|
180
|
+
|
|
151
181
|
def _initialize_caches(self) -> None:
|
|
152
182
|
self._file_cache = {}
|
|
153
183
|
# NOTE we can only guarantee uniqueness for @ID and @ORDER
|
|
154
184
|
self._page_cache = {k : {} for k in METS_PAGE_DIV_ATTRIBUTE}
|
|
155
185
|
self._fptr_cache = {}
|
|
186
|
+
self._struct_cache = {k : {} for k in METS_STRUCT_DIV_ATTRIBUTE}
|
|
156
187
|
|
|
157
188
|
def _refresh_caches(self) -> None:
|
|
158
189
|
if self._cache_flag:
|
|
@@ -253,12 +284,20 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
253
284
|
:py:attr:`url` and :py:attr:`mimetype` parameters can each be either a
|
|
254
285
|
literal string, or a regular expression if the string starts with
|
|
255
286
|
``//`` (double slash).
|
|
287
|
+
|
|
256
288
|
If it is a regex, the leading ``//`` is removed and candidates are matched
|
|
257
289
|
against the regex with `re.fullmatch`. If it is a literal string, comparison
|
|
258
290
|
is done with string equality.
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
291
|
+
|
|
292
|
+
The :py:attr:`pageId` parameter also supports comma-separated lists, as well
|
|
293
|
+
as the numeric range operator ``..`` and the negation operator ``~``.
|
|
294
|
+
|
|
295
|
+
For example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``, the
|
|
296
|
+
both expressions ``PHYS_0001..PHYS_0003`` and ``PHYS_0001,PHYS_0002,PHYS_0003``
|
|
297
|
+
will be expanded to the same 3 pages. To find all files above that subrange,
|
|
298
|
+
both expressions ``~PHYS_0001..PHYS_0003`` and ``~PHYS_0001,~PHYS_0002,~PHYS_0003``
|
|
299
|
+
will be expanded to ``PHYS_0004`` and upwards.
|
|
300
|
+
|
|
262
301
|
Keyword Args:
|
|
263
302
|
ID (string) : ``@ID`` of the ``mets:file``
|
|
264
303
|
fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of
|
|
@@ -609,101 +648,73 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
609
648
|
|
|
610
649
|
return self.physical_pages
|
|
611
650
|
|
|
612
|
-
|
|
651
|
+
log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
|
|
613
652
|
if for_pageIds is not None:
|
|
614
|
-
ret = []
|
|
615
653
|
page_attr_patterns = []
|
|
616
|
-
|
|
617
|
-
for pageId_token in
|
|
654
|
+
page_attr_antipatterns = []
|
|
655
|
+
for pageId_token in re.split(r',', for_pageIds):
|
|
656
|
+
pageId_token_raw = pageId_token
|
|
657
|
+
# prefix for disambiguation of attribute?
|
|
658
|
+
attr = list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE)
|
|
659
|
+
for attr_type in [METS_STRUCT_DIV_ATTRIBUTE, METS_PAGE_DIV_ATTRIBUTE]:
|
|
660
|
+
if pageId_token.startswith(attr_type.type_prefix()):
|
|
661
|
+
for attr_val in list(attr_type):
|
|
662
|
+
if pageId_token.startswith(attr_val.prefix()):
|
|
663
|
+
# disambiguated to e.g. "logical:label:"
|
|
664
|
+
attr = [attr_val]
|
|
665
|
+
pageId_token = pageId_token[len(attr_val.prefix()):]
|
|
666
|
+
break
|
|
667
|
+
if len(attr) > 1:
|
|
668
|
+
# just "logical:" or "physical:"
|
|
669
|
+
attr = list(attr_type)
|
|
670
|
+
pageId_token = pageId_token[len(attr_type.type_prefix()):]
|
|
671
|
+
break
|
|
672
|
+
if not pageId_token:
|
|
673
|
+
raise ValueError("invalid pageId syntax '%s': empty after type prefix" % pageId_token_raw)
|
|
674
|
+
# negation prefix
|
|
675
|
+
if pageId_token.startswith('~'):
|
|
676
|
+
page_attr_xpatterns = page_attr_antipatterns
|
|
677
|
+
pageId_token = pageId_token[1:]
|
|
678
|
+
else:
|
|
679
|
+
page_attr_xpatterns = page_attr_patterns
|
|
680
|
+
if not pageId_token:
|
|
681
|
+
raise ValueError("invalid pageId syntax '%s': empty after negator prefix" % pageId_token_raw)
|
|
682
|
+
# operator prefix
|
|
618
683
|
if pageId_token.startswith(REGEX_PREFIX):
|
|
619
|
-
|
|
684
|
+
pageId_token = pageId_token[REGEX_PREFIX_LEN:]
|
|
685
|
+
if not pageId_token:
|
|
686
|
+
raise ValueError("invalid pageId syntax '%s': empty after regex prefix" % pageId_token_raw)
|
|
687
|
+
val_expr = re.compile(pageId_token)
|
|
688
|
+
page_attr_xpatterns.append(
|
|
689
|
+
METS_DIV_ATTRIBUTE_REGEX_PATTERN(val_expr, attr))
|
|
620
690
|
elif '..' in pageId_token:
|
|
621
|
-
|
|
622
|
-
|
|
691
|
+
try:
|
|
692
|
+
val_range = generate_range(*pageId_token.split('..', 1))
|
|
693
|
+
except ValueError as e:
|
|
694
|
+
raise ValueError("invalid pageId syntax '%s': %s" % (pageId_token_raw, str(e))) from None
|
|
695
|
+
page_attr_xpatterns.append(
|
|
696
|
+
METS_DIV_ATTRIBUTE_RANGE_PATTERN(val_range, attr))
|
|
623
697
|
else:
|
|
624
|
-
|
|
625
|
-
|
|
698
|
+
if not pageId_token:
|
|
699
|
+
raise ValueError("invalid pageId syntax '%s': empty" % pageId_token_raw)
|
|
700
|
+
page_attr_xpatterns.append(
|
|
701
|
+
METS_DIV_ATTRIBUTE_ATOM_PATTERN(pageId_token, attr))
|
|
702
|
+
log.debug("parsed pattern '%s' to %s", pageId_token_raw, page_attr_xpatterns[-1])
|
|
703
|
+
if not page_attr_patterns and not page_attr_antipatterns:
|
|
626
704
|
return []
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
if self._cache_flag:
|
|
630
|
-
for pat in page_attr_patterns:
|
|
631
|
-
try:
|
|
632
|
-
attr : METS_PAGE_DIV_ATTRIBUTE
|
|
633
|
-
if isinstance(pat, str):
|
|
634
|
-
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat in self._page_cache[a])
|
|
635
|
-
cache_keys = [pat]
|
|
636
|
-
elif isinstance(pat, list):
|
|
637
|
-
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x in self._page_cache[a] for x in pat))
|
|
638
|
-
cache_keys = [v for v in pat if v in self._page_cache[attr]]
|
|
639
|
-
for k in cache_keys:
|
|
640
|
-
pat.remove(k)
|
|
641
|
-
elif isinstance(pat, tuple):
|
|
642
|
-
_, re_pat = pat
|
|
643
|
-
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if re_pat.fullmatch(v))
|
|
644
|
-
cache_keys = [v for v in self._page_cache[attr] if re_pat.fullmatch(v)]
|
|
645
|
-
else:
|
|
646
|
-
raise ValueError
|
|
647
|
-
if return_divs:
|
|
648
|
-
ret += [self._page_cache[attr][v] for v in cache_keys]
|
|
649
|
-
else:
|
|
650
|
-
ret += [self._page_cache[attr][v].get('ID') for v in cache_keys]
|
|
651
|
-
except StopIteration:
|
|
652
|
-
raise ValueError(f"{pat} matches none of the keys of any of the _page_caches.")
|
|
705
|
+
if page_attr_patterns:
|
|
706
|
+
divs = self.get_physical_page_patterns(page_attr_patterns)
|
|
653
707
|
else:
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
patterns_exhausted.append(pat)
|
|
665
|
-
elif isinstance(pat, list):
|
|
666
|
-
if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE):
|
|
667
|
-
pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x == page.get(a.name) for x in pat)))
|
|
668
|
-
attr_val = page.get(pat[0].name)
|
|
669
|
-
if attr_val in pat:
|
|
670
|
-
pat.remove(attr_val)
|
|
671
|
-
ret.append(page if return_divs else page.get('ID'))
|
|
672
|
-
if len(pat) == 1:
|
|
673
|
-
patterns_exhausted.append(pat)
|
|
674
|
-
elif isinstance(pat, tuple):
|
|
675
|
-
attr, re_pat = pat
|
|
676
|
-
if not attr:
|
|
677
|
-
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if re_pat.fullmatch(page.get(a.name) or ''))
|
|
678
|
-
page_attr_patterns[pat_idx] = (attr, re_pat)
|
|
679
|
-
if re_pat.fullmatch(page.get(attr.name) or ''):
|
|
680
|
-
ret.append(page if return_divs else page.get('ID'))
|
|
681
|
-
else:
|
|
682
|
-
raise ValueError
|
|
683
|
-
page_attr_patterns_matched.append(pat)
|
|
684
|
-
except StopIteration:
|
|
685
|
-
continue
|
|
686
|
-
for p in patterns_exhausted:
|
|
687
|
-
page_attr_patterns.remove(p)
|
|
688
|
-
unmatched = [x for x in page_attr_patterns_copy if x not in page_attr_patterns_matched]
|
|
689
|
-
if unmatched:
|
|
690
|
-
raise ValueError(f"Patterns {unmatched} match none of the pages")
|
|
691
|
-
|
|
692
|
-
ranges_without_start_match = []
|
|
693
|
-
ranges_without_last_match = []
|
|
694
|
-
for idx, pat in enumerate(page_attr_patterns_copy):
|
|
695
|
-
if isinstance(pat, list):
|
|
696
|
-
start, last = range_patterns_first_last[idx]
|
|
697
|
-
if start in pat:
|
|
698
|
-
print(pat, start, last)
|
|
699
|
-
ranges_without_start_match.append(page_attr_patterns_raw[idx])
|
|
700
|
-
# if last in pat:
|
|
701
|
-
# ranges_without_last_match.append(page_attr_patterns_raw[idx])
|
|
702
|
-
if ranges_without_start_match:
|
|
703
|
-
raise ValueError(f"Start of range patterns {ranges_without_start_match} not matched - invalid range")
|
|
704
|
-
# if ranges_without_last_match:
|
|
705
|
-
# raise ValueError(f"End of range patterns {ranges_without_last_match} not matched - invalid range")
|
|
706
|
-
return ret
|
|
708
|
+
all_pages = [METS_DIV_ATTRIBUTE_REGEX_PATTERN(
|
|
709
|
+
re.compile(".*"), [METS_PAGE_DIV_ATTRIBUTE.ID])]
|
|
710
|
+
divs = self.get_physical_page_patterns(all_pages)
|
|
711
|
+
if page_attr_antipatterns:
|
|
712
|
+
antidivs = self.get_physical_page_patterns(page_attr_antipatterns)
|
|
713
|
+
divs = [div for div in divs if div not in antidivs]
|
|
714
|
+
if return_divs:
|
|
715
|
+
return divs
|
|
716
|
+
else:
|
|
717
|
+
return [div.get('ID') for div in divs]
|
|
707
718
|
|
|
708
719
|
if for_fileIds == []:
|
|
709
720
|
return []
|
|
@@ -731,6 +742,129 @@ class OcrdMets(OcrdXmlDocument):
|
|
|
731
742
|
ret[index] = page.get('ID')
|
|
732
743
|
return ret
|
|
733
744
|
|
|
745
|
+
def get_physical_page_patterns(self, page_attr_patterns: List[METS_DIV_ATTRIBUTE_PATTERN]) -> List[ET._Element]:
|
|
746
|
+
log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
|
|
747
|
+
ret = []
|
|
748
|
+
page_attr_patterns_copy = list(page_attr_patterns)
|
|
749
|
+
if self._cache_flag:
|
|
750
|
+
for pat in page_attr_patterns:
|
|
751
|
+
for attr in pat.attr:
|
|
752
|
+
if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
|
|
753
|
+
cache = self._page_cache[attr]
|
|
754
|
+
else:
|
|
755
|
+
cache = self._struct_cache[attr]
|
|
756
|
+
if (isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN) and
|
|
757
|
+
# @TYPE makes no sense in range expressions
|
|
758
|
+
# @LABEL makes no sense in range expressions
|
|
759
|
+
attr in [METS_STRUCT_DIV_ATTRIBUTE.TYPE,
|
|
760
|
+
METS_STRUCT_DIV_ATTRIBUTE.LABEL]):
|
|
761
|
+
continue
|
|
762
|
+
if cache_keys := [v for v in cache if pat.matches(v)]:
|
|
763
|
+
if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
|
|
764
|
+
ret += [cache[v] for v in cache_keys]
|
|
765
|
+
log.debug('physical matches for %s: %s', pat, str(cache_keys))
|
|
766
|
+
else:
|
|
767
|
+
for v in cache_keys:
|
|
768
|
+
ret += [self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][p]
|
|
769
|
+
for p in cache[v]]
|
|
770
|
+
log.debug('logical matches for %s: %s', pat, str(cache_keys))
|
|
771
|
+
if isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
|
|
772
|
+
# remove matches for final range check
|
|
773
|
+
for v in cache_keys:
|
|
774
|
+
pat.expr.remove(v)
|
|
775
|
+
break
|
|
776
|
+
if not cache_keys:
|
|
777
|
+
raise ValueError(f"{pat} matches none of the keys of any of the _page_caches and _struct_caches.")
|
|
778
|
+
else:
|
|
779
|
+
# cache logical structmap:
|
|
780
|
+
el_struct_list = self._tree.getroot().findall("mets:structMap[@TYPE='LOGICAL']//mets:div", NS)
|
|
781
|
+
el_smlink_list = self._tree.getroot().findall("mets:structLink/mets:smLink", NS)
|
|
782
|
+
smlink_map = {}
|
|
783
|
+
for link in el_smlink_list:
|
|
784
|
+
link_log = link.get('{%s}from' % NS['xlink'])
|
|
785
|
+
link_phy = link.get('{%s}to' % NS['xlink'])
|
|
786
|
+
smlink_map.setdefault(link_log, list()).append(link_phy)
|
|
787
|
+
struct_cache = {k: {} for k in METS_STRUCT_DIV_ATTRIBUTE}
|
|
788
|
+
for el_div in el_struct_list:
|
|
789
|
+
for attr in METS_STRUCT_DIV_ATTRIBUTE:
|
|
790
|
+
if not el_div.get(attr.name):
|
|
791
|
+
# avoid mapping None indiscriminately
|
|
792
|
+
continue
|
|
793
|
+
val = struct_cache[attr].setdefault(str(el_div.get(attr.name)), list())
|
|
794
|
+
val.extend(smlink_map.get(el_div.get('ID'), []))
|
|
795
|
+
log.debug("found %d smLink entries for %d logical divs", len(el_smlink_list), len(el_struct_list))
|
|
796
|
+
page_attr_patterns_matched = []
|
|
797
|
+
for page in self._tree.getroot().xpath(
|
|
798
|
+
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
|
|
799
|
+
namespaces=NS):
|
|
800
|
+
patterns_exhausted = []
|
|
801
|
+
for pat in page_attr_patterns:
|
|
802
|
+
for attr in pat.attr:
|
|
803
|
+
if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
|
|
804
|
+
cache = [page.get(attr.name) or '']
|
|
805
|
+
else:
|
|
806
|
+
cache = struct_cache[attr]
|
|
807
|
+
if (isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN) and
|
|
808
|
+
# @TYPE makes no sense in range expressions
|
|
809
|
+
# @LABEL makes no sense in range expressions
|
|
810
|
+
attr in [METS_STRUCT_DIV_ATTRIBUTE.TYPE,
|
|
811
|
+
METS_STRUCT_DIV_ATTRIBUTE.LABEL]):
|
|
812
|
+
continue
|
|
813
|
+
if cache_keys := [v for v in cache if pat.matches(v)]:
|
|
814
|
+
pat.attr = [attr] # disambiguate next
|
|
815
|
+
if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
|
|
816
|
+
ret.append(page)
|
|
817
|
+
log.debug('physical match for %s on page %s', pat, page.get('ID'))
|
|
818
|
+
if isinstance(pat, METS_DIV_ATTRIBUTE_ATOM_PATTERN):
|
|
819
|
+
patterns_exhausted.append(pat)
|
|
820
|
+
elif isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
|
|
821
|
+
# remove for efficiency and final range check
|
|
822
|
+
pat.expr.remove(cache_keys[0])
|
|
823
|
+
if not pat.expr:
|
|
824
|
+
patterns_exhausted.append(pat)
|
|
825
|
+
elif cache_key := next((v for v in cache_keys
|
|
826
|
+
if page.get('ID') in cache[v]), None):
|
|
827
|
+
ret.append(page)
|
|
828
|
+
log.debug('logical match for %s on page %s', pat, page.get('ID'))
|
|
829
|
+
cache[cache_key].remove(page.get('ID'))
|
|
830
|
+
# remove for efficiency and final range check
|
|
831
|
+
if not cache[cache_key]:
|
|
832
|
+
if isinstance(pat, METS_DIV_ATTRIBUTE_ATOM_PATTERN):
|
|
833
|
+
patterns_exhausted.append(pat)
|
|
834
|
+
elif isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
|
|
835
|
+
pat.expr.remove(cache_key)
|
|
836
|
+
if not pat.expr:
|
|
837
|
+
patterns_exhausted.append(pat)
|
|
838
|
+
break # no more attributes for this pattern
|
|
839
|
+
# keep matching in order to exhaust and consume pattern list
|
|
840
|
+
#if page in ret:
|
|
841
|
+
# break # no more patterns for this page
|
|
842
|
+
for p in patterns_exhausted:
|
|
843
|
+
page_attr_patterns.remove(p)
|
|
844
|
+
unmatched = [pat for pat in page_attr_patterns_copy
|
|
845
|
+
if not pat.has_matched]
|
|
846
|
+
if unmatched:
|
|
847
|
+
raise ValueError(f"Patterns {unmatched} match none of the pages")
|
|
848
|
+
|
|
849
|
+
ranges_without_start_match = []
|
|
850
|
+
ranges_without_stop_match = []
|
|
851
|
+
for pat in page_attr_patterns_copy:
|
|
852
|
+
if isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
|
|
853
|
+
# range expression, expanded to pattern list
|
|
854
|
+
# list items get consumed (pat.expr.remove) when matched,
|
|
855
|
+
# exhausted patterns also get consumed (page_attr_patterns.remove)
|
|
856
|
+
# (but top-level list copy references the same list objects)
|
|
857
|
+
if pat.start in pat.expr:
|
|
858
|
+
log.debug((pat, pat.expr))
|
|
859
|
+
ranges_without_start_match.append(pat)
|
|
860
|
+
# if pat.stop in pat.expr:
|
|
861
|
+
# ranges_without_stop_match.append(pat)
|
|
862
|
+
if ranges_without_start_match:
|
|
863
|
+
raise ValueError(f"Start of range patterns {ranges_without_start_match} not matched - invalid range")
|
|
864
|
+
# if ranges_without_stop_match:
|
|
865
|
+
# raise ValueError(f"End of range patterns {ranges_without_stop_match} not matched - invalid range")
|
|
866
|
+
return ret
|
|
867
|
+
|
|
734
868
|
def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile,
|
|
735
869
|
order : Optional[str] = None, orderlabel : Optional[str] = None) -> None:
|
|
736
870
|
"""
|