ocrd 3.4.1__tar.gz → 3.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. {ocrd-3.4.1/src/ocrd.egg-info → ocrd-3.5.0}/PKG-INFO +2 -2
  2. ocrd-3.5.0/VERSION +1 -0
  3. {ocrd-3.4.1 → ocrd-3.5.0}/requirements.txt +1 -1
  4. {ocrd-3.4.1 → ocrd-3.5.0/src/ocrd.egg-info}/PKG-INFO +2 -2
  5. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/requires.txt +1 -1
  6. ocrd-3.5.0/src/ocrd_models/constants.py +205 -0
  7. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_mets.py +231 -97
  8. ocrd-3.4.1/VERSION +0 -1
  9. ocrd-3.4.1/src/ocrd_models/constants.py +0 -100
  10. {ocrd-3.4.1 → ocrd-3.5.0}/LICENSE +0 -0
  11. {ocrd-3.4.1 → ocrd-3.5.0}/MANIFEST.in +0 -0
  12. {ocrd-3.4.1 → ocrd-3.5.0}/README.md +0 -0
  13. {ocrd-3.4.1 → ocrd-3.5.0}/README_bashlib.md +0 -0
  14. {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd.md +0 -0
  15. {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_modelfactory.md +0 -0
  16. {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_models.md +0 -0
  17. {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_network.md +0 -0
  18. {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_utils.md +0 -0
  19. {ocrd-3.4.1 → ocrd-3.5.0}/README_ocrd_validators.md +0 -0
  20. {ocrd-3.4.1 → ocrd-3.5.0}/pyproject.toml +0 -0
  21. {ocrd-3.4.1 → ocrd-3.5.0}/setup.cfg +0 -0
  22. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/__init__.py +0 -0
  23. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/__init__.py +0 -0
  24. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/bashlib.py +0 -0
  25. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/log.py +0 -0
  26. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/network.py +0 -0
  27. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/ocrd_tool.py +0 -0
  28. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/process.py +0 -0
  29. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/resmgr.py +0 -0
  30. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/validate.py +0 -0
  31. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/workspace.py +0 -0
  32. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/cli/zip.py +0 -0
  33. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/constants.py +0 -0
  34. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/__init__.py +0 -0
  35. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/loglevel_option.py +0 -0
  36. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/mets_find_options.py +0 -0
  37. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/ocrd_cli_options.py +0 -0
  38. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/decorators/parameter_option.py +0 -0
  39. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/lib.bash +0 -0
  40. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/mets_server.py +0 -0
  41. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/ocrd-all-tool.json +0 -0
  42. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/__init__.py +0 -0
  43. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/base.py +0 -0
  44. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/__init__.py +0 -0
  45. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
  46. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
  47. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
  48. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/builtin/filter_processor.py +0 -0
  49. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/helpers.py +0 -0
  50. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/processor/ocrd_page_result.py +0 -0
  51. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/resolver.py +0 -0
  52. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/resource_list.yml +0 -0
  53. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/resource_manager.py +0 -0
  54. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/task_sequence.py +0 -0
  55. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/workspace.py +0 -0
  56. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/workspace_backup.py +0 -0
  57. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd/workspace_bagger.py +0 -0
  58. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/SOURCES.txt +0 -0
  59. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/dependency_links.txt +0 -0
  60. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/entry_points.txt +0 -0
  61. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd.egg-info/top_level.txt +0 -0
  62. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_modelfactory/__init__.py +0 -0
  63. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/__init__.py +0 -0
  64. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/mets-empty.xml +0 -0
  65. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_agent.py +0 -0
  66. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_exif.py +0 -0
  67. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_file.py +0 -0
  68. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_page.py +0 -0
  69. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_page_generateds.py +0 -0
  70. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/ocrd_xml_base.py +0 -0
  71. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/report.py +0 -0
  72. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/utils.py +0 -0
  73. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_models/xpath_functions.py +0 -0
  74. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/__init__.py +0 -0
  75. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/__init__.py +0 -0
  76. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/client.py +0 -0
  77. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/processing_server.py +0 -0
  78. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/processing_worker.py +0 -0
  79. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/cli/processor_server.py +0 -0
  80. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/client.py +0 -0
  81. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/client_utils.py +0 -0
  82. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/constants.py +0 -0
  83. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/database.py +0 -0
  84. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/logging_utils.py +0 -0
  85. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/__init__.py +0 -0
  86. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/job.py +0 -0
  87. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/messages.py +0 -0
  88. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/ocrd_tool.py +0 -0
  89. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/workflow.py +0 -0
  90. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/models/workspace.py +0 -0
  91. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/param_validators.py +0 -0
  92. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/process_helpers.py +0 -0
  93. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/processing_server.py +0 -0
  94. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/processing_worker.py +0 -0
  95. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/processor_server.py +0 -0
  96. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
  97. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
  98. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
  99. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
  100. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
  101. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
  102. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
  103. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/__init__.py +0 -0
  104. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/config_parser.py +0 -0
  105. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
  106. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/deployer.py +0 -0
  107. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/hosts.py +0 -0
  108. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/network_agents.py +0 -0
  109. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/runtime_data/network_services.py +0 -0
  110. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/server_cache.py +0 -0
  111. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/server_utils.py +0 -0
  112. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
  113. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_network/utils.py +0 -0
  114. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/__init__.py +0 -0
  115. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/config.py +0 -0
  116. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/constants.py +0 -0
  117. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/deprecate.py +0 -0
  118. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/image.py +0 -0
  119. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/introspect.py +0 -0
  120. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/logging.py +0 -0
  121. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/ocrd_logging.conf +0 -0
  122. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/os.py +0 -0
  123. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_utils/str.py +0 -0
  124. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/__init__.py +0 -0
  125. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/bagit-profile.yml +0 -0
  126. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/constants.py +0 -0
  127. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/json_validator.py +0 -0
  128. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/message_processing.schema.yml +0 -0
  129. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/message_result.schema.yml +0 -0
  130. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/mets.xsd +0 -0
  131. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
  132. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
  133. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
  134. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
  135. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/page.xsd +0 -0
  136. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/page_validator.py +0 -0
  137. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/parameter_validator.py +0 -0
  138. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
  139. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/processing_server_config_validator.py +0 -0
  140. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/resource_list_validator.py +0 -0
  141. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/workspace_validator.py +0 -0
  142. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/xlink.xsd +0 -0
  143. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/xsd_mets_validator.py +0 -0
  144. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/xsd_page_validator.py +0 -0
  145. {ocrd-3.4.1 → ocrd-3.5.0}/src/ocrd_validators/xsd_validator.py +0 -0
  146. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_decorators.py +0 -0
  147. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_logging.py +0 -0
  148. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_logging_conf.py +0 -0
  149. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_mets_server.py +0 -0
  150. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_model_factory.py +0 -0
  151. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_resolver.py +0 -0
  152. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_resolver_oai.py +0 -0
  153. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_resource_manager.py +0 -0
  154. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_task_sequence.py +0 -0
  155. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_utils.py +0 -0
  156. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_version.py +0 -0
  157. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_workspace.py +0 -0
  158. {ocrd-3.4.1 → ocrd-3.5.0}/tests/test_workspace_remove.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.4.1
3
+ Version: 3.5.0
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -21,7 +21,7 @@ Requires-Dist: elementpath
21
21
  Requires-Dist: fastapi>=0.78.0
22
22
  Requires-Dist: filetype
23
23
  Requires-Dist: Flask
24
- Requires-Dist: frozendict>=2.3.4
24
+ Requires-Dist: frozendict>=2.4.0
25
25
  Requires-Dist: gdown
26
26
  Requires-Dist: httpx>=0.22.0
27
27
  Requires-Dist: importlib_metadata; python_version < "3.8"
ocrd-3.5.0/VERSION ADDED
@@ -0,0 +1 @@
1
+ 3.5.0
@@ -8,7 +8,7 @@ elementpath
8
8
  fastapi>=0.78.0
9
9
  filetype
10
10
  Flask
11
- frozendict>=2.3.4
11
+ frozendict>=2.4.0
12
12
  gdown
13
13
  httpx>=0.22.0
14
14
  importlib_metadata ; python_version < '3.8'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.4.1
3
+ Version: 3.5.0
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -21,7 +21,7 @@ Requires-Dist: elementpath
21
21
  Requires-Dist: fastapi>=0.78.0
22
22
  Requires-Dist: filetype
23
23
  Requires-Dist: Flask
24
- Requires-Dist: frozendict>=2.3.4
24
+ Requires-Dist: frozendict>=2.4.0
25
25
  Requires-Dist: gdown
26
26
  Requires-Dist: httpx>=0.22.0
27
27
  Requires-Dist: importlib_metadata; python_version < "3.8"
@@ -8,7 +8,7 @@ elementpath
8
8
  fastapi>=0.78.0
9
9
  filetype
10
10
  Flask
11
- frozendict>=2.3.4
11
+ frozendict>=2.4.0
12
12
  gdown
13
13
  httpx>=0.22.0
14
14
  jsonschema>=4
@@ -0,0 +1,205 @@
1
+ """
2
+ Constants for ocrd_models.
3
+ """
4
+ from re import Pattern
5
+ from enum import Enum, auto
6
+ from dataclasses import dataclass, field
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any, List, Optional, Union
9
+ from ocrd_utils import resource_string
10
+
11
+ __all__ = [
12
+ 'IDENTIFIER_PRIORITY',
13
+ 'METS_XML_EMPTY',
14
+ 'NAMESPACES',
15
+ 'TAG_METS_AGENT',
16
+ 'TAG_METS_DIV',
17
+ 'TAG_METS_FILE',
18
+ 'TAG_METS_FILEGRP',
19
+ 'TAG_METS_FILESEC',
20
+ 'TAG_METS_FPTR',
21
+ 'TAG_METS_FLOCAT',
22
+ 'TAG_METS_METSHDR',
23
+ 'TAG_METS_NAME',
24
+ 'TAG_METS_NOTE',
25
+ 'TAG_METS_STRUCTMAP',
26
+ 'TAG_MODS_IDENTIFIER',
27
+ 'TAG_PAGE_ALTERNATIVEIMAGE',
28
+ 'TAG_PAGE_COORDS',
29
+ 'TAG_PAGE_READINGORDER',
30
+ 'TAG_PAGE_REGIONREFINDEXED',
31
+ 'TAG_PAGE_TEXTLINE',
32
+ 'TAG_PAGE_TEXTEQUIV',
33
+ 'TAG_PAGE_TEXTREGION',
34
+ 'METS_PAGE_DIV_ATTRIBUTE',
35
+ 'METS_STRUCT_DIV_ATTRIBUTE',
36
+ 'METS_DIV_ATTRIBUTE_ATOM_PATTERN',
37
+ 'METS_DIV_ATTRIBUTE_RANGE_PATTERN',
38
+ 'METS_DIV_ATTRIBUTE_REGEX_PATTERN',
39
+ 'PAGE_REGION_TYPES',
40
+ 'PAGE_ALTIMG_FEATURES',
41
+ ]
42
+
43
+
44
+ IDENTIFIER_PRIORITY = ['purl', 'urn', 'doi', 'url']
45
+
46
+ METS_XML_EMPTY = resource_string(__package__, 'mets-empty.xml')
47
+
48
+ NAMESPACES = {
49
+ 'mets': "http://www.loc.gov/METS/",
50
+ 'mods': "http://www.loc.gov/mods/v3",
51
+ 'xlink': "http://www.w3.org/1999/xlink",
52
+ 'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
53
+ 'xsl': 'http://www.w3.org/1999/XSL/Transform#',
54
+ 'ocrd': 'https://ocr-d.de',
55
+ }
56
+
57
+ TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets']
58
+ TAG_METS_DIV = '{%s}div' % NAMESPACES['mets']
59
+ TAG_METS_FILE = '{%s}file' % NAMESPACES['mets']
60
+ TAG_METS_FILEGRP = '{%s}fileGrp' % NAMESPACES['mets']
61
+ TAG_METS_FILESEC = '{%s}fileSec' % NAMESPACES['mets']
62
+ TAG_METS_FPTR = '{%s}fptr' % NAMESPACES['mets']
63
+ TAG_METS_FLOCAT = '{%s}FLocat' % NAMESPACES['mets']
64
+ TAG_METS_METSHDR = '{%s}metsHdr' % NAMESPACES['mets']
65
+ TAG_METS_NAME = '{%s}name' % NAMESPACES['mets']
66
+ TAG_METS_NOTE = '{%s}note' % NAMESPACES['mets']
67
+ TAG_METS_STRUCTMAP = '{%s}structMap' % NAMESPACES['mets']
68
+
69
+ TAG_MODS_IDENTIFIER = '{%s}identifier' % NAMESPACES['mods']
70
+
71
+ TAG_PAGE_ALTERNATIVEIMAGE = '{%s}AlternativeImage' % NAMESPACES['page']
72
+ TAG_PAGE_COORDS = '{%s}Coords' % NAMESPACES['page']
73
+ TAG_PAGE_READINGORDER = '{%s}ReadingOrder' % NAMESPACES['page']
74
+ TAG_PAGE_REGIONREFINDEXED = '{%s}RegionRefIndexed' % NAMESPACES['page']
75
+ TAG_PAGE_TEXTLINE = '{%s}TextLine' % NAMESPACES['page']
76
+ TAG_PAGE_TEXTEQUIV = '{%s}TextEquiv' % NAMESPACES['page']
77
+ TAG_PAGE_TEXTREGION = '{%s}TextRegion' % NAMESPACES['page']
78
+
79
+ PAGE_REGION_TYPES = [
80
+ 'Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image',
81
+ 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
82
+ 'Separator', 'Table', 'Text', 'Unknown'
83
+ ]
84
+
85
+ PAGE_ALTIMG_FEATURES = [
86
+ 'binarized',
87
+ 'grayscale_normalized',
88
+ 'despeckled',
89
+ 'cropped',
90
+ 'deskewed',
91
+ 'rotated-90',
92
+ 'rotated-180',
93
+ 'rotated-270',
94
+ 'dewarped',
95
+ 'clipped',
96
+ ]
97
+
98
+
99
+ class METS_PAGE_DIV_ATTRIBUTE(Enum):
100
+ """page selection attributes of PHYSICAL mets:structMap//mets:div"""
101
+ ID = auto()
102
+ ORDER = auto()
103
+ ORDERLABEL = auto()
104
+ LABEL = auto()
105
+ CONTENTIDS = auto()
106
+
107
+ @classmethod
108
+ def names(cls):
109
+ return [x.name for x in cls]
110
+ @classmethod
111
+ def type_prefix(cls):
112
+ """disambiguation prefix to use for all subtypes"""
113
+ return "physical:"
114
+ def prefix(self):
115
+ """disambiguation prefix to use for this attribute type"""
116
+ return self.type_prefix() + self.name.lower() + ":"
117
+
118
+ class METS_STRUCT_DIV_ATTRIBUTE(Enum):
119
+ """page selection attributes of LOGICAL mets:structMap//mets:div"""
120
+ ID = auto()
121
+ DMDID = auto()
122
+ TYPE = auto()
123
+ LABEL = auto()
124
+
125
+ @classmethod
126
+ def names(cls):
127
+ return [x.name for x in cls]
128
+ @classmethod
129
+ def type_prefix(cls):
130
+ """disambiguation prefix to use for all subtypes"""
131
+ return "logical:"
132
+ def prefix(self):
133
+ """disambiguation prefix to use for this attribute type"""
134
+ return self.type_prefix() + self.name.lower() + ":"
135
+
136
+ @dataclass
137
+ class METS_DIV_ATTRIBUTE_PATTERN(ABC):
138
+ """page selection pattern (abstract supertype)"""
139
+
140
+ expr: Any
141
+ """pattern value to match a mets:div against"""
142
+ attr: List[Union[METS_PAGE_DIV_ATTRIBUTE, METS_STRUCT_DIV_ATTRIBUTE]] = field(
143
+ default_factory=lambda: list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE))
144
+ """attribute type(s) to match a mets:div for
145
+ (pre-disambiguated with prefix syntax, or filled upon first match)
146
+ """
147
+ has_matched: bool = field(init=False, default=False)
148
+ """whether this pattern has already been matched"""
149
+
150
+ def attr_prefix(self):
151
+ """attribute type disambiguation prefix corresponding to the current state of disambiguation"""
152
+ if self.attr == list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE):
153
+ return ""
154
+ if self.attr == list(METS_PAGE_DIV_ATTRIBUTE):
155
+ return METS_PAGE_DIV_ATTRIBUTE.type_prefix()
156
+ if self.attr == list(METS_STRUCT_DIV_ATTRIBUTE):
157
+ return METS_STRUCT_DIV_ATTRIBUTE.type_prefix()
158
+ assert len(self.attr) == 1, "unexpected type ambiguity: %s" % repr(self.attr)
159
+ return self.attr[0].prefix()
160
+
161
+ @abstractmethod
162
+ def _matches(self, input) -> bool:
163
+ return
164
+ def matches(self, input) -> bool:
165
+ """does the selection pattern match on the given attribute value?"""
166
+ if (matched := self._matches(input)):
167
+ self.has_matched = True
168
+ return matched
169
+
170
+ @dataclass
171
+ class METS_DIV_ATTRIBUTE_ATOM_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
172
+ """page selection pattern for literal (single value) matching"""
173
+
174
+ expr: str
175
+ def __repr__(self):
176
+ return "%s%s" % (self.attr_prefix(), self.expr)
177
+ def _matches(self, input):
178
+ return input == self.expr
179
+
180
+ @dataclass
181
+ class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
182
+ """page selection pattern for interval (list expansion) matching"""
183
+
184
+ expr: List[str]
185
+ start: str = field(init=False)
186
+ """first value of the range after expansion, before matching-exhausting"""
187
+ stop: str = field(init=False)
188
+ """last value of the range after expansion, before matching-exhausting"""
189
+ def __post_init__(self):
190
+ self.start = self.expr[0]
191
+ self.stop = self.expr[-1]
192
+ def __repr__(self):
193
+ return "%s%s..%s" % (self.attr_prefix(), self.start, self.stop)
194
+ def _matches(self, input):
195
+ return input in self.expr
196
+
197
+ @dataclass
198
+ class METS_DIV_ATTRIBUTE_REGEX_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
199
+ """page selection pattern for regular expression matching"""
200
+
201
+ expr: Pattern
202
+ def __repr__(self):
203
+ return "%s//%s" % (self.attr_prefix(), self.expr.pattern)
204
+ def _matches(self, input):
205
+ return bool(self.expr.fullmatch(input))
@@ -29,7 +29,12 @@ from .constants import (
29
29
  IDENTIFIER_PRIORITY,
30
30
  TAG_MODS_IDENTIFIER,
31
31
  METS_XML_EMPTY,
32
- METS_PAGE_DIV_ATTRIBUTE
32
+ METS_PAGE_DIV_ATTRIBUTE,
33
+ METS_STRUCT_DIV_ATTRIBUTE,
34
+ METS_DIV_ATTRIBUTE_PATTERN,
35
+ METS_DIV_ATTRIBUTE_ATOM_PATTERN,
36
+ METS_DIV_ATTRIBUTE_RANGE_PATTERN,
37
+ METS_DIV_ATTRIBUTE_REGEX_PATTERN,
33
38
  )
34
39
 
35
40
  from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore
@@ -43,9 +48,11 @@ class OcrdMets(OcrdXmlDocument):
43
48
  API to a single METS file
44
49
  """
45
50
  _cache_flag : bool
46
- # Cache for the pages (mets:div)
47
- # The dictionary's Key: 'div.ID'
48
- # The dictionary's Value: a 'div' object at some memory location
51
+ # Cache for the physical pages (mets:div) - two nested dictionaries
52
+ # The outer dictionary's key: attribute type
53
+ # The outer dictionary's value: inner dictionary
54
+ # The inner dictionary's key: attribute value (str)
55
+ # The inner dictionary's value: a 'div' object at some memory location
49
56
  _page_cache : Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET._Element]]
50
57
  # Cache for the files (mets:file) - two nested dictionaries
51
58
  # The outer dictionary's Key: 'fileGrp.USE'
@@ -59,6 +66,12 @@ class OcrdMets(OcrdXmlDocument):
59
66
  # The inner dictionary's Key: 'fptr.FILEID'
60
67
  # The inner dictionary's Value: a 'fptr' object at some memory location
61
68
  _fptr_cache : Dict[str, Dict[str, ET._Element]]
69
+ # Cache for the logical structural divs (mets:div) - two nested dictionaries
70
+ # The outer dictionary's key: attribute type
71
+ # The outer dictionary's value: inner dictionary
72
+ # The inner dictionary's key: attribute value (str)
73
+ # The inner dictionary's value: a list of corresponding physical div.ID
74
+ _struct_cache : Dict[METS_STRUCT_DIV_ATTRIBUTE, Dict[str, List[str]]]
62
75
 
63
76
  @staticmethod
64
77
  def empty_mets(now : Optional[str] = None, cache_flag : bool = False):
@@ -111,7 +124,6 @@ class OcrdMets(OcrdXmlDocument):
111
124
  return
112
125
 
113
126
  log = getLogger('ocrd.models.ocrd_mets._fill_caches-files')
114
-
115
127
  for el_fileGrp in el_fileSec.findall('mets:fileGrp', NS):
116
128
  fileGrp_use = el_fileGrp.get('USE')
117
129
 
@@ -124,10 +136,10 @@ class OcrdMets(OcrdXmlDocument):
124
136
  # log.info("File added to the cache: %s" % file_id)
125
137
 
126
138
  # Fill with pages
139
+ log = getLogger('ocrd.models.ocrd_mets._fill_caches-pages')
127
140
  el_div_list = tree_root.findall(".//mets:div[@TYPE='page']", NS)
128
141
  if len(el_div_list) == 0:
129
142
  return
130
- log = getLogger('ocrd.models.ocrd_mets._fill_caches-pages')
131
143
 
132
144
  for el_div in el_div_list:
133
145
  div_id = el_div.get('ID')
@@ -148,11 +160,30 @@ class OcrdMets(OcrdXmlDocument):
148
160
  # log.info("Len of page_cache: %s" % len(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID]))
149
161
  # log.info("Len of fptr_cache: %s" % len(self._fptr_cache))
150
162
 
163
+ # Fill with logical divs
164
+ log = getLogger('ocrd.models.ocrd_mets._fill_caches-structs')
165
+ el_struct_list = tree_root.findall("mets:structMap[@TYPE='LOGICAL']//mets:div", NS)
166
+ el_smlink_list = tree_root.findall("mets:structLink/mets:smLink", NS)
167
+ if len(el_struct_list) == 0 or len(el_smlink_list) == 0:
168
+ return
169
+ smlink_map = {}
170
+ for link in el_smlink_list:
171
+ link_log = link.get('{%s}from' % NS['xlink'])
172
+ link_phy = link.get('{%s}to' % NS['xlink'])
173
+ smlink_map.setdefault(link_log, list()).append(link_phy)
174
+ for el_div in el_struct_list:
175
+ for attr in METS_STRUCT_DIV_ATTRIBUTE:
176
+ val = self._struct_cache[attr].setdefault(str(el_div.get(attr.name)), list())
177
+ val.extend(smlink_map.get(el_div.get('ID'), []))
178
+
179
+ # log.info("Len of struct_cache: %s" % len(self._struct_cache[METS_STRUCT_DIV_ATTRIBUTE.ID]))
180
+
151
181
  def _initialize_caches(self) -> None:
152
182
  self._file_cache = {}
153
183
  # NOTE we can only guarantee uniqueness for @ID and @ORDER
154
184
  self._page_cache = {k : {} for k in METS_PAGE_DIV_ATTRIBUTE}
155
185
  self._fptr_cache = {}
186
+ self._struct_cache = {k : {} for k in METS_STRUCT_DIV_ATTRIBUTE}
156
187
 
157
188
  def _refresh_caches(self) -> None:
158
189
  if self._cache_flag:
@@ -253,12 +284,20 @@ class OcrdMets(OcrdXmlDocument):
253
284
  :py:attr:`url` and :py:attr:`mimetype` parameters can each be either a
254
285
  literal string, or a regular expression if the string starts with
255
286
  ``//`` (double slash).
287
+
256
288
  If it is a regex, the leading ``//`` is removed and candidates are matched
257
289
  against the regex with `re.fullmatch`. If it is a literal string, comparison
258
290
  is done with string equality.
259
- The :py:attr:`pageId` parameter supports the numeric range operator ``..``. For
260
- example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``,
261
- ``PHYS_0001..PHYS_0003`` will be expanded to ``PHYS_0001,PHYS_0002,PHYS_0003``.
291
+
292
+ The :py:attr:`pageId` parameter also supports comma-separated lists, as well
293
+ as the numeric range operator ``..`` and the negation operator ``~``.
294
+
295
+ For example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``, the
296
+ both expressions ``PHYS_0001..PHYS_0003`` and ``PHYS_0001,PHYS_0002,PHYS_0003``
297
+ will be expanded to the same 3 pages. To find all files above that subrange,
298
+ both expressions ``~PHYS_0001..PHYS_0003`` and ``~PHYS_0001,~PHYS_0002,~PHYS_0003``
299
+ will be expanded to ``PHYS_0004`` and upwards.
300
+
262
301
  Keyword Args:
263
302
  ID (string) : ``@ID`` of the ``mets:file``
264
303
  fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of
@@ -609,101 +648,73 @@ class OcrdMets(OcrdXmlDocument):
609
648
 
610
649
  return self.physical_pages
611
650
 
612
- # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
651
+ log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
613
652
  if for_pageIds is not None:
614
- ret = []
615
653
  page_attr_patterns = []
616
- page_attr_patterns_raw = re.split(r',', for_pageIds)
617
- for pageId_token in page_attr_patterns_raw:
654
+ page_attr_antipatterns = []
655
+ for pageId_token in re.split(r',', for_pageIds):
656
+ pageId_token_raw = pageId_token
657
+ # prefix for disambiguation of attribute?
658
+ attr = list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE)
659
+ for attr_type in [METS_STRUCT_DIV_ATTRIBUTE, METS_PAGE_DIV_ATTRIBUTE]:
660
+ if pageId_token.startswith(attr_type.type_prefix()):
661
+ for attr_val in list(attr_type):
662
+ if pageId_token.startswith(attr_val.prefix()):
663
+ # disambiguated to e.g. "logical:label:"
664
+ attr = [attr_val]
665
+ pageId_token = pageId_token[len(attr_val.prefix()):]
666
+ break
667
+ if len(attr) > 1:
668
+ # just "logical:" or "physical:"
669
+ attr = list(attr_type)
670
+ pageId_token = pageId_token[len(attr_type.type_prefix()):]
671
+ break
672
+ if not pageId_token:
673
+ raise ValueError("invalid pageId syntax '%s': empty after type prefix" % pageId_token_raw)
674
+ # negation prefix
675
+ if pageId_token.startswith('~'):
676
+ page_attr_xpatterns = page_attr_antipatterns
677
+ pageId_token = pageId_token[1:]
678
+ else:
679
+ page_attr_xpatterns = page_attr_patterns
680
+ if not pageId_token:
681
+ raise ValueError("invalid pageId syntax '%s': empty after negator prefix" % pageId_token_raw)
682
+ # operator prefix
618
683
  if pageId_token.startswith(REGEX_PREFIX):
619
- page_attr_patterns.append((None, re.compile(pageId_token[REGEX_PREFIX_LEN:])))
684
+ pageId_token = pageId_token[REGEX_PREFIX_LEN:]
685
+ if not pageId_token:
686
+ raise ValueError("invalid pageId syntax '%s': empty after regex prefix" % pageId_token_raw)
687
+ val_expr = re.compile(pageId_token)
688
+ page_attr_xpatterns.append(
689
+ METS_DIV_ATTRIBUTE_REGEX_PATTERN(val_expr, attr))
620
690
  elif '..' in pageId_token:
621
- val_range = generate_range(*pageId_token.split('..', 1))
622
- page_attr_patterns.append(val_range)
691
+ try:
692
+ val_range = generate_range(*pageId_token.split('..', 1))
693
+ except ValueError as e:
694
+ raise ValueError("invalid pageId syntax '%s': %s" % (pageId_token_raw, str(e))) from None
695
+ page_attr_xpatterns.append(
696
+ METS_DIV_ATTRIBUTE_RANGE_PATTERN(val_range, attr))
623
697
  else:
624
- page_attr_patterns.append(pageId_token)
625
- if not page_attr_patterns:
698
+ if not pageId_token:
699
+ raise ValueError("invalid pageId syntax '%s': empty" % pageId_token_raw)
700
+ page_attr_xpatterns.append(
701
+ METS_DIV_ATTRIBUTE_ATOM_PATTERN(pageId_token, attr))
702
+ log.debug("parsed pattern '%s' to %s", pageId_token_raw, page_attr_xpatterns[-1])
703
+ if not page_attr_patterns and not page_attr_antipatterns:
626
704
  return []
627
- range_patterns_first_last = [(x[0], x[-1]) if isinstance(x, list) else None for x in page_attr_patterns]
628
- page_attr_patterns_copy = list(page_attr_patterns)
629
- if self._cache_flag:
630
- for pat in page_attr_patterns:
631
- try:
632
- attr : METS_PAGE_DIV_ATTRIBUTE
633
- if isinstance(pat, str):
634
- attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat in self._page_cache[a])
635
- cache_keys = [pat]
636
- elif isinstance(pat, list):
637
- attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x in self._page_cache[a] for x in pat))
638
- cache_keys = [v for v in pat if v in self._page_cache[attr]]
639
- for k in cache_keys:
640
- pat.remove(k)
641
- elif isinstance(pat, tuple):
642
- _, re_pat = pat
643
- attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if re_pat.fullmatch(v))
644
- cache_keys = [v for v in self._page_cache[attr] if re_pat.fullmatch(v)]
645
- else:
646
- raise ValueError
647
- if return_divs:
648
- ret += [self._page_cache[attr][v] for v in cache_keys]
649
- else:
650
- ret += [self._page_cache[attr][v].get('ID') for v in cache_keys]
651
- except StopIteration:
652
- raise ValueError(f"{pat} matches none of the keys of any of the _page_caches.")
705
+ if page_attr_patterns:
706
+ divs = self.get_physical_page_patterns(page_attr_patterns)
653
707
  else:
654
- page_attr_patterns_matched = []
655
- for page in self._tree.getroot().xpath(
656
- 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
657
- namespaces=NS):
658
- patterns_exhausted = []
659
- for pat_idx, pat in enumerate(page_attr_patterns):
660
- try:
661
- if isinstance(pat, str):
662
- attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat == page.get(a.name))
663
- ret.append(page if return_divs else page.get('ID'))
664
- patterns_exhausted.append(pat)
665
- elif isinstance(pat, list):
666
- if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE):
667
- pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x == page.get(a.name) for x in pat)))
668
- attr_val = page.get(pat[0].name)
669
- if attr_val in pat:
670
- pat.remove(attr_val)
671
- ret.append(page if return_divs else page.get('ID'))
672
- if len(pat) == 1:
673
- patterns_exhausted.append(pat)
674
- elif isinstance(pat, tuple):
675
- attr, re_pat = pat
676
- if not attr:
677
- attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if re_pat.fullmatch(page.get(a.name) or ''))
678
- page_attr_patterns[pat_idx] = (attr, re_pat)
679
- if re_pat.fullmatch(page.get(attr.name) or ''):
680
- ret.append(page if return_divs else page.get('ID'))
681
- else:
682
- raise ValueError
683
- page_attr_patterns_matched.append(pat)
684
- except StopIteration:
685
- continue
686
- for p in patterns_exhausted:
687
- page_attr_patterns.remove(p)
688
- unmatched = [x for x in page_attr_patterns_copy if x not in page_attr_patterns_matched]
689
- if unmatched:
690
- raise ValueError(f"Patterns {unmatched} match none of the pages")
691
-
692
- ranges_without_start_match = []
693
- ranges_without_last_match = []
694
- for idx, pat in enumerate(page_attr_patterns_copy):
695
- if isinstance(pat, list):
696
- start, last = range_patterns_first_last[idx]
697
- if start in pat:
698
- print(pat, start, last)
699
- ranges_without_start_match.append(page_attr_patterns_raw[idx])
700
- # if last in pat:
701
- # ranges_without_last_match.append(page_attr_patterns_raw[idx])
702
- if ranges_without_start_match:
703
- raise ValueError(f"Start of range patterns {ranges_without_start_match} not matched - invalid range")
704
- # if ranges_without_last_match:
705
- # raise ValueError(f"End of range patterns {ranges_without_last_match} not matched - invalid range")
706
- return ret
708
+ all_pages = [METS_DIV_ATTRIBUTE_REGEX_PATTERN(
709
+ re.compile(".*"), [METS_PAGE_DIV_ATTRIBUTE.ID])]
710
+ divs = self.get_physical_page_patterns(all_pages)
711
+ if page_attr_antipatterns:
712
+ antidivs = self.get_physical_page_patterns(page_attr_antipatterns)
713
+ divs = [div for div in divs if div not in antidivs]
714
+ if return_divs:
715
+ return divs
716
+ else:
717
+ return [div.get('ID') for div in divs]
707
718
 
708
719
  if for_fileIds == []:
709
720
  return []
@@ -731,6 +742,129 @@ class OcrdMets(OcrdXmlDocument):
731
742
  ret[index] = page.get('ID')
732
743
  return ret
733
744
 
745
+ def get_physical_page_patterns(self, page_attr_patterns: List[METS_DIV_ATTRIBUTE_PATTERN]) -> List[ET._Element]:
746
+ log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
747
+ ret = []
748
+ page_attr_patterns_copy = list(page_attr_patterns)
749
+ if self._cache_flag:
750
+ for pat in page_attr_patterns:
751
+ for attr in pat.attr:
752
+ if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
753
+ cache = self._page_cache[attr]
754
+ else:
755
+ cache = self._struct_cache[attr]
756
+ if (isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN) and
757
+ # @TYPE makes no sense in range expressions
758
+ # @LABEL makes no sense in range expressions
759
+ attr in [METS_STRUCT_DIV_ATTRIBUTE.TYPE,
760
+ METS_STRUCT_DIV_ATTRIBUTE.LABEL]):
761
+ continue
762
+ if cache_keys := [v for v in cache if pat.matches(v)]:
763
+ if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
764
+ ret += [cache[v] for v in cache_keys]
765
+ log.debug('physical matches for %s: %s', pat, str(cache_keys))
766
+ else:
767
+ for v in cache_keys:
768
+ ret += [self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][p]
769
+ for p in cache[v]]
770
+ log.debug('logical matches for %s: %s', pat, str(cache_keys))
771
+ if isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
772
+ # remove matches for final range check
773
+ for v in cache_keys:
774
+ pat.expr.remove(v)
775
+ break
776
+ if not cache_keys:
777
+ raise ValueError(f"{pat} matches none of the keys of any of the _page_caches and _struct_caches.")
778
+ else:
779
+ # cache logical structmap:
780
+ el_struct_list = self._tree.getroot().findall("mets:structMap[@TYPE='LOGICAL']//mets:div", NS)
781
+ el_smlink_list = self._tree.getroot().findall("mets:structLink/mets:smLink", NS)
782
+ smlink_map = {}
783
+ for link in el_smlink_list:
784
+ link_log = link.get('{%s}from' % NS['xlink'])
785
+ link_phy = link.get('{%s}to' % NS['xlink'])
786
+ smlink_map.setdefault(link_log, list()).append(link_phy)
787
+ struct_cache = {k: {} for k in METS_STRUCT_DIV_ATTRIBUTE}
788
+ for el_div in el_struct_list:
789
+ for attr in METS_STRUCT_DIV_ATTRIBUTE:
790
+ if not el_div.get(attr.name):
791
+ # avoid mapping None indiscriminately
792
+ continue
793
+ val = struct_cache[attr].setdefault(str(el_div.get(attr.name)), list())
794
+ val.extend(smlink_map.get(el_div.get('ID'), []))
795
+ log.debug("found %d smLink entries for %d logical divs", len(el_smlink_list), len(el_struct_list))
796
+ page_attr_patterns_matched = []
797
+ for page in self._tree.getroot().xpath(
798
+ 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
799
+ namespaces=NS):
800
+ patterns_exhausted = []
801
+ for pat in page_attr_patterns:
802
+ for attr in pat.attr:
803
+ if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
804
+ cache = [page.get(attr.name) or '']
805
+ else:
806
+ cache = struct_cache[attr]
807
+ if (isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN) and
808
+ # @TYPE makes no sense in range expressions
809
+ # @LABEL makes no sense in range expressions
810
+ attr in [METS_STRUCT_DIV_ATTRIBUTE.TYPE,
811
+ METS_STRUCT_DIV_ATTRIBUTE.LABEL]):
812
+ continue
813
+ if cache_keys := [v for v in cache if pat.matches(v)]:
814
+ pat.attr = [attr] # disambiguate next
815
+ if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
816
+ ret.append(page)
817
+ log.debug('physical match for %s on page %s', pat, page.get('ID'))
818
+ if isinstance(pat, METS_DIV_ATTRIBUTE_ATOM_PATTERN):
819
+ patterns_exhausted.append(pat)
820
+ elif isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
821
+ # remove for efficiency and final range check
822
+ pat.expr.remove(cache_keys[0])
823
+ if not pat.expr:
824
+ patterns_exhausted.append(pat)
825
+ elif cache_key := next((v for v in cache_keys
826
+ if page.get('ID') in cache[v]), None):
827
+ ret.append(page)
828
+ log.debug('logical match for %s on page %s', pat, page.get('ID'))
829
+ cache[cache_key].remove(page.get('ID'))
830
+ # remove for efficiency and final range check
831
+ if not cache[cache_key]:
832
+ if isinstance(pat, METS_DIV_ATTRIBUTE_ATOM_PATTERN):
833
+ patterns_exhausted.append(pat)
834
+ elif isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
835
+ pat.expr.remove(cache_key)
836
+ if not pat.expr:
837
+ patterns_exhausted.append(pat)
838
+ break # no more attributes for this pattern
839
+ # keep matching in order to exhaust and consume pattern list
840
+ #if page in ret:
841
+ # break # no more patterns for this page
842
+ for p in patterns_exhausted:
843
+ page_attr_patterns.remove(p)
844
+ unmatched = [pat for pat in page_attr_patterns_copy
845
+ if not pat.has_matched]
846
+ if unmatched:
847
+ raise ValueError(f"Patterns {unmatched} match none of the pages")
848
+
849
+ ranges_without_start_match = []
850
+ ranges_without_stop_match = []
851
+ for pat in page_attr_patterns_copy:
852
+ if isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
853
+ # range expression, expanded to pattern list
854
+ # list items get consumed (pat.expr.remove) when matched,
855
+ # exhausted patterns also get consumed (page_attr_patterns.remove)
856
+ # (but top-level list copy references the same list objects)
857
+ if pat.start in pat.expr:
858
+ log.debug((pat, pat.expr))
859
+ ranges_without_start_match.append(pat)
860
+ # if pat.stop in pat.expr:
861
+ # ranges_without_stop_match.append(pat)
862
+ if ranges_without_start_match:
863
+ raise ValueError(f"Start of range patterns {ranges_without_start_match} not matched - invalid range")
864
+ # if ranges_without_stop_match:
865
+ # raise ValueError(f"End of range patterns {ranges_without_stop_match} not matched - invalid range")
866
+ return ret
867
+
734
868
  def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile,
735
869
  order : Optional[str] = None, orderlabel : Optional[str] = None) -> None:
736
870
  """