ocrd 3.4.0__tar.gz → 3.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. {ocrd-3.4.0/src/ocrd.egg-info → ocrd-3.5.0}/PKG-INFO +2 -2
  2. ocrd-3.5.0/VERSION +1 -0
  3. {ocrd-3.4.0 → ocrd-3.5.0}/requirements.txt +1 -1
  4. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/__init__.py +6 -0
  5. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/ocrd_cli_options.py +1 -1
  6. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/base.py +21 -13
  7. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/workspace.py +15 -19
  8. {ocrd-3.4.0 → ocrd-3.5.0/src/ocrd.egg-info}/PKG-INFO +2 -2
  9. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/requires.txt +1 -1
  10. ocrd-3.5.0/src/ocrd_models/constants.py +205 -0
  11. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_mets.py +231 -97
  12. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/constants.py +9 -5
  13. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/os.py +1 -1
  14. ocrd-3.4.0/VERSION +0 -1
  15. ocrd-3.4.0/src/ocrd_models/constants.py +0 -100
  16. {ocrd-3.4.0 → ocrd-3.5.0}/LICENSE +0 -0
  17. {ocrd-3.4.0 → ocrd-3.5.0}/MANIFEST.in +0 -0
  18. {ocrd-3.4.0 → ocrd-3.5.0}/README.md +0 -0
  19. {ocrd-3.4.0 → ocrd-3.5.0}/README_bashlib.md +0 -0
  20. {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd.md +0 -0
  21. {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_modelfactory.md +0 -0
  22. {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_models.md +0 -0
  23. {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_network.md +0 -0
  24. {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_utils.md +0 -0
  25. {ocrd-3.4.0 → ocrd-3.5.0}/README_ocrd_validators.md +0 -0
  26. {ocrd-3.4.0 → ocrd-3.5.0}/pyproject.toml +0 -0
  27. {ocrd-3.4.0 → ocrd-3.5.0}/setup.cfg +0 -0
  28. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/__init__.py +0 -0
  29. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/bashlib.py +0 -0
  30. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/log.py +0 -0
  31. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/network.py +0 -0
  32. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/ocrd_tool.py +0 -0
  33. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/process.py +0 -0
  34. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/resmgr.py +0 -0
  35. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/validate.py +0 -0
  36. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/workspace.py +0 -0
  37. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/cli/zip.py +0 -0
  38. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/constants.py +0 -0
  39. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/__init__.py +0 -0
  40. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/loglevel_option.py +0 -0
  41. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/mets_find_options.py +0 -0
  42. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/decorators/parameter_option.py +0 -0
  43. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/lib.bash +0 -0
  44. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/mets_server.py +0 -0
  45. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/ocrd-all-tool.json +0 -0
  46. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/__init__.py +0 -0
  47. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/__init__.py +0 -0
  48. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
  49. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +0 -0
  50. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
  51. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/builtin/filter_processor.py +0 -0
  52. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/helpers.py +0 -0
  53. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/processor/ocrd_page_result.py +0 -0
  54. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/resolver.py +0 -0
  55. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/resource_list.yml +0 -0
  56. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/resource_manager.py +0 -0
  57. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/task_sequence.py +0 -0
  58. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/workspace_backup.py +0 -0
  59. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd/workspace_bagger.py +0 -0
  60. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/SOURCES.txt +0 -0
  61. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/dependency_links.txt +0 -0
  62. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/entry_points.txt +0 -0
  63. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd.egg-info/top_level.txt +0 -0
  64. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_modelfactory/__init__.py +0 -0
  65. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/__init__.py +0 -0
  66. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/mets-empty.xml +0 -0
  67. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_agent.py +0 -0
  68. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_exif.py +0 -0
  69. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_file.py +0 -0
  70. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_page.py +0 -0
  71. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_page_generateds.py +0 -0
  72. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/ocrd_xml_base.py +0 -0
  73. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/report.py +0 -0
  74. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/utils.py +0 -0
  75. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_models/xpath_functions.py +0 -0
  76. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/__init__.py +0 -0
  77. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/__init__.py +0 -0
  78. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/client.py +0 -0
  79. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/processing_server.py +0 -0
  80. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/processing_worker.py +0 -0
  81. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/cli/processor_server.py +0 -0
  82. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/client.py +0 -0
  83. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/client_utils.py +0 -0
  84. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/database.py +0 -0
  85. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/logging_utils.py +0 -0
  86. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/__init__.py +0 -0
  87. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/job.py +0 -0
  88. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/messages.py +0 -0
  89. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/ocrd_tool.py +0 -0
  90. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/workflow.py +0 -0
  91. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/models/workspace.py +0 -0
  92. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/param_validators.py +0 -0
  93. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/process_helpers.py +0 -0
  94. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/processing_server.py +0 -0
  95. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/processing_worker.py +0 -0
  96. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/processor_server.py +0 -0
  97. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
  98. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
  99. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
  100. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
  101. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
  102. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
  103. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
  104. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/__init__.py +0 -0
  105. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/config_parser.py +0 -0
  106. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/connection_clients.py +0 -0
  107. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/deployer.py +0 -0
  108. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/hosts.py +0 -0
  109. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/network_agents.py +0 -0
  110. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/runtime_data/network_services.py +0 -0
  111. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/server_cache.py +0 -0
  112. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/server_utils.py +0 -0
  113. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
  114. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_network/utils.py +0 -0
  115. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/__init__.py +0 -0
  116. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/config.py +0 -0
  117. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/constants.py +0 -0
  118. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/deprecate.py +0 -0
  119. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/image.py +0 -0
  120. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/introspect.py +0 -0
  121. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/logging.py +0 -0
  122. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/ocrd_logging.conf +0 -0
  123. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_utils/str.py +0 -0
  124. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/__init__.py +0 -0
  125. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/bagit-profile.yml +0 -0
  126. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/constants.py +0 -0
  127. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/json_validator.py +0 -0
  128. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/message_processing.schema.yml +0 -0
  129. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/message_result.schema.yml +0 -0
  130. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/mets.xsd +0 -0
  131. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
  132. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/ocrd_tool.schema.yml +0 -0
  133. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
  134. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
  135. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/page.xsd +0 -0
  136. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/page_validator.py +0 -0
  137. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/parameter_validator.py +0 -0
  138. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
  139. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/processing_server_config_validator.py +0 -0
  140. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/resource_list_validator.py +0 -0
  141. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/workspace_validator.py +0 -0
  142. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/xlink.xsd +0 -0
  143. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/xsd_mets_validator.py +0 -0
  144. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/xsd_page_validator.py +0 -0
  145. {ocrd-3.4.0 → ocrd-3.5.0}/src/ocrd_validators/xsd_validator.py +0 -0
  146. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_decorators.py +0 -0
  147. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_logging.py +0 -0
  148. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_logging_conf.py +0 -0
  149. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_mets_server.py +0 -0
  150. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_model_factory.py +0 -0
  151. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_resolver.py +0 -0
  152. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_resolver_oai.py +0 -0
  153. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_resource_manager.py +0 -0
  154. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_task_sequence.py +0 -0
  155. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_utils.py +0 -0
  156. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_version.py +0 -0
  157. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_workspace.py +0 -0
  158. {ocrd-3.4.0 → ocrd-3.5.0}/tests/test_workspace_remove.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.4.0
3
+ Version: 3.5.0
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -21,7 +21,7 @@ Requires-Dist: elementpath
21
21
  Requires-Dist: fastapi>=0.78.0
22
22
  Requires-Dist: filetype
23
23
  Requires-Dist: Flask
24
- Requires-Dist: frozendict>=2.3.4
24
+ Requires-Dist: frozendict>=2.4.0
25
25
  Requires-Dist: gdown
26
26
  Requires-Dist: httpx>=0.22.0
27
27
  Requires-Dist: importlib_metadata; python_version < "3.8"
ocrd-3.5.0/VERSION ADDED
@@ -0,0 +1 @@
1
+ 3.5.0
@@ -8,7 +8,7 @@ elementpath
8
8
  fastapi>=0.78.0
9
9
  filetype
10
10
  Flask
11
- frozendict>=2.3.4
11
+ frozendict>=2.4.0
12
12
  gdown
13
13
  httpx>=0.22.0
14
14
  importlib_metadata ; python_version < '3.8'
@@ -67,6 +67,12 @@ Variables:
67
67
  \b
68
68
  {config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)}
69
69
  \b
70
+ {config.describe('OCRD_MAX_MISSING_OUTPUTS')}
71
+ \b
72
+ {config.describe('OCRD_MAX_PARALLEL_PAGES')}
73
+ \b
74
+ {config.describe('OCRD_PROCESSING_PAGE_TIMEOUT')}
75
+ \b
70
76
  {config.describe('OCRD_METS_CACHING')}
71
77
  \b
72
78
  {config.describe('OCRD_MAX_PROCESSOR_CACHE')}
@@ -56,7 +56,7 @@ def ocrd_cli_options(f):
56
56
  # subcommands. So we have to work around that by creating a
57
57
  # pseudo-subcommand handled in ocrd_cli_wrap_processor
58
58
  argument('subcommand', nargs=1, required=False,
59
- type=click.Choice([AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER])),
59
+ type=click.Choice(list(map(str, AgentType)))),
60
60
  ]
61
61
  for param in params:
62
62
  param(f)
@@ -29,8 +29,7 @@ from frozendict import frozendict
29
29
  # this is where the fixes came from:
30
30
  from loky import Future, ProcessPoolExecutor
31
31
  import multiprocessing as mp
32
- from threading import Timer
33
- from _thread import interrupt_main
32
+ from multiprocessing.pool import ThreadPool
34
33
 
35
34
  from click import wrap_text
36
35
  from deprecated import deprecated
@@ -783,11 +782,16 @@ class Processor():
783
782
  page_id = input_files[input_pos].pageId
784
783
  self._base_logger.info("processing page %s", page_id)
785
784
  for i, input_file in enumerate(input_files):
785
+ grp = self.input_file_grp.split(',')[i]
786
786
  if input_file is None:
787
- grp = self.input_file_grp.split(',')[i]
788
787
  self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}")
789
788
  continue
790
789
  assert isinstance(input_file, get_args(OcrdFileType))
790
+ if not input_file.local_filename:
791
+ self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}')
792
+ if config.OCRD_MISSING_INPUT == 'ABORT':
793
+ raise MissingInputFile(grp, page_id, input_file.mimetype)
794
+ continue
791
795
  self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
792
796
  try:
793
797
  page_ = page_from_file(input_file)
@@ -796,6 +800,9 @@ class Processor():
796
800
  except ValueError as err:
797
801
  # not PAGE and not an image to generate PAGE for
798
802
  self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
803
+ if not any(input_pcgts):
804
+ self._base_logger.warning(f'skipping page {page_id}')
805
+ return
799
806
  output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
800
807
  if input_files[input_pos].fileGrp == self.output_file_grp:
801
808
  # input=output fileGrp: re-use ID exactly
@@ -1107,7 +1114,11 @@ class Processor():
1107
1114
  self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
1108
1115
  f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
1109
1116
  ifts = []
1110
- for page, ifiles in pages.items():
1117
+ # use physical page order
1118
+ for page in self.workspace.mets.physical_pages:
1119
+ if page not in pages:
1120
+ continue
1121
+ ifiles = pages[page]
1111
1122
  for i, ifg in enumerate(ifgs):
1112
1123
  if not ifiles[i]:
1113
1124
  # could be from non-unique with on_error=skip or from true gap
@@ -1150,18 +1161,15 @@ def _page_worker(timeout, *input_files):
1150
1161
  """
1151
1162
  page_id = next((file.pageId for file in input_files
1152
1163
  if hasattr(file, 'pageId')), "")
1153
- if timeout > 0:
1154
- timer = Timer(timeout, interrupt_main)
1155
- timer.start()
1164
+ pool = ThreadPool(processes=1)
1156
1165
  try:
1157
- _page_worker_processor.process_page_file(*input_files)
1166
+ #_page_worker_processor.process_page_file(*input_files)
1167
+ async_result = pool.apply_async(_page_worker_processor.process_page_file, input_files)
1168
+ async_result.get(timeout or None)
1158
1169
  _page_worker_processor.logger.debug("page worker completed for page %s", page_id)
1159
- except KeyboardInterrupt:
1170
+ except mp.TimeoutError:
1160
1171
  _page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
1161
- raise TimeoutError()
1162
- finally:
1163
- if timeout > 0:
1164
- timer.cancel()
1172
+ raise
1165
1173
 
1166
1174
  def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
1167
1175
  """Generate a string describing the full CLI of this processor including params.
@@ -777,16 +777,14 @@ class Workspace():
777
777
  raise Exception('Found no AlternativeImage that satisfies all requirements ' +
778
778
  'filename="%s" in page "%s"' % (
779
779
  filename, page_id))
780
- if not all(feature in page_coords['features']
781
- for feature in feature_selector.split(',') if feature):
782
- raise Exception('Found no AlternativeImage that satisfies all requirements ' +
783
- 'selector="%s" in page "%s"' % (
784
- feature_selector, page_id))
785
- if any(feature in page_coords['features']
786
- for feature in feature_filter.split(',') if feature):
787
- raise Exception('Found no AlternativeImage that satisfies all requirements ' +
788
- 'filter="%s" in page "%s"' % (
789
- feature_filter, page_id))
780
+ if (not all(feature in page_coords['features']
781
+ for feature in feature_selector.split(',') if feature) or
782
+ any(feature in page_coords['features']
783
+ for feature in feature_filter.split(',') if feature)):
784
+ raise Exception('Found no AlternativeImage that satisfies all requirements' +
785
+ ' selector="%s"' % feature_selector +
786
+ ' filter="%s"' % feature_filter +
787
+ ' in page "%s"' % page_id)
790
788
  # ensure DPI will be set in image meta-data again
791
789
  if 'DPI' in page_coords:
792
790
  dpi = page_coords['DPI']
@@ -1038,16 +1036,14 @@ class Workspace():
1038
1036
  raise Exception('Found no AlternativeImage that satisfies all requirements ' +
1039
1037
  'filename="%s" in segment "%s"' % (
1040
1038
  filename, segment.id))
1041
- if not all(feature in segment_coords['features']
1042
- for feature in feature_selector.split(',') if feature):
1039
+ if (not all(feature in segment_coords['features']
1040
+ for feature in feature_selector.split(',') if feature) or
1041
+ any(feature in segment_coords['features']
1042
+ for feature in feature_filter.split(',') if feature)):
1043
1043
  raise Exception('Found no AlternativeImage that satisfies all requirements' +
1044
- 'selector="%s" in segment "%s"' % (
1045
- feature_selector, segment.id))
1046
- if any(feature in segment_coords['features']
1047
- for feature in feature_filter.split(',') if feature):
1048
- raise Exception('Found no AlternativeImage that satisfies all requirements ' +
1049
- 'filter="%s" in segment "%s"' % (
1050
- feature_filter, segment.id))
1044
+ ' selector="%s"' % feature_selector +
1045
+ ' filter="%s"' % feature_filter +
1046
+ ' in segment "%s"' % segment.id)
1051
1047
  # ensure DPI will be set in image meta-data again
1052
1048
  if 'DPI' in segment_coords:
1053
1049
  dpi = segment_coords['DPI']
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.4.0
3
+ Version: 3.5.0
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -21,7 +21,7 @@ Requires-Dist: elementpath
21
21
  Requires-Dist: fastapi>=0.78.0
22
22
  Requires-Dist: filetype
23
23
  Requires-Dist: Flask
24
- Requires-Dist: frozendict>=2.3.4
24
+ Requires-Dist: frozendict>=2.4.0
25
25
  Requires-Dist: gdown
26
26
  Requires-Dist: httpx>=0.22.0
27
27
  Requires-Dist: importlib_metadata; python_version < "3.8"
@@ -8,7 +8,7 @@ elementpath
8
8
  fastapi>=0.78.0
9
9
  filetype
10
10
  Flask
11
- frozendict>=2.3.4
11
+ frozendict>=2.4.0
12
12
  gdown
13
13
  httpx>=0.22.0
14
14
  jsonschema>=4
@@ -0,0 +1,205 @@
1
+ """
2
+ Constants for ocrd_models.
3
+ """
4
+ from re import Pattern
5
+ from enum import Enum, auto
6
+ from dataclasses import dataclass, field
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any, List, Optional, Union
9
+ from ocrd_utils import resource_string
10
+
11
+ __all__ = [
12
+ 'IDENTIFIER_PRIORITY',
13
+ 'METS_XML_EMPTY',
14
+ 'NAMESPACES',
15
+ 'TAG_METS_AGENT',
16
+ 'TAG_METS_DIV',
17
+ 'TAG_METS_FILE',
18
+ 'TAG_METS_FILEGRP',
19
+ 'TAG_METS_FILESEC',
20
+ 'TAG_METS_FPTR',
21
+ 'TAG_METS_FLOCAT',
22
+ 'TAG_METS_METSHDR',
23
+ 'TAG_METS_NAME',
24
+ 'TAG_METS_NOTE',
25
+ 'TAG_METS_STRUCTMAP',
26
+ 'TAG_MODS_IDENTIFIER',
27
+ 'TAG_PAGE_ALTERNATIVEIMAGE',
28
+ 'TAG_PAGE_COORDS',
29
+ 'TAG_PAGE_READINGORDER',
30
+ 'TAG_PAGE_REGIONREFINDEXED',
31
+ 'TAG_PAGE_TEXTLINE',
32
+ 'TAG_PAGE_TEXTEQUIV',
33
+ 'TAG_PAGE_TEXTREGION',
34
+ 'METS_PAGE_DIV_ATTRIBUTE',
35
+ 'METS_STRUCT_DIV_ATTRIBUTE',
36
+ 'METS_DIV_ATTRIBUTE_ATOM_PATTERN',
37
+ 'METS_DIV_ATTRIBUTE_RANGE_PATTERN',
38
+ 'METS_DIV_ATTRIBUTE_REGEX_PATTERN',
39
+ 'PAGE_REGION_TYPES',
40
+ 'PAGE_ALTIMG_FEATURES',
41
+ ]
42
+
43
+
44
+ IDENTIFIER_PRIORITY = ['purl', 'urn', 'doi', 'url']
45
+
46
+ METS_XML_EMPTY = resource_string(__package__, 'mets-empty.xml')
47
+
48
+ NAMESPACES = {
49
+ 'mets': "http://www.loc.gov/METS/",
50
+ 'mods': "http://www.loc.gov/mods/v3",
51
+ 'xlink': "http://www.w3.org/1999/xlink",
52
+ 'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
53
+ 'xsl': 'http://www.w3.org/1999/XSL/Transform#',
54
+ 'ocrd': 'https://ocr-d.de',
55
+ }
56
+
57
+ TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets']
58
+ TAG_METS_DIV = '{%s}div' % NAMESPACES['mets']
59
+ TAG_METS_FILE = '{%s}file' % NAMESPACES['mets']
60
+ TAG_METS_FILEGRP = '{%s}fileGrp' % NAMESPACES['mets']
61
+ TAG_METS_FILESEC = '{%s}fileSec' % NAMESPACES['mets']
62
+ TAG_METS_FPTR = '{%s}fptr' % NAMESPACES['mets']
63
+ TAG_METS_FLOCAT = '{%s}FLocat' % NAMESPACES['mets']
64
+ TAG_METS_METSHDR = '{%s}metsHdr' % NAMESPACES['mets']
65
+ TAG_METS_NAME = '{%s}name' % NAMESPACES['mets']
66
+ TAG_METS_NOTE = '{%s}note' % NAMESPACES['mets']
67
+ TAG_METS_STRUCTMAP = '{%s}structMap' % NAMESPACES['mets']
68
+
69
+ TAG_MODS_IDENTIFIER = '{%s}identifier' % NAMESPACES['mods']
70
+
71
+ TAG_PAGE_ALTERNATIVEIMAGE = '{%s}AlternativeImage' % NAMESPACES['page']
72
+ TAG_PAGE_COORDS = '{%s}Coords' % NAMESPACES['page']
73
+ TAG_PAGE_READINGORDER = '{%s}ReadingOrder' % NAMESPACES['page']
74
+ TAG_PAGE_REGIONREFINDEXED = '{%s}RegionRefIndexed' % NAMESPACES['page']
75
+ TAG_PAGE_TEXTLINE = '{%s}TextLine' % NAMESPACES['page']
76
+ TAG_PAGE_TEXTEQUIV = '{%s}TextEquiv' % NAMESPACES['page']
77
+ TAG_PAGE_TEXTREGION = '{%s}TextRegion' % NAMESPACES['page']
78
+
79
+ PAGE_REGION_TYPES = [
80
+ 'Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image',
81
+ 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
82
+ 'Separator', 'Table', 'Text', 'Unknown'
83
+ ]
84
+
85
+ PAGE_ALTIMG_FEATURES = [
86
+ 'binarized',
87
+ 'grayscale_normalized',
88
+ 'despeckled',
89
+ 'cropped',
90
+ 'deskewed',
91
+ 'rotated-90',
92
+ 'rotated-180',
93
+ 'rotated-270',
94
+ 'dewarped',
95
+ 'clipped',
96
+ ]
97
+
98
+
99
+ class METS_PAGE_DIV_ATTRIBUTE(Enum):
100
+ """page selection attributes of PHYSICAL mets:structMap//mets:div"""
101
+ ID = auto()
102
+ ORDER = auto()
103
+ ORDERLABEL = auto()
104
+ LABEL = auto()
105
+ CONTENTIDS = auto()
106
+
107
+ @classmethod
108
+ def names(cls):
109
+ return [x.name for x in cls]
110
+ @classmethod
111
+ def type_prefix(cls):
112
+ """disambiguation prefix to use for all subtypes"""
113
+ return "physical:"
114
+ def prefix(self):
115
+ """disambiguation prefix to use for this attribute type"""
116
+ return self.type_prefix() + self.name.lower() + ":"
117
+
118
+ class METS_STRUCT_DIV_ATTRIBUTE(Enum):
119
+ """page selection attributes of LOGICAL mets:structMap//mets:div"""
120
+ ID = auto()
121
+ DMDID = auto()
122
+ TYPE = auto()
123
+ LABEL = auto()
124
+
125
+ @classmethod
126
+ def names(cls):
127
+ return [x.name for x in cls]
128
+ @classmethod
129
+ def type_prefix(cls):
130
+ """disambiguation prefix to use for all subtypes"""
131
+ return "logical:"
132
+ def prefix(self):
133
+ """disambiguation prefix to use for this attribute type"""
134
+ return self.type_prefix() + self.name.lower() + ":"
135
+
136
+ @dataclass
137
+ class METS_DIV_ATTRIBUTE_PATTERN(ABC):
138
+ """page selection pattern (abstract supertype)"""
139
+
140
+ expr: Any
141
+ """pattern value to match a mets:div against"""
142
+ attr: List[Union[METS_PAGE_DIV_ATTRIBUTE, METS_STRUCT_DIV_ATTRIBUTE]] = field(
143
+ default_factory=lambda: list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE))
144
+ """attribute type(s) to match a mets:div for
145
+ (pre-disambiguated with prefix syntax, or filled upon first match)
146
+ """
147
+ has_matched: bool = field(init=False, default=False)
148
+ """whether this pattern has already been matched"""
149
+
150
+ def attr_prefix(self):
151
+ """attribute type disambiguation prefix corresponding to the current state of disambiguation"""
152
+ if self.attr == list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE):
153
+ return ""
154
+ if self.attr == list(METS_PAGE_DIV_ATTRIBUTE):
155
+ return METS_PAGE_DIV_ATTRIBUTE.type_prefix()
156
+ if self.attr == list(METS_STRUCT_DIV_ATTRIBUTE):
157
+ return METS_STRUCT_DIV_ATTRIBUTE.type_prefix()
158
+ assert len(self.attr) == 1, "unexpected type ambiguity: %s" % repr(self.attr)
159
+ return self.attr[0].prefix()
160
+
161
+ @abstractmethod
162
+ def _matches(self, input) -> bool:
163
+ return
164
+ def matches(self, input) -> bool:
165
+ """does the selection pattern match on the given attribute value?"""
166
+ if (matched := self._matches(input)):
167
+ self.has_matched = True
168
+ return matched
169
+
170
+ @dataclass
171
+ class METS_DIV_ATTRIBUTE_ATOM_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
172
+ """page selection pattern for literal (single value) matching"""
173
+
174
+ expr: str
175
+ def __repr__(self):
176
+ return "%s%s" % (self.attr_prefix(), self.expr)
177
+ def _matches(self, input):
178
+ return input == self.expr
179
+
180
+ @dataclass
181
+ class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
182
+ """page selection pattern for interval (list expansion) matching"""
183
+
184
+ expr: List[str]
185
+ start: str = field(init=False)
186
+ """first value of the range after expansion, before matching-exhausting"""
187
+ stop: str = field(init=False)
188
+ """last value of the range after expansion, before matching-exhausting"""
189
+ def __post_init__(self):
190
+ self.start = self.expr[0]
191
+ self.stop = self.expr[-1]
192
+ def __repr__(self):
193
+ return "%s%s..%s" % (self.attr_prefix(), self.start, self.stop)
194
+ def _matches(self, input):
195
+ return input in self.expr
196
+
197
+ @dataclass
198
+ class METS_DIV_ATTRIBUTE_REGEX_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
199
+ """page selection pattern for regular expression matching"""
200
+
201
+ expr: Pattern
202
+ def __repr__(self):
203
+ return "%s//%s" % (self.attr_prefix(), self.expr.pattern)
204
+ def _matches(self, input):
205
+ return bool(self.expr.fullmatch(input))