ocrd 3.7.0__tar.gz → 3.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. {ocrd-3.7.0/src/ocrd.egg-info → ocrd-3.8.0}/PKG-INFO +22 -3
  2. {ocrd-3.7.0 → ocrd-3.8.0}/README.md +19 -1
  3. ocrd-3.8.0/VERSION +1 -0
  4. {ocrd-3.7.0 → ocrd-3.8.0}/pyproject.toml +2 -0
  5. {ocrd-3.7.0 → ocrd-3.8.0}/requirements.txt +2 -1
  6. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/cli/network.py +2 -0
  7. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/cli/resmgr.py +29 -65
  8. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/constants.py +0 -2
  9. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/processor/base.py +6 -16
  10. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/dummy/ocrd-tool.json +25 -0
  11. ocrd-3.8.0/src/ocrd/processor/builtin/merge_processor.py +131 -0
  12. ocrd-3.8.0/src/ocrd/processor/builtin/param_command_header2unordered.json +7 -0
  13. ocrd-3.8.0/src/ocrd/processor/builtin/param_command_heading2unordered.json +7 -0
  14. ocrd-3.8.0/src/ocrd/processor/builtin/param_command_lines2orientation.json +6 -0
  15. ocrd-3.8.0/src/ocrd/processor/builtin/param_command_page-update-version.json +5 -0
  16. ocrd-3.8.0/src/ocrd/processor/builtin/param_command_transkribus-to-prima.json +8 -0
  17. ocrd-3.8.0/src/ocrd/processor/builtin/shell_processor.py +128 -0
  18. ocrd-3.8.0/src/ocrd/resource_manager.py +469 -0
  19. {ocrd-3.7.0 → ocrd-3.8.0/src/ocrd.egg-info}/PKG-INFO +22 -3
  20. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd.egg-info/SOURCES.txt +9 -1
  21. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd.egg-info/entry_points.txt +2 -0
  22. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd.egg-info/requires.txt +2 -1
  23. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_agent.py +3 -3
  24. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/__init__.py +1 -0
  25. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/cli/__init__.py +2 -0
  26. ocrd-3.8.0/src/ocrd_network/cli/resmgr_server.py +23 -0
  27. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/constants.py +3 -0
  28. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/logging_utils.py +5 -0
  29. ocrd-3.8.0/src/ocrd_network/resource_manager_server.py +182 -0
  30. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/connection_clients.py +1 -1
  31. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/hosts.py +43 -16
  32. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/network_agents.py +15 -1
  33. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/__init__.py +5 -1
  34. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/constants.py +5 -0
  35. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/os.py +141 -61
  36. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/ocrd_tool.schema.yml +7 -4
  37. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_resource_manager.py +37 -16
  38. ocrd-3.7.0/VERSION +0 -1
  39. ocrd-3.7.0/src/ocrd/resource_list.yml +0 -61
  40. ocrd-3.7.0/src/ocrd/resource_manager.py +0 -380
  41. {ocrd-3.7.0 → ocrd-3.8.0}/LICENSE +0 -0
  42. {ocrd-3.7.0 → ocrd-3.8.0}/MANIFEST.in +0 -0
  43. {ocrd-3.7.0 → ocrd-3.8.0}/README_ocrd.md +0 -0
  44. {ocrd-3.7.0 → ocrd-3.8.0}/README_ocrd_modelfactory.md +0 -0
  45. {ocrd-3.7.0 → ocrd-3.8.0}/README_ocrd_models.md +0 -0
  46. {ocrd-3.7.0 → ocrd-3.8.0}/README_ocrd_network.md +0 -0
  47. {ocrd-3.7.0 → ocrd-3.8.0}/README_ocrd_utils.md +0 -0
  48. {ocrd-3.7.0 → ocrd-3.8.0}/README_ocrd_validators.md +0 -0
  49. {ocrd-3.7.0 → ocrd-3.8.0}/setup.cfg +0 -0
  50. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/__init__.py +0 -0
  51. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/cli/__init__.py +0 -0
  52. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/cli/bashlib.py +0 -0
  53. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/cli/ocrd_tool.py +0 -0
  54. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/cli/process.py +0 -0
  55. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/cli/validate.py +0 -0
  56. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/cli/workspace.py +0 -0
  57. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/cli/zip.py +0 -0
  58. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/decorators/__init__.py +0 -0
  59. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/decorators/loglevel_option.py +0 -0
  60. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/decorators/mets_find_options.py +0 -0
  61. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/decorators/ocrd_cli_options.py +0 -0
  62. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/decorators/parameter_option.py +0 -0
  63. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/mets_server.py +0 -0
  64. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/ocrd-all-tool.json +0 -0
  65. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/processor/__init__.py +0 -0
  66. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/__init__.py +0 -0
  67. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/dummy/__init__.py +0 -0
  68. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/dummy_processor.py +0 -0
  69. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/processor/builtin/filter_processor.py +0 -0
  70. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/processor/helpers.py +0 -0
  71. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/processor/ocrd_page_result.py +0 -0
  72. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/resolver.py +0 -0
  73. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/task_sequence.py +0 -0
  74. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/workspace.py +0 -0
  75. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/workspace_backup.py +0 -0
  76. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd/workspace_bagger.py +0 -0
  77. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd.egg-info/dependency_links.txt +0 -0
  78. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd.egg-info/top_level.txt +0 -0
  79. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_modelfactory/__init__.py +0 -0
  80. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/__init__.py +0 -0
  81. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/constants.py +0 -0
  82. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/mets-empty.xml +0 -0
  83. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_exif.py +0 -0
  84. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_file.py +0 -0
  85. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_mets.py +0 -0
  86. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_page.py +0 -0
  87. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_page_generateds.py +0 -0
  88. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/ocrd_xml_base.py +0 -0
  89. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/report.py +0 -0
  90. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/utils.py +0 -0
  91. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_models/xpath_functions.py +0 -0
  92. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/cli/client.py +0 -0
  93. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/cli/processing_server.py +0 -0
  94. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/cli/processing_worker.py +0 -0
  95. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/client.py +0 -0
  96. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/client_utils.py +0 -0
  97. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/database.py +0 -0
  98. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/models/__init__.py +0 -0
  99. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/models/job.py +0 -0
  100. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/models/messages.py +0 -0
  101. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/models/workflow.py +0 -0
  102. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/models/workspace.py +0 -0
  103. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/param_validators.py +0 -0
  104. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/process_helpers.py +0 -0
  105. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/processing_server.py +0 -0
  106. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/processing_worker.py +0 -0
  107. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/__init__.py +0 -0
  108. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/connector.py +0 -0
  109. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/constants.py +0 -0
  110. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/consumer.py +0 -0
  111. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/helpers.py +0 -0
  112. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/ocrd_messages.py +0 -0
  113. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/rabbitmq_utils/publisher.py +0 -0
  114. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/__init__.py +0 -0
  115. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/config_parser.py +0 -0
  116. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/deployer.py +0 -0
  117. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/runtime_data/network_services.py +0 -0
  118. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/server_cache.py +0 -0
  119. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/server_utils.py +0 -0
  120. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/tcp_to_uds_mets_proxy.py +0 -0
  121. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_network/utils.py +0 -0
  122. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/config.py +0 -0
  123. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/deprecate.py +0 -0
  124. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/image.py +0 -0
  125. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/introspect.py +0 -0
  126. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/logging.py +0 -0
  127. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/ocrd_logging.conf +0 -0
  128. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_utils/str.py +0 -0
  129. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/__init__.py +0 -0
  130. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/bagit-profile.yml +0 -0
  131. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/constants.py +0 -0
  132. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/json_validator.py +0 -0
  133. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/message_processing.schema.yml +0 -0
  134. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/message_result.schema.yml +0 -0
  135. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/mets.xsd +0 -0
  136. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/ocrd_network_message_validator.py +0 -0
  137. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/ocrd_tool_validator.py +0 -0
  138. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/ocrd_zip_validator.py +0 -0
  139. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/page.xsd +0 -0
  140. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/page_validator.py +0 -0
  141. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/parameter_validator.py +0 -0
  142. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/processing_server_config.schema.yml +0 -0
  143. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/processing_server_config_validator.py +0 -0
  144. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/resource_list_validator.py +0 -0
  145. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/workspace_validator.py +0 -0
  146. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/xlink.xsd +0 -0
  147. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/xsd_mets_validator.py +0 -0
  148. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/xsd_page_validator.py +0 -0
  149. {ocrd-3.7.0 → ocrd-3.8.0}/src/ocrd_validators/xsd_validator.py +0 -0
  150. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_decorators.py +0 -0
  151. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_logging.py +0 -0
  152. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_logging_conf.py +0 -0
  153. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_mets_server.py +0 -0
  154. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_model_factory.py +0 -0
  155. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_resolver.py +0 -0
  156. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_resolver_oai.py +0 -0
  157. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_task_sequence.py +0 -0
  158. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_utils.py +0 -0
  159. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_version.py +0 -0
  160. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_workspace.py +0 -0
  161. {ocrd-3.7.0 → ocrd-3.8.0}/tests/test_workspace_remove.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.7.0
3
+ Version: 3.8.0
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -16,12 +16,13 @@ Requires-Dist: beanie~=1.7
16
16
  Requires-Dist: click>=7
17
17
  Requires-Dist: cryptography<43.0.0
18
18
  Requires-Dist: Deprecated==1.2.0
19
- Requires-Dist: docker
19
+ Requires-Dist: docker>=7.1.0
20
20
  Requires-Dist: elementpath
21
21
  Requires-Dist: fastapi>=0.78.0
22
22
  Requires-Dist: filetype
23
23
  Requires-Dist: Flask
24
24
  Requires-Dist: frozendict>=2.4.0
25
+ Requires-Dist: gitpython
25
26
  Requires-Dist: gdown
26
27
  Requires-Dist: httpx>=0.22.0
27
28
  Requires-Dist: importlib_metadata; python_version < "3.8"
@@ -68,6 +69,9 @@ Requires-Dist: uvicorn>=0.17.6
68
69
  * [Command line tools](#command-line-tools)
69
70
  * [`ocrd` CLI](#ocrd-cli)
70
71
  * [`ocrd-dummy` CLI](#ocrd-dummy-cli)
72
+ * [`ocrd-filter` CLI](#ocrd-filter-cli)
73
+ * [`ocrd-command` CLI](#ocrd-command-cli)
74
+ * [`ocrd-merge` CLI](#ocrd-merge-cli)
71
75
  * [Configuration](#configuration)
72
76
  * [Packages](#packages)
73
77
  * [ocrd_utils](#ocrd_utils)
@@ -76,7 +80,6 @@ Requires-Dist: uvicorn>=0.17.6
76
80
  * [ocrd_validators](#ocrd_validators)
77
81
  * [ocrd_network](#ocrd_network)
78
82
  * [ocrd](#ocrd)
79
- * [bash library](#bash-library)
80
83
  * [Testing](#testing)
81
84
  * [See Also](#see-also)
82
85
 
@@ -121,6 +124,22 @@ supported flags, options and arguments.
121
124
 
122
125
  A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp`
123
126
 
127
+ ### `ocrd-filter` CLI
128
+
129
+ A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that removes segments in PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` with arbitrary selection based on powerful XPath 2.0 expressions.
130
+
131
+ ### `ocrd-command` CLI
132
+
133
+ A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that runs arbitrary shell commands to transform PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` (in effect "wrapping" them for OCR-D).
134
+
135
+ ### `ocrd-merge` CLI
136
+
137
+ A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that (for every page) joins PAGE-XML files from multiple `-I/-input-file-grp` into a single `-O/-output-file-grp`, ensuring that
138
+ - `Border` polygons are joined
139
+ - all regions are concatenated, while
140
+ - ensuring segment identifiers do not clash,
141
+ - and the reading order simply gets concatenated.
142
+
124
143
  ## Configuration
125
144
 
126
145
  Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support.
@@ -18,6 +18,9 @@
18
18
  * [Command line tools](#command-line-tools)
19
19
  * [`ocrd` CLI](#ocrd-cli)
20
20
  * [`ocrd-dummy` CLI](#ocrd-dummy-cli)
21
+ * [`ocrd-filter` CLI](#ocrd-filter-cli)
22
+ * [`ocrd-command` CLI](#ocrd-command-cli)
23
+ * [`ocrd-merge` CLI](#ocrd-merge-cli)
21
24
  * [Configuration](#configuration)
22
25
  * [Packages](#packages)
23
26
  * [ocrd_utils](#ocrd_utils)
@@ -26,7 +29,6 @@
26
29
  * [ocrd_validators](#ocrd_validators)
27
30
  * [ocrd_network](#ocrd_network)
28
31
  * [ocrd](#ocrd)
29
- * [bash library](#bash-library)
30
32
  * [Testing](#testing)
31
33
  * [See Also](#see-also)
32
34
 
@@ -71,6 +73,22 @@ supported flags, options and arguments.
71
73
 
72
74
  A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp`
73
75
 
76
+ ### `ocrd-filter` CLI
77
+
78
+ A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that removes segments in PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` with arbitrary selection based on powerful XPath 2.0 expressions.
79
+
80
+ ### `ocrd-command` CLI
81
+
82
+ A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that runs arbitrary shell commands to transform PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` (in effect "wrapping" them for OCR-D).
83
+
84
+ ### `ocrd-merge` CLI
85
+
86
+ A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that (for every page) joins PAGE-XML files from multiple `-I/-input-file-grp` into a single `-O/-output-file-grp`, ensuring that
87
+ - `Border` polygons are joined
88
+ - all regions are concatenated, while
89
+ - ensuring segment identifiers do not clash,
90
+ - and the reading order simply gets concatenated.
91
+
74
92
  ## Configuration
75
93
 
76
94
  Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support.
ocrd-3.8.0/VERSION ADDED
@@ -0,0 +1 @@
1
+ 3.8.0
@@ -35,6 +35,8 @@ Issues = "https://github.com/OCR-D/core/issues"
35
35
  ocrd = "ocrd.cli:cli"
36
36
  ocrd-dummy = "ocrd.processor.builtin.dummy_processor:cli"
37
37
  ocrd-filter = "ocrd.processor.builtin.filter_processor:cli"
38
+ ocrd-command = "ocrd.processor.builtin.shell_processor:cli"
39
+ ocrd-merge = "ocrd.processor.builtin.merge_processor:cli"
38
40
 
39
41
  [tool.setuptools]
40
42
  include-package-data = true
@@ -3,12 +3,13 @@ beanie~=1.7
3
3
  click >=7
4
4
  cryptography < 43.0.0
5
5
  Deprecated == 1.2.0
6
- docker
6
+ docker>=7.1.0
7
7
  elementpath
8
8
  fastapi>=0.78.0
9
9
  filetype
10
10
  Flask
11
11
  frozendict>=2.4.0
12
+ gitpython
12
13
  gdown
13
14
  httpx>=0.22.0
14
15
  importlib_metadata ; python_version < '3.8'
@@ -12,6 +12,7 @@ from ocrd_network.cli import (
12
12
  client_cli,
13
13
  processing_server_cli,
14
14
  processing_worker_cli,
15
+ resource_manager_server_cli
15
16
  )
16
17
 
17
18
 
@@ -26,3 +27,4 @@ def network_cli():
26
27
  network_cli.add_command(client_cli)
27
28
  network_cli.add_command(processing_server_cli)
28
29
  network_cli.add_command(processing_worker_cli)
30
+ network_cli.add_command(resource_manager_server_cli)
@@ -20,6 +20,7 @@ from ocrd_utils import (
20
20
  get_ocrd_tool_json,
21
21
  initLogging,
22
22
  RESOURCE_LOCATIONS,
23
+ RESOURCE_TYPES
23
24
  )
24
25
  from ocrd.constants import RESOURCE_USER_LIST_COMMENT
25
26
 
@@ -70,16 +71,16 @@ def list_installed(executable=None):
70
71
  @resmgr_cli.command('download')
71
72
  @click.option('-n', '--any-url', default='', help='URL of unregistered resource to download/copy from')
72
73
  @click.option('-D', '--no-dynamic', default=False, is_flag=True,
73
- help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources")
74
- @click.option('-t', '--resource-type', type=click.Choice(['file', 'directory', 'archive']), default='file',
75
- help='Type of resource',)
76
- @click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type')
74
+ help="Skip looking into each processor's --dump-{json,module-dir} module-registered resources")
75
+ @click.option('-t', '--resource-type', type=click.Choice(RESOURCE_TYPES), default='file',
76
+ help='Type of resource (when unregistered or incomplete)',)
77
+ @click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type (when unregistered or incomplete)')
77
78
  @click.option('-a', '--allow-uninstalled', is_flag=True,
78
- help="Allow installing resources for uninstalled processors",)
79
+ help="Allow installing resources for not installed processors",)
79
80
  @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True)
80
- @click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
81
+ @click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
81
82
  help="Where to store resources - defaults to first location in processor's 'resource_locations' "
82
- "list or finally 'data'")
83
+ "list, i.e. usually 'data'")
83
84
  @click.argument('executable', required=True)
84
85
  @click.argument('name', required=False)
85
86
  def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstalled, overwrite, location, executable,
@@ -106,8 +107,6 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
106
107
  executable = None
107
108
  if name == '*':
108
109
  name = None
109
- is_url = (any_url.startswith('https://') or any_url.startswith('http://')) if any_url else False
110
- is_filename = Path(any_url).exists() if any_url else False
111
110
  if executable and not which(executable):
112
111
  if not allow_uninstalled:
113
112
  log.error(f"Executable '{executable}' is not installed. "
@@ -126,65 +125,30 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
126
125
  'path_in_archive': path_in_archive}]
127
126
  )]
128
127
  for this_executable, this_reslist in reslist:
129
- for resdict in this_reslist:
130
- if 'size' in resdict:
131
- registered = "registered"
132
- else:
133
- registered = "unregistered"
134
- if any_url:
135
- resdict['url'] = any_url
136
- if resdict['url'] == '???':
137
- log.warning(f"Cannot download user resource {resdict['name']}")
138
- continue
139
- if resdict['url'].startswith('https://') or resdict['url'].startswith('http://'):
140
- log.info(f"Downloading {registered} resource '{resdict['name']}' ({resdict['url']})")
141
- if 'size' not in resdict:
142
- with requests.head(resdict['url']) as r:
143
- resdict['size'] = int(r.headers.get('content-length', 0))
144
- else:
145
- log.info(f"Copying {registered} resource '{resdict['name']}' ({resdict['url']})")
146
- urlpath = Path(resdict['url'])
147
- resdict['url'] = str(urlpath.resolve())
148
- if Path(urlpath).is_dir():
149
- resdict['size'] = directory_size(urlpath)
150
- else:
151
- resdict['size'] = urlpath.stat().st_size
152
- if not location:
153
- location = get_ocrd_tool_json(this_executable)['resource_locations'][0]
154
- elif location not in get_ocrd_tool_json(this_executable)['resource_locations']:
155
- log.error(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
156
- f"refusing to install to invalid location")
157
- sys.exit(1)
158
- if location != 'module':
159
- basedir = resmgr.location_to_resource_dir(location)
160
- else:
161
- basedir = get_moduledir(this_executable)
162
- if not basedir:
163
- basedir = resmgr.location_to_resource_dir('data')
164
-
128
+ resource_locations = get_ocrd_tool_json(this_executable)['resource_locations']
129
+ if not location:
130
+ location = resource_locations[0]
131
+ elif location not in resource_locations:
132
+ log.warning(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
133
+ f"refusing to install to invalid location. Instead installing to: {resource_locations[0]}")
134
+ res_dest_dir = resmgr.build_resource_dest_dir(location=location, executable=this_executable)
135
+ for res_dict in this_reslist:
165
136
  try:
166
- with click.progressbar(length=resdict['size']) as bar:
167
- fpath = resmgr.download(
168
- this_executable,
169
- resdict['url'],
170
- basedir,
171
- name=resdict['name'],
172
- resource_type=resdict.get('type', resource_type),
173
- path_in_archive=resdict.get('path_in_archive', path_in_archive),
174
- overwrite=overwrite,
175
- no_subdir=location in ['cwd', 'module'],
176
- progress_cb=lambda delta: bar.update(delta)
177
- )
178
- if registered == 'unregistered':
179
- log.info(f"{this_executable} resource '{name}' ({any_url}) not a known resource, creating stub "
180
- f"in {resmgr.user_list}'")
181
- resmgr.add_to_user_database(this_executable, fpath, url=any_url)
182
- resmgr.save_user_list()
183
- log.info(f"Installed resource {resdict['url']} under {fpath}")
137
+ fpath = resmgr.handle_resource(
138
+ res_dict=res_dict,
139
+ executable=this_executable,
140
+ dest_dir=res_dest_dir,
141
+ any_url=any_url,
142
+ overwrite=overwrite,
143
+ resource_type=resource_type,
144
+ path_in_archive=path_in_archive
145
+ )
146
+ if not fpath:
147
+ continue
184
148
  except FileExistsError as exc:
185
149
  log.info(str(exc))
186
- log.info(f"Use in parameters as "
187
- f"'{resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))}'")
150
+ usage = res_dict.get('parameter_usage', 'as-is')
151
+ log.info(f"Use in parameters as '{resmgr.parameter_usage(res_dict['name'], usage)}'")
188
152
 
189
153
 
190
154
  @resmgr_cli.command('migrate')
@@ -9,7 +9,6 @@ __all__ = [
9
9
  'DOWNLOAD_DIR',
10
10
  'DEFAULT_REPOSITORY_URL',
11
11
  'BASHLIB_FILENAME',
12
- 'RESOURCE_LIST_FILENAME',
13
12
  'BACKUP_DIR',
14
13
  'RESOURCE_USER_LIST_COMMENT',
15
14
  ]
@@ -19,6 +18,5 @@ DEFAULT_UPLOAD_FOLDER = '/tmp/uploads-ocrd-core'
19
18
  DOWNLOAD_DIR = '/tmp/ocrd-core-downloads'
20
19
  DEFAULT_REPOSITORY_URL = 'http://localhost:5000/'
21
20
  BASHLIB_FILENAME = resource_filename(__package__, 'lib.bash')
22
- RESOURCE_LIST_FILENAME = resource_filename(__package__, 'resource_list.yml')
23
21
  RESOURCE_USER_LIST_COMMENT = "# OCR-D private resource list (consider sending a PR with your own resources to OCR-D/core)"
24
22
  BACKUP_DIR = '.backup'
@@ -42,15 +42,14 @@ from .ocrd_page_result import OcrdPageResult
42
42
  from ocrd_utils import (
43
43
  VERSION as OCRD_VERSION,
44
44
  MIMETYPE_PAGE,
45
- MIME_TO_EXT,
46
45
  config,
47
46
  getLogger,
48
47
  list_resource_candidates,
49
- pushd_popd,
50
48
  list_all_resources,
51
49
  get_processor_resource_types,
52
50
  resource_filename,
53
51
  parse_json_file_with_comments,
52
+ pushd_popd,
54
53
  make_file_id,
55
54
  deprecation_warning
56
55
  )
@@ -608,7 +607,7 @@ class Processor():
608
607
  """
609
608
  Ensure all input files for a single page are
610
609
  downloaded to the workspace, then schedule
611
- :py:meth:`.process_process_file` to be run on
610
+ :py:meth:`.process_page_file` to be run on
612
611
  them via `executor` (enforcing a per-page time
613
612
  limit of `max_seconds`).
614
613
 
@@ -935,9 +934,8 @@ class Processor():
935
934
  cwd = self.old_pwd
936
935
  else:
937
936
  cwd = getcwd()
938
- ret = [cand for cand in list_resource_candidates(executable, val,
939
- cwd=cwd, moduled=self.moduledir)
940
- if exists(cand)]
937
+ ret = list(filter(exists, list_resource_candidates(executable, val,
938
+ cwd=cwd, moduled=self.moduledir)))
941
939
  if ret:
942
940
  self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
943
941
  return ret[0]
@@ -968,17 +966,9 @@ class Processor():
968
966
  """
969
967
  List all resources found in the filesystem and matching content-type by filename suffix
970
968
  """
971
- mimetypes = get_processor_resource_types(None, self.ocrd_tool)
972
- for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
969
+ for res in list_all_resources(self.executable, ocrd_tool=self.ocrd_tool, moduled=self.moduledir):
973
970
  res = Path(res)
974
- if '*/*' not in mimetypes:
975
- if res.is_dir() and 'text/directory' not in mimetypes:
976
- continue
977
- # if we do not know all MIME types, then keep the file, otherwise require suffix match
978
- if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
979
- for mime in mimetypes):
980
- continue
981
- yield res
971
+ yield res.name
982
972
 
983
973
  @property
984
974
  def module(self):
@@ -37,6 +37,31 @@
37
37
  "description": "Whether to extract an image for each filtered segment and write to the output fileGrp."
38
38
  }
39
39
  }
40
+ },
41
+ "ocrd-command": {
42
+ "executable": "ocrd-command",
43
+ "description": "Bare-bones processor runs shell commands to process PAGE files",
44
+ "steps": ["recognition/text-recognition", "recognition/font-identification", "recognition/post-correction", "layout/segmentation", "layout/analysis"],
45
+ "categories": [],
46
+ "input_file_grp_cardinality": [1, -1],
47
+ "output_file_grp_cardinality": 1,
48
+ "parameters": {
49
+ "command": {
50
+ "type": "string",
51
+ "default": "cat @INFILE > @OUTFILE",
52
+ "description": "Shell command to operate on PAGE files, with @INFILE as place-holder for the input file path(s), and @OUTFILE as place-holder for the output file path. If running on multiple input fileGrps, then @INFILE must be repeated as many times."
53
+ }
54
+ }
55
+ },
56
+ "ocrd-merge": {
57
+ "executable": "ocrd-merge",
58
+ "description": "Bare-bones processor merges annotations from multiple fileGrps",
59
+ "steps": ["layout/segmentation"],
60
+ "categories": [],
61
+ "input_file_grp_cardinality": [1, -1],
62
+ "output_file_grp_cardinality": 1,
63
+ "parameters": {
64
+ }
40
65
  }
41
66
  }
42
67
  }
@@ -0,0 +1,131 @@
1
+ # pylint: disable=missing-module-docstring,invalid-name
2
+ from typing import Optional
3
+ from itertools import count
4
+ from collections import OrderedDict as odict
5
+
6
+ import click
7
+
8
+ from ocrd import Processor, OcrdPageResult
9
+ from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
10
+ from ocrd_modelfactory import page_from_file
11
+ from ocrd_models import OcrdPage
12
+ from ocrd_models.ocrd_page import (
13
+ BorderType,
14
+ CoordsType,
15
+ ReadingOrderType,
16
+ UnorderedGroupType,
17
+ )
18
+ from ocrd_utils import bbox_from_points
19
+
20
+ _SEGTYPES = [
21
+ "NoiseRegion",
22
+ "LineDrawingRegion",
23
+ "AdvertRegion",
24
+ "ImageRegion",
25
+ "ChartRegion",
26
+ "MusicRegion",
27
+ "GraphicRegion",
28
+ "UnknownRegion",
29
+ "CustomRegion",
30
+ "SeparatorRegion",
31
+ "MathsRegion",
32
+ "TextRegion",
33
+ "MapRegion",
34
+ "ChemRegion",
35
+ "TableRegion",
36
+ "TextLine",
37
+ "Word",
38
+ "Glyph"
39
+ ]
40
+
41
+
42
+ def get_border_bbox(pcgts):
43
+ if pcgts.Page.Border is None:
44
+ return [0, 0, pcgts.Page.imageWidth, pcgts.Page.imageHeight]
45
+ return bbox_from_points(pcgts.Page.Border.Coords.points)
46
+
47
+ def rename_segments(pcgts, start=1):
48
+ renamed = {}
49
+ rodict = pcgts.Page.get_ReadingOrderGroups()
50
+ # get everything that has an identifier
51
+ nodes = pcgts.xpath("//*[@id]")
52
+ # filter segments
53
+ segments = [segment for segment in map(pcgts.revmap.get, nodes)
54
+ # get PAGE objects from matching etree nodes
55
+ # but allow only hierarchy segments
56
+ if segment.__class__.__name__.replace('Type', '') in _SEGTYPES]
57
+ # count segments and rename them
58
+ # fixme: or perhaps better to have each segment type named and counted differently?
59
+ num = 0
60
+ regions = []
61
+ for num, segment in zip(count(start=start), segments):
62
+ segtype = segment.original_tagname_
63
+ #parent = segment.parent_object_
64
+ newname = "seg%011d" % num
65
+ assert not segment.id in renamed
66
+ if segtype.endswith('Region') and segment.id in rodict:
67
+ # update reading order
68
+ roelem = rodict[segment.id]
69
+ roelem.regionRef = newname
70
+ renamed[segment.id] = newname
71
+ segment.id = newname
72
+ return num
73
+
74
+ class MergeProcessor(Processor):
75
+ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
76
+ """
77
+ Merge PAGE segment hierarchy elements from all input file groups.
78
+
79
+ For each page, open and deserialise PAGE input files. Rename all elements
80
+ of the segment hierarchy to new (clash-free) identifers. Redefine the
81
+ `Border` coordinates as the convex hull of all input borders. Then add all
82
+ regions from all input files, concatenating them into a single `ReadingOrder`
83
+ in the order of input file groups.
84
+
85
+ Produce a new PAGE output file by serialising the resulting hierarchy.
86
+ """
87
+ actual_pcgts = list(filter(None, input_pcgts))
88
+ assert len(set(pcgts.Page.imageFilename for pcgts in actual_pcgts)) == 1, \
89
+ "input files must all reference the same @imageFilename"
90
+ # create new PAGE for image
91
+ result = OcrdPageResult(page_from_file(actual_pcgts[0].Page.imageFilename))
92
+ # unify Border
93
+ borders = [get_border_bbox(pcgts) for pcgts in actual_pcgts]
94
+ minx, miny, maxx, maxy = zip(*borders)
95
+ minx = min(minx)
96
+ miny = min(miny)
97
+ maxx = max(maxx)
98
+ maxy = max(maxy)
99
+ result.pcgts.Page.set_Border(
100
+ BorderType(CoordsType(
101
+ points=f"{minx},{miny} {maxx},{miny} {maxx},{maxy} {minx},{maxy}")))
102
+ # rename all segments
103
+ num = 1
104
+ for pcgts in actual_pcgts:
105
+ num = rename_segments(pcgts, num)
106
+ # concatenate all regions
107
+ ug = UnorderedGroupType(id="merged")
108
+ result.pcgts.Page.set_ReadingOrder(ReadingOrderType(UnorderedGroup=ug))
109
+ for pcgts in actual_pcgts:
110
+ for region in pcgts.Page.get_AllRegions():
111
+ adder = getattr(result.pcgts.Page, 'add_' + region.original_tagname_)
112
+ adder(region)
113
+ if pcgts.Page.ReadingOrder:
114
+ group = pcgts.Page.ReadingOrder.OrderedGroup or pcgts.Page.ReadingOrder.UnorderedGroup
115
+ adder = getattr(ug, 'add_' + group.original_tagname_)
116
+ adder(group)
117
+ return result
118
+
119
+ @property
120
+ def metadata_filename(self):
121
+ return 'processor/builtin/dummy/ocrd-tool.json'
122
+
123
+ @property
124
+ def executable(self):
125
+ return 'ocrd-merge'
126
+
127
+
128
+ @click.command()
129
+ @ocrd_cli_options
130
+ def cli(*args, **kwargs):
131
+ return ocrd_cli_wrap_processor(MergeProcessor, *args, **kwargs)
@@ -0,0 +1,7 @@
1
+ {
2
+ # requires https://github.com/bertsky/workflow-configuration installed
3
+ # partitions PAGE-XML ReadingOrder from single OrderedGroup to top
4
+ # UnorderedGroup divided into OrderedGroups starting at every @type=header
5
+ # text regions.
6
+ "command": "page-header2unordered @INFILE > @OUTFILE"
7
+ }
@@ -0,0 +1,7 @@
1
+ {
2
+ # requires https://github.com/bertsky/workflow-configuration installed
3
+ # partitions PAGE-XML ReadingOrder from single OrderedGroup to top
4
+ # UnorderedGroup divided into OrderedGroups starting at every @type=heading
5
+ # text regions.
6
+ "command": "page-heading2unordered @INFILE > @OUTFILE"
7
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ # requires https://github.com/bertsky/workflow-configuration installed
3
+ # retrieves lines from all paragraphs and geometrically determines
4
+ # their average skew, annotating the result under /Page/@orientation
5
+ "command": "page-lines2orientation @INFILE > @OUTFILE"
6
+ }
@@ -0,0 +1,5 @@
1
+ {
2
+ # requires https://github.com/PRImA-Research-Lab/prima-page-converter installed
3
+ # with the main JAR file copied to /usr/local/share/
4
+ "command": "java -jar /usr/local/share/PageConverter.jar -source-xml @INFILE -convert-to LATEST -target-xml @OUTFILE"
5
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ # requires https://github.com/kba/transkribus-to-prima installed
3
+ # converts PAGE from Transkribus dialect to PRImA standard
4
+ # also runs various fixes often necessary to make this work:
5
+ # - ensuring coordinates range within the image size
6
+ # - ensuring segment identifiers do not start with numbers
7
+ "command": "page-fix-coordinates @INFILE - | sed -e 's/ id=\"/ id=\"id/' -e 's/regionRef=\"/regionRef=\"id/' | transkribus-to-prima -V - @OUTFILE"
8
+ }