ocrd 3.6.0__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. ocrd/cli/__init__.py +2 -4
  2. ocrd/cli/bashlib.py +6 -117
  3. ocrd/cli/network.py +2 -0
  4. ocrd/cli/resmgr.py +29 -65
  5. ocrd/constants.py +0 -2
  6. ocrd/mets_server.py +5 -5
  7. ocrd/processor/base.py +6 -16
  8. ocrd/processor/builtin/dummy/ocrd-tool.json +25 -0
  9. ocrd/processor/builtin/merge_processor.py +131 -0
  10. ocrd/processor/builtin/param_command_header2unordered.json +7 -0
  11. ocrd/processor/builtin/param_command_heading2unordered.json +7 -0
  12. ocrd/processor/builtin/param_command_lines2orientation.json +6 -0
  13. ocrd/processor/builtin/param_command_page-update-version.json +5 -0
  14. ocrd/processor/builtin/param_command_transkribus-to-prima.json +8 -0
  15. ocrd/processor/builtin/shell_processor.py +128 -0
  16. ocrd/resource_manager.py +213 -124
  17. {ocrd-3.6.0.dist-info → ocrd-3.8.0.dist-info}/METADATA +23 -10
  18. {ocrd-3.6.0.dist-info → ocrd-3.8.0.dist-info}/RECORD +40 -34
  19. {ocrd-3.6.0.dist-info → ocrd-3.8.0.dist-info}/entry_points.txt +2 -0
  20. ocrd_models/ocrd_agent.py +3 -3
  21. ocrd_network/__init__.py +1 -0
  22. ocrd_network/cli/__init__.py +2 -0
  23. ocrd_network/cli/resmgr_server.py +23 -0
  24. ocrd_network/constants.py +3 -0
  25. ocrd_network/logging_utils.py +5 -0
  26. ocrd_network/models/job.py +29 -28
  27. ocrd_network/models/messages.py +3 -2
  28. ocrd_network/models/workspace.py +4 -4
  29. ocrd_network/resource_manager_server.py +182 -0
  30. ocrd_network/runtime_data/connection_clients.py +1 -1
  31. ocrd_network/runtime_data/hosts.py +43 -16
  32. ocrd_network/runtime_data/network_agents.py +15 -1
  33. ocrd_utils/__init__.py +5 -1
  34. ocrd_utils/constants.py +5 -0
  35. ocrd_utils/logging.py +3 -0
  36. ocrd_utils/os.py +142 -62
  37. ocrd_validators/ocrd_tool.schema.yml +7 -4
  38. ocrd/cli/log.py +0 -56
  39. ocrd/lib.bash +0 -310
  40. ocrd/resource_list.yml +0 -61
  41. {ocrd-3.6.0.dist-info → ocrd-3.8.0.dist-info}/LICENSE +0 -0
  42. {ocrd-3.6.0.dist-info → ocrd-3.8.0.dist-info}/WHEEL +0 -0
  43. {ocrd-3.6.0.dist-info → ocrd-3.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,182 @@
1
+ from datetime import datetime
2
+ from os import getpid
3
+ from shutil import which
4
+ from typing import Any
5
+ from uvicorn import run as uvicorn_run
6
+ from fastapi import APIRouter, FastAPI, HTTPException, status
7
+
8
+ from ocrd import OcrdResourceManager
9
+ from ocrd_utils import getLogger, get_ocrd_tool_json, initLogging
10
+ from .logging_utils import configure_file_handler_with_formatter, get_resource_manager_server_logging_file_path
11
+
12
+
13
+ class ResourceManagerServer(FastAPI):
14
+ def __init__(self, host: str, port: int) -> None:
15
+ self.title = f"OCR-D Resource Manager Server"
16
+ super().__init__(
17
+ title=self.title,
18
+ on_startup=[self.on_startup],
19
+ on_shutdown=[self.on_shutdown],
20
+ description=self.title
21
+ )
22
+ initLogging()
23
+ self.log = getLogger("ocrd_network.resource_manager_server")
24
+ log_file = get_resource_manager_server_logging_file_path(pid=getpid())
25
+ configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a")
26
+
27
+ self.resmgr_instance = OcrdResourceManager()
28
+
29
+ self.hostname = host
30
+ self.port = port
31
+
32
+ self.add_api_routes()
33
+
34
+ def start(self):
35
+ uvicorn_run(self, host=self.hostname, port=int(self.port))
36
+
37
+ async def on_startup(self):
38
+ self.log.info(f"Starting {self.title}")
39
+ pass
40
+
41
+ async def on_shutdown(self) -> None:
42
+ pass
43
+
44
+ def add_api_routes(self):
45
+ base_router = APIRouter()
46
+ base_router.add_api_route(
47
+ path="/",
48
+ endpoint=self.home_page,
49
+ methods=["GET"],
50
+ status_code=status.HTTP_200_OK,
51
+ summary="Get information about the OCR-D Resource Manager Server"
52
+ )
53
+ base_router.add_api_route(
54
+ path="/list_available",
55
+ endpoint=self.list_available_resources,
56
+ methods=["GET"],
57
+ status_code=status.HTTP_200_OK,
58
+ summary=""
59
+ )
60
+ base_router.add_api_route(
61
+ path="/list_installed",
62
+ endpoint=self.list_installed_resources,
63
+ methods=["GET"],
64
+ status_code=status.HTTP_200_OK,
65
+ summary=""
66
+ )
67
+ base_router.add_api_route(
68
+ path="/download",
69
+ endpoint=self.download_resource,
70
+ methods=["GET"],
71
+ status_code=status.HTTP_200_OK,
72
+ summary=""
73
+ )
74
+ self.include_router(base_router)
75
+
76
+ async def home_page(self):
77
+ message = f"The home page of the {self.title}"
78
+ json_message = {
79
+ "message": message,
80
+ "time": datetime.now().strftime("%Y-%m-%d %H:%M")
81
+ }
82
+ return json_message
83
+
84
+ async def list_available_resources(
85
+ self,
86
+ executable: Any = "ocrd-dummy",
87
+ dynamic: bool = True,
88
+ name: Any = None,
89
+ database: Any = None,
90
+ url: Any = None
91
+ ):
92
+ if executable == '*':
93
+ message = f"'*' is not an acceptable executable name! Try with a specific executable."
94
+ self.log.error(message)
95
+ raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=message)
96
+ result = self.resmgr_instance.list_available(executable, dynamic, name, database, url)
97
+ json_message = {
98
+ "result": result
99
+ }
100
+ return json_message
101
+
102
+ async def list_installed_resources(self, executable: Any = None):
103
+ if executable == '*':
104
+ message = f"'*' is not an acceptable executable name! Try with a specific executable."
105
+ self.log.error(message)
106
+ raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=message)
107
+ result = self.resmgr_instance.list_available(executable)
108
+ json_message = {
109
+ "result": result
110
+ }
111
+ return json_message
112
+
113
+ async def download_resource(
114
+ self,
115
+ executable: str = "ocrd-dummy",
116
+ name: Any = None,
117
+ location: Any = None,
118
+ any_url: str = '',
119
+ no_dynamic: bool = False,
120
+ resource_type: str = 'file',
121
+ path_in_archive: str = '.',
122
+ allow_uninstalled: bool = True,
123
+ overwrite: bool = True
124
+ ):
125
+ resmgr = OcrdResourceManager()
126
+ response = []
127
+ if executable == '*':
128
+ message = f"'*' is not an acceptable executable name! Try with a specific executable."
129
+ self.log.error(message)
130
+ raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=message)
131
+ if name == '*':
132
+ name = None
133
+ if executable and not which(executable):
134
+ if not allow_uninstalled:
135
+ message = (f"Executable '{executable}' is not installed. To download resources anyway, "
136
+ f"use the -a/--allow-uninstalled flag")
137
+ self.log.error(message)
138
+ raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=message)
139
+ else:
140
+ message = f"Executable '{executable}' is not installed, but downloading resources anyway."
141
+ self.log.info(message)
142
+ response.append(message)
143
+ reslist = resmgr.list_available(executable=executable, dynamic=not no_dynamic, name=name)
144
+ if not any(r[1] for r in reslist):
145
+ message = f"No resources {name} found in registry for executable {executable}"
146
+ self.log.info(message)
147
+ response.append(message)
148
+ if executable and name:
149
+ reslist = [(executable, [{
150
+ 'url': any_url or '???',
151
+ 'name': name,
152
+ 'type': resource_type,
153
+ 'path_in_archive': path_in_archive}]
154
+ )]
155
+ for this_executable, this_reslist in reslist:
156
+ resource_locations = get_ocrd_tool_json(this_executable)['resource_locations']
157
+ if not location:
158
+ location = resource_locations[0]
159
+ elif location not in resource_locations:
160
+ response.append(
161
+ f"The selected --location {location} is not in the {this_executable}'s resource search path, "
162
+ f"refusing to install to invalid location. Instead installing to: {resource_locations[0]}")
163
+ res_dest_dir = resmgr.build_resource_dest_dir(location=location, executable=this_executable)
164
+ for res_dict in this_reslist:
165
+ try:
166
+ fpath = resmgr.handle_resource(
167
+ res_dict=res_dict,
168
+ executable=this_executable,
169
+ dest_dir=res_dest_dir,
170
+ any_url=any_url,
171
+ overwrite=overwrite,
172
+ resource_type=resource_type,
173
+ path_in_archive=path_in_archive
174
+ )
175
+ if not fpath:
176
+ continue
177
+ except FileExistsError as exc:
178
+ response.append(str(exc))
179
+ usage = res_dict.get('parameter_usage', 'as-is')
180
+ response.append(f"Use in parameters as '{resmgr.parameter_usage(res_dict['name'], usage)}'")
181
+ json_message = { "result": response }
182
+ return json_message
@@ -36,7 +36,7 @@ class CustomDockerClient(DockerClient):
36
36
  raise ValueError("Both 'password' and 'keypath' provided - one must be provided")
37
37
  if ("password" not in kwargs) and ("keypath" not in kwargs):
38
38
  raise ValueError("Missing 'password' or 'keypath' - one must be provided")
39
- self.api = APIClient(base_url=f"ssh://{host}", use_ssh_client=True, version="1.41")
39
+ self.api = APIClient(base_url=f"ssh://{host}", use_ssh_client=True, version="auto")
40
40
  self.api.mount(
41
41
  prefix="http+docker://ssh", adapter=self.CustomSshHttpAdapter(base_url=f"ssh://{user}@{host}:22", **kwargs)
42
42
  )
@@ -1,9 +1,13 @@
1
1
  from logging import Logger
2
- from time import sleep
3
- from typing import Dict, List
2
+ from typing import Dict, List, Optional
4
3
 
5
- from .connection_clients import create_docker_client, create_ssh_client
6
- from .network_agents import DataProcessingWorker, DeployType
4
+ from docker import APIClient
5
+ from paramiko import SSHClient
6
+
7
+ from ..constants import RESOURCE_MANAGER_SERVER_PORT
8
+ from .connection_clients import CustomDockerClient, create_docker_client, create_ssh_client
9
+ from .network_agents import (
10
+ DataProcessingWorker, DeployType, deploy_agent_native_get_pid_hack)
7
11
 
8
12
 
9
13
  class DataHost:
@@ -11,6 +15,8 @@ class DataHost:
11
15
  self, host: str, username: str, password: str, keypath: str, workers: List[Dict], servers: List[Dict]
12
16
  ) -> None:
13
17
  self.host = host
18
+ self.resource_manager_port = RESOURCE_MANAGER_SERVER_PORT
19
+ self.resource_manager_pid = None
14
20
  self.username = username
15
21
  self.password = password
16
22
  self.keypath = keypath
@@ -22,14 +28,11 @@ class DataHost:
22
28
 
23
29
  # Connection clients, ssh for native deployment, docker for docker deployment
24
30
  self.ssh_client = None
25
- self.docker_client = None
26
-
27
- # Time to wait between deploying single workers
28
- self.wait_between_deploys: float = 0.3
31
+ self.docker_client: Optional[CustomDockerClient] = None
29
32
 
30
- # Lists of Processing Workers based on their deployment type
31
- self.workers_native = []
32
- self.workers_docker = []
33
+ # Lists of network agents based on their agent and deployment type
34
+ self.workers_native: List[DataProcessingWorker] = []
35
+ self.workers_docker: List[DataProcessingWorker] = []
33
36
 
34
37
  if not workers:
35
38
  workers = []
@@ -68,6 +71,13 @@ class DataHost:
68
71
  self.docker_client = create_docker_client(self.host, self.username, self.password, self.keypath)
69
72
  return self.docker_client
70
73
 
74
+ def __deploy_network_agent_resource_manager_server(self, logger: Logger):
75
+ logger.info(f"Deploying resource manager server on host: {self.host}:{self.resource_manager_port}")
76
+ start_cmd = f"ocrd network resmgr-server --address {self.host}:{self.resource_manager_port} &"
77
+ pid = deploy_agent_native_get_pid_hack(logger, self.ssh_client, start_cmd)
78
+ logger.info(f"Deployed: OCR-D Resource Manager Server [{pid}]: {self.host}:{self.resource_manager_port}")
79
+ self.resource_manager_pid = pid
80
+
71
81
  def __deploy_single_worker(
72
82
  self, logger: Logger, worker_data: DataProcessingWorker,
73
83
  mongodb_url: str, rabbitmq_url: str
@@ -86,7 +96,6 @@ class DataHost:
86
96
  connection_client = self.docker_client
87
97
 
88
98
  worker_data.deploy_network_agent(logger, connection_client, mongodb_url, rabbitmq_url)
89
- sleep(self.wait_between_deploys)
90
99
 
91
100
  def __deploy_all_workers(self, logger: Logger, mongodb_url: str, rabbitmq_url: str):
92
101
  logger.info(f"Deploying processing workers on host: {self.host}")
@@ -95,17 +104,24 @@ class DataHost:
95
104
  logger.info("No processing workers found to be deployed")
96
105
  for data_worker in self.workers_native:
97
106
  self.__deploy_single_worker(logger, data_worker, mongodb_url, rabbitmq_url)
107
+ logger.info(f"Deployed: {data_worker}")
98
108
  for data_worker in self.workers_docker:
99
109
  self.__deploy_single_worker(logger, data_worker, mongodb_url, rabbitmq_url)
110
+ logger.info(f"Deployed: {data_worker}")
100
111
 
101
112
  def deploy_workers(self, logger: Logger, mongodb_url: str, rabbitmq_url: str) -> None:
102
113
  if self.needs_ssh_connector and not self.ssh_client:
103
114
  logger.debug("Creating missing ssh connector before deploying")
104
- self.ssh_client = self.create_connection_client(client_type="ssh")
115
+ client = self.create_connection_client(client_type="ssh")
116
+ assert isinstance(client, SSHClient)
117
+ self.ssh_client = client
105
118
  if self.needs_docker_connector:
106
119
  logger.debug("Creating missing docker connector before deploying")
107
- self.docker_client = self.create_connection_client(client_type="docker")
120
+ client = self.create_connection_client(client_type="docker")
121
+ assert isinstance(client, CustomDockerClient)
122
+ self.docker_client = client
108
123
 
124
+ self.__deploy_network_agent_resource_manager_server(logger)
109
125
  self.__deploy_all_workers(logger=logger, mongodb_url=mongodb_url, rabbitmq_url=rabbitmq_url)
110
126
 
111
127
  if self.ssh_client:
@@ -115,6 +131,12 @@ class DataHost:
115
131
  self.docker_client.close()
116
132
  self.docker_client = None
117
133
 
134
+ def __stop_network_agent_resource_manager_server(self, logger: Logger):
135
+ logger.info(f"Stopping OCR-D Resource Manager Server [{self.resource_manager_pid}]: "
136
+ f"{self.host}:{self.resource_manager_port}")
137
+ assert self.ssh_client, "SSH client connection missing"
138
+ self.ssh_client.exec_command(f"kill {self.resource_manager_pid}")
139
+
118
140
  def __stop_worker(self, logger: Logger, name: str, deploy_type: DeployType, pid: str):
119
141
  worker_info = f"Processing Worker: deploy: {deploy_type}, name: {name}"
120
142
  if not pid:
@@ -132,10 +154,15 @@ class DataHost:
132
154
  def stop_workers(self, logger: Logger):
133
155
  if self.needs_ssh_connector and not self.ssh_client:
134
156
  logger.debug("Creating missing ssh connector before stopping")
135
- self.ssh_client = self.create_connection_client(client_type="ssh")
157
+ client = self.create_connection_client(client_type="ssh")
158
+ assert isinstance(client, SSHClient)
159
+ self.ssh_client = client
136
160
  if self.needs_docker_connector and not self.docker_client:
137
161
  logger.debug("Creating missing docker connector before stopping")
138
- self.docker_client = self.create_connection_client(client_type="docker")
162
+ client = self.create_connection_client(client_type="docker")
163
+ assert isinstance(client, CustomDockerClient)
164
+ self.docker_client = client
165
+ self.__stop_network_agent_resource_manager_server(logger=logger)
139
166
 
140
167
  logger.info(f"Stopping processing workers on host: {self.host}")
141
168
  amount_workers = len(self.workers_native) + len(self.workers_docker)
@@ -1,4 +1,5 @@
1
1
  from logging import Logger
2
+ from time import sleep
2
3
  from typing import Any
3
4
 
4
5
  from re import search as re_search
@@ -25,7 +26,8 @@ def deploy_agent_native_get_pid_hack(logger: Logger, ssh_client, start_cmd: str)
25
26
  output = stdout.read().decode("utf-8")
26
27
  stdout.close()
27
28
  stdin.close()
28
- return re_search(r"xyz([0-9]+)xyz", output).group(1) # type: ignore
29
+ pid = re_search(r"xyz([0-9]+)xyz", output).group(1) # type: ignore
30
+ return pid
29
31
 
30
32
 
31
33
  # TODO: Implement the actual method that is missing
@@ -51,6 +53,12 @@ class DataNetworkAgent:
51
53
  # The id is assigned when the agent is deployed
52
54
  self.pid = pid
53
55
 
56
+ # Time to wait between deploying agents
57
+ self.wait_between_agent_deploys: float = 0.3
58
+
59
+ def __str__(self):
60
+ return f"{self.pid} {self.deploy_type} {self.processor_name} on host: {self.host}"
61
+
54
62
  def _start_native_instance(self, logger: Logger, ssh_client, start_cmd: str):
55
63
  if self.deploy_type != DeployType.NATIVE:
56
64
  raise RuntimeError(f"Mismatch of deploy type when starting network agent: {self.processor_name}")
@@ -76,11 +84,17 @@ class DataProcessingWorker(DataNetworkAgent):
76
84
  def deploy_network_agent(self, logger: Logger, connector_client, database_url: str, queue_url: str):
77
85
  if self.deploy_type == DeployType.NATIVE:
78
86
  start_cmd = f"{self.processor_name} --database {database_url} --queue {queue_url} &"
87
+ assert connector_client, f"SSH client connection missing."
79
88
  self.pid = self._start_native_instance(logger, connector_client, start_cmd)
89
+ sleep(self.wait_between_agent_deploys)
80
90
  return self.pid
81
91
  if self.deploy_type == DeployType.DOCKER:
82
92
  # TODO: add real command to start processing worker in docker here
83
93
  start_cmd = ""
94
+ assert connector_client, f"Docker client connection missing."
95
+ if not start_cmd:
96
+ raise RuntimeError("Missing start command for the Processing Worker in docker mode")
84
97
  self.pid = self._start_docker_instance(logger, connector_client, start_cmd)
98
+ sleep(self.wait_between_agent_deploys)
85
99
  return self.pid
86
100
  raise RuntimeError(f"Unknown deploy type of {self.__dict__}")
ocrd_utils/__init__.py CHANGED
@@ -70,7 +70,8 @@ Utility functions and constants usable in various circumstances.
70
70
 
71
71
  filesystem-related utilities
72
72
 
73
- * :py:func:`is_string`,
73
+ * :py:func:`is_git_url`,
74
+ :py:func:`is_string`,
74
75
  :py:func:`membername`,
75
76
  :py:func:`concat_padded`,
76
77
  :py:func:`nth_url_segment`,
@@ -118,6 +119,7 @@ from .constants import (
118
119
  REGEX_PREFIX,
119
120
  REGEX_FILE_ID,
120
121
  RESOURCE_LOCATIONS,
122
+ RESOURCE_TYPES,
121
123
  LOG_FORMAT,
122
124
  LOG_TIMEFMT,
123
125
  VERSION,
@@ -184,9 +186,11 @@ from .os import (
184
186
  get_processor_resource_types,
185
187
  get_ocrd_tool_json,
186
188
  get_moduledir,
189
+ get_env_locations,
187
190
  guess_media_type,
188
191
  list_all_resources,
189
192
  is_file_in_directory,
193
+ is_git_url,
190
194
  list_resource_candidates,
191
195
  atomic_write,
192
196
  pushd_popd,
ocrd_utils/constants.py CHANGED
@@ -5,6 +5,7 @@ from .introspect import dist_version
5
5
  from re import compile as regex_compile
6
6
 
7
7
  __all__ = [
8
+ 'DEFAULT_METS_BASENAME',
8
9
  'EXT_TO_MIME',
9
10
  'LOG_FORMAT',
10
11
  'LOG_TIMEFMT',
@@ -14,7 +15,9 @@ __all__ = [
14
15
  'PIL_TO_MIME',
15
16
  'REGEX_PREFIX',
16
17
  'REGEX_FILE_ID',
18
+ 'RESOURCES_DIR_SYSTEM',
17
19
  'RESOURCE_LOCATIONS',
20
+ 'RESOURCE_TYPES',
18
21
  'VERSION',
19
22
  ]
20
23
 
@@ -108,6 +111,8 @@ LOG_FORMAT = r'%(asctime)s.%(msecs)03d %(levelname)s %(name)s - %(message)s'
108
111
  LOG_TIMEFMT = r'%H:%M:%S'
109
112
 
110
113
  RESOURCE_LOCATIONS = ['data', 'cwd', 'system', 'module']
114
+ RESOURCE_TYPES = ['file', 'directory', 'archive']
115
+ RESOURCES_DIR_SYSTEM = '/usr/local/share/ocrd-resources'
111
116
 
112
117
  DEFAULT_METS_BASENAME = 'mets.xml'
113
118
 
ocrd_utils/logging.py CHANGED
@@ -75,6 +75,9 @@ _ocrdLevel2pythonLevel = {
75
75
 
76
76
 
77
77
  def tf_disable_interactive_logs():
78
+ """
79
+ Disable the interactive logging of tf/keras and set the log level to error or higher
80
+ """
78
81
  try:
79
82
  from os import environ # pylint: disable=import-outside-toplevel
80
83
  # This env variable must be set before importing from Keras