ocrd 3.5.0__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +6 -2
- ocrd/cli/bashlib.py +7 -2
- ocrd/cli/log.py +7 -2
- ocrd/cli/network.py +0 -2
- ocrd/cli/ocrd_tool.py +26 -4
- ocrd/cli/process.py +1 -0
- ocrd/cli/resmgr.py +0 -1
- ocrd/cli/validate.py +32 -13
- ocrd/cli/workspace.py +125 -52
- ocrd/cli/zip.py +13 -4
- ocrd/decorators/__init__.py +28 -52
- ocrd/decorators/loglevel_option.py +4 -0
- ocrd/decorators/mets_find_options.py +2 -1
- ocrd/decorators/ocrd_cli_options.py +3 -7
- ocrd/decorators/parameter_option.py +12 -11
- ocrd/lib.bash +6 -13
- ocrd/mets_server.py +6 -10
- ocrd/processor/base.py +88 -71
- ocrd/processor/builtin/dummy_processor.py +7 -4
- ocrd/processor/builtin/filter_processor.py +3 -2
- ocrd/processor/helpers.py +5 -6
- ocrd/processor/ocrd_page_result.py +7 -5
- ocrd/resolver.py +42 -32
- ocrd/task_sequence.py +11 -4
- ocrd/workspace.py +64 -54
- ocrd/workspace_backup.py +3 -0
- ocrd/workspace_bagger.py +15 -8
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/METADATA +3 -2
- ocrd-3.6.0.dist-info/RECORD +125 -0
- ocrd_modelfactory/__init__.py +4 -2
- ocrd_models/constants.py +18 -1
- ocrd_models/ocrd_agent.py +1 -1
- ocrd_models/ocrd_exif.py +7 -3
- ocrd_models/ocrd_file.py +24 -19
- ocrd_models/ocrd_mets.py +90 -67
- ocrd_models/ocrd_page.py +17 -13
- ocrd_models/ocrd_xml_base.py +1 -0
- ocrd_models/report.py +2 -1
- ocrd_models/utils.py +4 -3
- ocrd_models/xpath_functions.py +3 -1
- ocrd_network/__init__.py +1 -2
- ocrd_network/cli/__init__.py +0 -2
- ocrd_network/cli/client.py +122 -50
- ocrd_network/cli/processing_server.py +1 -2
- ocrd_network/client.py +2 -2
- ocrd_network/client_utils.py +30 -13
- ocrd_network/constants.py +1 -6
- ocrd_network/database.py +3 -3
- ocrd_network/logging_utils.py +2 -7
- ocrd_network/models/__init__.py +0 -2
- ocrd_network/models/job.py +2 -5
- ocrd_network/models/workspace.py +1 -1
- ocrd_network/process_helpers.py +54 -17
- ocrd_network/processing_server.py +63 -114
- ocrd_network/processing_worker.py +6 -5
- ocrd_network/rabbitmq_utils/__init__.py +2 -0
- ocrd_network/rabbitmq_utils/helpers.py +24 -7
- ocrd_network/runtime_data/__init__.py +1 -2
- ocrd_network/runtime_data/deployer.py +12 -85
- ocrd_network/runtime_data/hosts.py +61 -130
- ocrd_network/runtime_data/network_agents.py +7 -31
- ocrd_network/runtime_data/network_services.py +1 -1
- ocrd_network/server_cache.py +1 -1
- ocrd_network/server_utils.py +13 -52
- ocrd_network/utils.py +1 -0
- ocrd_utils/__init__.py +4 -4
- ocrd_utils/config.py +86 -76
- ocrd_utils/deprecate.py +3 -0
- ocrd_utils/image.py +51 -23
- ocrd_utils/introspect.py +8 -3
- ocrd_utils/logging.py +12 -7
- ocrd_utils/os.py +16 -3
- ocrd_utils/str.py +32 -16
- ocrd_validators/json_validator.py +4 -1
- ocrd_validators/ocrd_tool_validator.py +2 -1
- ocrd_validators/ocrd_zip_validator.py +5 -4
- ocrd_validators/page_validator.py +21 -9
- ocrd_validators/parameter_validator.py +3 -2
- ocrd_validators/processing_server_config.schema.yml +1 -33
- ocrd_validators/resource_list_validator.py +3 -1
- ocrd_validators/workspace_validator.py +30 -20
- ocrd_validators/xsd_mets_validator.py +2 -1
- ocrd_validators/xsd_page_validator.py +2 -1
- ocrd_validators/xsd_validator.py +4 -2
- ocrd-3.5.0.dist-info/RECORD +0 -128
- ocrd_network/cli/processor_server.py +0 -31
- ocrd_network/models/ocrd_tool.py +0 -12
- ocrd_network/processor_server.py +0 -255
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/LICENSE +0 -0
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/WHEEL +0 -0
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/entry_points.txt +0 -0
- {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/top_level.txt +0 -0
|
@@ -10,7 +10,7 @@ from fastapi.responses import FileResponse, JSONResponse, PlainTextResponse
|
|
|
10
10
|
|
|
11
11
|
from ocrd.task_sequence import ProcessorTask
|
|
12
12
|
from ocrd_utils import initLogging, getLogger
|
|
13
|
-
from .constants import
|
|
13
|
+
from .constants import JobState, ServerApiTags
|
|
14
14
|
from .database import (
|
|
15
15
|
initiate_database,
|
|
16
16
|
db_get_processing_job,
|
|
@@ -34,14 +34,13 @@ from .models import (
|
|
|
34
34
|
from .rabbitmq_utils import (
|
|
35
35
|
check_if_queue_exists,
|
|
36
36
|
connect_rabbitmq_publisher,
|
|
37
|
-
|
|
37
|
+
get_message_queues,
|
|
38
38
|
OcrdProcessingMessage
|
|
39
39
|
)
|
|
40
40
|
from .server_cache import CacheLockedPages, CacheProcessingRequests
|
|
41
41
|
from .server_utils import (
|
|
42
42
|
create_processing_message,
|
|
43
43
|
create_workspace_if_not_exists,
|
|
44
|
-
forward_job_to_processor_server,
|
|
45
44
|
_get_processor_job,
|
|
46
45
|
_get_processor_job_log,
|
|
47
46
|
get_page_ids_list,
|
|
@@ -51,7 +50,6 @@ from .server_utils import (
|
|
|
51
50
|
kill_mets_server_zombies,
|
|
52
51
|
parse_workflow_tasks,
|
|
53
52
|
raise_http_exception,
|
|
54
|
-
request_processor_server_tool_json,
|
|
55
53
|
validate_and_return_mets_path,
|
|
56
54
|
validate_first_task_input_file_groups_existence,
|
|
57
55
|
validate_job_input,
|
|
@@ -91,7 +89,7 @@ class ProcessingServer(FastAPI):
|
|
|
91
89
|
log_file = get_processing_server_logging_file_path(pid=getpid())
|
|
92
90
|
configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a")
|
|
93
91
|
|
|
94
|
-
self.log.info(
|
|
92
|
+
self.log.info("Loading ocrd-all-tool.json")
|
|
95
93
|
self.ocrd_all_tool_json = load_ocrd_all_tool_json()
|
|
96
94
|
self.hostname = host
|
|
97
95
|
self.port = port
|
|
@@ -104,7 +102,7 @@ class ProcessingServer(FastAPI):
|
|
|
104
102
|
self.mets_server_proxy = MetsServerProxy()
|
|
105
103
|
self.use_tcp_mets = self.deployer.use_tcp_mets
|
|
106
104
|
# If set, all Mets Server UDS requests are multiplexed over TCP
|
|
107
|
-
# Used by processing workers
|
|
105
|
+
# Used by processing workers to report back the results
|
|
108
106
|
if self.deployer.internal_callback_url:
|
|
109
107
|
host = self.deployer.internal_callback_url
|
|
110
108
|
self.internal_job_callback_url = f"{host.rstrip('/')}/result_callback"
|
|
@@ -153,16 +151,10 @@ class ProcessingServer(FastAPI):
|
|
|
153
151
|
# The RMQPublisher is initialized and a connection to the RabbitMQ is performed
|
|
154
152
|
self.rmq_publisher = connect_rabbitmq_publisher(self.log, self.rmq_data, enable_acks=True)
|
|
155
153
|
|
|
156
|
-
|
|
157
|
-
worker_only=True, str_names_only=True, unique_only=True
|
|
158
|
-
)
|
|
159
|
-
self.log.info(f"Creating message queues on RabbitMQ instance url: {self.rabbitmq_url}")
|
|
160
|
-
create_message_queues(logger=self.log, rmq_publisher=self.rmq_publisher, queue_names=queue_names)
|
|
161
|
-
|
|
162
|
-
self.deployer.deploy_network_agents(mongodb_url=self.mongodb_url, rabbitmq_url=self.rabbitmq_url)
|
|
154
|
+
self.deployer.deploy_workers(mongodb_url=self.mongodb_url, rabbitmq_url=self.rabbitmq_url)
|
|
163
155
|
except Exception as error:
|
|
164
156
|
self.log.exception(f"Failed to start the Processing Server, error: {error}")
|
|
165
|
-
self.log.warning("Trying to stop previously deployed services and
|
|
157
|
+
self.log.warning("Trying to stop previously deployed services and workers.")
|
|
166
158
|
self.deployer.stop_all()
|
|
167
159
|
raise
|
|
168
160
|
uvicorn_run(self, host=self.hostname, port=int(self.port))
|
|
@@ -208,7 +200,8 @@ class ProcessingServer(FastAPI):
|
|
|
208
200
|
methods=["DELETE"],
|
|
209
201
|
tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING],
|
|
210
202
|
status_code=status.HTTP_200_OK,
|
|
211
|
-
summary="!! Workaround Do Not Use Unless You Have A Reason
|
|
203
|
+
summary="!! Workaround Do Not Use Unless You Have A Reason "
|
|
204
|
+
"!! Kill all METS servers on this machine that have been created more than 60 minutes ago."
|
|
212
205
|
)
|
|
213
206
|
self.include_router(others_router)
|
|
214
207
|
|
|
@@ -224,7 +217,7 @@ class ProcessingServer(FastAPI):
|
|
|
224
217
|
)
|
|
225
218
|
processing_router.add_api_route(
|
|
226
219
|
path="/processor/info/{processor_name}",
|
|
227
|
-
endpoint=self.
|
|
220
|
+
endpoint=self.get_worker_ocrd_tool,
|
|
228
221
|
methods=["GET"],
|
|
229
222
|
tags=[ServerApiTags.PROCESSING, ServerApiTags.DISCOVERY],
|
|
230
223
|
status_code=status.HTTP_200_OK,
|
|
@@ -232,7 +225,7 @@ class ProcessingServer(FastAPI):
|
|
|
232
225
|
)
|
|
233
226
|
processing_router.add_api_route(
|
|
234
227
|
path="/processor/run/{processor_name}",
|
|
235
|
-
endpoint=self.
|
|
228
|
+
endpoint=self.validate_and_forward_job_to_worker,
|
|
236
229
|
methods=["POST"],
|
|
237
230
|
tags=[ServerApiTags.PROCESSING],
|
|
238
231
|
status_code=status.HTTP_200_OK,
|
|
@@ -266,7 +259,7 @@ class ProcessingServer(FastAPI):
|
|
|
266
259
|
methods=["POST"],
|
|
267
260
|
tags=[ServerApiTags.PROCESSING],
|
|
268
261
|
status_code=status.HTTP_200_OK,
|
|
269
|
-
summary="Callback used by a worker
|
|
262
|
+
summary="Callback used by a worker for reporting result of a processing request"
|
|
270
263
|
)
|
|
271
264
|
self.include_router(processing_router)
|
|
272
265
|
|
|
@@ -350,68 +343,38 @@ class ProcessingServer(FastAPI):
|
|
|
350
343
|
async def stop_deployed_agents(self) -> None:
|
|
351
344
|
self.deployer.stop_all()
|
|
352
345
|
|
|
353
|
-
def
|
|
354
|
-
processor_server_base_url = self.deployer.resolve_processor_server_url(processor_name)
|
|
355
|
-
if processor_server_base_url == '':
|
|
356
|
-
message = f"Processor Server URL of '{processor_name}' not found"
|
|
357
|
-
raise_http_exception(self.log, status.HTTP_404_NOT_FOUND, message=message)
|
|
358
|
-
return request_processor_server_tool_json(self.log, processor_server_base_url=processor_server_base_url)
|
|
359
|
-
|
|
360
|
-
async def get_network_agent_ocrd_tool(
|
|
361
|
-
self, processor_name: str, agent_type: AgentType = AgentType.PROCESSING_WORKER
|
|
362
|
-
) -> Dict:
|
|
346
|
+
async def get_worker_ocrd_tool(self, processor_name: str) -> Dict:
|
|
363
347
|
ocrd_tool = {}
|
|
364
|
-
|
|
365
|
-
if agent_type != AgentType.PROCESSING_WORKER and agent_type != AgentType.PROCESSOR_SERVER:
|
|
366
|
-
message = f"Unknown agent type: {agent_type}, {type(agent_type)}"
|
|
367
|
-
raise_http_exception(self.log, status_code=status.HTTP_501_NOT_IMPLEMENTED, message=message)
|
|
368
|
-
if agent_type == AgentType.PROCESSING_WORKER:
|
|
369
|
-
ocrd_tool = self.ocrd_all_tool_json.get(processor_name, None)
|
|
370
|
-
if agent_type == AgentType.PROCESSOR_SERVER:
|
|
371
|
-
ocrd_tool = self.query_ocrd_tool_json_from_server(processor_name)
|
|
348
|
+
ocrd_tool = self.ocrd_all_tool_json.get(processor_name, None)
|
|
372
349
|
if not ocrd_tool:
|
|
373
|
-
raise_http_exception(self.log, status.HTTP_404_NOT_FOUND,
|
|
350
|
+
raise_http_exception(self.log, status.HTTP_404_NOT_FOUND,
|
|
351
|
+
f"Processing Worker '{processor_name}' not found.")
|
|
374
352
|
return ocrd_tool
|
|
375
353
|
|
|
376
|
-
def
|
|
377
|
-
processor_server_url = self.deployer.resolve_processor_server_url(processor_name)
|
|
378
|
-
return bool(processor_server_url)
|
|
379
|
-
|
|
380
|
-
def network_agent_exists_worker(self, processor_name: str) -> bool:
|
|
354
|
+
def exists_worker(self, processor_name: str) -> bool:
|
|
381
355
|
# TODO: Reconsider and refactor this.
|
|
382
356
|
# Added ocrd-dummy by default if not available for the integration tests.
|
|
383
|
-
# A proper Processing Worker
|
|
384
|
-
# is needed on the Processing Server side
|
|
357
|
+
# A proper Processing Worker registration endpoint is needed on the Processing Server side
|
|
385
358
|
if processor_name == 'ocrd-dummy':
|
|
386
359
|
return True
|
|
387
360
|
return bool(check_if_queue_exists(self.log, self.rmq_data, processor_name=processor_name))
|
|
388
361
|
|
|
389
|
-
def
|
|
390
|
-
|
|
391
|
-
if
|
|
392
|
-
|
|
393
|
-
elif agent_type == AgentType.PROCESSING_WORKER:
|
|
394
|
-
agent_exists = self.network_agent_exists_worker(processor_name=processor_name)
|
|
395
|
-
else:
|
|
396
|
-
message = f"Unknown agent type: {agent_type}, {type(agent_type)}"
|
|
397
|
-
raise_http_exception(self.log, status_code=status.HTTP_501_NOT_IMPLEMENTED, message=message)
|
|
398
|
-
if not agent_exists:
|
|
399
|
-
message = f"Network agent of type '{agent_type}' for processor '{processor_name}' not found."
|
|
362
|
+
def validate_worker_existence(self, processor_name: str) -> None:
|
|
363
|
+
worker_exists = self.exists_worker(processor_name=processor_name)
|
|
364
|
+
if not worker_exists:
|
|
365
|
+
message = f"Processing Worker '{processor_name}' not found."
|
|
400
366
|
raise_http_exception(self.log, status.HTTP_422_UNPROCESSABLE_ENTITY, message)
|
|
401
367
|
|
|
402
|
-
async def
|
|
368
|
+
async def validate_and_forward_job_to_worker(self, processor_name: str, data: PYJobInput) -> PYJobOutput:
|
|
403
369
|
# Append the processor name to the request itself
|
|
404
370
|
data.processor_name = processor_name
|
|
405
|
-
self.
|
|
371
|
+
self.validate_worker_existence(processor_name=data.processor_name)
|
|
406
372
|
if data.job_id:
|
|
407
373
|
message = f"Processing request job id field is set but must not be: {data.job_id}"
|
|
408
374
|
raise_http_exception(self.log, status.HTTP_422_UNPROCESSABLE_ENTITY, message)
|
|
409
375
|
# Generate processing job id
|
|
410
376
|
data.job_id = generate_id()
|
|
411
|
-
ocrd_tool = await self.
|
|
412
|
-
processor_name=data.processor_name,
|
|
413
|
-
agent_type=data.agent_type
|
|
414
|
-
)
|
|
377
|
+
ocrd_tool = await self.get_worker_ocrd_tool(processor_name=data.processor_name)
|
|
415
378
|
validate_job_input(self.log, data.processor_name, ocrd_tool, data)
|
|
416
379
|
|
|
417
380
|
if data.workspace_id:
|
|
@@ -491,19 +454,13 @@ class ProcessingServer(FastAPI):
|
|
|
491
454
|
)
|
|
492
455
|
await db_queued_job.insert()
|
|
493
456
|
self.cache_processing_requests.update_request_counter(workspace_key=workspace_key, by_value=1)
|
|
494
|
-
job_output = await self.
|
|
457
|
+
job_output = await self.push_job_to_worker(data=data, db_job=db_queued_job)
|
|
495
458
|
return job_output
|
|
496
459
|
|
|
497
|
-
async def
|
|
498
|
-
if data.agent_type != AgentType.PROCESSING_WORKER and data.agent_type != AgentType.PROCESSOR_SERVER:
|
|
499
|
-
message = f"Unknown agent type: {data.agent_type}, {type(data.agent_type)}"
|
|
500
|
-
raise_http_exception(self.log, status_code=status.HTTP_501_NOT_IMPLEMENTED, message=message)
|
|
460
|
+
async def push_job_to_worker(self, data: PYJobInput, db_job: DBProcessorJob) -> PYJobOutput:
|
|
501
461
|
job_output = None
|
|
502
|
-
self.log.debug(f"Pushing to
|
|
503
|
-
|
|
504
|
-
job_output = await self.push_job_to_processing_queue(db_job=db_job)
|
|
505
|
-
if data.agent_type == AgentType.PROCESSOR_SERVER:
|
|
506
|
-
job_output = await self.push_job_to_processor_server(job_input=data)
|
|
462
|
+
self.log.debug(f"Pushing to Processing Worker: {data.processor_name}, {data.page_id}, {data.job_id}")
|
|
463
|
+
job_output = await self.push_job_to_processing_queue(db_job=db_job)
|
|
507
464
|
if not job_output:
|
|
508
465
|
message = f"Failed to create job output for job input: {data}"
|
|
509
466
|
raise_http_exception(self.log, status.HTTP_500_INTERNAL_SERVER_ERROR, message)
|
|
@@ -525,12 +482,6 @@ class ProcessingServer(FastAPI):
|
|
|
525
482
|
raise_http_exception(self.log, status.HTTP_500_INTERNAL_SERVER_ERROR, message, error)
|
|
526
483
|
return db_job.to_job_output()
|
|
527
484
|
|
|
528
|
-
async def push_job_to_processor_server(self, job_input: PYJobInput) -> PYJobOutput:
|
|
529
|
-
processor_server_base_url = self.deployer.resolve_processor_server_url(job_input.processor_name)
|
|
530
|
-
return await forward_job_to_processor_server(
|
|
531
|
-
self.log, job_input=job_input, processor_server_base_url=processor_server_base_url
|
|
532
|
-
)
|
|
533
|
-
|
|
534
485
|
async def get_processor_job(self, job_id: str) -> PYJobOutput:
|
|
535
486
|
return await _get_processor_job(self.log, job_id)
|
|
536
487
|
|
|
@@ -556,7 +507,7 @@ class ProcessingServer(FastAPI):
|
|
|
556
507
|
page_ids=page_ids
|
|
557
508
|
)
|
|
558
509
|
|
|
559
|
-
async def
|
|
510
|
+
async def push_cached_jobs_to_workers(self, processing_jobs: List[PYJobInput]) -> None:
|
|
560
511
|
if not len(processing_jobs):
|
|
561
512
|
self.log.debug("No processing jobs were consumed from the requests cache")
|
|
562
513
|
return
|
|
@@ -573,7 +524,7 @@ class ProcessingServer(FastAPI):
|
|
|
573
524
|
)
|
|
574
525
|
|
|
575
526
|
self.cache_processing_requests.update_request_counter(workspace_key=workspace_key, by_value=1)
|
|
576
|
-
job_output = await self.
|
|
527
|
+
job_output = await self.push_job_to_worker(data=data, db_job=db_consumed_job)
|
|
577
528
|
if not job_output:
|
|
578
529
|
self.log.exception(f"Failed to create job output for job input data: {data}")
|
|
579
530
|
|
|
@@ -653,22 +604,16 @@ class ProcessingServer(FastAPI):
|
|
|
653
604
|
consumed_cached_jobs = await self._consume_cached_jobs_of_workspace(
|
|
654
605
|
workspace_key=workspace_key, mets_server_url=mets_server_url, path_to_mets=path_to_mets
|
|
655
606
|
)
|
|
656
|
-
await self.
|
|
607
|
+
await self.push_cached_jobs_to_workers(processing_jobs=consumed_cached_jobs)
|
|
657
608
|
|
|
658
609
|
async def list_processors(self) -> List[str]:
|
|
659
|
-
|
|
660
|
-
processor_names_list = self.deployer.find_matching_network_agents(
|
|
661
|
-
docker_only=False, native_only=False, worker_only=False, server_only=False,
|
|
662
|
-
str_names_only=True, unique_only=True, sort=True
|
|
663
|
-
)
|
|
664
|
-
return processor_names_list
|
|
610
|
+
return get_message_queues(self.log, self.rmq_data)
|
|
665
611
|
|
|
666
612
|
async def task_sequence_to_processing_jobs(
|
|
667
613
|
self,
|
|
668
614
|
tasks: List[ProcessorTask],
|
|
669
615
|
mets_path: str,
|
|
670
616
|
page_id: str,
|
|
671
|
-
agent_type: AgentType = AgentType.PROCESSING_WORKER
|
|
672
617
|
) -> List[PYJobOutput]:
|
|
673
618
|
temp_file_group_cache = {}
|
|
674
619
|
responses = []
|
|
@@ -687,10 +632,9 @@ class ProcessingServer(FastAPI):
|
|
|
687
632
|
output_file_grps=task.output_file_grps,
|
|
688
633
|
page_id=page_id,
|
|
689
634
|
parameters=task.parameters,
|
|
690
|
-
agent_type=agent_type,
|
|
691
635
|
depends_on=dependent_jobs,
|
|
692
636
|
)
|
|
693
|
-
response = await self.
|
|
637
|
+
response = await self.validate_and_forward_job_to_worker(
|
|
694
638
|
processor_name=job_input_data.processor_name,
|
|
695
639
|
data=job_input_data
|
|
696
640
|
)
|
|
@@ -699,27 +643,26 @@ class ProcessingServer(FastAPI):
|
|
|
699
643
|
responses.append(response)
|
|
700
644
|
return responses
|
|
701
645
|
|
|
702
|
-
def
|
|
703
|
-
|
|
646
|
+
def validate_tasks_worker_existence(self, tasks: List[ProcessorTask]) -> None:
|
|
647
|
+
missing_workers = []
|
|
704
648
|
for task in tasks:
|
|
705
649
|
try:
|
|
706
|
-
self.
|
|
650
|
+
self.validate_worker_existence(processor_name=task.executable)
|
|
707
651
|
except HTTPException:
|
|
708
652
|
# catching the error is not relevant here
|
|
709
|
-
|
|
710
|
-
if
|
|
653
|
+
missing_workers.append({task.executable})
|
|
654
|
+
if missing_workers:
|
|
711
655
|
message = (
|
|
712
|
-
"Workflow validation has failed. The desired
|
|
713
|
-
f"Missing
|
|
656
|
+
"Workflow validation has failed. The desired Processing Worker was not found. "
|
|
657
|
+
f"Missing Processing Workers: {missing_workers}"
|
|
714
658
|
)
|
|
715
659
|
raise_http_exception(self.log, status.HTTP_406_NOT_ACCEPTABLE, message)
|
|
716
660
|
|
|
717
661
|
async def run_workflow(
|
|
718
662
|
self,
|
|
719
663
|
mets_path: str,
|
|
720
|
-
workflow: Union[UploadFile, None] = File(None),
|
|
664
|
+
workflow: Union[UploadFile, str, None] = File(None),
|
|
721
665
|
workflow_id: str = None,
|
|
722
|
-
agent_type: AgentType = AgentType.PROCESSING_WORKER,
|
|
723
666
|
page_id: str = None,
|
|
724
667
|
page_wise: bool = False,
|
|
725
668
|
workflow_callback_url: str = None
|
|
@@ -731,28 +674,27 @@ class ProcessingServer(FastAPI):
|
|
|
731
674
|
# Validate the input file groups of the first task in the workflow
|
|
732
675
|
validate_first_task_input_file_groups_existence(self.log, mets_path, processing_tasks[0].input_file_grps)
|
|
733
676
|
|
|
734
|
-
# Validate existence of
|
|
677
|
+
# Validate existence of Processing Workers
|
|
735
678
|
# for the ocr-d processors referenced inside tasks
|
|
736
|
-
self.
|
|
679
|
+
self.validate_tasks_worker_existence(processing_tasks)
|
|
737
680
|
|
|
681
|
+
# for page_wise mode, we need to expand the list of pages
|
|
682
|
+
# for the database, it's better to keep a short string
|
|
683
|
+
page_id = page_id or ''
|
|
738
684
|
page_ids = get_page_ids_list(self.log, mets_path, page_id)
|
|
739
685
|
|
|
740
|
-
# TODO: Reconsider this, the compact page range may not always work if the page_ids are hashes!
|
|
741
|
-
compact_page_range = f"{page_ids[0]}..{page_ids[-1]}"
|
|
742
|
-
|
|
743
686
|
if not page_wise:
|
|
744
687
|
responses = await self.task_sequence_to_processing_jobs(
|
|
745
688
|
tasks=processing_tasks,
|
|
746
689
|
mets_path=mets_path,
|
|
747
|
-
page_id=
|
|
748
|
-
agent_type=agent_type
|
|
690
|
+
page_id=page_id,
|
|
749
691
|
)
|
|
750
692
|
processing_job_ids = [response.job_id for response in responses]
|
|
751
693
|
db_workflow_job = DBWorkflowJob(
|
|
752
694
|
job_id=generate_id(),
|
|
753
|
-
page_id=
|
|
695
|
+
page_id=page_id,
|
|
754
696
|
page_wise=page_wise,
|
|
755
|
-
processing_job_ids={
|
|
697
|
+
processing_job_ids={page_id: processing_job_ids},
|
|
756
698
|
path_to_mets=mets_path,
|
|
757
699
|
workflow_callback_url=workflow_callback_url
|
|
758
700
|
)
|
|
@@ -765,13 +707,12 @@ class ProcessingServer(FastAPI):
|
|
|
765
707
|
tasks=processing_tasks,
|
|
766
708
|
mets_path=mets_path,
|
|
767
709
|
page_id=current_page,
|
|
768
|
-
agent_type=agent_type
|
|
769
710
|
)
|
|
770
711
|
processing_job_ids = [response.job_id for response in responses]
|
|
771
712
|
all_pages_job_ids[current_page] = processing_job_ids
|
|
772
713
|
db_workflow_job = DBWorkflowJob(
|
|
773
714
|
job_id=generate_id(),
|
|
774
|
-
page_id=
|
|
715
|
+
page_id=page_id,
|
|
775
716
|
page_wise=page_wise,
|
|
776
717
|
processing_job_ids=all_pages_job_ids,
|
|
777
718
|
path_to_mets=mets_path,
|
|
@@ -825,7 +766,7 @@ class ProcessingServer(FastAPI):
|
|
|
825
766
|
response = self._produce_workflow_status_response(processing_jobs=jobs)
|
|
826
767
|
return response
|
|
827
768
|
|
|
828
|
-
async def kill_mets_server_zombies(self, minutes_ago
|
|
769
|
+
async def kill_mets_server_zombies(self, minutes_ago: Optional[int] = None, dry_run: Optional[bool] = None) -> List[int]:
|
|
829
770
|
pids_killed = kill_mets_server_zombies(minutes_ago=minutes_ago, dry_run=dry_run)
|
|
830
771
|
return pids_killed
|
|
831
772
|
|
|
@@ -843,10 +784,14 @@ class ProcessingServer(FastAPI):
|
|
|
843
784
|
workflow_job_state = self._produce_workflow_status_simple_response(processing_jobs=jobs)
|
|
844
785
|
return {"state": workflow_job_state}
|
|
845
786
|
|
|
846
|
-
async def upload_workflow(self, workflow: UploadFile) -> Dict[str, str]:
|
|
787
|
+
async def upload_workflow(self, workflow: Union[UploadFile, str]) -> Dict[str, str]:
|
|
847
788
|
""" Store a script for a workflow in the database
|
|
848
789
|
"""
|
|
849
|
-
|
|
790
|
+
if isinstance(workflow, str):
|
|
791
|
+
with open(workflow) as wf_file:
|
|
792
|
+
workflow_content = wf_file.read()
|
|
793
|
+
else:
|
|
794
|
+
workflow_content = await generate_workflow_content(workflow)
|
|
850
795
|
validate_workflow(self.log, workflow_content)
|
|
851
796
|
content_hash = generate_workflow_content_hash(workflow_content)
|
|
852
797
|
try:
|
|
@@ -865,12 +810,16 @@ class ProcessingServer(FastAPI):
|
|
|
865
810
|
await db_workflow_script.insert()
|
|
866
811
|
return {"workflow_id": workflow_id}
|
|
867
812
|
|
|
868
|
-
async def replace_workflow(self, workflow_id, workflow: UploadFile) -> Dict[str, str]:
|
|
813
|
+
async def replace_workflow(self, workflow_id, workflow: Union[UploadFile, str]) -> Dict[str, str]:
|
|
869
814
|
""" Update a workflow script file in the database
|
|
870
815
|
"""
|
|
871
816
|
try:
|
|
872
817
|
db_workflow_script = await db_get_workflow_script(workflow_id)
|
|
873
|
-
|
|
818
|
+
if isinstance(workflow, str):
|
|
819
|
+
with open(workflow) as wf_file:
|
|
820
|
+
workflow_content = wf_file.read()
|
|
821
|
+
else:
|
|
822
|
+
workflow_content = await generate_workflow_content(workflow)
|
|
874
823
|
validate_workflow(self.log, workflow_content)
|
|
875
824
|
db_workflow_script.content = workflow_content
|
|
876
825
|
content_hash = generate_workflow_content_hash(workflow_content)
|
|
@@ -9,7 +9,7 @@ is a single OCR-D Processor instance.
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from datetime import datetime
|
|
12
|
-
from os import getpid
|
|
12
|
+
from os import getpid
|
|
13
13
|
from pika import BasicProperties
|
|
14
14
|
from pika.adapters.blocking_connection import BlockingChannel
|
|
15
15
|
from pika.spec import Basic
|
|
@@ -36,7 +36,7 @@ from .utils import calculate_execution_time, post_to_callback_url
|
|
|
36
36
|
class ProcessingWorker:
|
|
37
37
|
def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None:
|
|
38
38
|
initLogging()
|
|
39
|
-
self.log = getLogger(
|
|
39
|
+
self.log = getLogger('ocrd_network.processing_worker')
|
|
40
40
|
log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid())
|
|
41
41
|
configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a")
|
|
42
42
|
|
|
@@ -120,7 +120,7 @@ class ProcessingWorker:
|
|
|
120
120
|
channel.basic_nack(delivery_tag=delivery_tag, multiple=False, requeue=False)
|
|
121
121
|
raise Exception(message)
|
|
122
122
|
|
|
123
|
-
self.log.info(
|
|
123
|
+
self.log.info("Successfully processed RabbitMQ message")
|
|
124
124
|
self.log.debug(ack_message)
|
|
125
125
|
channel.basic_ack(delivery_tag=delivery_tag, multiple=False)
|
|
126
126
|
|
|
@@ -134,8 +134,9 @@ class ProcessingWorker:
|
|
|
134
134
|
self.log.info(f"Starting consuming from queue: {self.processor_name}")
|
|
135
135
|
# Starting consuming is a blocking action
|
|
136
136
|
self.rmq_consumer.start_consuming()
|
|
137
|
+
self.log.info(f"Consuming stopped for queue: {self.processor_name}")
|
|
137
138
|
else:
|
|
138
|
-
msg =
|
|
139
|
+
msg = "The RMQConsumer is not connected/configured properly."
|
|
139
140
|
self.log.exception(msg)
|
|
140
141
|
raise Exception(msg)
|
|
141
142
|
|
|
@@ -165,7 +166,7 @@ class ProcessingWorker:
|
|
|
165
166
|
parameters = processing_message.parameters if processing_message.parameters else {}
|
|
166
167
|
|
|
167
168
|
if not path_to_mets and not workspace_id:
|
|
168
|
-
msg =
|
|
169
|
+
msg = "Both 'path_to_mets' and 'workspace_id' are missing in the OcrdProcessingMessage."
|
|
169
170
|
self.log.exception(msg)
|
|
170
171
|
raise ValueError(msg)
|
|
171
172
|
|
|
@@ -3,6 +3,7 @@ __all__ = [
|
|
|
3
3
|
"connect_rabbitmq_consumer",
|
|
4
4
|
"connect_rabbitmq_publisher",
|
|
5
5
|
"create_message_queues",
|
|
6
|
+
"get_message_queues",
|
|
6
7
|
"verify_and_parse_mq_uri",
|
|
7
8
|
"verify_rabbitmq_available",
|
|
8
9
|
"RMQConsumer",
|
|
@@ -19,6 +20,7 @@ from .helpers import (
|
|
|
19
20
|
connect_rabbitmq_consumer,
|
|
20
21
|
connect_rabbitmq_publisher,
|
|
21
22
|
create_message_queues,
|
|
23
|
+
get_message_queues,
|
|
22
24
|
verify_and_parse_mq_uri,
|
|
23
25
|
verify_rabbitmq_available
|
|
24
26
|
)
|
|
@@ -4,6 +4,9 @@ from pika.exceptions import AMQPConnectionError, ChannelClosedByBroker
|
|
|
4
4
|
from re import match as re_match
|
|
5
5
|
from time import sleep
|
|
6
6
|
from typing import Dict, List, Union
|
|
7
|
+
from requests import get
|
|
8
|
+
from requests.auth import HTTPBasicAuth
|
|
9
|
+
from requests.exceptions import RequestException, HTTPError
|
|
7
10
|
|
|
8
11
|
from .constants import RABBITMQ_URI_PATTERN, RECONNECT_TRIES, RECONNECT_WAIT
|
|
9
12
|
from .consumer import RMQConsumer
|
|
@@ -42,7 +45,7 @@ def __connect_rabbitmq_client(
|
|
|
42
45
|
|
|
43
46
|
def connect_rabbitmq_consumer(logger: Logger, rmq_data: Dict) -> RMQConsumer:
|
|
44
47
|
rmq_consumer = __connect_rabbitmq_client(logger=logger, client_type="consumer", rmq_data=rmq_data)
|
|
45
|
-
logger.info(
|
|
48
|
+
logger.info("Successfully connected RMQConsumer")
|
|
46
49
|
return rmq_consumer
|
|
47
50
|
|
|
48
51
|
|
|
@@ -68,12 +71,6 @@ def check_if_queue_exists(logger: Logger, rmq_data: Dict, processor_name: str) -
|
|
|
68
71
|
|
|
69
72
|
|
|
70
73
|
def create_message_queues(logger: Logger, rmq_publisher: RMQPublisher, queue_names: List[str]) -> None:
|
|
71
|
-
# TODO: Reconsider and refactor this.
|
|
72
|
-
# Added ocrd-dummy by default if not available for the integration tests.
|
|
73
|
-
# A proper Processing Worker / Processor Server registration endpoint is needed on the Processing Server side
|
|
74
|
-
if "ocrd-dummy" not in queue_names:
|
|
75
|
-
queue_names.append("ocrd-dummy")
|
|
76
|
-
|
|
77
74
|
for queue_name in queue_names:
|
|
78
75
|
# The existence/validity of the worker.name is not tested.
|
|
79
76
|
# Even if an ocr-d processor does not exist, the queue is created
|
|
@@ -81,6 +78,26 @@ def create_message_queues(logger: Logger, rmq_publisher: RMQPublisher, queue_nam
|
|
|
81
78
|
rmq_publisher.create_queue(queue_name=queue_name)
|
|
82
79
|
|
|
83
80
|
|
|
81
|
+
def get_message_queues(logger: Logger, rmq_data: Dict) -> List:
|
|
82
|
+
try:
|
|
83
|
+
response = get(
|
|
84
|
+
f"http://{rmq_data['host']}:{15672}/api/queues",
|
|
85
|
+
auth=HTTPBasicAuth(rmq_data["username"], rmq_data["password"])
|
|
86
|
+
)
|
|
87
|
+
response.raise_for_status()
|
|
88
|
+
queues = response.json()
|
|
89
|
+
return [queue['name'] for queue in queues]
|
|
90
|
+
except HTTPError:
|
|
91
|
+
logger.warn(
|
|
92
|
+
f"Error requesting all queue-names from rabbitmq. Status code: {response.status_code}. "
|
|
93
|
+
f"Response-Text: {response.text}"
|
|
94
|
+
)
|
|
95
|
+
return []
|
|
96
|
+
except RequestException as e:
|
|
97
|
+
logger.warn(f"Error querying RabbitMQ API: {e}")
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
|
|
84
101
|
def verify_and_parse_mq_uri(rabbitmq_address: str):
|
|
85
102
|
"""
|
|
86
103
|
Check the full list of available parameters in the docs here:
|
|
@@ -5,10 +5,9 @@ __all__ = [
|
|
|
5
5
|
"DataNetworkAgent",
|
|
6
6
|
"DataRabbitMQ",
|
|
7
7
|
"DataProcessingWorker",
|
|
8
|
-
"DataProcessorServer"
|
|
9
8
|
]
|
|
10
9
|
|
|
11
10
|
from .deployer import Deployer
|
|
12
11
|
from .hosts import DataHost
|
|
13
|
-
from .network_agents import DataNetworkAgent, DataProcessingWorker
|
|
12
|
+
from .network_agents import DataNetworkAgent, DataProcessingWorker
|
|
14
13
|
from .network_services import DataMongoDB, DataRabbitMQ
|