ocrd 3.5.1__py3-none-any.whl → 3.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. ocrd/cli/__init__.py +6 -2
  2. ocrd/cli/bashlib.py +7 -2
  3. ocrd/cli/log.py +7 -2
  4. ocrd/cli/network.py +0 -2
  5. ocrd/cli/ocrd_tool.py +26 -4
  6. ocrd/cli/process.py +1 -0
  7. ocrd/cli/resmgr.py +0 -1
  8. ocrd/cli/validate.py +32 -13
  9. ocrd/cli/workspace.py +125 -52
  10. ocrd/cli/zip.py +13 -4
  11. ocrd/decorators/__init__.py +28 -52
  12. ocrd/decorators/loglevel_option.py +4 -0
  13. ocrd/decorators/mets_find_options.py +2 -1
  14. ocrd/decorators/ocrd_cli_options.py +3 -7
  15. ocrd/decorators/parameter_option.py +12 -11
  16. ocrd/lib.bash +6 -13
  17. ocrd/mets_server.py +6 -10
  18. ocrd/processor/base.py +88 -71
  19. ocrd/processor/builtin/dummy_processor.py +7 -4
  20. ocrd/processor/builtin/filter_processor.py +3 -2
  21. ocrd/processor/helpers.py +5 -6
  22. ocrd/processor/ocrd_page_result.py +7 -5
  23. ocrd/resolver.py +42 -32
  24. ocrd/task_sequence.py +11 -4
  25. ocrd/workspace.py +64 -54
  26. ocrd/workspace_backup.py +3 -0
  27. ocrd/workspace_bagger.py +15 -8
  28. {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/METADATA +1 -1
  29. ocrd-3.6.0.dist-info/RECORD +125 -0
  30. ocrd_modelfactory/__init__.py +4 -2
  31. ocrd_models/constants.py +18 -1
  32. ocrd_models/ocrd_agent.py +1 -1
  33. ocrd_models/ocrd_exif.py +7 -3
  34. ocrd_models/ocrd_file.py +24 -19
  35. ocrd_models/ocrd_mets.py +90 -67
  36. ocrd_models/ocrd_page.py +17 -13
  37. ocrd_models/ocrd_xml_base.py +1 -0
  38. ocrd_models/report.py +2 -1
  39. ocrd_models/utils.py +4 -3
  40. ocrd_models/xpath_functions.py +3 -1
  41. ocrd_network/__init__.py +1 -2
  42. ocrd_network/cli/__init__.py +0 -2
  43. ocrd_network/cli/client.py +122 -50
  44. ocrd_network/cli/processing_server.py +1 -2
  45. ocrd_network/client.py +2 -2
  46. ocrd_network/client_utils.py +30 -13
  47. ocrd_network/constants.py +1 -6
  48. ocrd_network/database.py +3 -3
  49. ocrd_network/logging_utils.py +2 -7
  50. ocrd_network/models/__init__.py +0 -2
  51. ocrd_network/models/job.py +2 -5
  52. ocrd_network/models/workspace.py +1 -1
  53. ocrd_network/process_helpers.py +54 -17
  54. ocrd_network/processing_server.py +63 -114
  55. ocrd_network/processing_worker.py +6 -5
  56. ocrd_network/rabbitmq_utils/__init__.py +2 -0
  57. ocrd_network/rabbitmq_utils/helpers.py +24 -7
  58. ocrd_network/runtime_data/__init__.py +1 -2
  59. ocrd_network/runtime_data/deployer.py +12 -85
  60. ocrd_network/runtime_data/hosts.py +61 -130
  61. ocrd_network/runtime_data/network_agents.py +7 -31
  62. ocrd_network/runtime_data/network_services.py +1 -1
  63. ocrd_network/server_cache.py +1 -1
  64. ocrd_network/server_utils.py +13 -52
  65. ocrd_network/utils.py +1 -0
  66. ocrd_utils/__init__.py +4 -4
  67. ocrd_utils/config.py +86 -76
  68. ocrd_utils/deprecate.py +3 -0
  69. ocrd_utils/image.py +51 -23
  70. ocrd_utils/introspect.py +8 -3
  71. ocrd_utils/logging.py +12 -7
  72. ocrd_utils/os.py +16 -3
  73. ocrd_utils/str.py +32 -16
  74. ocrd_validators/json_validator.py +4 -1
  75. ocrd_validators/ocrd_tool_validator.py +2 -1
  76. ocrd_validators/ocrd_zip_validator.py +5 -4
  77. ocrd_validators/page_validator.py +21 -9
  78. ocrd_validators/parameter_validator.py +3 -2
  79. ocrd_validators/processing_server_config.schema.yml +1 -33
  80. ocrd_validators/resource_list_validator.py +3 -1
  81. ocrd_validators/workspace_validator.py +30 -20
  82. ocrd_validators/xsd_mets_validator.py +2 -1
  83. ocrd_validators/xsd_page_validator.py +2 -1
  84. ocrd_validators/xsd_validator.py +4 -2
  85. ocrd-3.5.1.dist-info/RECORD +0 -128
  86. ocrd_network/cli/processor_server.py +0 -31
  87. ocrd_network/models/ocrd_tool.py +0 -12
  88. ocrd_network/processor_server.py +0 -255
  89. {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/LICENSE +0 -0
  90. {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/WHEEL +0 -0
  91. {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/entry_points.txt +0 -0
  92. {ocrd-3.5.1.dist-info → ocrd-3.6.0.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,7 @@ from fastapi.responses import FileResponse, JSONResponse, PlainTextResponse
10
10
 
11
11
  from ocrd.task_sequence import ProcessorTask
12
12
  from ocrd_utils import initLogging, getLogger
13
- from .constants import AgentType, JobState, ServerApiTags
13
+ from .constants import JobState, ServerApiTags
14
14
  from .database import (
15
15
  initiate_database,
16
16
  db_get_processing_job,
@@ -34,14 +34,13 @@ from .models import (
34
34
  from .rabbitmq_utils import (
35
35
  check_if_queue_exists,
36
36
  connect_rabbitmq_publisher,
37
- create_message_queues,
37
+ get_message_queues,
38
38
  OcrdProcessingMessage
39
39
  )
40
40
  from .server_cache import CacheLockedPages, CacheProcessingRequests
41
41
  from .server_utils import (
42
42
  create_processing_message,
43
43
  create_workspace_if_not_exists,
44
- forward_job_to_processor_server,
45
44
  _get_processor_job,
46
45
  _get_processor_job_log,
47
46
  get_page_ids_list,
@@ -51,7 +50,6 @@ from .server_utils import (
51
50
  kill_mets_server_zombies,
52
51
  parse_workflow_tasks,
53
52
  raise_http_exception,
54
- request_processor_server_tool_json,
55
53
  validate_and_return_mets_path,
56
54
  validate_first_task_input_file_groups_existence,
57
55
  validate_job_input,
@@ -91,7 +89,7 @@ class ProcessingServer(FastAPI):
91
89
  log_file = get_processing_server_logging_file_path(pid=getpid())
92
90
  configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a")
93
91
 
94
- self.log.info(f"Loading ocrd all tool json")
92
+ self.log.info("Loading ocrd-all-tool.json")
95
93
  self.ocrd_all_tool_json = load_ocrd_all_tool_json()
96
94
  self.hostname = host
97
95
  self.port = port
@@ -104,7 +102,7 @@ class ProcessingServer(FastAPI):
104
102
  self.mets_server_proxy = MetsServerProxy()
105
103
  self.use_tcp_mets = self.deployer.use_tcp_mets
106
104
  # If set, all Mets Server UDS requests are multiplexed over TCP
107
- # Used by processing workers and/or processor servers to report back the results
105
+ # Used by processing workers to report back the results
108
106
  if self.deployer.internal_callback_url:
109
107
  host = self.deployer.internal_callback_url
110
108
  self.internal_job_callback_url = f"{host.rstrip('/')}/result_callback"
@@ -153,16 +151,10 @@ class ProcessingServer(FastAPI):
153
151
  # The RMQPublisher is initialized and a connection to the RabbitMQ is performed
154
152
  self.rmq_publisher = connect_rabbitmq_publisher(self.log, self.rmq_data, enable_acks=True)
155
153
 
156
- queue_names = self.deployer.find_matching_network_agents(
157
- worker_only=True, str_names_only=True, unique_only=True
158
- )
159
- self.log.info(f"Creating message queues on RabbitMQ instance url: {self.rabbitmq_url}")
160
- create_message_queues(logger=self.log, rmq_publisher=self.rmq_publisher, queue_names=queue_names)
161
-
162
- self.deployer.deploy_network_agents(mongodb_url=self.mongodb_url, rabbitmq_url=self.rabbitmq_url)
154
+ self.deployer.deploy_workers(mongodb_url=self.mongodb_url, rabbitmq_url=self.rabbitmq_url)
163
155
  except Exception as error:
164
156
  self.log.exception(f"Failed to start the Processing Server, error: {error}")
165
- self.log.warning("Trying to stop previously deployed services and network agents.")
157
+ self.log.warning("Trying to stop previously deployed services and workers.")
166
158
  self.deployer.stop_all()
167
159
  raise
168
160
  uvicorn_run(self, host=self.hostname, port=int(self.port))
@@ -208,7 +200,8 @@ class ProcessingServer(FastAPI):
208
200
  methods=["DELETE"],
209
201
  tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING],
210
202
  status_code=status.HTTP_200_OK,
211
- summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago."
203
+ summary="!! Workaround Do Not Use Unless You Have A Reason "
204
+ "!! Kill all METS servers on this machine that have been created more than 60 minutes ago."
212
205
  )
213
206
  self.include_router(others_router)
214
207
 
@@ -224,7 +217,7 @@ class ProcessingServer(FastAPI):
224
217
  )
225
218
  processing_router.add_api_route(
226
219
  path="/processor/info/{processor_name}",
227
- endpoint=self.get_network_agent_ocrd_tool,
220
+ endpoint=self.get_worker_ocrd_tool,
228
221
  methods=["GET"],
229
222
  tags=[ServerApiTags.PROCESSING, ServerApiTags.DISCOVERY],
230
223
  status_code=status.HTTP_200_OK,
@@ -232,7 +225,7 @@ class ProcessingServer(FastAPI):
232
225
  )
233
226
  processing_router.add_api_route(
234
227
  path="/processor/run/{processor_name}",
235
- endpoint=self.validate_and_forward_job_to_network_agent,
228
+ endpoint=self.validate_and_forward_job_to_worker,
236
229
  methods=["POST"],
237
230
  tags=[ServerApiTags.PROCESSING],
238
231
  status_code=status.HTTP_200_OK,
@@ -266,7 +259,7 @@ class ProcessingServer(FastAPI):
266
259
  methods=["POST"],
267
260
  tags=[ServerApiTags.PROCESSING],
268
261
  status_code=status.HTTP_200_OK,
269
- summary="Callback used by a worker or processor server for reporting result of a processing request"
262
+ summary="Callback used by a worker for reporting result of a processing request"
270
263
  )
271
264
  self.include_router(processing_router)
272
265
 
@@ -350,68 +343,38 @@ class ProcessingServer(FastAPI):
350
343
  async def stop_deployed_agents(self) -> None:
351
344
  self.deployer.stop_all()
352
345
 
353
- def query_ocrd_tool_json_from_server(self, processor_name: str) -> Dict:
354
- processor_server_base_url = self.deployer.resolve_processor_server_url(processor_name)
355
- if processor_server_base_url == '':
356
- message = f"Processor Server URL of '{processor_name}' not found"
357
- raise_http_exception(self.log, status.HTTP_404_NOT_FOUND, message=message)
358
- return request_processor_server_tool_json(self.log, processor_server_base_url=processor_server_base_url)
359
-
360
- async def get_network_agent_ocrd_tool(
361
- self, processor_name: str, agent_type: AgentType = AgentType.PROCESSING_WORKER
362
- ) -> Dict:
346
+ async def get_worker_ocrd_tool(self, processor_name: str) -> Dict:
363
347
  ocrd_tool = {}
364
- error_message = f"Network agent of type '{agent_type}' for processor '{processor_name}' not found."
365
- if agent_type != AgentType.PROCESSING_WORKER and agent_type != AgentType.PROCESSOR_SERVER:
366
- message = f"Unknown agent type: {agent_type}, {type(agent_type)}"
367
- raise_http_exception(self.log, status_code=status.HTTP_501_NOT_IMPLEMENTED, message=message)
368
- if agent_type == AgentType.PROCESSING_WORKER:
369
- ocrd_tool = self.ocrd_all_tool_json.get(processor_name, None)
370
- if agent_type == AgentType.PROCESSOR_SERVER:
371
- ocrd_tool = self.query_ocrd_tool_json_from_server(processor_name)
348
+ ocrd_tool = self.ocrd_all_tool_json.get(processor_name, None)
372
349
  if not ocrd_tool:
373
- raise_http_exception(self.log, status.HTTP_404_NOT_FOUND, error_message)
350
+ raise_http_exception(self.log, status.HTTP_404_NOT_FOUND,
351
+ f"Processing Worker '{processor_name}' not found.")
374
352
  return ocrd_tool
375
353
 
376
- def network_agent_exists_server(self, processor_name: str) -> bool:
377
- processor_server_url = self.deployer.resolve_processor_server_url(processor_name)
378
- return bool(processor_server_url)
379
-
380
- def network_agent_exists_worker(self, processor_name: str) -> bool:
354
+ def exists_worker(self, processor_name: str) -> bool:
381
355
  # TODO: Reconsider and refactor this.
382
356
  # Added ocrd-dummy by default if not available for the integration tests.
383
- # A proper Processing Worker / Processor Server registration endpoint
384
- # is needed on the Processing Server side
357
+ # A proper Processing Worker registration endpoint is needed on the Processing Server side
385
358
  if processor_name == 'ocrd-dummy':
386
359
  return True
387
360
  return bool(check_if_queue_exists(self.log, self.rmq_data, processor_name=processor_name))
388
361
 
389
- def validate_agent_type_and_existence(self, processor_name: str, agent_type: AgentType) -> None:
390
- agent_exists = False
391
- if agent_type == AgentType.PROCESSOR_SERVER:
392
- agent_exists = self.network_agent_exists_server(processor_name=processor_name)
393
- elif agent_type == AgentType.PROCESSING_WORKER:
394
- agent_exists = self.network_agent_exists_worker(processor_name=processor_name)
395
- else:
396
- message = f"Unknown agent type: {agent_type}, {type(agent_type)}"
397
- raise_http_exception(self.log, status_code=status.HTTP_501_NOT_IMPLEMENTED, message=message)
398
- if not agent_exists:
399
- message = f"Network agent of type '{agent_type}' for processor '{processor_name}' not found."
362
+ def validate_worker_existence(self, processor_name: str) -> None:
363
+ worker_exists = self.exists_worker(processor_name=processor_name)
364
+ if not worker_exists:
365
+ message = f"Processing Worker '{processor_name}' not found."
400
366
  raise_http_exception(self.log, status.HTTP_422_UNPROCESSABLE_ENTITY, message)
401
367
 
402
- async def validate_and_forward_job_to_network_agent(self, processor_name: str, data: PYJobInput) -> PYJobOutput:
368
+ async def validate_and_forward_job_to_worker(self, processor_name: str, data: PYJobInput) -> PYJobOutput:
403
369
  # Append the processor name to the request itself
404
370
  data.processor_name = processor_name
405
- self.validate_agent_type_and_existence(processor_name=data.processor_name, agent_type=data.agent_type)
371
+ self.validate_worker_existence(processor_name=data.processor_name)
406
372
  if data.job_id:
407
373
  message = f"Processing request job id field is set but must not be: {data.job_id}"
408
374
  raise_http_exception(self.log, status.HTTP_422_UNPROCESSABLE_ENTITY, message)
409
375
  # Generate processing job id
410
376
  data.job_id = generate_id()
411
- ocrd_tool = await self.get_network_agent_ocrd_tool(
412
- processor_name=data.processor_name,
413
- agent_type=data.agent_type
414
- )
377
+ ocrd_tool = await self.get_worker_ocrd_tool(processor_name=data.processor_name)
415
378
  validate_job_input(self.log, data.processor_name, ocrd_tool, data)
416
379
 
417
380
  if data.workspace_id:
@@ -491,19 +454,13 @@ class ProcessingServer(FastAPI):
491
454
  )
492
455
  await db_queued_job.insert()
493
456
  self.cache_processing_requests.update_request_counter(workspace_key=workspace_key, by_value=1)
494
- job_output = await self.push_job_to_network_agent(data=data, db_job=db_queued_job)
457
+ job_output = await self.push_job_to_worker(data=data, db_job=db_queued_job)
495
458
  return job_output
496
459
 
497
- async def push_job_to_network_agent(self, data: PYJobInput, db_job: DBProcessorJob) -> PYJobOutput:
498
- if data.agent_type != AgentType.PROCESSING_WORKER and data.agent_type != AgentType.PROCESSOR_SERVER:
499
- message = f"Unknown agent type: {data.agent_type}, {type(data.agent_type)}"
500
- raise_http_exception(self.log, status_code=status.HTTP_501_NOT_IMPLEMENTED, message=message)
460
+ async def push_job_to_worker(self, data: PYJobInput, db_job: DBProcessorJob) -> PYJobOutput:
501
461
  job_output = None
502
- self.log.debug(f"Pushing to {data.agent_type}: {data.processor_name}, {data.page_id}, {data.job_id}")
503
- if data.agent_type == AgentType.PROCESSING_WORKER:
504
- job_output = await self.push_job_to_processing_queue(db_job=db_job)
505
- if data.agent_type == AgentType.PROCESSOR_SERVER:
506
- job_output = await self.push_job_to_processor_server(job_input=data)
462
+ self.log.debug(f"Pushing to Processing Worker: {data.processor_name}, {data.page_id}, {data.job_id}")
463
+ job_output = await self.push_job_to_processing_queue(db_job=db_job)
507
464
  if not job_output:
508
465
  message = f"Failed to create job output for job input: {data}"
509
466
  raise_http_exception(self.log, status.HTTP_500_INTERNAL_SERVER_ERROR, message)
@@ -525,12 +482,6 @@ class ProcessingServer(FastAPI):
525
482
  raise_http_exception(self.log, status.HTTP_500_INTERNAL_SERVER_ERROR, message, error)
526
483
  return db_job.to_job_output()
527
484
 
528
- async def push_job_to_processor_server(self, job_input: PYJobInput) -> PYJobOutput:
529
- processor_server_base_url = self.deployer.resolve_processor_server_url(job_input.processor_name)
530
- return await forward_job_to_processor_server(
531
- self.log, job_input=job_input, processor_server_base_url=processor_server_base_url
532
- )
533
-
534
485
  async def get_processor_job(self, job_id: str) -> PYJobOutput:
535
486
  return await _get_processor_job(self.log, job_id)
536
487
 
@@ -556,7 +507,7 @@ class ProcessingServer(FastAPI):
556
507
  page_ids=page_ids
557
508
  )
558
509
 
559
- async def push_cached_jobs_to_agents(self, processing_jobs: List[PYJobInput]) -> None:
510
+ async def push_cached_jobs_to_workers(self, processing_jobs: List[PYJobInput]) -> None:
560
511
  if not len(processing_jobs):
561
512
  self.log.debug("No processing jobs were consumed from the requests cache")
562
513
  return
@@ -573,7 +524,7 @@ class ProcessingServer(FastAPI):
573
524
  )
574
525
 
575
526
  self.cache_processing_requests.update_request_counter(workspace_key=workspace_key, by_value=1)
576
- job_output = await self.push_job_to_network_agent(data=data, db_job=db_consumed_job)
527
+ job_output = await self.push_job_to_worker(data=data, db_job=db_consumed_job)
577
528
  if not job_output:
578
529
  self.log.exception(f"Failed to create job output for job input data: {data}")
579
530
 
@@ -653,22 +604,16 @@ class ProcessingServer(FastAPI):
653
604
  consumed_cached_jobs = await self._consume_cached_jobs_of_workspace(
654
605
  workspace_key=workspace_key, mets_server_url=mets_server_url, path_to_mets=path_to_mets
655
606
  )
656
- await self.push_cached_jobs_to_agents(processing_jobs=consumed_cached_jobs)
607
+ await self.push_cached_jobs_to_workers(processing_jobs=consumed_cached_jobs)
657
608
 
658
609
  async def list_processors(self) -> List[str]:
659
- # There is no caching on the Processing Server side
660
- processor_names_list = self.deployer.find_matching_network_agents(
661
- docker_only=False, native_only=False, worker_only=False, server_only=False,
662
- str_names_only=True, unique_only=True, sort=True
663
- )
664
- return processor_names_list
610
+ return get_message_queues(self.log, self.rmq_data)
665
611
 
666
612
  async def task_sequence_to_processing_jobs(
667
613
  self,
668
614
  tasks: List[ProcessorTask],
669
615
  mets_path: str,
670
616
  page_id: str,
671
- agent_type: AgentType = AgentType.PROCESSING_WORKER
672
617
  ) -> List[PYJobOutput]:
673
618
  temp_file_group_cache = {}
674
619
  responses = []
@@ -687,10 +632,9 @@ class ProcessingServer(FastAPI):
687
632
  output_file_grps=task.output_file_grps,
688
633
  page_id=page_id,
689
634
  parameters=task.parameters,
690
- agent_type=agent_type,
691
635
  depends_on=dependent_jobs,
692
636
  )
693
- response = await self.validate_and_forward_job_to_network_agent(
637
+ response = await self.validate_and_forward_job_to_worker(
694
638
  processor_name=job_input_data.processor_name,
695
639
  data=job_input_data
696
640
  )
@@ -699,27 +643,26 @@ class ProcessingServer(FastAPI):
699
643
  responses.append(response)
700
644
  return responses
701
645
 
702
- def validate_tasks_agents_existence(self, tasks: List[ProcessorTask], agent_type: AgentType) -> None:
703
- missing_agents = []
646
+ def validate_tasks_worker_existence(self, tasks: List[ProcessorTask]) -> None:
647
+ missing_workers = []
704
648
  for task in tasks:
705
649
  try:
706
- self.validate_agent_type_and_existence(processor_name=task.executable, agent_type=agent_type)
650
+ self.validate_worker_existence(processor_name=task.executable)
707
651
  except HTTPException:
708
652
  # catching the error is not relevant here
709
- missing_agents.append({task.executable, agent_type})
710
- if missing_agents:
653
+ missing_workers.append({task.executable})
654
+ if missing_workers:
711
655
  message = (
712
- "Workflow validation has failed. The desired network agents not found. "
713
- f"Missing processing agents: {missing_agents}"
656
+ "Workflow validation has failed. The desired Processing Worker was not found. "
657
+ f"Missing Processing Workers: {missing_workers}"
714
658
  )
715
659
  raise_http_exception(self.log, status.HTTP_406_NOT_ACCEPTABLE, message)
716
660
 
717
661
  async def run_workflow(
718
662
  self,
719
663
  mets_path: str,
720
- workflow: Union[UploadFile, None] = File(None),
664
+ workflow: Union[UploadFile, str, None] = File(None),
721
665
  workflow_id: str = None,
722
- agent_type: AgentType = AgentType.PROCESSING_WORKER,
723
666
  page_id: str = None,
724
667
  page_wise: bool = False,
725
668
  workflow_callback_url: str = None
@@ -731,28 +674,27 @@ class ProcessingServer(FastAPI):
731
674
  # Validate the input file groups of the first task in the workflow
732
675
  validate_first_task_input_file_groups_existence(self.log, mets_path, processing_tasks[0].input_file_grps)
733
676
 
734
- # Validate existence of agents (processing workers/processor servers)
677
+ # Validate existence of Processing Workers
735
678
  # for the ocr-d processors referenced inside tasks
736
- self.validate_tasks_agents_existence(processing_tasks, agent_type)
679
+ self.validate_tasks_worker_existence(processing_tasks)
737
680
 
681
+ # for page_wise mode, we need to expand the list of pages
682
+ # for the database, it's better to keep a short string
683
+ page_id = page_id or ''
738
684
  page_ids = get_page_ids_list(self.log, mets_path, page_id)
739
685
 
740
- # TODO: Reconsider this, the compact page range may not always work if the page_ids are hashes!
741
- compact_page_range = f"{page_ids[0]}..{page_ids[-1]}"
742
-
743
686
  if not page_wise:
744
687
  responses = await self.task_sequence_to_processing_jobs(
745
688
  tasks=processing_tasks,
746
689
  mets_path=mets_path,
747
- page_id=compact_page_range,
748
- agent_type=agent_type
690
+ page_id=page_id,
749
691
  )
750
692
  processing_job_ids = [response.job_id for response in responses]
751
693
  db_workflow_job = DBWorkflowJob(
752
694
  job_id=generate_id(),
753
- page_id=compact_page_range,
695
+ page_id=page_id,
754
696
  page_wise=page_wise,
755
- processing_job_ids={compact_page_range: processing_job_ids},
697
+ processing_job_ids={page_id: processing_job_ids},
756
698
  path_to_mets=mets_path,
757
699
  workflow_callback_url=workflow_callback_url
758
700
  )
@@ -765,13 +707,12 @@ class ProcessingServer(FastAPI):
765
707
  tasks=processing_tasks,
766
708
  mets_path=mets_path,
767
709
  page_id=current_page,
768
- agent_type=agent_type
769
710
  )
770
711
  processing_job_ids = [response.job_id for response in responses]
771
712
  all_pages_job_ids[current_page] = processing_job_ids
772
713
  db_workflow_job = DBWorkflowJob(
773
714
  job_id=generate_id(),
774
- page_id=compact_page_range,
715
+ page_id=page_id,
775
716
  page_wise=page_wise,
776
717
  processing_job_ids=all_pages_job_ids,
777
718
  path_to_mets=mets_path,
@@ -825,7 +766,7 @@ class ProcessingServer(FastAPI):
825
766
  response = self._produce_workflow_status_response(processing_jobs=jobs)
826
767
  return response
827
768
 
828
- async def kill_mets_server_zombies(self, minutes_ago : Optional[int] = None, dry_run : Optional[bool] = None) -> List[int]:
769
+ async def kill_mets_server_zombies(self, minutes_ago: Optional[int] = None, dry_run: Optional[bool] = None) -> List[int]:
829
770
  pids_killed = kill_mets_server_zombies(minutes_ago=minutes_ago, dry_run=dry_run)
830
771
  return pids_killed
831
772
 
@@ -843,10 +784,14 @@ class ProcessingServer(FastAPI):
843
784
  workflow_job_state = self._produce_workflow_status_simple_response(processing_jobs=jobs)
844
785
  return {"state": workflow_job_state}
845
786
 
846
- async def upload_workflow(self, workflow: UploadFile) -> Dict[str, str]:
787
+ async def upload_workflow(self, workflow: Union[UploadFile, str]) -> Dict[str, str]:
847
788
  """ Store a script for a workflow in the database
848
789
  """
849
- workflow_content = await generate_workflow_content(workflow)
790
+ if isinstance(workflow, str):
791
+ with open(workflow) as wf_file:
792
+ workflow_content = wf_file.read()
793
+ else:
794
+ workflow_content = await generate_workflow_content(workflow)
850
795
  validate_workflow(self.log, workflow_content)
851
796
  content_hash = generate_workflow_content_hash(workflow_content)
852
797
  try:
@@ -865,12 +810,16 @@ class ProcessingServer(FastAPI):
865
810
  await db_workflow_script.insert()
866
811
  return {"workflow_id": workflow_id}
867
812
 
868
- async def replace_workflow(self, workflow_id, workflow: UploadFile) -> Dict[str, str]:
813
+ async def replace_workflow(self, workflow_id, workflow: Union[UploadFile, str]) -> Dict[str, str]:
869
814
  """ Update a workflow script file in the database
870
815
  """
871
816
  try:
872
817
  db_workflow_script = await db_get_workflow_script(workflow_id)
873
- workflow_content = await generate_workflow_content(workflow)
818
+ if isinstance(workflow, str):
819
+ with open(workflow) as wf_file:
820
+ workflow_content = wf_file.read()
821
+ else:
822
+ workflow_content = await generate_workflow_content(workflow)
874
823
  validate_workflow(self.log, workflow_content)
875
824
  db_workflow_script.content = workflow_content
876
825
  content_hash = generate_workflow_content_hash(workflow_content)
@@ -9,7 +9,7 @@ is a single OCR-D Processor instance.
9
9
  """
10
10
 
11
11
  from datetime import datetime
12
- from os import getpid, getppid
12
+ from os import getpid
13
13
  from pika import BasicProperties
14
14
  from pika.adapters.blocking_connection import BlockingChannel
15
15
  from pika.spec import Basic
@@ -36,7 +36,7 @@ from .utils import calculate_execution_time, post_to_callback_url
36
36
  class ProcessingWorker:
37
37
  def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None:
38
38
  initLogging()
39
- self.log = getLogger(f'ocrd_network.processing_worker')
39
+ self.log = getLogger('ocrd_network.processing_worker')
40
40
  log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid())
41
41
  configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a")
42
42
 
@@ -120,7 +120,7 @@ class ProcessingWorker:
120
120
  channel.basic_nack(delivery_tag=delivery_tag, multiple=False, requeue=False)
121
121
  raise Exception(message)
122
122
 
123
- self.log.info(f"Successfully processed RabbitMQ message")
123
+ self.log.info("Successfully processed RabbitMQ message")
124
124
  self.log.debug(ack_message)
125
125
  channel.basic_ack(delivery_tag=delivery_tag, multiple=False)
126
126
 
@@ -134,8 +134,9 @@ class ProcessingWorker:
134
134
  self.log.info(f"Starting consuming from queue: {self.processor_name}")
135
135
  # Starting consuming is a blocking action
136
136
  self.rmq_consumer.start_consuming()
137
+ self.log.info(f"Consuming stopped for queue: {self.processor_name}")
137
138
  else:
138
- msg = f"The RMQConsumer is not connected/configured properly."
139
+ msg = "The RMQConsumer is not connected/configured properly."
139
140
  self.log.exception(msg)
140
141
  raise Exception(msg)
141
142
 
@@ -165,7 +166,7 @@ class ProcessingWorker:
165
166
  parameters = processing_message.parameters if processing_message.parameters else {}
166
167
 
167
168
  if not path_to_mets and not workspace_id:
168
- msg = f"Both 'path_to_mets' and 'workspace_id' are missing in the OcrdProcessingMessage."
169
+ msg = "Both 'path_to_mets' and 'workspace_id' are missing in the OcrdProcessingMessage."
169
170
  self.log.exception(msg)
170
171
  raise ValueError(msg)
171
172
 
@@ -3,6 +3,7 @@ __all__ = [
3
3
  "connect_rabbitmq_consumer",
4
4
  "connect_rabbitmq_publisher",
5
5
  "create_message_queues",
6
+ "get_message_queues",
6
7
  "verify_and_parse_mq_uri",
7
8
  "verify_rabbitmq_available",
8
9
  "RMQConsumer",
@@ -19,6 +20,7 @@ from .helpers import (
19
20
  connect_rabbitmq_consumer,
20
21
  connect_rabbitmq_publisher,
21
22
  create_message_queues,
23
+ get_message_queues,
22
24
  verify_and_parse_mq_uri,
23
25
  verify_rabbitmq_available
24
26
  )
@@ -4,6 +4,9 @@ from pika.exceptions import AMQPConnectionError, ChannelClosedByBroker
4
4
  from re import match as re_match
5
5
  from time import sleep
6
6
  from typing import Dict, List, Union
7
+ from requests import get
8
+ from requests.auth import HTTPBasicAuth
9
+ from requests.exceptions import RequestException, HTTPError
7
10
 
8
11
  from .constants import RABBITMQ_URI_PATTERN, RECONNECT_TRIES, RECONNECT_WAIT
9
12
  from .consumer import RMQConsumer
@@ -42,7 +45,7 @@ def __connect_rabbitmq_client(
42
45
 
43
46
  def connect_rabbitmq_consumer(logger: Logger, rmq_data: Dict) -> RMQConsumer:
44
47
  rmq_consumer = __connect_rabbitmq_client(logger=logger, client_type="consumer", rmq_data=rmq_data)
45
- logger.info(f"Successfully connected RMQConsumer")
48
+ logger.info("Successfully connected RMQConsumer")
46
49
  return rmq_consumer
47
50
 
48
51
 
@@ -68,12 +71,6 @@ def check_if_queue_exists(logger: Logger, rmq_data: Dict, processor_name: str) -
68
71
 
69
72
 
70
73
  def create_message_queues(logger: Logger, rmq_publisher: RMQPublisher, queue_names: List[str]) -> None:
71
- # TODO: Reconsider and refactor this.
72
- # Added ocrd-dummy by default if not available for the integration tests.
73
- # A proper Processing Worker / Processor Server registration endpoint is needed on the Processing Server side
74
- if "ocrd-dummy" not in queue_names:
75
- queue_names.append("ocrd-dummy")
76
-
77
74
  for queue_name in queue_names:
78
75
  # The existence/validity of the worker.name is not tested.
79
76
  # Even if an ocr-d processor does not exist, the queue is created
@@ -81,6 +78,26 @@ def create_message_queues(logger: Logger, rmq_publisher: RMQPublisher, queue_nam
81
78
  rmq_publisher.create_queue(queue_name=queue_name)
82
79
 
83
80
 
81
+ def get_message_queues(logger: Logger, rmq_data: Dict) -> List:
82
+ try:
83
+ response = get(
84
+ f"http://{rmq_data['host']}:{15672}/api/queues",
85
+ auth=HTTPBasicAuth(rmq_data["username"], rmq_data["password"])
86
+ )
87
+ response.raise_for_status()
88
+ queues = response.json()
89
+ return [queue['name'] for queue in queues]
90
+ except HTTPError:
91
+ logger.warn(
92
+ f"Error requesting all queue-names from rabbitmq. Status code: {response.status_code}. "
93
+ f"Response-Text: {response.text}"
94
+ )
95
+ return []
96
+ except RequestException as e:
97
+ logger.warn(f"Error querying RabbitMQ API: {e}")
98
+ return []
99
+
100
+
84
101
  def verify_and_parse_mq_uri(rabbitmq_address: str):
85
102
  """
86
103
  Check the full list of available parameters in the docs here:
@@ -5,10 +5,9 @@ __all__ = [
5
5
  "DataNetworkAgent",
6
6
  "DataRabbitMQ",
7
7
  "DataProcessingWorker",
8
- "DataProcessorServer"
9
8
  ]
10
9
 
11
10
  from .deployer import Deployer
12
11
  from .hosts import DataHost
13
- from .network_agents import DataNetworkAgent, DataProcessingWorker, DataProcessorServer
12
+ from .network_agents import DataNetworkAgent, DataProcessingWorker
14
13
  from .network_services import DataMongoDB, DataRabbitMQ