ocrd 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. ocrd/cli/__init__.py +8 -6
  2. ocrd/cli/bashlib.py +8 -114
  3. ocrd/cli/network.py +0 -2
  4. ocrd/cli/ocrd_tool.py +26 -4
  5. ocrd/cli/process.py +1 -0
  6. ocrd/cli/resmgr.py +0 -1
  7. ocrd/cli/validate.py +32 -13
  8. ocrd/cli/workspace.py +125 -52
  9. ocrd/cli/zip.py +13 -4
  10. ocrd/decorators/__init__.py +28 -52
  11. ocrd/decorators/loglevel_option.py +4 -0
  12. ocrd/decorators/mets_find_options.py +2 -1
  13. ocrd/decorators/ocrd_cli_options.py +3 -7
  14. ocrd/decorators/parameter_option.py +12 -11
  15. ocrd/mets_server.py +11 -15
  16. ocrd/processor/base.py +88 -71
  17. ocrd/processor/builtin/dummy_processor.py +7 -4
  18. ocrd/processor/builtin/filter_processor.py +3 -2
  19. ocrd/processor/helpers.py +5 -6
  20. ocrd/processor/ocrd_page_result.py +7 -5
  21. ocrd/resolver.py +42 -32
  22. ocrd/task_sequence.py +11 -4
  23. ocrd/workspace.py +64 -54
  24. ocrd/workspace_backup.py +3 -0
  25. ocrd/workspace_bagger.py +15 -8
  26. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/METADATA +2 -8
  27. ocrd-3.7.0.dist-info/RECORD +123 -0
  28. ocrd_modelfactory/__init__.py +4 -2
  29. ocrd_models/constants.py +18 -1
  30. ocrd_models/ocrd_agent.py +1 -1
  31. ocrd_models/ocrd_exif.py +7 -3
  32. ocrd_models/ocrd_file.py +24 -19
  33. ocrd_models/ocrd_mets.py +90 -67
  34. ocrd_models/ocrd_page.py +17 -13
  35. ocrd_models/ocrd_xml_base.py +1 -0
  36. ocrd_models/report.py +2 -1
  37. ocrd_models/utils.py +4 -3
  38. ocrd_models/xpath_functions.py +3 -1
  39. ocrd_network/__init__.py +1 -2
  40. ocrd_network/cli/__init__.py +0 -2
  41. ocrd_network/cli/client.py +122 -50
  42. ocrd_network/cli/processing_server.py +1 -2
  43. ocrd_network/client.py +2 -2
  44. ocrd_network/client_utils.py +30 -13
  45. ocrd_network/constants.py +1 -6
  46. ocrd_network/database.py +3 -3
  47. ocrd_network/logging_utils.py +2 -7
  48. ocrd_network/models/__init__.py +0 -2
  49. ocrd_network/models/job.py +31 -33
  50. ocrd_network/models/messages.py +3 -2
  51. ocrd_network/models/workspace.py +5 -5
  52. ocrd_network/process_helpers.py +54 -17
  53. ocrd_network/processing_server.py +63 -114
  54. ocrd_network/processing_worker.py +6 -5
  55. ocrd_network/rabbitmq_utils/__init__.py +2 -0
  56. ocrd_network/rabbitmq_utils/helpers.py +24 -7
  57. ocrd_network/runtime_data/__init__.py +1 -2
  58. ocrd_network/runtime_data/deployer.py +12 -85
  59. ocrd_network/runtime_data/hosts.py +61 -130
  60. ocrd_network/runtime_data/network_agents.py +7 -31
  61. ocrd_network/runtime_data/network_services.py +1 -1
  62. ocrd_network/server_cache.py +1 -1
  63. ocrd_network/server_utils.py +13 -52
  64. ocrd_network/utils.py +1 -0
  65. ocrd_utils/__init__.py +4 -4
  66. ocrd_utils/config.py +86 -76
  67. ocrd_utils/deprecate.py +3 -0
  68. ocrd_utils/image.py +51 -23
  69. ocrd_utils/introspect.py +8 -3
  70. ocrd_utils/logging.py +15 -7
  71. ocrd_utils/os.py +17 -4
  72. ocrd_utils/str.py +32 -16
  73. ocrd_validators/json_validator.py +4 -1
  74. ocrd_validators/ocrd_tool_validator.py +2 -1
  75. ocrd_validators/ocrd_zip_validator.py +5 -4
  76. ocrd_validators/page_validator.py +21 -9
  77. ocrd_validators/parameter_validator.py +3 -2
  78. ocrd_validators/processing_server_config.schema.yml +1 -33
  79. ocrd_validators/resource_list_validator.py +3 -1
  80. ocrd_validators/workspace_validator.py +30 -20
  81. ocrd_validators/xsd_mets_validator.py +2 -1
  82. ocrd_validators/xsd_page_validator.py +2 -1
  83. ocrd_validators/xsd_validator.py +4 -2
  84. ocrd/cli/log.py +0 -51
  85. ocrd/lib.bash +0 -317
  86. ocrd-3.5.1.dist-info/RECORD +0 -128
  87. ocrd_network/cli/processor_server.py +0 -31
  88. ocrd_network/models/ocrd_tool.py +0 -12
  89. ocrd_network/processor_server.py +0 -255
  90. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/LICENSE +0 -0
  91. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/WHEEL +0 -0
  92. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/entry_points.txt +0 -0
  93. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,7 @@ def ocrd_cli_wrap_processor(
30
30
  working_dir=None,
31
31
  dump_json=False,
32
32
  dump_module_dir=False,
33
- help=False, # pylint: disable=redefined-builtin
33
+ help=False, # pylint: disable=redefined-builtin
34
34
  profile=False,
35
35
  profile_file=None,
36
36
  version=False,
@@ -41,7 +41,6 @@ def ocrd_cli_wrap_processor(
41
41
  list_resources=False,
42
42
  # ocrd_network params start #
43
43
  subcommand=None,
44
- address=None,
45
44
  queue=None,
46
45
  log_filename=None,
47
46
  database=None,
@@ -88,9 +87,8 @@ def ocrd_cli_wrap_processor(
88
87
  if list_resources:
89
88
  processor.list_resources()
90
89
  sys.exit()
91
- if subcommand or address or queue or database:
92
- # Used for checking/starting network agents for the WebAPI architecture
93
- check_and_run_network_agent(processorClass, subcommand, address, database, queue)
90
+ if subcommand == "worker" or queue or database:
91
+ check_and_run_processing_worker(processorClass, database, queue)
94
92
 
95
93
  if 'parameter' in kwargs:
96
94
  # Disambiguate parameter file/literal, and resolve file
@@ -110,10 +108,10 @@ def ocrd_cli_wrap_processor(
110
108
  if not kwargs.get('input_file_grp', None):
111
109
  raise ValueError('-I/--input-file-grp is required')
112
110
  if 'output_file_grp' not in kwargs:
113
- raise ValueError('-O/--output-file-grp is required') # actually, it may be None
111
+ raise ValueError('-O/--output-file-grp is required') # actually, it may be None
114
112
  resolver = Resolver()
115
113
  working_dir, mets, _, mets_server_url = \
116
- resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url)
114
+ resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url)
117
115
  workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url)
118
116
  page_id = kwargs.get('page_id')
119
117
  if debug:
@@ -122,7 +120,10 @@ def ocrd_cli_wrap_processor(
122
120
  config.OCRD_EXISTING_OUTPUT = 'ABORT'
123
121
  if overwrite:
124
122
  config.OCRD_EXISTING_OUTPUT = 'OVERWRITE'
125
- report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id)
123
+ report = WorkspaceValidator.check_file_grp(workspace,
124
+ kwargs['input_file_grp'],
125
+ '' if overwrite else kwargs['output_file_grp'],
126
+ page_id)
126
127
  if not report.is_valid:
127
128
  raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors))
128
129
  # Set up profiling behavior from environment variables/flags
@@ -138,6 +139,7 @@ def ocrd_cli_wrap_processor(
138
139
  print("Profiling...")
139
140
  pr = cProfile.Profile()
140
141
  pr.enable()
142
+
141
143
  def goexit():
142
144
  pr.disable()
143
145
  print("Profiling completed")
@@ -146,6 +148,7 @@ def ocrd_cli_wrap_processor(
146
148
  s = io.StringIO()
147
149
  pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats()
148
150
  print(s.getvalue())
151
+
149
152
  atexit.register(goexit)
150
153
  if log_filename:
151
154
  log_ctx = redirect_stderr_and_stdout_to_file(log_filename)
@@ -155,53 +158,26 @@ def ocrd_cli_wrap_processor(
155
158
  run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs)
156
159
 
157
160
 
158
- def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str):
159
- """
161
+ def check_and_run_processing_worker(ProcessorClass, database: str, queue: str):
162
+ """ Check/start Processing Worker for the WebAPI architecture
160
163
  """
161
- from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
162
- SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER]
163
-
164
- if not subcommand:
165
- raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}")
166
- if subcommand not in SUBCOMMANDS:
167
- raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}")
164
+ from ocrd_network import ProcessingWorker
168
165
 
169
166
  if not database:
170
- raise ValueError(f"Option '--database' is invalid for subcommand {subcommand}")
171
-
172
- if subcommand == AgentType.PROCESSOR_SERVER:
173
- if not address:
174
- raise ValueError(f"Option '--address' required for subcommand {subcommand}")
175
- if queue:
176
- raise ValueError(f"Option '--queue' invalid for subcommand {subcommand}")
177
- if subcommand == AgentType.PROCESSING_WORKER:
178
- if address:
179
- raise ValueError(f"Option '--address' invalid for subcommand {subcommand}")
180
- if not queue:
181
- raise ValueError(f"Option '--queue' required for subcommand {subcommand}")
167
+ raise ValueError("Option '--database' is required for the Processing Worker")
168
+ if not queue:
169
+ raise ValueError("Option '--queue' is required for the Processing Worker")
182
170
 
183
171
  processor = ProcessorClass(workspace=None)
184
- if subcommand == AgentType.PROCESSING_WORKER:
185
- processing_worker = ProcessingWorker(
186
- rabbitmq_addr=queue,
187
- mongodb_addr=database,
188
- processor_name=processor.ocrd_tool['executable'],
189
- ocrd_tool=processor.ocrd_tool,
190
- processor_class=ProcessorClass,
191
- )
192
- # The RMQConsumer is initialized and a connection to the RabbitMQ is performed
193
- processing_worker.connect_consumer()
194
- # Start consuming from the queue with name `processor_name`
195
- processing_worker.start_consuming()
196
- elif subcommand == AgentType.PROCESSOR_SERVER:
197
- # TODO: Better validate that inside the ProcessorServer itself
198
- host, port = address.split(':')
199
- processor_server = ProcessorServer(
200
- mongodb_addr=database,
201
- processor_name=processor.ocrd_tool['executable'],
202
- processor_class=ProcessorClass,
203
- )
204
- processor_server.run_server(host=host, port=int(port))
205
- else:
206
- raise ValueError(f"Unknown network agent type, must be one of: {SUBCOMMANDS}")
172
+ processing_worker = ProcessingWorker(
173
+ rabbitmq_addr=queue,
174
+ mongodb_addr=database,
175
+ processor_name=processor.ocrd_tool['executable'],
176
+ ocrd_tool=processor.ocrd_tool,
177
+ processor_class=ProcessorClass,
178
+ )
179
+ # The RMQConsumer is initialized and a connection to the RabbitMQ is performed
180
+ processing_worker.connect_consumer()
181
+ # Start consuming from the queue with name `processor_name`
182
+ processing_worker.start_consuming()
207
183
  sys.exit(0)
@@ -1,14 +1,17 @@
1
1
  import click
2
2
  from ocrd_utils.logging import setOverrideLogLevel
3
3
 
4
+
4
5
  __all__ = ['ocrd_loglevel']
5
6
 
7
+
6
8
  def _setOverrideLogLevel(ctx, param, value): # pylint: disable=unused-argument
7
9
  if value is None: # Explicitly test for None because logging.DEBUG == 0
8
10
  return
9
11
  setOverrideLogLevel(value)
10
12
  return value
11
13
 
14
+
12
15
  loglevel_option = click.option('-l', '--log-level', help="Log level",
13
16
  type=click.Choice([
14
17
  'OFF', 'ERROR', 'WARN',
@@ -16,6 +19,7 @@ loglevel_option = click.option('-l', '--log-level', help="Log level",
16
19
  ]),
17
20
  default=None, callback=_setOverrideLogLevel)
18
21
 
22
+
19
23
  def ocrd_loglevel(f):
20
24
  """
21
25
  Add an option '--log-level' to set the log level.
@@ -1,7 +1,8 @@
1
1
  from click import option
2
2
 
3
+
3
4
  def mets_find_options(f):
4
- for opt in [
5
+ for opt in [
5
6
  option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER'),
6
7
  option('-m', '--mimetype', help="Media type to look for", metavar='FILTER'),
7
8
  option('-g', '--page-id', help="Page ID", metavar='FILTER'),
@@ -1,12 +1,10 @@
1
1
  import click
2
- from click import option, Path, group, command, argument
2
+ from click import option, Path, argument
3
3
  from ocrd_utils import DEFAULT_METS_BASENAME
4
- from ocrd_network import AgentType
5
4
  from .parameter_option import parameter_option, parameter_override_option
6
5
  from .loglevel_option import loglevel_option
7
6
  from ocrd_network import (
8
7
  DatabaseParamType,
9
- ServerAddressParamType,
10
8
  QueueServerParamType
11
9
  )
12
10
 
@@ -40,7 +38,6 @@ def ocrd_cli_options(f):
40
38
  parameter_override_option,
41
39
  loglevel_option,
42
40
  option('--log-filename', default=None),
43
- option('--address', type=ServerAddressParamType()),
44
41
  option('--queue', type=QueueServerParamType()),
45
42
  option('--database', type=DatabaseParamType()),
46
43
  option('-R', '--resolve-resource'),
@@ -50,13 +47,12 @@ def ocrd_cli_options(f):
50
47
  option('-D', '--dump-module-dir', is_flag=True, default=False),
51
48
  option('-h', '--help', is_flag=True, default=False),
52
49
  option('-V', '--version', is_flag=True, default=False),
53
- # Subcommand, only used for 'worker'/'server'. Cannot be handled in
50
+ # Subcommand, only used for 'worker'. Cannot be handled in
54
51
  # click because processors use the @command decorator and even if they
55
52
  # were using `group`, you cannot combine have a command with
56
53
  # subcommands. So we have to work around that by creating a
57
54
  # pseudo-subcommand handled in ocrd_cli_wrap_processor
58
- argument('subcommand', nargs=1, required=False,
59
- type=click.Choice(list(map(str, AgentType)))),
55
+ argument('subcommand', nargs=1, required=False, type=click.Choice(["worker"])),
60
56
  ]
61
57
  for param in params:
62
58
  param(f)
@@ -7,17 +7,18 @@ def _handle_param_option(ctx, param, value):
7
7
  from ocrd_utils import parse_json_string_or_file
8
8
  return parse_json_string_or_file(*list(value))
9
9
 
10
+
10
11
  parameter_option = option('-p', '--parameter',
11
- help="Parameters, either JSON string or path to JSON file",
12
- multiple=True,
13
- default=[],
14
- # now handled in ocrd_cli_wrap_processor to resolve processor preset files
15
- # callback=_handle_param_option
16
- callback=lambda ctx, param, kv: list(kv))
12
+ help="Parameters, either JSON string or path to JSON file",
13
+ multiple=True,
14
+ default=[],
15
+ # now handled in ocrd_cli_wrap_processor to resolve processor preset files
16
+ # callback=_handle_param_option
17
+ callback=lambda ctx, param, kv: list(kv))
17
18
 
18
19
  parameter_override_option = option('-P', '--parameter-override',
19
- help="Parameter override",
20
- nargs=2,
21
- multiple=True,
22
- callback=lambda ctx, param, kv: kv)
23
- # callback=lambda ctx, param, kv: {kv[0]: kv[1]})
20
+ help="Parameter override",
21
+ nargs=2,
22
+ multiple=True,
23
+ # callback=lambda ctx, param, kv: {kv[0]: kv[1]})
24
+ callback=lambda ctx, param, kv: kv)
ocrd/mets_server.py CHANGED
@@ -3,12 +3,12 @@
3
3
  """
4
4
  import os
5
5
  import re
6
- from os import _exit, chmod
6
+ from os import chmod
7
7
  import signal
8
8
  from typing import Dict, Optional, Union, List, Tuple
9
9
  from time import sleep
10
10
  from pathlib import Path
11
- from subprocess import Popen, run as subprocess_run
11
+ from subprocess import Popen
12
12
  from urllib.parse import urlparse
13
13
  import socket
14
14
  import atexit
@@ -258,12 +258,12 @@ class ClientSideOcrdMets:
258
258
 
259
259
  def add_agent(self, **kwargs):
260
260
  if not self.multiplexing_mode:
261
- return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict())
261
+ return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).model_dump())
262
262
  else:
263
263
  self.session.request(
264
264
  "POST",
265
265
  self.url,
266
- json=MpxReq.add_agent(self.ws_dir_path, OcrdAgentModel.create(**kwargs).dict())
266
+ json=MpxReq.add_agent(self.ws_dir_path, OcrdAgentModel.create(**kwargs).model_dump())
267
267
  ).json()
268
268
  return OcrdAgentModel.create(**kwargs)
269
269
 
@@ -305,7 +305,7 @@ class ClientSideOcrdMets:
305
305
  mimetype=mimetype, url=url, local_filename=local_filename
306
306
  )
307
307
  # add force+ignore
308
- kwargs = {**kwargs, **data.dict()}
308
+ kwargs = {**kwargs, **data.model_dump()}
309
309
 
310
310
  if not self.multiplexing_mode:
311
311
  r = self.session.request("POST", f"{self.url}/file", data=kwargs)
@@ -424,7 +424,7 @@ class OcrdMetsServer:
424
424
  # Wait for the mets server to start
425
425
  sleep(2)
426
426
  if sub_process.poll():
427
- raise RuntimeError(f"Mets server starting failed. See {log_file} for errors")
427
+ raise RuntimeError(f"Starting METS Server failed. See {log_file} for errors")
428
428
  return sub_process.pid
429
429
 
430
430
  @staticmethod
@@ -433,12 +433,12 @@ class OcrdMetsServer:
433
433
  sleep(3)
434
434
  try:
435
435
  os.kill(mets_server_pid, signal.SIGKILL)
436
- except ProcessLookupError as e:
436
+ except ProcessLookupError:
437
437
  pass
438
438
 
439
439
  def shutdown(self):
440
440
  pid = os.getpid()
441
- self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.")
441
+ self.log.info(f"Shutdown method of METS Server[{pid}] invoked, sending SIGTERM signal.")
442
442
  os.kill(pid, signal.SIGTERM)
443
443
  if self.is_uds:
444
444
  if Path(self.url).exists():
@@ -446,7 +446,7 @@ class OcrdMetsServer:
446
446
  Path(self.url).unlink()
447
447
 
448
448
  def startup(self):
449
- self.log.info(f"Configuring the Mets Server")
449
+ self.log.info("Configuring the METS Server")
450
450
 
451
451
  workspace = self.workspace
452
452
 
@@ -516,10 +516,6 @@ class OcrdMetsServer:
516
516
  self.log.debug(f"GET /physical_pages -> {response}")
517
517
  return response
518
518
 
519
- @app.get(path='/physical_pages', response_model=OcrdPageListModel)
520
- async def physical_pages():
521
- return {'physical_pages': workspace.mets.physical_pages}
522
-
523
519
  @app.get(path='/file_groups', response_model=OcrdFileGroupListModel)
524
520
  async def file_groups():
525
521
  response = {'file_groups': workspace.mets.file_groups}
@@ -534,7 +530,7 @@ class OcrdMetsServer:
534
530
 
535
531
  @app.post(path='/agent', response_model=OcrdAgentModel)
536
532
  async def add_agent(agent: OcrdAgentModel):
537
- kwargs = agent.dict()
533
+ kwargs = agent.model_dump()
538
534
  kwargs['_type'] = kwargs.pop('type')
539
535
  workspace.mets.add_agent(**kwargs)
540
536
  response = agent
@@ -579,7 +575,7 @@ class OcrdMetsServer:
579
575
  local_filename=local_filename
580
576
  )
581
577
  # Add to workspace
582
- kwargs = file_resource.dict()
578
+ kwargs = file_resource.model_dump()
583
579
  workspace.add_file(**kwargs, force=force)
584
580
  response = file_resource
585
581
  self.log.debug(f"POST /file -> {response.__dict__}")