ocrd 3.9.0__py3-none-any.whl → 3.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocrd/processor/base.py CHANGED
@@ -25,12 +25,13 @@ import tarfile
25
25
  import io
26
26
  from collections import defaultdict
27
27
  from frozendict import frozendict
28
- # concurrent.futures is buggy in py38,
29
- # this is where the fixes came from:
30
- from loky import Future, ProcessPoolExecutor
28
+ # concurrent.futures cannot timeout-kill workers
29
+ from pebble import ProcessFuture as Future, ProcessPool as ProcessPoolExecutor
30
+ from concurrent.futures import TimeoutError
31
31
  import multiprocessing as mp
32
- from multiprocessing.pool import ThreadPool
32
+ import threading
33
33
 
34
+ from cysignals import alarm
34
35
  from click import wrap_text
35
36
  from deprecated import deprecated
36
37
  from requests import HTTPError
@@ -113,6 +114,13 @@ class MissingInputFile(ValueError):
113
114
  super().__init__(self.message)
114
115
 
115
116
 
117
+ class IncompleteProcessorImplementation(NotImplementedError):
118
+ """
119
+ An exception signifying the Processor subclass is incomplete,
120
+ because either :py:meth:`Processor.process_page_pcgts()` or
121
+ :py:meth:`Processor.process()` was not overridden.
122
+ """
123
+
116
124
  class DummyFuture:
117
125
  """
118
126
  Mimics some of `concurrent.futures.Future` but runs immediately.
@@ -134,12 +142,18 @@ class DummyExecutor:
134
142
  def __init__(self, initializer=None, initargs=(), **kwargs):
135
143
  initializer(*initargs)
136
144
 
137
- def shutdown(self, **kwargs):
145
+ def stop(self):
138
146
  # allow gc to catch processor instance (unless cached)
139
147
  _page_worker_set_ctxt(None, None)
140
148
 
141
- def submit(self, fn, *args, **kwargs) -> DummyFuture:
142
- return DummyFuture(fn, *args, **kwargs)
149
+ def join(self, **kwargs):
150
+ pass
151
+
152
+ def schedule(self, fn, args=None, kwargs=None, timeout=None) -> DummyFuture:
153
+ args = args or []
154
+ kwargs = kwargs or {}
155
+ timeout = timeout or 0
156
+ return DummyFuture(fn, *args, **kwargs, timeout=timeout)
143
157
 
144
158
 
145
159
  TFuture = Union[DummyFuture, Future]
@@ -490,7 +504,7 @@ class Processor():
490
504
  (This contains the main functionality and needs to be
491
505
  overridden by subclasses.)
492
506
  """
493
- raise NotImplementedError()
507
+ raise IncompleteProcessorImplementation()
494
508
 
495
509
  def process_workspace(self, workspace: Workspace) -> None:
496
510
  """
@@ -525,7 +539,7 @@ class Processor():
525
539
  self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
526
540
  max_seconds = self.max_page_seconds
527
541
 
528
- if max_workers > 1:
542
+ if isinstance(workspace.mets, ClientSideOcrdMets):
529
543
  executor_cls = ProcessPoolExecutor
530
544
  log_queue = mp.get_context('fork').Queue()
531
545
  else:
@@ -539,7 +553,8 @@ class Processor():
539
553
  initializer=_page_worker_set_ctxt,
540
554
  initargs=(self, log_queue),
541
555
  )
542
- if max_workers > 1:
556
+ if isinstance(workspace.mets, ClientSideOcrdMets):
557
+ assert executor.active # ensure pre-forking
543
558
  # forward messages from log queue (in subprocesses) to all root handlers
544
559
  log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers,
545
560
  respect_handler_level=True)
@@ -550,21 +565,22 @@ class Processor():
550
565
  tasks = self.process_workspace_submit_tasks(executor, max_seconds)
551
566
  stats = self.process_workspace_handle_tasks(tasks)
552
567
  finally:
553
- executor.shutdown(kill_workers=True, wait=False)
568
+ executor.stop()
569
+ executor.join(timeout=3.0) # raises TimeoutError
554
570
  self._base_logger.debug("stopped executor %s after %d tasks", str(executor), len(tasks) if tasks else -1)
555
- if max_workers > 1:
571
+ if isinstance(workspace.mets, ClientSideOcrdMets):
556
572
  # can cause deadlock:
557
573
  #log_listener.stop()
558
574
  # not much better:
559
575
  #log_listener.enqueue_sentinel()
560
576
  pass
561
577
 
562
- except NotImplementedError:
578
+ except IncompleteProcessorImplementation:
563
579
  # fall back to deprecated method
564
580
  try:
565
581
  self.process()
566
582
  except Exception as err:
567
- # suppress the NotImplementedError context
583
+ # suppress the IncompleteProcessorImplementation context
568
584
  raise err from None
569
585
 
570
586
  def process_workspace_submit_tasks(self, executor: TExecutor, max_seconds: int) -> Dict[
@@ -640,7 +656,7 @@ class Processor():
640
656
  self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
641
657
  # process page
642
658
  #executor.submit(self.process_page_file, *input_files)
643
- return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files
659
+ return executor.schedule(_page_worker, args=input_files, timeout=max_seconds), page_id, input_files
644
660
 
645
661
  def process_workspace_handle_tasks(self, tasks: Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[
646
662
  int, int, Dict[str, int], int]:
@@ -726,11 +742,12 @@ class Processor():
726
742
  # but does not stop the running process/thread, and executor itself
727
743
  # offers nothing to that effect:
728
744
  # task.result(timeout=max_seconds or None)
729
- # so we instead applied the timeout within the worker function
745
+ # so we instead passed the timeout to the submit (schedule) function
730
746
  task.result()
747
+ self._base_logger.debug("page worker completed for page %s", page_id)
731
748
  return True
732
- except NotImplementedError:
733
- # exclude NotImplementedError, so we can try process() below
749
+ except IncompleteProcessorImplementation:
750
+ # pass this through, so we can try process() below
734
751
  raise
735
752
  # handle input failures separately
736
753
  except FileExistsError as err:
@@ -744,7 +761,9 @@ class Processor():
744
761
  except KeyboardInterrupt:
745
762
  raise
746
763
  # broad coverage of output failures (including TimeoutError)
747
- except Exception as err:
764
+ except (Exception, TimeoutError) as err:
765
+ if isinstance(err, TimeoutError):
766
+ self._base_logger.debug("page worker timed out for page %s", page_id)
748
767
  # FIXME: add re-usable/actionable logging
749
768
  if config.OCRD_MISSING_OUTPUT == 'ABORT':
750
769
  self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}")
@@ -896,7 +915,7 @@ class Processor():
896
915
  (This contains the main functionality and must be overridden by subclasses,
897
916
  unless it does not get called by some overriden :py:meth:`.process_page_file`.)
898
917
  """
899
- raise NotImplementedError()
918
+ raise IncompleteProcessorImplementation()
900
919
 
901
920
  def add_metadata(self, pcgts: OcrdPage) -> None:
902
921
  """
@@ -1182,22 +1201,33 @@ def _page_worker_set_ctxt(processor, log_queue):
1182
1201
  logging.root.handlers = [logging.handlers.QueueHandler(log_queue)]
1183
1202
 
1184
1203
 
1185
- def _page_worker(timeout, *input_files):
1204
+ def _page_worker(*input_files, timeout=0):
1186
1205
  """
1187
1206
  Wraps a `Processor.process_page_file` call as payload (call target)
1188
- of the ProcessPoolExecutor workers, but also enforces the given timeout.
1207
+ of the ProcessPoolExecutor workers.
1189
1208
  """
1209
+ #_page_worker_processor.process_page_file(*input_files)
1190
1210
  page_id = next((file.pageId for file in input_files
1191
1211
  if hasattr(file, 'pageId')), "")
1192
- pool = ThreadPool(processes=1)
1212
+ if timeout:
1213
+ if threading.current_thread() is not threading.main_thread():
1214
+ # does not work outside of main thread
1215
+ # (because the exception/interrupt goes there):
1216
+ raise ValueError("cannot apply page worker timeout outside main thread")
1217
+ # based on setitimer() / SIGALRM - only available on Unix+Cygwin
1218
+ # (but we need to interrupt even when in syscalls
1219
+ # and cannot rely on Timer threads, because
1220
+ # processor implementations might not work with threads),
1221
+ alarm.alarm(timeout)
1193
1222
  try:
1194
- #_page_worker_processor.process_page_file(*input_files)
1195
- async_result = pool.apply_async(_page_worker_processor.process_page_file, input_files)
1196
- async_result.get(timeout or None)
1223
+ _page_worker_processor.process_page_file(*input_files)
1197
1224
  _page_worker_processor.logger.debug("page worker completed for page %s", page_id)
1198
- except mp.TimeoutError:
1225
+ except alarm.AlarmInterrupt:
1199
1226
  _page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
1200
- raise
1227
+ raise TimeoutError
1228
+ finally:
1229
+ if timeout:
1230
+ alarm.cancel_alarm()
1201
1231
 
1202
1232
 
1203
1233
  def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: ocrd
3
- Version: 3.9.0
3
+ Version: 3.10.0
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -12,9 +12,10 @@ Requires-Python: >=3.8
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: atomicwrites>=1.3.0
15
- Requires-Dist: beanie~=1.7
15
+ Requires-Dist: beanie>=2.0.1
16
16
  Requires-Dist: click>=7
17
17
  Requires-Dist: cryptography<43.0.0
18
+ Requires-Dist: cysignals
18
19
  Requires-Dist: Deprecated==1.2.0
19
20
  Requires-Dist: docker>=7.1.0
20
21
  Requires-Dist: elementpath
@@ -25,29 +26,31 @@ Requires-Dist: frozendict>=2.4.0
25
26
  Requires-Dist: gitpython
26
27
  Requires-Dist: gdown
27
28
  Requires-Dist: httpx>=0.22.0
29
+ Requires-Dist: importlib_metadata; python_version < "3.8"
30
+ Requires-Dist: importlib_resources; python_version < "3.10"
28
31
  Requires-Dist: jsonschema>=4
29
- Requires-Dist: loky
32
+ Requires-Dist: pebble
30
33
  Requires-Dist: lxml
31
34
  Requires-Dist: memory-profiler>=0.58.0
32
35
  Requires-Dist: numpy
33
36
  Requires-Dist: ocrd-fork-bagit>=1.8.1.post2
34
- Requires-Dist: ocrd-fork-bagit-profile>=1.3.0.post1
37
+ Requires-Dist: ocrd-fork-bagit_profile>=1.3.0.post1
35
38
  Requires-Dist: opencv-python-headless
36
39
  Requires-Dist: paramiko
37
40
  Requires-Dist: pika>=1.2.0
38
41
  Requires-Dist: Pillow>=7.2.0
39
42
  Requires-Dist: pydantic>=2.0.0
43
+ Requires-Dist: pymongo>=4.15.5
40
44
  Requires-Dist: python-magic
41
45
  Requires-Dist: python-multipart
42
46
  Requires-Dist: pyyaml
43
47
  Requires-Dist: requests
44
- Requires-Dist: requests-unixsocket2
45
- Requires-Dist: uvicorn
46
- Requires-Dist: uvicorn>=0.17.6
47
- Requires-Dist: importlib-resources; python_version < "3.10"
48
- Requires-Dist: importlib-metadata; python_version < "3.8"
48
+ Requires-Dist: requests_unixsocket2
49
49
  Requires-Dist: shapely<2.0.2; python_version < "3.9"
50
50
  Requires-Dist: shapely>=2; python_version >= "3.9"
51
+ Requires-Dist: uvicorn
52
+ Requires-Dist: uvicorn>=0.17.6
53
+ Dynamic: license-file
51
54
 
52
55
  # OCR-D/core
53
56
 
@@ -23,7 +23,7 @@ ocrd/decorators/mets_find_options.py,sha256=8fiSdk-415o6-iBPB2T9He_v52qE8cTj3cCn
23
23
  ocrd/decorators/ocrd_cli_options.py,sha256=Bemkq3V3QkOI3nNqGzphaNW7gjU9vNN-M5F2DvxvioM,2479
24
24
  ocrd/decorators/parameter_option.py,sha256=TnCIcV9L5oAnI1Ew2TyFzo5FAwiIzWl2pn8oaD9jfEU,1056
25
25
  ocrd/processor/__init__.py,sha256=39ymNwYRdc-b_OJzzKmWCvo2ga3KdsGSYDHE1Hzkn_w,274
26
- ocrd/processor/base.py,sha256=yHwxd4ZkHLPuFgqQmOeDhMWAdCnHY_ptOjiSWj-FZqI,60600
26
+ ocrd/processor/base.py,sha256=0SbFhLYUC8VMPZdCd_Y329514IuOz1X1jrCbuuX8Kwg,62074
27
27
  ocrd/processor/helpers.py,sha256=4lR_QvZsxvh7f8_uK9YzdHP5-hvFU4qqYM_Cu_k41KI,10937
28
28
  ocrd/processor/ocrd_page_result.py,sha256=hHV1TlKhKFN848cUCqR31v2R3HH4HEoeyGXqUc2DLkY,2945
29
29
  ocrd/processor/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -38,6 +38,7 @@ ocrd/processor/builtin/param_command_transkribus-to-prima.json,sha256=AvPNNS5uBm
38
38
  ocrd/processor/builtin/shell_processor.py,sha256=aWsB_m7o4ypG1DBAE0sNMFnaw9ptONqchLLl06KgTEo,5888
39
39
  ocrd/processor/builtin/dummy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  ocrd/processor/builtin/dummy/ocrd-tool.json,sha256=t_M3HABw7k_Ufi1L9Mr4t3LSCRnu0HH8-fvEs3u2PQY,3487
41
+ ocrd-3.10.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
41
42
  ocrd_modelfactory/__init__.py,sha256=sjAwPwDzetvPHdV6nPquHtMdFUBYRmo1P-VKER9YCWM,4404
42
43
  ocrd_models/__init__.py,sha256=A0aj0mOraNb-xfiUueACdoaqISnp0qH-F49nTJg2vCs,380
43
44
  ocrd_models/constants.py,sha256=R7-jOGabFd8HP0qRWfTMk0RcUmdwN-jhmDVbUW_QfU4,6961
@@ -56,7 +57,7 @@ ocrd_network/__init__.py,sha256=NWlSgXi7z45ow37AmITxfCB1d-L39rO8ttyxNJ-z8G0,376
56
57
  ocrd_network/client.py,sha256=hi13uDUYC5t7xHtZEUYwNBAZOvovWaScfCtFSORVg7Q,3224
57
58
  ocrd_network/client_utils.py,sha256=d5UE0MdDJxsYxIQemKcoUuALOiPJ8Cew8bjgsg9d71w,5709
58
59
  ocrd_network/constants.py,sha256=mUjpkZDYPdRZmOeC0jyzQkuLuWrODLFzlrAHkguKWGg,1942
59
- ocrd_network/database.py,sha256=-SddvaMLKn0pjdONyvWmjxfPJd6viedAIp6Lj1sU1Zs,10705
60
+ ocrd_network/database.py,sha256=p2vaFgVzhA1l8CTY9vTMT6mOjzHSJK1tE1dkPrUUcp8,10670
60
61
  ocrd_network/logging_utils.py,sha256=hXwS46FzY_HTh92DgnxTuARxj8C18bOBmFKVrvBlUgc,2409
61
62
  ocrd_network/param_validators.py,sha256=Jl1VwiPPKJ50k-xEHLdvW-1QDOkJHCiMz4k9Ipqm-Uc,1489
62
63
  ocrd_network/process_helpers.py,sha256=t2qltUpRefzLwdSGsiUEOGYO4Pz2OH7arpgjmCAeXMU,3086
@@ -68,7 +69,7 @@ ocrd_network/server_utils.py,sha256=Lxby62gHvrSbHgpWXvyZGdsWajp2TFzyxjHdMZWBESk,
68
69
  ocrd_network/tcp_to_uds_mets_proxy.py,sha256=yRW-O6ihd31gf7xqQBIBb_ZQQgqisMyOdRI216ehq_A,3160
69
70
  ocrd_network/utils.py,sha256=yE-nV_sv171tPp7weIFOxYw6HJlxvGBmrS8b1rIHS7c,6760
70
71
  ocrd_network/cli/__init__.py,sha256=VBjjXcn-2O5gerqE6UdNfS-EkVFEVPQFHylsn8F9kfY,317
71
- ocrd_network/cli/client.py,sha256=aZbUqPSQtUcCk-4zz-qNwRTGy42-KvzGk44L2_FVR4k,10357
72
+ ocrd_network/cli/client.py,sha256=WoLt1NZAOtHeECegUBcop8K2_D0S8khrLjFZhV_38ww,10551
72
73
  ocrd_network/cli/processing_server.py,sha256=NsuI0f9h4KDwe39YugmHo5cJ_29chcLLQ7DThKfPO7s,770
73
74
  ocrd_network/cli/processing_worker.py,sha256=ZuaCkbKV_WKJV7cGOjZ6RLrjjppymnwNCiznFMlclAg,1897
74
75
  ocrd_network/cli/resmgr_server.py,sha256=sc0VX_RehTbg8Qp7ht_DvVqsrdL5b9Zw3bBgWcAD13A,826
@@ -112,7 +113,7 @@ ocrd_validators/ocrd_network_message_validator.py,sha256=oafNWOjieBmTHFfYeCtyFFp
112
113
  ocrd_validators/ocrd_tool.schema.yml,sha256=fDNr-QdEOBtYbz8aHmjdOUirPBKr3vfLUDtC88gu75U,10231
113
114
  ocrd_validators/ocrd_tool_validator.py,sha256=0DWuyyOSbdbrrQ5kEfWZv_qp5rSmLzmFMUKcPGfCBgM,749
114
115
  ocrd_validators/ocrd_zip_validator.py,sha256=t-cYIZ5llZSQ2EspFzm0m-FajkLRfAFTISmXe27wMtA,3720
115
- ocrd_validators/page.xsd,sha256=abQ8C3gRLPMFm8lH62aTCfvTIWI23TpgEDcaW9YCt7I,85770
116
+ ocrd_validators/page.xsd,sha256=dhWXObYIvME8kWb1pY9RXtSR8wJOjR7xyXeFbDgArA8,87357
116
117
  ocrd_validators/page_validator.py,sha256=-xPlQb0WRv_wkpGdFFYRJFBoW2if3IYztu-7zeUKs_0,21813
117
118
  ocrd_validators/parameter_validator.py,sha256=kZes2sl35iEjcewvyeW2aSXJC9tbMDLqVurH2IOYChU,1366
118
119
  ocrd_validators/processing_server_config.schema.yml,sha256=s-cFCKxSNdxbtbuOhmATDrL1W-12CjhXq1n4PIAC_r8,4417
@@ -123,9 +124,8 @@ ocrd_validators/xlink.xsd,sha256=8fW7YAMWXN2PbB_MMvj9H5ZeFoEBDzuYBtlGC8_6ijw,318
123
124
  ocrd_validators/xsd_mets_validator.py,sha256=YgiuNtwNDtn3LuvdFFscnmsGREF_wQ4wtA76yE2Iljw,469
124
125
  ocrd_validators/xsd_page_validator.py,sha256=ggt-nmaz-DDyAPwm3ZMVvtChuV2BJ2ZEEbWpePL9vTk,469
125
126
  ocrd_validators/xsd_validator.py,sha256=ahJo_oVvTK_JB0Cu4CkMC8l_gbzsyW91AxGtelMjqrg,2115
126
- ocrd-3.9.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
127
- ocrd-3.9.0.dist-info/METADATA,sha256=sKR-ODMcThMWizUZ_1duc0bj4oruD0RPnc5z7-AJly4,11396
128
- ocrd-3.9.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
129
- ocrd-3.9.0.dist-info/entry_points.txt,sha256=CI-NoDR1BYmsuAsJmPAn4NrN9guzdedHGUbC8QSmdGs,266
130
- ocrd-3.9.0.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
131
- ocrd-3.9.0.dist-info/RECORD,,
127
+ ocrd-3.10.0.dist-info/METADATA,sha256=E17aYmDFptx9m4iJnCetIQnpLQJvAHK73YkiuRrWXCo,11479
128
+ ocrd-3.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
129
+ ocrd-3.10.0.dist-info/entry_points.txt,sha256=CI-NoDR1BYmsuAsJmPAn4NrN9guzdedHGUbC8QSmdGs,266
130
+ ocrd-3.10.0.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
131
+ ocrd-3.10.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.2)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -212,13 +212,17 @@ def workflow_cli():
212
212
  @workflow_cli.command('check-status')
213
213
  @click.option('--address', type=URL, help=ADDRESS_HELP)
214
214
  @click.option('-j', '--workflow-job-id', required=True)
215
- def check_workflow_job_status(address: Optional[str], workflow_job_id: str):
215
+ @click.option('-v', '--verbose', default=False, is_flag=True)
216
+ def check_workflow_job_status(address: Optional[str], workflow_job_id: str, verbose: bool = False):
216
217
  """
217
218
  Check the status of a previously submitted workflow job.
218
219
  """
219
220
  client = Client(server_addr_processing=address)
220
221
  try:
221
- job_status = client.check_workflow_status_simple(workflow_job_id)
222
+ if verbose:
223
+ job_status = client.check_workflow_status(workflow_job_id)
224
+ else:
225
+ job_status = client.check_workflow_status_simple(workflow_job_id)
222
226
  except RequestException as e:
223
227
  print(
224
228
  getattr(e, 'detail_message', str(e)),
ocrd_network/database.py CHANGED
@@ -14,9 +14,8 @@ database (runs in docker) currently has no volume set.
14
14
  """
15
15
  from beanie import init_beanie
16
16
  from beanie.operators import In
17
- from motor.motor_asyncio import AsyncIOMotorClient
18
17
  from pathlib import Path
19
- from pymongo import MongoClient, uri_parser as mongo_uri_parser
18
+ from pymongo import AsyncMongoClient, MongoClient, uri_parser as mongo_uri_parser
20
19
  from re import sub as re_sub
21
20
  from typing import List
22
21
  from uuid import uuid4
@@ -26,7 +25,7 @@ from .utils import call_sync
26
25
 
27
26
 
28
27
  async def initiate_database(db_url: str, db_name: str = 'ocrd'):
29
- client = AsyncIOMotorClient(db_url)
28
+ client = AsyncMongoClient(db_url)
30
29
  await init_beanie(
31
30
  database=client.get_default_database(default=db_name),
32
31
  document_models=[DBProcessorJob, DBWorkflowJob, DBWorkspace, DBWorkflowScript]
ocrd_validators/page.xsd CHANGED
@@ -1321,9 +1321,16 @@
1321
1321
  <annotation>
1322
1322
  <documentation>
1323
1323
  Definition of the reading order within the page.
1324
- To express a reading order between elements
1325
- they have to be included in an OrderedGroup.
1326
- Groups may contain further groups.
1324
+ To express a reading order between regions,
1325
+ they have to be referenced by an OrderedGroup.
1326
+ To express a non-sequential relationship (while still
1327
+ referencing them as part of the overall structural tree),
1328
+ they have to be referenced by an UnorderedGroup.
1329
+ Groups may contain further groups, just as regions may
1330
+ contain further regions.
1331
+ Regions may be referenced only once by any ReadingOrder
1332
+ element. (That is, the @regionRef define an injective
1333
+ mapping from the ReadingOrder tree to the Region tree.)
1327
1334
  </documentation>
1328
1335
  </annotation>
1329
1336
  <choice minOccurs="1" maxOccurs="1">
@@ -1336,26 +1343,26 @@
1336
1343
  </annotation>
1337
1344
  </attribute>
1338
1345
  </complexType>
1339
- <complexType name="RegionRefIndexedType">
1346
+ <complexType name="RegionRefType">
1340
1347
  <annotation>
1341
- <documentation>Numbered region</documentation>
1348
+ <documentation>Region reference in an UnorderedGroup(Indexed)</documentation>
1342
1349
  </annotation>
1343
- <attribute name="index" type="int" use="required">
1344
- <annotation>
1345
- <documentation>Position (order number) of this item within the current hierarchy level.</documentation>
1346
- </annotation>
1347
- </attribute>
1348
1350
  <attribute name="regionRef" type="IDREF" use="required"/>
1349
1351
  </complexType>
1350
- <complexType name="OrderedGroupIndexedType">
1352
+ <complexType name="OrderedGroupType">
1351
1353
  <annotation>
1352
1354
  <documentation>
1353
- Indexed group containing ordered elements
1355
+ Group containing index-ordered elements within an UnorderedGroup(Indexed)
1356
+ Elements must be sorted ascending strictly monotonically according to their
1357
+ @index, regardless of their type. (That is, the index of each two consecutive
1358
+ OrderedGroupIndexed, UnorderedGroupIndexed or RegionRefIndexed elements
1359
+ must increase by at least one.)
1354
1360
  </documentation>
1355
1361
  </annotation>
1356
1362
  <sequence>
1357
1363
  <element name="UserDefined" type="pc:UserDefinedType"
1358
- minOccurs="0" maxOccurs="1"/>
1364
+ minOccurs="0" maxOccurs="1">
1365
+ </element>
1359
1366
  <element name="Labels" type="pc:LabelsType"
1360
1367
  minOccurs="0" maxOccurs="unbounded">
1361
1368
  <annotation>
@@ -1378,23 +1385,15 @@
1378
1385
  </documentation>
1379
1386
  </annotation>
1380
1387
  </attribute>
1381
- <attribute name="index" type="int" use="required">
1382
- <annotation>
1383
- <documentation>
1384
- Position (order number) of this item within the
1385
- current hierarchy level.
1386
- </documentation>
1387
- </annotation>
1388
- </attribute>
1389
1388
  <attribute name="caption" type="string"/>
1390
1389
  <attribute name="type" type="pc:GroupTypeSimpleType"/>
1391
1390
  <attribute name="continuation" type="boolean">
1392
1391
  <annotation>
1393
1392
  <documentation>
1394
- Is this group a continuation of another group (from
1395
- previous column or page, for example)?
1393
+ Is this group a continuation of another group
1394
+ (from previous column or page, for example)?
1396
1395
  </documentation>
1397
- </annotation>
1396
+ </annotation>
1398
1397
  </attribute>
1399
1398
  <attribute name="custom" type="string">
1400
1399
  <annotation>
@@ -1403,10 +1402,13 @@
1403
1402
  </attribute>
1404
1403
  <attribute name="comments" type="string"/>
1405
1404
  </complexType>
1406
- <complexType name="UnorderedGroupIndexedType">
1405
+ <complexType name="UnorderedGroupType">
1407
1406
  <annotation>
1408
1407
  <documentation>
1409
- Indexed group containing unordered elements
1408
+ Group containing unordered elements within an UnorderedGroup(Indexed)
1409
+ Elements need not be sorted, and may be mixed by type. (That is,
1410
+ the sequence of OrderedGroup, UnorderedGroup or RegionRef elements
1411
+ may be ordered arbitrarily.)
1410
1412
  </documentation>
1411
1413
  </annotation>
1412
1414
  <sequence>
@@ -1435,14 +1437,6 @@
1435
1437
  </documentation>
1436
1438
  </annotation>
1437
1439
  </attribute>
1438
- <attribute name="index" type="int" use="required">
1439
- <annotation>
1440
- <documentation>
1441
- Position (order number) of this item within the
1442
- current hierarchy level.
1443
- </documentation>
1444
- </annotation>
1445
- </attribute>
1446
1440
  <attribute name="caption" type="string"/>
1447
1441
  <attribute name="type" type="pc:GroupTypeSimpleType"/>
1448
1442
  <attribute name="continuation" type="boolean">
@@ -1460,19 +1454,30 @@
1460
1454
  </attribute>
1461
1455
  <attribute name="comments" type="string"/>
1462
1456
  </complexType>
1463
- <complexType name="RegionRefType">
1457
+ <complexType name="RegionRefIndexedType">
1458
+ <annotation>
1459
+ <documentation>Region reference within an OrderedGroup(Indexed)</documentation>
1460
+ </annotation>
1461
+ <attribute name="index" type="int" use="required">
1462
+ <annotation>
1463
+ <documentation>Position (order number) of this item within the current hierarchy level.</documentation>
1464
+ </annotation>
1465
+ </attribute>
1464
1466
  <attribute name="regionRef" type="IDREF" use="required"/>
1465
1467
  </complexType>
1466
- <complexType name="OrderedGroupType">
1468
+ <complexType name="OrderedGroupIndexedType">
1467
1469
  <annotation>
1468
1470
  <documentation>
1469
- Numbered group (contains ordered elements)
1471
+ Group containing index-ordered elements within an OrderedGroup(Indexed).
1472
+ Elements must be sorted ascending strictly monotonically according to their
1473
+ @index, regardless of their type. (That is, the index of each two consecutive
1474
+ OrderedGroupIndexed, UnorderedGroupIndexed or RegionRefIndexed elements
1475
+ must increase by at least one.)
1470
1476
  </documentation>
1471
1477
  </annotation>
1472
1478
  <sequence>
1473
1479
  <element name="UserDefined" type="pc:UserDefinedType"
1474
- minOccurs="0" maxOccurs="1">
1475
- </element>
1480
+ minOccurs="0" maxOccurs="1"/>
1476
1481
  <element name="Labels" type="pc:LabelsType"
1477
1482
  minOccurs="0" maxOccurs="unbounded">
1478
1483
  <annotation>
@@ -1495,15 +1500,23 @@
1495
1500
  </documentation>
1496
1501
  </annotation>
1497
1502
  </attribute>
1503
+ <attribute name="index" type="int" use="required">
1504
+ <annotation>
1505
+ <documentation>
1506
+ Position (order number) of this item within the
1507
+ current hierarchy level.
1508
+ </documentation>
1509
+ </annotation>
1510
+ </attribute>
1498
1511
  <attribute name="caption" type="string"/>
1499
1512
  <attribute name="type" type="pc:GroupTypeSimpleType"/>
1500
1513
  <attribute name="continuation" type="boolean">
1501
1514
  <annotation>
1502
1515
  <documentation>
1503
- Is this group a continuation of another group
1504
- (from previous column or page, for example)?
1516
+ Is this group a continuation of another group (from
1517
+ previous column or page, for example)?
1505
1518
  </documentation>
1506
- </annotation>
1519
+ </annotation>
1507
1520
  </attribute>
1508
1521
  <attribute name="custom" type="string">
1509
1522
  <annotation>
@@ -1512,10 +1525,13 @@
1512
1525
  </attribute>
1513
1526
  <attribute name="comments" type="string"/>
1514
1527
  </complexType>
1515
- <complexType name="UnorderedGroupType">
1528
+ <complexType name="UnorderedGroupIndexedType">
1516
1529
  <annotation>
1517
1530
  <documentation>
1518
- Numbered group (contains unordered elements)
1531
+ Group containing unordered elements within an OrderedGroup(Indexed)
1532
+ Elements need not be sorted, and may be mixed by type. (That is,
1533
+ the sequence of OrderedGroup, UnorderedGroup or RegionRef elements
1534
+ may be ordered arbitrarily.)
1519
1535
  </documentation>
1520
1536
  </annotation>
1521
1537
  <sequence>
@@ -1544,6 +1560,14 @@
1544
1560
  </documentation>
1545
1561
  </annotation>
1546
1562
  </attribute>
1563
+ <attribute name="index" type="int" use="required">
1564
+ <annotation>
1565
+ <documentation>
1566
+ Position (order number) of this item within the
1567
+ current hierarchy level.
1568
+ </documentation>
1569
+ </annotation>
1570
+ </attribute>
1547
1571
  <attribute name="caption" type="string"/>
1548
1572
  <attribute name="type" type="pc:GroupTypeSimpleType"/>
1549
1573
  <attribute name="continuation" type="boolean">
@@ -2325,6 +2349,7 @@
2325
2349
  <element name="TableRegion" type="pc:TableRegionType"/>
2326
2350
  <element name="ChartRegion" type="pc:ChartRegionType"/>
2327
2351
  <element name="SeparatorRegion" type="pc:SeparatorRegionType"/>
2352
+ <element name="MapRegion" type="pc:MapRegionType"/>
2328
2353
  <element name="MathsRegion" type="pc:MathsRegionType"/>
2329
2354
  <element name="ChemRegion" type="pc:ChemRegionType"/>
2330
2355
  <element name="MusicRegion" type="pc:MusicRegionType"/>