ocrd 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. ocrd/cli/__init__.py +8 -6
  2. ocrd/cli/bashlib.py +8 -114
  3. ocrd/cli/network.py +0 -2
  4. ocrd/cli/ocrd_tool.py +26 -4
  5. ocrd/cli/process.py +1 -0
  6. ocrd/cli/resmgr.py +0 -1
  7. ocrd/cli/validate.py +32 -13
  8. ocrd/cli/workspace.py +125 -52
  9. ocrd/cli/zip.py +13 -4
  10. ocrd/decorators/__init__.py +28 -52
  11. ocrd/decorators/loglevel_option.py +4 -0
  12. ocrd/decorators/mets_find_options.py +2 -1
  13. ocrd/decorators/ocrd_cli_options.py +3 -7
  14. ocrd/decorators/parameter_option.py +12 -11
  15. ocrd/mets_server.py +11 -15
  16. ocrd/processor/base.py +88 -71
  17. ocrd/processor/builtin/dummy_processor.py +7 -4
  18. ocrd/processor/builtin/filter_processor.py +3 -2
  19. ocrd/processor/helpers.py +5 -6
  20. ocrd/processor/ocrd_page_result.py +7 -5
  21. ocrd/resolver.py +42 -32
  22. ocrd/task_sequence.py +11 -4
  23. ocrd/workspace.py +64 -54
  24. ocrd/workspace_backup.py +3 -0
  25. ocrd/workspace_bagger.py +15 -8
  26. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/METADATA +2 -8
  27. ocrd-3.7.0.dist-info/RECORD +123 -0
  28. ocrd_modelfactory/__init__.py +4 -2
  29. ocrd_models/constants.py +18 -1
  30. ocrd_models/ocrd_agent.py +1 -1
  31. ocrd_models/ocrd_exif.py +7 -3
  32. ocrd_models/ocrd_file.py +24 -19
  33. ocrd_models/ocrd_mets.py +90 -67
  34. ocrd_models/ocrd_page.py +17 -13
  35. ocrd_models/ocrd_xml_base.py +1 -0
  36. ocrd_models/report.py +2 -1
  37. ocrd_models/utils.py +4 -3
  38. ocrd_models/xpath_functions.py +3 -1
  39. ocrd_network/__init__.py +1 -2
  40. ocrd_network/cli/__init__.py +0 -2
  41. ocrd_network/cli/client.py +122 -50
  42. ocrd_network/cli/processing_server.py +1 -2
  43. ocrd_network/client.py +2 -2
  44. ocrd_network/client_utils.py +30 -13
  45. ocrd_network/constants.py +1 -6
  46. ocrd_network/database.py +3 -3
  47. ocrd_network/logging_utils.py +2 -7
  48. ocrd_network/models/__init__.py +0 -2
  49. ocrd_network/models/job.py +31 -33
  50. ocrd_network/models/messages.py +3 -2
  51. ocrd_network/models/workspace.py +5 -5
  52. ocrd_network/process_helpers.py +54 -17
  53. ocrd_network/processing_server.py +63 -114
  54. ocrd_network/processing_worker.py +6 -5
  55. ocrd_network/rabbitmq_utils/__init__.py +2 -0
  56. ocrd_network/rabbitmq_utils/helpers.py +24 -7
  57. ocrd_network/runtime_data/__init__.py +1 -2
  58. ocrd_network/runtime_data/deployer.py +12 -85
  59. ocrd_network/runtime_data/hosts.py +61 -130
  60. ocrd_network/runtime_data/network_agents.py +7 -31
  61. ocrd_network/runtime_data/network_services.py +1 -1
  62. ocrd_network/server_cache.py +1 -1
  63. ocrd_network/server_utils.py +13 -52
  64. ocrd_network/utils.py +1 -0
  65. ocrd_utils/__init__.py +4 -4
  66. ocrd_utils/config.py +86 -76
  67. ocrd_utils/deprecate.py +3 -0
  68. ocrd_utils/image.py +51 -23
  69. ocrd_utils/introspect.py +8 -3
  70. ocrd_utils/logging.py +15 -7
  71. ocrd_utils/os.py +17 -4
  72. ocrd_utils/str.py +32 -16
  73. ocrd_validators/json_validator.py +4 -1
  74. ocrd_validators/ocrd_tool_validator.py +2 -1
  75. ocrd_validators/ocrd_zip_validator.py +5 -4
  76. ocrd_validators/page_validator.py +21 -9
  77. ocrd_validators/parameter_validator.py +3 -2
  78. ocrd_validators/processing_server_config.schema.yml +1 -33
  79. ocrd_validators/resource_list_validator.py +3 -1
  80. ocrd_validators/workspace_validator.py +30 -20
  81. ocrd_validators/xsd_mets_validator.py +2 -1
  82. ocrd_validators/xsd_page_validator.py +2 -1
  83. ocrd_validators/xsd_validator.py +4 -2
  84. ocrd/cli/log.py +0 -51
  85. ocrd/lib.bash +0 -317
  86. ocrd-3.5.1.dist-info/RECORD +0 -128
  87. ocrd_network/cli/processor_server.py +0 -31
  88. ocrd_network/models/ocrd_tool.py +0 -12
  89. ocrd_network/processor_server.py +0 -255
  90. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/LICENSE +0 -0
  91. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/WHEEL +0 -0
  92. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/entry_points.txt +0 -0
  93. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/top_level.txt +0 -0
ocrd_models/ocrd_mets.py CHANGED
@@ -3,7 +3,6 @@ API to METS
3
3
  """
4
4
  from datetime import datetime
5
5
  import re
6
- from lxml import etree as ET
7
6
  from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
8
7
 
9
8
  from ocrd_utils import (
@@ -37,44 +36,45 @@ from .constants import (
37
36
  METS_DIV_ATTRIBUTE_REGEX_PATTERN,
38
37
  )
39
38
 
40
- from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore
39
+ from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore
41
40
  from .ocrd_file import OcrdFile
42
41
  from .ocrd_agent import OcrdAgent
43
42
 
44
43
  REGEX_PREFIX_LEN = len(REGEX_PREFIX)
45
44
 
45
+
46
46
  class OcrdMets(OcrdXmlDocument):
47
47
  """
48
48
  API to a single METS file
49
49
  """
50
- _cache_flag : bool
50
+ _cache_flag: bool
51
51
  # Cache for the physical pages (mets:div) - two nested dictionaries
52
52
  # The outer dictionary's key: attribute type
53
53
  # The outer dictionary's value: inner dictionary
54
54
  # The inner dictionary's key: attribute value (str)
55
55
  # The inner dictionary's value: a 'div' object at some memory location
56
- _page_cache : Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET._Element]]
56
+ _page_cache: Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET._Element]]
57
57
  # Cache for the files (mets:file) - two nested dictionaries
58
58
  # The outer dictionary's Key: 'fileGrp.USE'
59
59
  # The outer dictionary's Value: Inner dictionary
60
60
  # The inner dictionary's Key: 'file.ID'
61
61
  # The inner dictionary's Value: a 'file' object at some memory location
62
- _file_cache : Dict[str, Dict[str, ET._Element]]
62
+ _file_cache: Dict[str, Dict[str, ET._Element]]
63
63
  # Cache for the file pointers (mets:fptr) - two nested dictionaries
64
64
  # The outer dictionary's Key: 'div.ID'
65
65
  # The outer dictionary's Value: Inner dictionary
66
66
  # The inner dictionary's Key: 'fptr.FILEID'
67
67
  # The inner dictionary's Value: a 'fptr' object at some memory location
68
- _fptr_cache : Dict[str, Dict[str, ET._Element]]
68
+ _fptr_cache: Dict[str, Dict[str, ET._Element]]
69
69
  # Cache for the logical structural divs (mets:div) - two nested dictionaries
70
70
  # The outer dictionary's key: attribute type
71
71
  # The outer dictionary's value: inner dictionary
72
72
  # The inner dictionary's key: attribute value (str)
73
73
  # The inner dictionary's value: a list of corresponding physical div.ID
74
- _struct_cache : Dict[METS_STRUCT_DIV_ATTRIBUTE, Dict[str, List[str]]]
74
+ _struct_cache: Dict[METS_STRUCT_DIV_ATTRIBUTE, Dict[str, List[str]]]
75
75
 
76
76
  @staticmethod
77
- def empty_mets(now : Optional[str] = None, cache_flag : bool = False):
77
+ def empty_mets(now: Optional[str] = None, cache_flag: bool = False):
78
78
  """
79
79
  Create an empty METS file from bundled template.
80
80
  """
@@ -94,11 +94,11 @@ class OcrdMets(OcrdXmlDocument):
94
94
  # then enable caching, if "false", disable caching, overriding the
95
95
  # kwarg to the constructor
96
96
  if config.is_set('OCRD_METS_CACHING'):
97
- getLogger('ocrd.models.ocrd_mets').debug('METS Caching %s because OCRD_METS_CACHING is %s',
98
- 'enabled' if config.OCRD_METS_CACHING else 'disabled', config.raw_value('OCRD_METS_CACHING'))
97
+ getLogger('ocrd.models.ocrd_mets').debug(
98
+ 'METS Caching %s because OCRD_METS_CACHING is %s',
99
+ 'enabled' if config.OCRD_METS_CACHING else 'disabled', config.raw_value('OCRD_METS_CACHING'))
99
100
  self._cache_flag = config.OCRD_METS_CACHING
100
101
 
101
-
102
102
  # If cache is enabled
103
103
  if self._cache_flag:
104
104
  self._initialize_caches()
@@ -109,7 +109,7 @@ class OcrdMets(OcrdXmlDocument):
109
109
  String representation
110
110
  """
111
111
  return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (
112
- self._cache_flag, self.file_groups, list(self.find_files()))
112
+ self._cache_flag, self.file_groups, list(self.find_files()))
113
113
 
114
114
  def _fill_caches(self) -> None:
115
115
  """
@@ -181,9 +181,9 @@ class OcrdMets(OcrdXmlDocument):
181
181
  def _initialize_caches(self) -> None:
182
182
  self._file_cache = {}
183
183
  # NOTE we can only guarantee uniqueness for @ID and @ORDER
184
- self._page_cache = {k : {} for k in METS_PAGE_DIV_ATTRIBUTE}
184
+ self._page_cache = {k: {} for k in METS_PAGE_DIV_ATTRIBUTE}
185
185
  self._fptr_cache = {}
186
- self._struct_cache = {k : {} for k in METS_STRUCT_DIV_ATTRIBUTE}
186
+ self._struct_cache = {k: {} for k in METS_STRUCT_DIV_ATTRIBUTE}
187
187
 
188
188
  def _refresh_caches(self) -> None:
189
189
  if self._cache_flag:
@@ -205,7 +205,7 @@ class OcrdMets(OcrdXmlDocument):
205
205
  return found.text
206
206
 
207
207
  @unique_identifier.setter
208
- def unique_identifier(self, purl : str) -> None:
208
+ def unique_identifier(self, purl: str) -> None:
209
209
  """
210
210
  Set the unique identifier by looking through ``mods:identifier``
211
211
  See `specs <https://ocr-d.de/en/spec/mets#unique-id-for-the-document-processed>`_ for details.
@@ -268,15 +268,15 @@ class OcrdMets(OcrdXmlDocument):
268
268
  # pylint: disable=multiple-statements
269
269
  def find_files(
270
270
  self,
271
- ID : Optional[str] = None,
272
- fileGrp : Optional[str] = None,
273
- pageId : Optional[str] = None,
274
- mimetype : Optional[str] = None,
275
- url : Optional[str] = None,
276
- local_filename : Optional[str] = None,
277
- local_only : bool = False,
278
- include_fileGrp : Optional[List[str]] = None,
279
- exclude_fileGrp : Optional[List[str]] = None,
271
+ ID: Optional[str] = None,
272
+ fileGrp: Optional[str] = None,
273
+ pageId: Optional[str] = None,
274
+ mimetype: Optional[str] = None,
275
+ url: Optional[str] = None,
276
+ local_filename: Optional[str] = None,
277
+ local_only: bool = False,
278
+ include_fileGrp: Optional[List[str]] = None,
279
+ exclude_fileGrp: Optional[List[str]] = None,
280
280
  ) -> Iterator[OcrdFile]:
281
281
  """
282
282
  Search ``mets:file`` entries in this METS document and yield results.
@@ -346,24 +346,30 @@ class OcrdMets(OcrdXmlDocument):
346
346
  for cand in candidates:
347
347
  if ID:
348
348
  if isinstance(ID, str):
349
- if not ID == cand.get('ID'): continue
349
+ if not ID == cand.get('ID'):
350
+ continue
350
351
  else:
351
- if not ID.fullmatch(cand.get('ID')): continue
352
+ if not ID.fullmatch(cand.get('ID')):
353
+ continue
352
354
 
353
355
  if pageId is not None and cand.get('ID') not in pageId_list:
354
356
  continue
355
357
 
356
358
  if not self._cache_flag and fileGrp:
357
359
  if isinstance(fileGrp, str):
358
- if cand.getparent().get('USE') != fileGrp: continue
360
+ if cand.getparent().get('USE') != fileGrp:
361
+ continue
359
362
  else:
360
- if not fileGrp.fullmatch(cand.getparent().get('USE')): continue
363
+ if not fileGrp.fullmatch(cand.getparent().get('USE')):
364
+ continue
361
365
 
362
366
  if mimetype:
363
367
  if isinstance(mimetype, str):
364
- if cand.get('MIMETYPE') != mimetype: continue
368
+ if cand.get('MIMETYPE') != mimetype:
369
+ continue
365
370
  else:
366
- if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): continue
371
+ if not mimetype.fullmatch(cand.get('MIMETYPE') or ''):
372
+ continue
367
373
 
368
374
  if url:
369
375
  cand_locat = cand.find('mets:FLocat[@LOCTYPE="URL"]', namespaces=NS)
@@ -371,9 +377,11 @@ class OcrdMets(OcrdXmlDocument):
371
377
  continue
372
378
  cand_url = cand_locat.get('{%s}href' % NS['xlink'])
373
379
  if isinstance(url, str):
374
- if cand_url != url: continue
380
+ if cand_url != url:
381
+ continue
375
382
  else:
376
- if not url.fullmatch(cand_url): continue
383
+ if not url.fullmatch(cand_url):
384
+ continue
377
385
 
378
386
  if local_filename:
379
387
  cand_locat = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', namespaces=NS)
@@ -381,9 +389,11 @@ class OcrdMets(OcrdXmlDocument):
381
389
  continue
382
390
  cand_local_filename = cand_locat.get('{%s}href' % NS['xlink'])
383
391
  if isinstance(local_filename, str):
384
- if cand_local_filename != local_filename: continue
392
+ if cand_local_filename != local_filename:
393
+ continue
385
394
  else:
386
- if not local_filename.fullmatch(cand_local_filename): continue
395
+ if not local_filename.fullmatch(cand_local_filename):
396
+ continue
387
397
 
388
398
  if local_only:
389
399
  # deprecation_warning("'local_only' is deprecated, use 'local_filename=\"//.+\"' instead")
@@ -435,7 +445,7 @@ class OcrdMets(OcrdXmlDocument):
435
445
  if self._cache_flag:
436
446
  self._file_cache[new] = self._file_cache.pop(old)
437
447
 
438
- def remove_file_group(self, USE: str, recursive : bool = False, force : bool = False) -> None:
448
+ def remove_file_group(self, USE: str, recursive: bool = False, force: bool = False) -> None:
439
449
  """
440
450
  Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``)
441
451
  Arguments:
@@ -479,16 +489,16 @@ class OcrdMets(OcrdXmlDocument):
479
489
 
480
490
  if self._cache_flag:
481
491
  # Note: Since the files inside the group are removed
482
- # with the 'remove_one_file' method above,
492
+ # with the 'remove_one_file' method above,
483
493
  # we should not take care of that again.
484
494
  # We just remove the fileGrp.
485
495
  del self._file_cache[el_fileGrp.get('USE')]
486
496
 
487
497
  el_fileGrp.getparent().remove(el_fileGrp)
488
498
 
489
- def add_file(self, fileGrp : str, mimetype : Optional[str] = None, url : Optional[str] = None,
490
- ID : Optional[str] = None, pageId : Optional[str] = None, force : bool = False,
491
- local_filename : Optional[str] = None, ignore : bool = False, **kwargs) -> OcrdFile:
499
+ def add_file(self, fileGrp: str, mimetype: Optional[str] = None, url: Optional[str] = None,
500
+ ID: Optional[str] = None, pageId: Optional[str] = None, force: bool = False,
501
+ local_filename: Optional[str] = None, ignore: bool = False, **kwargs) -> OcrdFile:
492
502
  """
493
503
  Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`.
494
504
  Arguments:
@@ -499,7 +509,8 @@ class OcrdMets(OcrdXmlDocument):
499
509
  ID (string): ``@ID`` of the ``mets:file`` to use
500
510
  pageId (string): ``@ID`` in the physical ``mets:structMap`` to link to
501
511
  force (boolean): Whether to add the file even if a ``mets:file`` with the same ``@ID`` already exists.
502
- ignore (boolean): Do not look for existing files at all. Shift responsibility for preventing errors from duplicate ID to the user.
512
+ ignore (boolean): Do not look for existing files at all.
513
+ (Shifts responsibility for preventing errors from duplicate ID to the user.)
503
514
  local_filename (string):
504
515
  """
505
516
  if not ID:
@@ -541,7 +552,7 @@ class OcrdMets(OcrdXmlDocument):
541
552
 
542
553
  return mets_file
543
554
 
544
- def remove_file(self, *args, **kwargs) -> Union[List[OcrdFile],OcrdFile]:
555
+ def remove_file(self, *args, **kwargs) -> Union[List[OcrdFile], OcrdFile]:
545
556
  """
546
557
  Delete each ``ocrd:file`` matching the query. Same arguments as :py:meth:`find_files`
547
558
  """
@@ -559,12 +570,14 @@ class OcrdMets(OcrdXmlDocument):
559
570
  return []
560
571
  raise FileNotFoundError("File not found: %s %s" % (args, kwargs))
561
572
 
562
- def remove_one_file(self, ID : Union[str, OcrdFile], fileGrp : str = None) -> OcrdFile:
573
+ def remove_one_file(self, ID: Union[str, OcrdFile], fileGrp: str = None) -> OcrdFile:
563
574
  """
564
575
  Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`.
565
576
  Arguments:
566
- ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete Can also be an :py:class:`ocrd_models.ocrd_file.OcrdFile` to avoid search via ``ID``.
567
- fileGrp (string): ``@USE`` of the ``mets:fileGrp`` containing the ``mets:file``. Used only for optimization.
577
+ ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete.
578
+ (Can also be an :py:class:`ocrd_models.ocrd_file.OcrdFile` to avoid search via ``ID``.)
579
+ fileGrp (string): ``@USE`` of the ``mets:fileGrp`` containing the ``mets:file``.
580
+ (Used only for optimization.)
568
581
  Returns:
569
582
  The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference.
570
583
  """
@@ -629,8 +642,8 @@ class OcrdMets(OcrdXmlDocument):
629
642
  'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID',
630
643
  namespaces=NS)]
631
644
 
632
- def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageIds : Optional[str] = None,
633
- return_divs : bool = False) -> List[Union[str, ET._Element]]:
645
+ def get_physical_pages(self, for_fileIds: Optional[List[str]] = None, for_pageIds: Optional[str] = None,
646
+ return_divs: bool = False) -> List[Union[str, ET._Element]]:
634
647
  """
635
648
  List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``),
636
649
  optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`,
@@ -718,7 +731,7 @@ class OcrdMets(OcrdXmlDocument):
718
731
 
719
732
  if for_fileIds == []:
720
733
  return []
721
- assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright
734
+ assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright
722
735
  ret = [None] * len(for_fileIds)
723
736
  if self._cache_flag:
724
737
  for pageId, fptrdict in self._fptr_cache.items():
@@ -793,7 +806,6 @@ class OcrdMets(OcrdXmlDocument):
793
806
  val = struct_cache[attr].setdefault(str(el_div.get(attr.name)), list())
794
807
  val.extend(smlink_map.get(el_div.get('ID'), []))
795
808
  log.debug("found %d smLink entries for %d logical divs", len(el_smlink_list), len(el_struct_list))
796
- page_attr_patterns_matched = []
797
809
  for page in self._tree.getroot().xpath(
798
810
  'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
799
811
  namespaces=NS):
@@ -811,7 +823,7 @@ class OcrdMets(OcrdXmlDocument):
811
823
  METS_STRUCT_DIV_ATTRIBUTE.LABEL]):
812
824
  continue
813
825
  if cache_keys := [v for v in cache if pat.matches(v)]:
814
- pat.attr = [attr] # disambiguate next
826
+ pat.attr = [attr] # disambiguate next
815
827
  if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE):
816
828
  ret.append(page)
817
829
  log.debug('physical match for %s on page %s', pat, page.get('ID'))
@@ -835,7 +847,7 @@ class OcrdMets(OcrdXmlDocument):
835
847
  pat.expr.remove(cache_key)
836
848
  if not pat.expr:
837
849
  patterns_exhausted.append(pat)
838
- break # no more attributes for this pattern
850
+ break # no more attributes for this pattern
839
851
  # keep matching in order to exhaust and consume pattern list
840
852
  #if page in ret:
841
853
  # break # no more patterns for this page
@@ -847,7 +859,7 @@ class OcrdMets(OcrdXmlDocument):
847
859
  raise ValueError(f"Patterns {unmatched} match none of the pages")
848
860
 
849
861
  ranges_without_start_match = []
850
- ranges_without_stop_match = []
862
+ # ranges_without_stop_match = []
851
863
  for pat in page_attr_patterns_copy:
852
864
  if isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN):
853
865
  # range expression, expanded to pattern list
@@ -865,8 +877,8 @@ class OcrdMets(OcrdXmlDocument):
865
877
  # raise ValueError(f"End of range patterns {ranges_without_stop_match} not matched - invalid range")
866
878
  return ret
867
879
 
868
- def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile,
869
- order : Optional[str] = None, orderlabel : Optional[str] = None) -> None:
880
+ def set_physical_page_for_file(self, pageId: str, ocrd_file: OcrdFile,
881
+ order: Optional[str] = None, orderlabel: Optional[str] = None) -> None:
870
882
  """
871
883
  Set the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry)
872
884
  corresponding to the ``mets:file`` :py:attr:`ocrd_file`, creating all structures if necessary.
@@ -887,7 +899,10 @@ class OcrdMets(OcrdXmlDocument):
887
899
  fptrs.append(fptrdict[ocrd_file.ID])
888
900
  else:
889
901
  fptrs = self._tree.getroot().findall(
890
- 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' %
902
+ 'mets:structMap[@TYPE="PHYSICAL"]/'
903
+ 'mets:div[@TYPE="physSequence"]/'
904
+ 'mets:div[@TYPE="page"]/'
905
+ 'mets:fptr[@FILEID="%s"]' %
891
906
  ocrd_file.ID, namespaces=NS)
892
907
 
893
908
  for el_fptr in fptrs:
@@ -923,7 +938,7 @@ class OcrdMets(OcrdXmlDocument):
923
938
  if self._cache_flag:
924
939
  # Create a new entry in the page cache
925
940
  self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] = el_pagediv
926
- # Create a new entry in the fptr cache and
941
+ # Create a new entry in the fptr cache and
927
942
  # assign an empty dictionary to hold the fileids
928
943
  self._fptr_cache.setdefault(pageId, {})
929
944
 
@@ -934,7 +949,7 @@ class OcrdMets(OcrdXmlDocument):
934
949
  # Assign the ocrd fileID to the pageId in the cache
935
950
  self._fptr_cache[pageId].update({ocrd_file.ID: el_fptr})
936
951
 
937
- def update_physical_page_attributes(self, page_id : str, **kwargs) -> None:
952
+ def update_physical_page_attributes(self, page_id: str, **kwargs) -> None:
938
953
  invalid_keys = list(k for k in kwargs if k not in METS_PAGE_DIV_ATTRIBUTE.names())
939
954
  if invalid_keys:
940
955
  raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTE.names()}")
@@ -950,7 +965,7 @@ class OcrdMets(OcrdXmlDocument):
950
965
  else:
951
966
  page_div.attrib[k] = v
952
967
 
953
- def get_physical_page_for_file(self, ocrd_file : OcrdFile) -> Optional[str]:
968
+ def get_physical_page_for_file(self, ocrd_file: OcrdFile) -> Optional[str]:
954
969
  """
955
970
  Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry)
956
971
  corresponding to the ``mets:file`` :py:attr:`ocrd_file`.
@@ -961,12 +976,15 @@ class OcrdMets(OcrdXmlDocument):
961
976
  return pageId
962
977
  else:
963
978
  ret = self._tree.getroot().find(
964
- 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' %
979
+ 'mets:structMap[@TYPE="PHYSICAL"]/'
980
+ 'mets:div[@TYPE="physSequence"]/'
981
+ 'mets:div[@TYPE="page"]/'
982
+ 'mets:fptr[@FILEID="%s"]' %
965
983
  ocrd_file.ID, namespaces=NS)
966
984
  if ret is not None:
967
985
  return ret.getparent().get('ID')
968
986
 
969
- def remove_physical_page(self, ID : str) -> None:
987
+ def remove_physical_page(self, ID: str) -> None:
970
988
  """
971
989
  Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`.
972
990
  """
@@ -987,9 +1005,11 @@ class OcrdMets(OcrdXmlDocument):
987
1005
  del self._page_cache[attr][mets_div_attrib[attr.name]]
988
1006
  del self._fptr_cache[ID]
989
1007
 
990
- def remove_physical_page_fptr(self, fileId : str) -> List[str]:
1008
+ def remove_physical_page_fptr(self, fileId: str) -> List[str]:
991
1009
  """
992
- Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]`` for :py:attr:`fileId` from all ``mets:div`` entries in the physical ``mets:structMap``.
1010
+ Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]``
1011
+ for :py:attr:`fileId` from all ``mets:div`` entries in the physical ``mets:structMap``.
1012
+
993
1013
  Returns:
994
1014
  List of pageIds that mets:fptrs were deleted from
995
1015
  """
@@ -1006,7 +1026,10 @@ class OcrdMets(OcrdXmlDocument):
1006
1026
  mets_fptrs.append(fptrdict[fileId])
1007
1027
  else:
1008
1028
  mets_fptrs = self._tree.getroot().xpath(
1009
- 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId,
1029
+ 'mets:structMap[@TYPE="PHYSICAL"]/'
1030
+ 'mets:div[@TYPE="physSequence"]/'
1031
+ 'mets:div[@TYPE="page"]/'
1032
+ 'mets:fptr[@FILEID="%s"]' % fileId,
1010
1033
  namespaces=NS)
1011
1034
  ret = []
1012
1035
  for mets_fptr in mets_fptrs:
@@ -1029,11 +1052,11 @@ class OcrdMets(OcrdXmlDocument):
1029
1052
  return {div.get('ID'): (div.get('ORDER', None), div.get('ORDERLABEL', None), div.get('LABEL', None))
1030
1053
  for div in divs}
1031
1054
 
1032
- def merge(self, other_mets, force : bool = False,
1033
- fileGrp_mapping : Optional[Dict[str, str]] = None,
1034
- fileId_mapping : Optional[Dict[str, str]] = None,
1035
- pageId_mapping : Optional[Dict[str, str]] = None,
1036
- after_add_cb : Optional[Callable[[OcrdFile], Any]] = None, **kwargs) -> None:
1055
+ def merge(self, other_mets, force: bool = False,
1056
+ fileGrp_mapping: Optional[Dict[str, str]] = None,
1057
+ fileId_mapping: Optional[Dict[str, str]] = None,
1058
+ pageId_mapping: Optional[Dict[str, str]] = None,
1059
+ after_add_cb: Optional[Callable[[OcrdFile], Any]] = None, **kwargs) -> None:
1037
1060
  """
1038
1061
  Add all files from other_mets.
1039
1062
  Accepts the same kwargs as :py:func:`find_files`
ocrd_models/ocrd_page.py CHANGED
@@ -179,6 +179,7 @@ parseString.__doc__ = (
179
179
  """
180
180
  )
181
181
 
182
+
182
183
  class OcrdPage():
183
184
  """
184
185
  Proxy object for :py:class:`ocrd_models.PcGtsType` (i.e. PRImA PAGE-XML
@@ -188,10 +189,10 @@ class OcrdPage():
188
189
  """
189
190
  def __init__(
190
191
  self,
191
- pcgts : PcGtsType,
192
- etree : ET._Element,
193
- mapping : Dict[str, ET._Element],
194
- revmap : Dict[ET._Element, Any],
192
+ pcgts: PcGtsType,
193
+ etree: ET._Element,
194
+ mapping: Dict[str, ET._Element],
195
+ revmap: Dict[ET._Element, Any],
195
196
  ):
196
197
  self._pcgts = pcgts
197
198
  self.etree = etree
@@ -214,8 +215,10 @@ class OcrdPage():
214
215
  def __getattr__(self, name):
215
216
  return getattr(self._pcgts, name)
216
217
 
218
+
217
219
  OcrdPageType = Union[OcrdPage, PcGtsType]
218
220
 
221
+
219
222
  def to_xml(el, skip_declaration=False) -> str:
220
223
  """
221
224
  Serialize ``pc:PcGts`` document as string.
@@ -229,15 +232,16 @@ def to_xml(el, skip_declaration=False) -> str:
229
232
  name = 'PcGts'
230
233
  sio = StringIO()
231
234
  el.export(
232
- outfile=sio,
233
- level=0,
234
- name_=name,
235
- namespaceprefix_='pc:',
236
- namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % (
237
- NAMESPACES['page'],
238
- NAMESPACES['page'],
239
- NAMESPACES['page']
240
- ))
235
+ outfile=sio,
236
+ level=0,
237
+ name_=name,
238
+ namespaceprefix_='pc:',
239
+ namespacedef_='xmlns:pc="%s" ' % NAMESPACES['page'] +
240
+ 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' +
241
+ 'xsi:schemaLocation="%s %s/pagecontent.xsd"' % (
242
+ NAMESPACES['page'],
243
+ NAMESPACES['page']
244
+ ))
241
245
  ret = sio.getvalue()
242
246
  if not skip_declaration:
243
247
  ret = '<?xml version="1.0" encoding="UTF-8"?>\n' + ret
@@ -11,6 +11,7 @@ from .utils import xmllint_format
11
11
  for curie, url in NAMESPACES.items():
12
12
  ET.register_namespace(curie, url)
13
13
 
14
+
14
15
  class OcrdXmlDocument():
15
16
  """
16
17
  Base class for XML documents loaded from either content or filename.
ocrd_models/report.py CHANGED
@@ -7,7 +7,8 @@ __all__ = ['ValidationReport']
7
7
  # -------------------------------------------------
8
8
  #
9
9
 
10
- class ValidationReport(object):
10
+
11
+ class ValidationReport():
11
12
  """
12
13
  Container of notices, warnings and errors about a workspace.
13
14
  """
ocrd_models/utils.py CHANGED
@@ -13,6 +13,7 @@ __all__ = [
13
13
  'extract_mets_from_oai_content'
14
14
  ]
15
15
 
16
+
16
17
  def xmllint_format(xml):
17
18
  """
18
19
  Pretty-print XML like ``xmllint`` does.
@@ -25,6 +26,7 @@ def xmllint_format(xml):
25
26
  return ('%s\n%s' % ('<?xml version="1.0" encoding="UTF-8"?>',
26
27
  ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8')
27
28
 
29
+
28
30
  def handle_oai_response(response):
29
31
  """
30
32
  In case of a valid OAI-Response, extract first METS-Entry-Data
@@ -62,9 +64,8 @@ def extract_mets_from_oai_content(data, preamble='<?xml version="1.0" encoding="
62
64
  mets_root_el = xml_root.find('.//{%s}mets' % NS['mets'])
63
65
  if mets_root_el is not None:
64
66
  new_tree = ET.ElementTree(mets_root_el)
65
- xml_formatted = ET.tostring(new_tree,
66
- pretty_print=True,
67
- encoding='UTF-8').decode('UTF-8')
67
+ xml_formatted = ET.tostring(
68
+ new_tree, pretty_print=True, encoding='UTF-8').decode('UTF-8')
68
69
  formatted_content = '{}\n{}'.format(preamble, xml_formatted)
69
70
  return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n')
70
71
 
@@ -2,10 +2,12 @@ from ocrd_utils import xywh_from_points
2
2
 
3
3
  pc_functions = []
4
4
 
5
+
5
6
  def _export(func):
6
7
  pc_functions.append(func)
7
8
  return func
8
9
 
10
+
9
11
  @_export
10
12
  def pc_pixelarea(nodes):
11
13
  """
@@ -24,6 +26,7 @@ def pc_pixelarea(nodes):
24
26
  area += xywh['w'] * xywh['h']
25
27
  return area
26
28
 
29
+
27
30
  @_export
28
31
  def pc_textequiv(nodes):
29
32
  """
@@ -48,4 +51,3 @@ def pc_textequiv(nodes):
48
51
  continue
49
52
  text += str(string.text)
50
53
  return text
51
-
ocrd_network/__init__.py CHANGED
@@ -1,7 +1,6 @@
1
1
  from .client import Client
2
- from .constants import AgentType, JobState
2
+ from .constants import JobState
3
3
  from .processing_server import ProcessingServer
4
4
  from .processing_worker import ProcessingWorker
5
- from .processor_server import ProcessorServer
6
5
  from .param_validators import DatabaseParamType, ServerAddressParamType, QueueServerParamType
7
6
  from .server_cache import CacheLockedPages, CacheProcessingRequests
@@ -1,11 +1,9 @@
1
1
  from .client import client_cli
2
2
  from .processing_server import processing_server_cli
3
3
  from .processing_worker import processing_worker_cli
4
- from .processor_server import processor_server_cli
5
4
 
6
5
  __all__ = [
7
6
  'client_cli',
8
7
  'processing_server_cli',
9
8
  'processing_worker_cli',
10
- 'processor_server_cli'
11
9
  ]