ocrd 3.5.0__py3-none-any.whl → 3.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. ocrd/cli/__init__.py +6 -2
  2. ocrd/cli/bashlib.py +7 -2
  3. ocrd/cli/log.py +7 -2
  4. ocrd/cli/network.py +0 -2
  5. ocrd/cli/ocrd_tool.py +26 -4
  6. ocrd/cli/process.py +1 -0
  7. ocrd/cli/resmgr.py +0 -1
  8. ocrd/cli/validate.py +32 -13
  9. ocrd/cli/workspace.py +125 -52
  10. ocrd/cli/zip.py +13 -4
  11. ocrd/decorators/__init__.py +28 -52
  12. ocrd/decorators/loglevel_option.py +4 -0
  13. ocrd/decorators/mets_find_options.py +2 -1
  14. ocrd/decorators/ocrd_cli_options.py +3 -7
  15. ocrd/decorators/parameter_option.py +12 -11
  16. ocrd/lib.bash +6 -13
  17. ocrd/mets_server.py +6 -10
  18. ocrd/processor/base.py +88 -71
  19. ocrd/processor/builtin/dummy_processor.py +7 -4
  20. ocrd/processor/builtin/filter_processor.py +3 -2
  21. ocrd/processor/helpers.py +5 -6
  22. ocrd/processor/ocrd_page_result.py +7 -5
  23. ocrd/resolver.py +42 -32
  24. ocrd/task_sequence.py +11 -4
  25. ocrd/workspace.py +64 -54
  26. ocrd/workspace_backup.py +3 -0
  27. ocrd/workspace_bagger.py +15 -8
  28. {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/METADATA +3 -2
  29. ocrd-3.6.0.dist-info/RECORD +125 -0
  30. ocrd_modelfactory/__init__.py +4 -2
  31. ocrd_models/constants.py +18 -1
  32. ocrd_models/ocrd_agent.py +1 -1
  33. ocrd_models/ocrd_exif.py +7 -3
  34. ocrd_models/ocrd_file.py +24 -19
  35. ocrd_models/ocrd_mets.py +90 -67
  36. ocrd_models/ocrd_page.py +17 -13
  37. ocrd_models/ocrd_xml_base.py +1 -0
  38. ocrd_models/report.py +2 -1
  39. ocrd_models/utils.py +4 -3
  40. ocrd_models/xpath_functions.py +3 -1
  41. ocrd_network/__init__.py +1 -2
  42. ocrd_network/cli/__init__.py +0 -2
  43. ocrd_network/cli/client.py +122 -50
  44. ocrd_network/cli/processing_server.py +1 -2
  45. ocrd_network/client.py +2 -2
  46. ocrd_network/client_utils.py +30 -13
  47. ocrd_network/constants.py +1 -6
  48. ocrd_network/database.py +3 -3
  49. ocrd_network/logging_utils.py +2 -7
  50. ocrd_network/models/__init__.py +0 -2
  51. ocrd_network/models/job.py +2 -5
  52. ocrd_network/models/workspace.py +1 -1
  53. ocrd_network/process_helpers.py +54 -17
  54. ocrd_network/processing_server.py +63 -114
  55. ocrd_network/processing_worker.py +6 -5
  56. ocrd_network/rabbitmq_utils/__init__.py +2 -0
  57. ocrd_network/rabbitmq_utils/helpers.py +24 -7
  58. ocrd_network/runtime_data/__init__.py +1 -2
  59. ocrd_network/runtime_data/deployer.py +12 -85
  60. ocrd_network/runtime_data/hosts.py +61 -130
  61. ocrd_network/runtime_data/network_agents.py +7 -31
  62. ocrd_network/runtime_data/network_services.py +1 -1
  63. ocrd_network/server_cache.py +1 -1
  64. ocrd_network/server_utils.py +13 -52
  65. ocrd_network/utils.py +1 -0
  66. ocrd_utils/__init__.py +4 -4
  67. ocrd_utils/config.py +86 -76
  68. ocrd_utils/deprecate.py +3 -0
  69. ocrd_utils/image.py +51 -23
  70. ocrd_utils/introspect.py +8 -3
  71. ocrd_utils/logging.py +12 -7
  72. ocrd_utils/os.py +16 -3
  73. ocrd_utils/str.py +32 -16
  74. ocrd_validators/json_validator.py +4 -1
  75. ocrd_validators/ocrd_tool_validator.py +2 -1
  76. ocrd_validators/ocrd_zip_validator.py +5 -4
  77. ocrd_validators/page_validator.py +21 -9
  78. ocrd_validators/parameter_validator.py +3 -2
  79. ocrd_validators/processing_server_config.schema.yml +1 -33
  80. ocrd_validators/resource_list_validator.py +3 -1
  81. ocrd_validators/workspace_validator.py +30 -20
  82. ocrd_validators/xsd_mets_validator.py +2 -1
  83. ocrd_validators/xsd_page_validator.py +2 -1
  84. ocrd_validators/xsd_validator.py +4 -2
  85. ocrd-3.5.0.dist-info/RECORD +0 -128
  86. ocrd_network/cli/processor_server.py +0 -31
  87. ocrd_network/models/ocrd_tool.py +0 -12
  88. ocrd_network/processor_server.py +0 -255
  89. {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/LICENSE +0 -0
  90. {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/WHEEL +0 -0
  91. {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/entry_points.txt +0 -0
  92. {ocrd-3.5.0.dist-info → ocrd-3.6.0.dist-info}/top_level.txt +0 -0
ocrd_utils/str.py CHANGED
@@ -37,13 +37,14 @@ def assert_file_grp_cardinality(grps, n, msg=None):
37
37
  if isinstance(grps, str):
38
38
  grps = grps.split(',')
39
39
  assert len(grps) == n, \
40
- "Expected exactly %d output file group%s%s, but '%s' has %d" % (
41
- n,
42
- '' if n == 1 else 's',
43
- ' (%s)' % msg if msg else '',
44
- grps,
45
- len(grps)
46
- )
40
+ "Expected exactly %d output file group%s%s, but '%s' has %d" % (
41
+ n,
42
+ '' if n == 1 else 's',
43
+ ' (%s)' % msg if msg else '',
44
+ grps,
45
+ len(grps)
46
+ )
47
+
47
48
 
48
49
  def concat_padded(base, *args):
49
50
  """
@@ -54,18 +55,20 @@ def concat_padded(base, *args):
54
55
  if is_string(n):
55
56
  ret = "%s_%s" % (ret, n)
56
57
  else:
57
- ret = "%s_%04i" % (ret, n)
58
+ ret = "%s_%04i" % (ret, n)
58
59
  return ret
59
60
 
61
+
60
62
  def remove_non_path_from_url(url):
61
63
  """
62
64
  Remove everything from URL after path.
63
65
  """
64
- url = url.split('?', 1)[0] # query
65
- url = url.split('#', 1)[0] # fragment identifier
66
- url = re.sub(r"/+$", "", url) # trailing slashes
66
+ url = url.split('?', 1)[0] # query
67
+ url = url.split('#', 1)[0] # fragment identifier
68
+ url = re.sub(r"/+$", "", url) # trailing slashes
67
69
  return url
68
70
 
71
+
69
72
  def make_file_id(ocrd_file, output_file_grp):
70
73
  """
71
74
  Derive a new file ID for an output file from an existing input file ``ocrd_file``
@@ -101,9 +104,12 @@ def make_file_id(ocrd_file, output_file_grp):
101
104
  ret = output_file_grp + '_' + ocrd_file.ID
102
105
  return make_xml_id(ret)
103
106
 
107
+
104
108
  def make_xml_id(idstr: str) -> str:
105
109
  """
106
- Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``, removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_` if ``idstr`` starts with a number.
110
+ Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``,
111
+ removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_`
112
+ if ``idstr`` starts with a number.
107
113
  """
108
114
  ret = idstr
109
115
  if not REGEX_FILE_ID.fullmatch(ret):
@@ -113,6 +119,7 @@ def make_xml_id(idstr: str) -> str:
113
119
  ret = re.sub(r'[^\w.-]', r'', ret)
114
120
  return ret
115
121
 
122
+
116
123
  def nth_url_segment(url, n=-1):
117
124
  """
118
125
  Return the last /-delimited segment of a URL-like string
@@ -127,6 +134,7 @@ def nth_url_segment(url, n=-1):
127
134
  except IndexError:
128
135
  return ''
129
136
 
137
+
130
138
  def get_local_filename(url, start=None):
131
139
  """
132
140
  Return local filename, optionally relative to ``start``
@@ -150,12 +158,14 @@ def get_local_filename(url, start=None):
150
158
  url = url[len(start):]
151
159
  return url
152
160
 
161
+
153
162
  def is_local_filename(url):
154
163
  """
155
164
  Whether a url is a local filename.
156
165
  """
157
166
  # deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
158
- return url.startswith('file://') or not('://' in url)
167
+ return url.startswith('file://') or '://' not in url
168
+
159
169
 
160
170
  def is_string(val):
161
171
  """
@@ -171,6 +181,7 @@ def parse_json_file_with_comments(val):
171
181
  with open(val, 'r', encoding='utf-8') as inputf:
172
182
  return parse_json_string_with_comments(inputf.read())
173
183
 
184
+
174
185
  def parse_json_string_with_comments(val):
175
186
  """
176
187
  Parse a string of JSON interspersed with #-prefixed full-line comments
@@ -178,6 +189,7 @@ def parse_json_string_with_comments(val):
178
189
  jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
179
190
  return json.loads(jsonstr)
180
191
 
192
+
181
193
  def parse_json_string_or_file(*values, resolve_preset_file=None): # pylint: disable=unused-argument
182
194
  """
183
195
  Parse a string as either the path to a JSON object or a literal JSON object.
@@ -208,6 +220,7 @@ def parse_json_string_or_file(*values, resolve_preset_file=None): # pylint: d
208
220
  ret = {**ret, **value_parsed}
209
221
  return ret
210
222
 
223
+
211
224
  def safe_filename(url):
212
225
  """
213
226
  Sanitize input to be safely used as the basename of a local file.
@@ -218,7 +231,8 @@ def safe_filename(url):
218
231
  # print('safe filename: %s -> %s' % (url, ret))
219
232
  return ret
220
233
 
221
- def generate_range(start : str, end : str) -> List[str]:
234
+
235
+ def generate_range(start: str, end: str) -> List[str]:
222
236
  """
223
237
  Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
224
238
  """
@@ -228,7 +242,8 @@ def generate_range(start : str, end : str) -> List[str]:
228
242
  except IndexError:
229
243
  raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
230
244
  if start[:-len(start_num)] != end[:-len(end_num)]:
231
- raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: '{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'")
245
+ raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: "
246
+ f"'{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'")
232
247
  if start_num == end_num:
233
248
  warn("Range '%s..%s': evaluates to the same number")
234
249
  for i in range(int(start_num), int(end_num) + 1):
@@ -261,7 +276,8 @@ def partition_list(lst, chunks, chunk_index=None):
261
276
  return [ret[chunk_index]]
262
277
  return ret
263
278
 
264
- def sparkline(values : List[int]) -> str:
279
+
280
+ def sparkline(values: List[int]) -> str:
265
281
  """
266
282
  Render a list of points with block characters
267
283
  """
@@ -7,9 +7,11 @@ from jsonschema import Draft201909Validator, ValidationError, validators # pylin
7
7
 
8
8
  from ocrd_models import ValidationReport
9
9
 
10
+
10
11
  class JsonSchemaDeprecationWarning(ValidationError):
11
12
  pass
12
13
 
14
+
13
15
  # http://python-jsonschema.readthedocs.io/en/latest/faq/
14
16
  def extend_with_default(validator_class):
15
17
  """
@@ -34,6 +36,7 @@ def extend_with_default(validator_class):
34
36
 
35
37
  DefaultValidatingDraft20199Validator = extend_with_default(Draft201909Validator)
36
38
 
39
+
37
40
  #
38
41
  # -------------------------------------------------
39
42
  #
@@ -54,7 +57,7 @@ class JsonValidator():
54
57
  """
55
58
  if isinstance(obj, str):
56
59
  obj = json.loads(obj)
57
- return JsonValidator(schema)._validate(obj) # pylint: disable=protected-access
60
+ return JsonValidator(schema)._validate(obj) # pylint: disable=protected-access
58
61
 
59
62
  def __init__(self, schema, validator_class=Draft201909Validator):
60
63
  """
@@ -6,6 +6,7 @@ See `specs <https://ocr-d.de/en/spec/ocrd_tool>`_.
6
6
  from .constants import OCRD_TOOL_SCHEMA
7
7
  from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator
8
8
 
9
+
9
10
  #
10
11
  # -------------------------------------------------
11
12
  #
@@ -20,7 +21,7 @@ class OcrdToolValidator(JsonValidator):
20
21
  """
21
22
  Validate against ``ocrd-tool.json`` schema.
22
23
  """
23
- return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access
24
+ return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access
24
25
 
25
26
  def __init__(self, schema):
26
27
  super().__init__(schema, validator_class=DefaultValidatingDraft20199Validator)
@@ -8,12 +8,13 @@ from shutil import rmtree
8
8
 
9
9
  from ocrd_utils import getLogger, unzip_file_to_dir
10
10
 
11
- from bagit import Bag, BagValidationError # pylint: disable=no-name-in-module
12
- from bagit_profile import Profile, ProfileValidationError # pylint: disable=no-name-in-module
11
+ from bagit import Bag, BagValidationError # pylint: disable=no-name-in-module
12
+ from bagit_profile import Profile, ProfileValidationError # pylint: disable=no-name-in-module
13
13
 
14
14
  from .constants import OCRD_BAGIT_PROFILE, OCRD_BAGIT_PROFILE_URL, TMP_BAGIT_PREFIX
15
15
  from ocrd_models import ValidationReport
16
16
 
17
+
17
18
  #
18
19
  # -------------------------------------------------
19
20
  #
@@ -58,7 +59,8 @@ class OcrdZipValidator():
58
59
  # for d in e.details:
59
60
  # log = getLogger('ocrd.ocrd_zip_validator')
60
61
  # if isinstance(d, ChecksumMismatch):
61
- # log.error("Validation Error: expected %s to have %s checksum of %s but found %s", d.path, d.algorithm, d.expected, d.found)
62
+ # log.error("Validation Error: expected %s to have %s checksum of %s but found %s",
63
+ # d.path, d.algorithm, d.expected, d.found)
62
64
  # else:
63
65
  # log.error("Validation Error: %s", d)
64
66
  if failed:
@@ -89,7 +91,6 @@ class OcrdZipValidator():
89
91
  bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX)
90
92
  unzip_file_to_dir(self.path_to_zip, bagdir)
91
93
 
92
-
93
94
  try:
94
95
  bag = Bag(bagdir)
95
96
  self._validate_profile(bag)
@@ -119,6 +119,7 @@ class ConsistencyError(Exception):
119
119
  f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': "
120
120
  f"text results '{actual}' != concatenated '{expected}'")
121
121
 
122
+
122
123
  class CoordinateConsistencyError(Exception):
123
124
  """
124
125
  Exception representing a consistency error in coordinate confinement across levels of a PAGE-XML.
@@ -145,6 +146,7 @@ class CoordinateConsistencyError(Exception):
145
146
  f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': "
146
147
  f"coords '{inner}' not within parent coords '{outer}'")
147
148
 
149
+
148
150
  class CoordinateValidityError(Exception):
149
151
  """
150
152
  Exception representing a validity error of an element's coordinates in PAGE-XML.
@@ -169,12 +171,14 @@ class CoordinateValidityError(Exception):
169
171
  super().__init__(
170
172
  f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}")
171
173
 
174
+
172
175
  def compare_without_whitespace(a, b):
173
176
  """
174
177
  Compare two strings, ignoring all whitespace.
175
178
  """
176
179
  return re.sub('\\s+', '', a) == re.sub('\\s+', '', b)
177
180
 
181
+
178
182
  def page_get_reading_order(ro, rogroup):
179
183
  """
180
184
  Add all elements from the given reading order group to the given dictionary.
@@ -197,6 +201,7 @@ def page_get_reading_order(ro, rogroup):
197
201
  if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
198
202
  page_get_reading_order(ro, elem)
199
203
 
204
+
200
205
  def make_poly(polygon_points):
201
206
  """Instantiate a Polygon from a list of point pairs, or return an error string"""
202
207
  if len(polygon_points) < 4:
@@ -212,6 +217,7 @@ def make_poly(polygon_points):
212
217
  return 'is negative'
213
218
  return poly
214
219
 
220
+
215
221
  def make_line(line_points):
216
222
  """Instantiate a LineString from a list of point pairs, or return an error string"""
217
223
  if len(line_points) < 2:
@@ -225,6 +231,7 @@ def make_line(line_points):
225
231
  return 'is negative'
226
232
  return line
227
233
 
234
+
228
235
  @deprecated_alias(strictness='page_textequiv_consistency')
229
236
  @deprecated_alias(strategy='page_textequiv_strategy')
230
237
  def validate_consistency(node, page_textequiv_consistency, page_textequiv_strategy,
@@ -239,7 +246,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
239
246
  if isinstance(node, (PcGtsType, OcrdPage)):
240
247
  # top-level (start recursion)
241
248
  node_id = node.get_pcGtsId()
242
- node = node.get_Page() # has no .id
249
+ node = node.get_Page() # has no .id
243
250
  if not readingOrder:
244
251
  readingOrder = {}
245
252
  ro = node.get_ReadingOrder()
@@ -247,13 +254,13 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
247
254
  page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup())
248
255
  if not joinRelations:
249
256
  joinRelations = []
250
- relations = node.get_Relations() # get RelationsType
257
+ relations = node.get_Relations() # get RelationsType
251
258
  if relations:
252
- relations = relations.get_Relation() # get list of RelationType
259
+ relations = relations.get_Relation() # get list of RelationType
253
260
  else:
254
261
  relations = []
255
262
  for relation in relations:
256
- if relation.get_type() == 'join': # ignore 'link' type here
263
+ if relation.get_type() == 'join': # ignore 'link' type here
257
264
  joinRelations.append((relation.get_SourceRegionRef().get_regionRef(),
258
265
  relation.get_TargetRegionRef().get_regionRef()))
259
266
  elif isinstance(node, GlyphType):
@@ -277,7 +284,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
277
284
  parent_points, node_poly))
278
285
  log.debug("Invalid coords of %s %s", tag, node_id)
279
286
  consistent = False
280
- node_poly = None # don't use in further comparisons
287
+ node_poly = None # don't use in further comparisons
281
288
  else:
282
289
  node_poly = None
283
290
  for class_, getterLO, getterRD in _ORDER[1:]:
@@ -314,7 +321,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
314
321
  # report.add_error(CoordinateValidityError(child_tag, child.id, file_id, child_points))
315
322
  # log.debug("Invalid coords of %s %s", child_tag, child.id)
316
323
  # consistent = False
317
- pass # already reported in recursive call above
324
+ pass # already reported in recursive call above
318
325
  elif not child_poly.within(node_poly.buffer(PARENT_SLACK)):
319
326
  # TODO: automatic repair?
320
327
  report.add_error(CoordinateConsistencyError(child_tag, child.id, file_id,
@@ -344,13 +351,14 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
344
351
  if page_textequiv_consistency == 'fix':
345
352
  log.debug("Repaired text of %s %s", tag, node_id)
346
353
  set_text(node, concatenated, page_textequiv_strategy)
347
- elif (page_textequiv_consistency == 'strict' # or 'lax' but...
354
+ elif (page_textequiv_consistency == 'strict' # or 'lax' but...
348
355
  or not compare_without_whitespace(concatenated, text_results)):
349
356
  log.debug("Inconsistent text of %s %s", tag, node_id)
350
357
  report.add_error(ConsistencyError(tag, node_id, file_id,
351
358
  text_results, concatenated))
352
359
  return consistent
353
360
 
361
+
354
362
  def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None):
355
363
  """
356
364
  Concatenate nodes textually according to https://ocr-d.github.io/page#consistency-of-text-results-on-different-levels
@@ -367,6 +375,7 @@ def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None):
367
375
  result += get_text(next_node, page_textequiv_strategy)
368
376
  return result.strip()
369
377
 
378
+
370
379
  def get_text(node, page_textequiv_strategy='first'):
371
380
  """
372
381
  Get the first or most confident among text results (depending on ``page_textequiv_strategy``).
@@ -399,6 +408,7 @@ def get_text(node, page_textequiv_strategy='first'):
399
408
  # fall back to first element
400
409
  return textEquivs[0].get_Unicode().strip()
401
410
 
411
+
402
412
  def set_text(node, text, page_textequiv_strategy):
403
413
  """
404
414
  Set the first or most confident among text results (depending on ``page_textequiv_strategy``).
@@ -410,7 +420,7 @@ def set_text(node, text, page_textequiv_strategy):
410
420
  text = text.strip()
411
421
  textEquivs = node.get_TextEquiv()
412
422
  if not textEquivs:
413
- node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ?
423
+ node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ?
414
424
  elif page_textequiv_strategy == 'best':
415
425
  if len(textEquivs) > 1:
416
426
  textEquivsSorted = sorted([x for x in textEquivs if x.conf],
@@ -432,6 +442,7 @@ def set_text(node, text, page_textequiv_strategy):
432
442
  # fall back to first element
433
443
  textEquivs[0].set_Unicode(text)
434
444
 
445
+
435
446
  class PageValidator():
436
447
  """
437
448
  Validator for `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`.
@@ -477,5 +488,6 @@ class PageValidator():
477
488
  raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency)
478
489
  report = ValidationReport()
479
490
  log.info("Validating input file '%s'", file_id)
480
- validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id)
491
+ validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords,
492
+ report, file_id)
481
493
  return report
@@ -3,6 +3,7 @@ Validate parameters against ocrd-tool.json.
3
3
  """
4
4
  from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator
5
5
 
6
+
6
7
  #
7
8
  # -------------------------------------------------
8
9
  #
@@ -12,7 +13,7 @@ class ParameterValidator(JsonValidator):
12
13
  JsonValidator validating parametersagains ocrd-tool.json.
13
14
  """
14
15
 
15
- def validate(self, *args, **kwargs): # pylint: disable=arguments-differ
16
+ def validate(self, *args, **kwargs): # pylint: disable=arguments-differ
16
17
  """
17
18
  Validate a parameter dict against a parameter schema from an ocrd-tool.json
18
19
 
@@ -39,7 +40,7 @@ class ParameterValidator(JsonValidator):
39
40
  if 'required' in p[n]:
40
41
  if p[n]['required']:
41
42
  required.append(n)
42
- del(p[n]['required'])
43
+ del p[n]['required']
43
44
  super().__init__({
44
45
  "type": "object",
45
46
  "required": required,
@@ -68,16 +68,12 @@ properties:
68
68
  required:
69
69
  - address
70
70
  - username
71
+ - workers
71
72
  oneOf:
72
73
  - required:
73
74
  - password
74
75
  - required:
75
76
  - path_to_privkey
76
- anyOf:
77
- - required:
78
- - workers
79
- - required:
80
- - servers
81
77
  properties:
82
78
  address:
83
79
  description: The IP address or domain name of the target machine
@@ -118,34 +114,6 @@ properties:
118
114
  - native
119
115
  - docker
120
116
  default: native
121
- servers:
122
- description: List of processor servers that will be deployed
123
- type: array
124
- minItems: 1
125
- items:
126
- type: object
127
- additionalProperties: false
128
- required:
129
- - name
130
- - port
131
- properties:
132
- name:
133
- description: Name of the processor
134
- type: string
135
- pattern: "^ocrd-.*$"
136
- examples:
137
- - ocrd-cis-ocropy-binarize
138
- - ocrd-olena-binarize
139
- deploy_type:
140
- description: Should the processor server be deployed natively or with Docker
141
- type: string
142
- enum:
143
- - native
144
- - docker
145
- default: native
146
- port:
147
- description: The port number to be deployed on the host
148
- $ref: "#/$defs/port"
149
117
 
150
118
  $defs:
151
119
  address:
@@ -6,6 +6,7 @@ See `specs <https://ocr-d.de/en/spec/cli#processor-resources>`_.
6
6
  from .constants import RESOURCE_LIST_SCHEMA
7
7
  from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator
8
8
 
9
+
9
10
  #
10
11
  # -------------------------------------------------
11
12
  #
@@ -22,4 +23,5 @@ class OcrdResourceListValidator(JsonValidator):
22
23
  """
23
24
  if schema is None:
24
25
  schema = RESOURCE_LIST_SCHEMA
25
- return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) # pylint: disable=protected-access
26
+ validator = JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)
27
+ return validator._validate(obj) # pylint: disable=protected-access
@@ -15,6 +15,7 @@ from .page_validator import PageValidator
15
15
  from .xsd_page_validator import XsdPageValidator
16
16
  from .xsd_mets_validator import XsdMetsValidator
17
17
 
18
+
18
19
  #
19
20
  # -------------------------------------------------
20
21
  #
@@ -57,7 +58,8 @@ class WorkspaceValidator():
57
58
  if page_id:
58
59
  for one_page_id in page_id:
59
60
  if next(workspace.mets.find_files(fileGrp=grp, pageId=one_page_id), None):
60
- report.add_error("Output fileGrp[@USE='%s'] already contains output for page %s" % (grp, one_page_id))
61
+ report.add_error("Output fileGrp[@USE='%s'] already contains output for page %s" % (
62
+ grp, one_page_id))
61
63
  else:
62
64
  report.add_error("Output fileGrp[@USE='%s'] already in METS!" % grp)
63
65
  return report
@@ -121,10 +123,10 @@ class WorkspaceValidator():
121
123
  resolver (:class:`ocrd.Resolver`): Resolver
122
124
  mets_url (string): URL of the METS file
123
125
  src_dir (string, None): Directory containing mets file
124
- skip (list): Validation checks to omit. One or more of
126
+ skip (list): Validation checks to omit. One or more of
125
127
  'mets_unique_identifier',
126
128
  'mets_files', 'pixel_density', 'dimension', 'url',
127
- 'multipage', 'page', 'page_xsd', 'mets_xsd',
129
+ 'multipage', 'page', 'page_xsd', 'mets_xsd',
128
130
  'mets_fileid_page_pcgtsid'
129
131
  download (boolean): Whether to download remote file references
130
132
  temporarily during validation (like a processor would)
@@ -133,7 +135,7 @@ class WorkspaceValidator():
133
135
  report (:class:`ValidationReport`) Report on the validity
134
136
  """
135
137
  validator = WorkspaceValidator(*args, **kwargs)
136
- return validator._validate() # pylint: disable=protected-access
138
+ return validator._validate() # pylint: disable=protected-access
137
139
 
138
140
  def _validate(self):
139
141
  """
@@ -141,7 +143,7 @@ class WorkspaceValidator():
141
143
  """
142
144
  try:
143
145
  self._resolve_workspace()
144
- except Exception as e: # pylint: disable=broad-except
146
+ except Exception as e: # pylint: disable=broad-except
145
147
  self.log.warning("Failed to instantiate workspace: %s", e)
146
148
  self.report.add_error(f"Failed to instantiate workspace: {e}")
147
149
  return self.report
@@ -159,7 +161,7 @@ class WorkspaceValidator():
159
161
  self._validate_mets_xsd()
160
162
  if self.page_checks:
161
163
  self._validate_page()
162
- except Exception: # pylint: disable=broad-except
164
+ except Exception: # pylint: disable=broad-except
163
165
  self.report.add_error(f"Validation aborted with exception: {format_exc()}")
164
166
  return self.report
165
167
 
@@ -216,9 +218,11 @@ class WorkspaceValidator():
216
218
  page = page_from_file(f).get_Page()
217
219
  _, _, exif = self.workspace.image_from_page(page, f.pageId)
218
220
  if page.imageHeight != exif.height:
219
- self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})")
221
+ self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height "
222
+ f"({page.imageHeight} != {exif.height})")
220
223
  if page.imageWidth != exif.width:
221
- self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})")
224
+ self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width "
225
+ f"({page.imageWidth} != {exif.width})")
222
226
 
223
227
  def _validate_multipage(self):
224
228
  """
@@ -237,7 +241,8 @@ class WorkspaceValidator():
237
241
  if exif.n_frames > 1:
238
242
  self.report.add_error(f"Image '{f.ID}': More than 1 frame: {exif.n_frames}")
239
243
  except FileNotFoundError:
240
- self.report.add_error(f"Image '{f.ID}': Could not retrieve (local_filename='{f.local_filename}', url='{f.url}')")
244
+ self.report.add_error(f"Image '{f.ID}': Could not retrieve "
245
+ f"(local_filename='{f.local_filename}', url='{f.url}')")
241
246
  return
242
247
 
243
248
  def _validate_pixel_density(self):
@@ -293,10 +298,11 @@ class WorkspaceValidator():
293
298
  except StopIteration:
294
299
  self.report.add_error("No files")
295
300
  for f in self.mets.find_files(**self.find_kwargs):
296
- if f._el.get('GROUPID'): # pylint: disable=protected-access
301
+ if f._el.get('GROUPID'): # pylint: disable=protected-access
297
302
  self.report.add_notice(f"File '{f.ID}' has GROUPID attribute - document might need an update")
298
303
  if not (f.url or f.local_filename):
299
- self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href")
304
+ self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href "
305
+ "nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href")
300
306
  continue
301
307
  if f.url and 'url' not in self.skip:
302
308
  if re.match(r'^file:/[^/]', f.url):
@@ -322,19 +328,22 @@ class WorkspaceValidator():
322
328
  for err in XsdPageValidator.validate(Path(f.local_filename)).errors:
323
329
  self.report.add_error("%s: %s" % (f.ID, err))
324
330
  if 'page' in self.page_checks:
325
- page_report = PageValidator.validate(ocrd_file=f,
326
- page_textequiv_consistency=self.page_strictness,
327
- check_coords=self.page_coordinate_consistency in ['poly', 'both'],
328
- check_baseline=self.page_coordinate_consistency in ['baseline', 'both'])
331
+ page_report = PageValidator.validate(
332
+ ocrd_file=f,
333
+ page_textequiv_consistency=self.page_strictness,
334
+ check_coords=self.page_coordinate_consistency in ['poly', 'both'],
335
+ check_baseline=self.page_coordinate_consistency in ['baseline', 'both'])
329
336
  self.report.merge_report(page_report)
330
337
  pcgts = page_from_file(f)
331
338
  page = pcgts.get_Page()
332
339
  if 'dimension' in self.page_checks:
333
340
  img = self.workspace._resolve_image_as_pil(page.imageFilename)
334
341
  if page.imageHeight != img.height:
335
- self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {img.height})")
342
+ self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height "
343
+ f"({page.imageHeight} != {img.height})")
336
344
  if page.imageWidth != img.width:
337
- self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {img.width})")
345
+ self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width "
346
+ f"({page.imageWidth} != {img.width})")
338
347
  if 'imagefilename' in self.page_checks:
339
348
  imageFilename = page.imageFilename
340
349
  if is_local_filename(imageFilename):
@@ -344,7 +353,8 @@ class WorkspaceValidator():
344
353
  if not self.mets.find_files(**kwargs):
345
354
  self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
346
355
  if is_local_filename(imageFilename) and not Path(imageFilename).exists():
347
- self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file")
356
+ self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' "
357
+ "points to non-existent local file")
348
358
  if 'alternativeimage_filename' in self.page_checks:
349
359
  for altimg in page.get_AllAlternativeImages():
350
360
  if is_local_filename(altimg.filename):
@@ -368,8 +378,8 @@ class WorkspaceValidator():
368
378
  self.report.add_error(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
369
379
  f"'{altimg.filename}' feature '{feature}' not standardized for PAGE")
370
380
  if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID:
371
- self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pcgts.pcGtsId or '', f.ID or ''))
372
-
381
+ self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (
382
+ pcgts.pcGtsId or '', f.ID or ''))
373
383
 
374
384
  def _validate_page_xsd(self):
375
385
  """
@@ -1,6 +1,7 @@
1
1
  from .xsd_validator import XsdValidator
2
2
  from .constants import XSD_METS_URL
3
3
 
4
+
4
5
  class XsdMetsValidator(XsdValidator):
5
6
  """
6
7
  XML Schema validator.
@@ -14,4 +15,4 @@ class XsdMetsValidator(XsdValidator):
14
15
  Args:
15
16
  doc (etree.ElementTree|str|bytes):
16
17
  """
17
- return cls.instance(XSD_METS_URL)._validate(doc) # pylint: disable=protected-access
18
+ return cls.instance(XSD_METS_URL)._validate(doc) # pylint: disable=protected-access
@@ -1,6 +1,7 @@
1
1
  from .xsd_validator import XsdValidator
2
2
  from .constants import XSD_PAGE_URL
3
3
 
4
+
4
5
  class XsdPageValidator(XsdValidator):
5
6
  """
6
7
  XML Schema validator.
@@ -14,4 +15,4 @@ class XsdPageValidator(XsdValidator):
14
15
  Args:
15
16
  doc (etree.ElementTree|str|bytes):
16
17
  """
17
- return cls.instance(XSD_PAGE_URL)._validate(doc) # pylint: disable=protected-access
18
+ return cls.instance(XSD_PAGE_URL)._validate(doc) # pylint: disable=protected-access