ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. ocrd/cli/__init__.py +34 -26
  2. ocrd/cli/bashlib.py +32 -18
  3. ocrd/cli/ocrd_tool.py +7 -5
  4. ocrd/cli/workspace.py +10 -8
  5. ocrd/decorators/__init__.py +13 -7
  6. ocrd/decorators/ocrd_cli_options.py +1 -1
  7. ocrd/lib.bash +3 -0
  8. ocrd/mets_server.py +3 -4
  9. ocrd/processor/__init__.py +1 -1
  10. ocrd/processor/base.py +421 -98
  11. ocrd/processor/builtin/dummy_processor.py +4 -11
  12. ocrd/processor/helpers.py +24 -161
  13. ocrd/processor/ocrd_page_result.py +3 -3
  14. ocrd/resolver.py +0 -3
  15. ocrd/resource_manager.py +9 -5
  16. ocrd/workspace.py +10 -11
  17. ocrd/workspace_backup.py +1 -1
  18. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/METADATA +32 -10
  19. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/RECORD +49 -48
  20. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/WHEEL +1 -1
  21. ocrd_modelfactory/__init__.py +1 -1
  22. ocrd_models/constants.py +0 -1
  23. ocrd_models/ocrd_exif.py +2 -2
  24. ocrd_models/ocrd_file.py +2 -2
  25. ocrd_models/ocrd_mets.py +22 -22
  26. ocrd_models/ocrd_page.py +0 -1
  27. ocrd_models/ocrd_xml_base.py +2 -2
  28. ocrd_network/cli/client.py +134 -30
  29. ocrd_network/client.py +53 -27
  30. ocrd_network/client_utils.py +101 -0
  31. ocrd_network/processing_server.py +1 -1
  32. ocrd_network/runtime_data/deployer.py +12 -3
  33. ocrd_network/server_utils.py +12 -10
  34. ocrd_utils/__init__.py +2 -0
  35. ocrd_utils/config.py +31 -2
  36. ocrd_utils/image.py +25 -25
  37. ocrd_utils/logging.py +20 -20
  38. ocrd_utils/os.py +4 -5
  39. ocrd_utils/str.py +10 -3
  40. ocrd_validators/json_validator.py +1 -3
  41. ocrd_validators/ocrd_tool_validator.py +2 -2
  42. ocrd_validators/page_validator.py +56 -56
  43. ocrd_validators/parameter_validator.py +2 -2
  44. ocrd_validators/resource_list_validator.py +4 -3
  45. ocrd_validators/workspace_validator.py +21 -21
  46. ocrd_validators/xsd_validator.py +1 -1
  47. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/LICENSE +0 -0
  48. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/entry_points.txt +0 -0
  49. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,6 @@
2
2
  Validating JSON-Schema
3
3
  """
4
4
  import json
5
- from warnings import warn
6
5
 
7
6
  from jsonschema import Draft201909Validator, ValidationError, validators # pylint: disable=import-error
8
7
 
@@ -28,8 +27,7 @@ def extend_with_default(validator_class):
28
27
  if subschema.get('deprecated', False) and instance.get(prop):
29
28
  yield JsonSchemaDeprecationWarning(f"Property {prop} has been deprecated, ocrd-tool.json should be updated.")
30
29
 
31
- for error in validate_properties(validator, properties, instance, schema):
32
- yield error
30
+ yield from validate_properties(validator, properties, instance, schema)
33
31
 
34
32
  return validators.extend(validator_class, {"properties": set_defaults_and_handle_deprecate})
35
33
 
@@ -22,5 +22,5 @@ class OcrdToolValidator(JsonValidator):
22
22
  """
23
23
  return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access
24
24
 
25
- def __init__(self, schema, validator_class=...):
26
- super().__init__(schema, DefaultValidatingDraft20199Validator)
25
+ def __init__(self, schema):
26
+ super().__init__(schema, validator_class=DefaultValidatingDraft20199Validator)
@@ -34,50 +34,50 @@ from ocrd_models import ValidationReport
34
34
 
35
35
  _HIERARCHY = [
36
36
  # page can contain different types of regions
37
- (PageType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace
38
- (PageType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace
39
- (PageType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace
40
- (PageType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace
41
- (PageType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace
42
- (PageType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace
43
- (PageType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace
44
- (PageType, 'get_MapRegion', None), # pylint: disable=bad-whitespace
45
- (PageType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace
46
- (PageType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace
47
- (PageType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace
48
- (PageType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace
49
- (PageType, 'get_TableRegion', None), # pylint: disable=bad-whitespace
50
- (PageType, 'get_TextRegion', None), # pylint: disable=bad-whitespace
51
- (PageType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace
37
+ (PageType, 'get_AdvertRegion', None),
38
+ (PageType, 'get_ChartRegion', None),
39
+ (PageType, 'get_ChemRegion', None),
40
+ (PageType, 'get_CustomRegion', None),
41
+ (PageType, 'get_GraphicRegion', None),
42
+ (PageType, 'get_ImageRegion', None),
43
+ (PageType, 'get_LineDrawingRegion', None),
44
+ (PageType, 'get_MapRegion', None),
45
+ (PageType, 'get_MathsRegion', None),
46
+ (PageType, 'get_MusicRegion', None),
47
+ (PageType, 'get_NoiseRegion', None),
48
+ (PageType, 'get_SeparatorRegion', None),
49
+ (PageType, 'get_TableRegion', None),
50
+ (PageType, 'get_TextRegion', None),
51
+ (PageType, 'get_UnknownRegion', None),
52
52
  # all regions can be recursive
53
- (RegionType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace
54
- (RegionType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace
55
- (RegionType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace
56
- (RegionType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace
57
- (RegionType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace
58
- (RegionType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace
59
- (RegionType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace
60
- #(RegionType, 'get_MapRegion', None), # pylint: disable=bad-whitespace
61
- (RegionType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace
62
- (RegionType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace
63
- (RegionType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace
64
- (RegionType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace
65
- (RegionType, 'get_TableRegion', None), # pylint: disable=bad-whitespace
66
- (RegionType, 'get_TextRegion', None), # pylint: disable=bad-whitespace
67
- (RegionType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace
53
+ (RegionType, 'get_AdvertRegion', None),
54
+ (RegionType, 'get_ChartRegion', None),
55
+ (RegionType, 'get_ChemRegion', None),
56
+ (RegionType, 'get_CustomRegion', None),
57
+ (RegionType, 'get_GraphicRegion', None),
58
+ (RegionType, 'get_ImageRegion', None),
59
+ (RegionType, 'get_LineDrawingRegion', None),
60
+ #(RegionType, 'get_MapRegion', None),
61
+ (RegionType, 'get_MathsRegion', None),
62
+ (RegionType, 'get_MusicRegion', None),
63
+ (RegionType, 'get_NoiseRegion', None),
64
+ (RegionType, 'get_SeparatorRegion', None),
65
+ (RegionType, 'get_TableRegion', None),
66
+ (RegionType, 'get_TextRegion', None),
67
+ (RegionType, 'get_UnknownRegion', None),
68
68
  # only TextRegion can contain TextLine
69
- (TextRegionType, 'get_TextLine', '\n'), # pylint: disable=bad-whitespace
70
- (TextLineType, 'get_Word', ' '), # pylint: disable=bad-whitespace
71
- (WordType, 'get_Glyph', ''), # pylint: disable=bad-whitespace
72
- (GlyphType, None, None), # pylint: disable=bad-whitespace
69
+ (TextRegionType, 'get_TextLine', '\n'),
70
+ (TextLineType, 'get_Word', ' '),
71
+ (WordType, 'get_Glyph', ''),
72
+ (GlyphType, None, None),
73
73
  ]
74
74
 
75
75
  _ORDER = [
76
76
  (None, TextLineOrderSimpleType.BOTTOMTOTOP, ReadingDirectionSimpleType.RIGHTTOLEFT),
77
- (PageType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace
78
- (TextRegionType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace
79
- (TextLineType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace
80
- (WordType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace
77
+ (PageType, 'get_textLineOrder', 'get_readingDirection'),
78
+ (TextRegionType, 'get_textLineOrder', 'get_readingDirection'),
79
+ (TextLineType, None, 'get_readingDirection'),
80
+ (WordType, None, 'get_readingDirection'),
81
81
  ]
82
82
 
83
83
  # The following parameters control how tolerant we are with respect to
@@ -115,9 +115,9 @@ class ConsistencyError(Exception):
115
115
  self.file_id = file_id
116
116
  self.actual = actual
117
117
  self.expected = expected
118
- super(ConsistencyError, self).__init__(
119
- "INCONSISTENCY in %s ID '%s' of file '%s': text results '%s' != concatenated '%s'" % (
120
- tag, ID, file_id, actual, expected))
118
+ super().__init__(
119
+ f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': "
120
+ f"text results '{actual}' != concatenated '{expected}'")
121
121
 
122
122
  class CoordinateConsistencyError(Exception):
123
123
  """
@@ -141,9 +141,9 @@ class CoordinateConsistencyError(Exception):
141
141
  self.file_id = file_id
142
142
  self.outer = outer
143
143
  self.inner = inner
144
- super(CoordinateConsistencyError, self).__init__(
145
- "INCONSISTENCY in %s ID '%s' of '%s': coords '%s' not within parent coords '%s'" % (
146
- tag, ID, file_id, inner, outer))
144
+ super().__init__(
145
+ f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': "
146
+ f"coords '{inner}' not within parent coords '{outer}'")
147
147
 
148
148
  class CoordinateValidityError(Exception):
149
149
  """
@@ -166,9 +166,8 @@ class CoordinateValidityError(Exception):
166
166
  self.ID = ID
167
167
  self.file_id = file_id
168
168
  self.points = points
169
- super(CoordinateValidityError, self).__init__(
170
- "INVALIDITY in %s ID '%s' of '%s': coords '%s' - %s" % (
171
- tag, ID, file_id, points, reason))
169
+ super().__init__(
170
+ f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}")
172
171
 
173
172
  def compare_without_whitespace(a, b):
174
173
  """
@@ -177,13 +176,14 @@ def compare_without_whitespace(a, b):
177
176
  return re.sub('\\s+', '', a) == re.sub('\\s+', '', b)
178
177
 
179
178
  def page_get_reading_order(ro, rogroup):
180
- """Add all elements from the given reading order group to the given dictionary.
181
-
179
+ """
180
+ Add all elements from the given reading order group to the given dictionary.
181
+
182
182
  Given a dict ``ro`` from layout element IDs to ReadingOrder element objects,
183
183
  and an object ``rogroup`` with additional ReadingOrder element objects,
184
184
  add all references to the dict, traversing the group recursively.
185
185
  """
186
- regionrefs = list()
186
+ regionrefs = []
187
187
  if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
188
188
  regionrefs = (rogroup.get_RegionRefIndexed() +
189
189
  rogroup.get_OrderedGroupIndexed() +
@@ -241,12 +241,12 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
241
241
  node_id = node.get_pcGtsId()
242
242
  node = node.get_Page() # has no .id
243
243
  if not readingOrder:
244
- readingOrder = dict()
244
+ readingOrder = {}
245
245
  ro = node.get_ReadingOrder()
246
246
  if ro:
247
247
  page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup())
248
248
  if not joinRelations:
249
- joinRelations = list()
249
+ joinRelations = []
250
250
  relations = node.get_Relations() # get RelationsType
251
251
  if relations:
252
252
  relations = relations.get_Relation() # get list of RelationType
@@ -358,7 +358,7 @@ def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None):
358
358
  if not nodes:
359
359
  return ''
360
360
  if not joins:
361
- joins = list()
361
+ joins = []
362
362
  result = get_text(nodes[0], page_textequiv_strategy)
363
363
  for node, next_node in zip(nodes, nodes[1:]):
364
364
  if (node.id, next_node.id) not in joins:
@@ -470,11 +470,11 @@ class PageValidator():
470
470
  page = parse(filename, silence=True)
471
471
  file_id = filename
472
472
  else:
473
- raise Exception("At least one of ocrd_page, ocrd_file or filename must be set")
473
+ raise ValueError("At least one of ocrd_page, ocrd_file or filename must be set")
474
474
  if page_textequiv_strategy not in ('first'):
475
- raise Exception("page_textequiv_strategy %s not implemented" % page_textequiv_strategy)
475
+ raise ValueError("page_textequiv_strategy %s not implemented" % page_textequiv_strategy)
476
476
  if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'):
477
- raise Exception("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency)
477
+ raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency)
478
478
  report = ValidationReport()
479
479
  log.info("Validating input file '%s'", file_id)
480
480
  validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id)
@@ -20,7 +20,7 @@ class ParameterValidator(JsonValidator):
20
20
  obj (dict):
21
21
  schema (dict):
22
22
  """
23
- return super(ParameterValidator, self)._validate(*args, **kwargs)
23
+ return super()._validate(*args, **kwargs)
24
24
 
25
25
  def __init__(self, ocrd_tool):
26
26
  """
@@ -40,7 +40,7 @@ class ParameterValidator(JsonValidator):
40
40
  if p[n]['required']:
41
41
  required.append(n)
42
42
  del(p[n]['required'])
43
- super(ParameterValidator, self).__init__({
43
+ super().__init__({
44
44
  "type": "object",
45
45
  "required": required,
46
46
  "additionalProperties": False,
@@ -16,9 +16,10 @@ class OcrdResourceListValidator(JsonValidator):
16
16
  """
17
17
 
18
18
  @staticmethod
19
- def validate(obj, schema=RESOURCE_LIST_SCHEMA):
19
+ def validate(obj, schema=None):
20
20
  """
21
21
  Validate against ``resource_list.schema.yml`` schema.
22
22
  """
23
- return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj)
24
-
23
+ if schema is None:
24
+ schema = RESOURCE_LIST_SCHEMA
25
+ return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) # pylint: disable=protected-access
@@ -103,7 +103,7 @@ class WorkspaceValidator():
103
103
  'page_xsd']
104
104
  if check not in self.skip]
105
105
 
106
- self.find_kwargs = dict(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp)
106
+ self.find_kwargs = {"include_fileGrp": include_fileGrp, "exclude_fileGrp": exclude_fileGrp}
107
107
  self.src_dir = src_dir
108
108
  self.workspace = None
109
109
  self.mets = None
@@ -139,7 +139,7 @@ class WorkspaceValidator():
139
139
  self._resolve_workspace()
140
140
  except Exception as e: # pylint: disable=broad-except
141
141
  self.log.warning("Failed to instantiate workspace: %s", e)
142
- self.report.add_error("Failed to instantiate workspace: %s" % e)
142
+ self.report.add_error(f"Failed to instantiate workspace: {e}")
143
143
  return self.report
144
144
  with pushd_popd(self.workspace.directory):
145
145
  try:
@@ -158,7 +158,7 @@ class WorkspaceValidator():
158
158
  if self.page_checks:
159
159
  self._validate_page()
160
160
  except Exception: # pylint: disable=broad-except
161
- self.report.add_error("Validation aborted with exception: %s" % format_exc())
161
+ self.report.add_error(f"Validation aborted with exception: {format_exc()}")
162
162
  return self.report
163
163
 
164
164
  def _resolve_workspace(self):
@@ -193,9 +193,9 @@ class WorkspaceValidator():
193
193
  page = page_from_file(f).get_Page()
194
194
  imageFilename = page.imageFilename
195
195
  if not self.mets.find_files(url=imageFilename, **self.find_kwargs):
196
- self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.local_filename, imageFilename))
196
+ self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
197
197
  if is_local_filename(imageFilename) and not Path(imageFilename).exists():
198
- self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.local_filename, imageFilename))
198
+ self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file")
199
199
 
200
200
  def _validate_dimension(self):
201
201
  """
@@ -210,9 +210,9 @@ class WorkspaceValidator():
210
210
  page = page_from_file(f).get_Page()
211
211
  _, _, exif = self.workspace.image_from_page(page, f.pageId)
212
212
  if page.imageHeight != exif.height:
213
- self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height))
213
+ self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})")
214
214
  if page.imageWidth != exif.width:
215
- self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width))
215
+ self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})")
216
216
 
217
217
  def _validate_multipage(self):
218
218
  """
@@ -229,9 +229,9 @@ class WorkspaceValidator():
229
229
  try:
230
230
  exif = self.workspace.resolve_image_exif(f.local_filename)
231
231
  if exif.n_frames > 1:
232
- self.report.add_error("Image %s: More than 1 frame: %s" % (f.ID, exif.n_frames))
232
+ self.report.add_error(f"Image '{f.ID}': More than 1 frame: {exif.n_frames}")
233
233
  except FileNotFoundError:
234
- self.report.add_error("Image %s: Could not retrieve %s (local_filename=%s, url=%s)" % (f.ID, f.local_filename, f.url))
234
+ self.report.add_error(f"Image '{f.ID}': Could not retrieve (local_filename='{f.local_filename}', url='{f.url}')")
235
235
  return
236
236
 
237
237
  def _validate_pixel_density(self):
@@ -250,7 +250,7 @@ class WorkspaceValidator():
250
250
  for k in ['xResolution', 'yResolution']:
251
251
  v = exif.__dict__.get(k)
252
252
  if v is None or v <= 72:
253
- self.report.add_notice("Image %s: %s (%s pixels per %s) is suspiciously low" % (f.ID, k, v, exif.resolutionUnit))
253
+ self.report.add_notice(f"Image '{f.ID}': {k} ({v} pixels per {exif.resolutionUnit}) is suspiciously low")
254
254
 
255
255
  def _validate_mets_file_group_names(self):
256
256
  """
@@ -261,7 +261,7 @@ class WorkspaceValidator():
261
261
  self.log.debug('_validate_mets_file_group_names')
262
262
  for fileGrp in self.mets.file_groups:
263
263
  if not fileGrp.startswith(FILE_GROUP_PREFIX):
264
- self.report.add_notice("fileGrp USE does not begin with '%s': %s" % (FILE_GROUP_PREFIX, fileGrp))
264
+ self.report.add_notice(f"fileGrp USE '{fileGrp}' does not begin with '{FILE_GROUP_PREFIX}'")
265
265
  else:
266
266
  # OCR-D-FOO-BAR -> ('FOO', 'BAR')
267
267
  # \____/\_/ \_/
@@ -273,9 +273,9 @@ class WorkspaceValidator():
273
273
  if '-' in category:
274
274
  category, name = category.split('-', 1)
275
275
  if category not in FILE_GROUP_CATEGORIES:
276
- self.report.add_notice("Unspecified USE category '%s' in fileGrp '%s'" % (category, fileGrp))
276
+ self.report.add_notice(f"Unspecified USE category '{category}' in fileGrp '{fileGrp}'")
277
277
  if name is not None and not re.match(r'^[A-Z0-9-]{3,}$', name):
278
- self.report.add_notice("Invalid USE name '%s' in fileGrp '%s'" % (name, fileGrp))
278
+ self.report.add_notice(f"Invalid USE name '{name}' in fileGrp '{fileGrp}'")
279
279
 
280
280
  def _validate_mets_files(self):
281
281
  """
@@ -288,16 +288,16 @@ class WorkspaceValidator():
288
288
  self.report.add_error("No files")
289
289
  for f in self.mets.find_files(**self.find_kwargs):
290
290
  if f._el.get('GROUPID'): # pylint: disable=protected-access
291
- self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID)
291
+ self.report.add_notice(f"File '{f.ID}' has GROUPID attribute - document might need an update")
292
292
  if not (f.url or f.local_filename):
293
- self.report.add_error("File '%s' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href" % f.ID)
293
+ self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href")
294
294
  continue
295
295
  if f.url and 'url' not in self.skip:
296
296
  if re.match(r'^file:/[^/]', f.url):
297
- self.report.add_error("File '%s' has an invalid (Java-specific) file URL '%s'" % (f.ID, f.url))
297
+ self.report.add_error(f"File '{f.ID}' has an invalid (Java-specific) file URL '{f.url}'")
298
298
  scheme = f.url[0:f.url.index(':')]
299
299
  if scheme not in ('http', 'https', 'file'):
300
- self.report.add_warning("File '%s' has non-HTTP, non-file URL '%s'" % (f.ID, f.url))
300
+ self.report.add_warning(f"File '{f.ID}' has non-HTTP, non-file URL '{f.url}'")
301
301
 
302
302
  def _validate_page(self):
303
303
  """
@@ -323,15 +323,15 @@ class WorkspaceValidator():
323
323
  if 'dimension' in self.page_checks:
324
324
  _, _, exif = self.workspace.image_from_page(page, f.pageId)
325
325
  if page.imageHeight != exif.height:
326
- self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height))
326
+ self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})")
327
327
  if page.imageWidth != exif.width:
328
- self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width))
328
+ self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})")
329
329
  if 'imagefilename' in self.page_checks:
330
330
  imageFilename = page.imageFilename
331
331
  if not self.mets.find_files(url=imageFilename):
332
- self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename))
332
+ self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
333
333
  if is_local_filename(imageFilename) and not Path(imageFilename).exists():
334
- self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.url, imageFilename))
334
+ self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file")
335
335
  if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID:
336
336
  self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pcgts.pcGtsId or '', f.ID or ''))
337
337
 
@@ -45,7 +45,7 @@ class XsdValidator():
45
45
  schema_url (str): URI of XML schema to validate against.
46
46
  """
47
47
  if schema_url not in XSD_PATHS:
48
- raise Exception('XML schema not bundled with OCR-D: %s' % schema_url)
48
+ raise ValueError('XML schema not bundled with OCR-D: %s' % schema_url)
49
49
  with open(XSD_PATHS[schema_url], 'r') as f:
50
50
  xmlschema_doc = ET.parse(f)
51
51
  self._xmlschema = ET.XMLSchema(xmlschema_doc)