ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +34 -26
- ocrd/cli/bashlib.py +32 -18
- ocrd/cli/ocrd_tool.py +7 -5
- ocrd/cli/workspace.py +10 -8
- ocrd/decorators/__init__.py +13 -7
- ocrd/decorators/ocrd_cli_options.py +1 -1
- ocrd/lib.bash +3 -0
- ocrd/mets_server.py +3 -4
- ocrd/processor/__init__.py +1 -1
- ocrd/processor/base.py +421 -98
- ocrd/processor/builtin/dummy_processor.py +4 -11
- ocrd/processor/helpers.py +24 -161
- ocrd/processor/ocrd_page_result.py +3 -3
- ocrd/resolver.py +0 -3
- ocrd/resource_manager.py +9 -5
- ocrd/workspace.py +10 -11
- ocrd/workspace_backup.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/METADATA +32 -10
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/RECORD +49 -48
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/WHEEL +1 -1
- ocrd_modelfactory/__init__.py +1 -1
- ocrd_models/constants.py +0 -1
- ocrd_models/ocrd_exif.py +2 -2
- ocrd_models/ocrd_file.py +2 -2
- ocrd_models/ocrd_mets.py +22 -22
- ocrd_models/ocrd_page.py +0 -1
- ocrd_models/ocrd_xml_base.py +2 -2
- ocrd_network/cli/client.py +134 -30
- ocrd_network/client.py +53 -27
- ocrd_network/client_utils.py +101 -0
- ocrd_network/processing_server.py +1 -1
- ocrd_network/runtime_data/deployer.py +12 -3
- ocrd_network/server_utils.py +12 -10
- ocrd_utils/__init__.py +2 -0
- ocrd_utils/config.py +31 -2
- ocrd_utils/image.py +25 -25
- ocrd_utils/logging.py +20 -20
- ocrd_utils/os.py +4 -5
- ocrd_utils/str.py +10 -3
- ocrd_validators/json_validator.py +1 -3
- ocrd_validators/ocrd_tool_validator.py +2 -2
- ocrd_validators/page_validator.py +56 -56
- ocrd_validators/parameter_validator.py +2 -2
- ocrd_validators/resource_list_validator.py +4 -3
- ocrd_validators/workspace_validator.py +21 -21
- ocrd_validators/xsd_validator.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/LICENSE +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/entry_points.txt +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/top_level.txt +0 -0
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
Validating JSON-Schema
|
|
3
3
|
"""
|
|
4
4
|
import json
|
|
5
|
-
from warnings import warn
|
|
6
5
|
|
|
7
6
|
from jsonschema import Draft201909Validator, ValidationError, validators # pylint: disable=import-error
|
|
8
7
|
|
|
@@ -28,8 +27,7 @@ def extend_with_default(validator_class):
|
|
|
28
27
|
if subschema.get('deprecated', False) and instance.get(prop):
|
|
29
28
|
yield JsonSchemaDeprecationWarning(f"Property {prop} has been deprecated, ocrd-tool.json should be updated.")
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
yield error
|
|
30
|
+
yield from validate_properties(validator, properties, instance, schema)
|
|
33
31
|
|
|
34
32
|
return validators.extend(validator_class, {"properties": set_defaults_and_handle_deprecate})
|
|
35
33
|
|
|
@@ -22,5 +22,5 @@ class OcrdToolValidator(JsonValidator):
|
|
|
22
22
|
"""
|
|
23
23
|
return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access
|
|
24
24
|
|
|
25
|
-
def __init__(self, schema
|
|
26
|
-
super().__init__(schema, DefaultValidatingDraft20199Validator)
|
|
25
|
+
def __init__(self, schema):
|
|
26
|
+
super().__init__(schema, validator_class=DefaultValidatingDraft20199Validator)
|
|
@@ -34,50 +34,50 @@ from ocrd_models import ValidationReport
|
|
|
34
34
|
|
|
35
35
|
_HIERARCHY = [
|
|
36
36
|
# page can contain different types of regions
|
|
37
|
-
(PageType, 'get_AdvertRegion', None),
|
|
38
|
-
(PageType, 'get_ChartRegion', None),
|
|
39
|
-
(PageType, 'get_ChemRegion', None),
|
|
40
|
-
(PageType, 'get_CustomRegion', None),
|
|
41
|
-
(PageType, 'get_GraphicRegion', None),
|
|
42
|
-
(PageType, 'get_ImageRegion', None),
|
|
43
|
-
(PageType, 'get_LineDrawingRegion', None),
|
|
44
|
-
(PageType, 'get_MapRegion', None),
|
|
45
|
-
(PageType, 'get_MathsRegion', None),
|
|
46
|
-
(PageType, 'get_MusicRegion', None),
|
|
47
|
-
(PageType, 'get_NoiseRegion', None),
|
|
48
|
-
(PageType, 'get_SeparatorRegion', None),
|
|
49
|
-
(PageType, 'get_TableRegion', None),
|
|
50
|
-
(PageType, 'get_TextRegion', None),
|
|
51
|
-
(PageType, 'get_UnknownRegion', None),
|
|
37
|
+
(PageType, 'get_AdvertRegion', None),
|
|
38
|
+
(PageType, 'get_ChartRegion', None),
|
|
39
|
+
(PageType, 'get_ChemRegion', None),
|
|
40
|
+
(PageType, 'get_CustomRegion', None),
|
|
41
|
+
(PageType, 'get_GraphicRegion', None),
|
|
42
|
+
(PageType, 'get_ImageRegion', None),
|
|
43
|
+
(PageType, 'get_LineDrawingRegion', None),
|
|
44
|
+
(PageType, 'get_MapRegion', None),
|
|
45
|
+
(PageType, 'get_MathsRegion', None),
|
|
46
|
+
(PageType, 'get_MusicRegion', None),
|
|
47
|
+
(PageType, 'get_NoiseRegion', None),
|
|
48
|
+
(PageType, 'get_SeparatorRegion', None),
|
|
49
|
+
(PageType, 'get_TableRegion', None),
|
|
50
|
+
(PageType, 'get_TextRegion', None),
|
|
51
|
+
(PageType, 'get_UnknownRegion', None),
|
|
52
52
|
# all regions can be recursive
|
|
53
|
-
(RegionType, 'get_AdvertRegion', None),
|
|
54
|
-
(RegionType, 'get_ChartRegion', None),
|
|
55
|
-
(RegionType, 'get_ChemRegion', None),
|
|
56
|
-
(RegionType, 'get_CustomRegion', None),
|
|
57
|
-
(RegionType, 'get_GraphicRegion', None),
|
|
58
|
-
(RegionType, 'get_ImageRegion', None),
|
|
59
|
-
(RegionType, 'get_LineDrawingRegion', None),
|
|
60
|
-
#(RegionType, 'get_MapRegion', None),
|
|
61
|
-
(RegionType, 'get_MathsRegion', None),
|
|
62
|
-
(RegionType, 'get_MusicRegion', None),
|
|
63
|
-
(RegionType, 'get_NoiseRegion', None),
|
|
64
|
-
(RegionType, 'get_SeparatorRegion', None),
|
|
65
|
-
(RegionType, 'get_TableRegion', None),
|
|
66
|
-
(RegionType, 'get_TextRegion', None),
|
|
67
|
-
(RegionType, 'get_UnknownRegion', None),
|
|
53
|
+
(RegionType, 'get_AdvertRegion', None),
|
|
54
|
+
(RegionType, 'get_ChartRegion', None),
|
|
55
|
+
(RegionType, 'get_ChemRegion', None),
|
|
56
|
+
(RegionType, 'get_CustomRegion', None),
|
|
57
|
+
(RegionType, 'get_GraphicRegion', None),
|
|
58
|
+
(RegionType, 'get_ImageRegion', None),
|
|
59
|
+
(RegionType, 'get_LineDrawingRegion', None),
|
|
60
|
+
#(RegionType, 'get_MapRegion', None),
|
|
61
|
+
(RegionType, 'get_MathsRegion', None),
|
|
62
|
+
(RegionType, 'get_MusicRegion', None),
|
|
63
|
+
(RegionType, 'get_NoiseRegion', None),
|
|
64
|
+
(RegionType, 'get_SeparatorRegion', None),
|
|
65
|
+
(RegionType, 'get_TableRegion', None),
|
|
66
|
+
(RegionType, 'get_TextRegion', None),
|
|
67
|
+
(RegionType, 'get_UnknownRegion', None),
|
|
68
68
|
# only TextRegion can contain TextLine
|
|
69
|
-
(TextRegionType, 'get_TextLine', '\n'),
|
|
70
|
-
(TextLineType, 'get_Word', ' '),
|
|
71
|
-
(WordType, 'get_Glyph', ''),
|
|
72
|
-
(GlyphType, None, None),
|
|
69
|
+
(TextRegionType, 'get_TextLine', '\n'),
|
|
70
|
+
(TextLineType, 'get_Word', ' '),
|
|
71
|
+
(WordType, 'get_Glyph', ''),
|
|
72
|
+
(GlyphType, None, None),
|
|
73
73
|
]
|
|
74
74
|
|
|
75
75
|
_ORDER = [
|
|
76
76
|
(None, TextLineOrderSimpleType.BOTTOMTOTOP, ReadingDirectionSimpleType.RIGHTTOLEFT),
|
|
77
|
-
(PageType, 'get_textLineOrder', 'get_readingDirection'),
|
|
78
|
-
(TextRegionType, 'get_textLineOrder', 'get_readingDirection'),
|
|
79
|
-
(TextLineType, None, 'get_readingDirection'),
|
|
80
|
-
(WordType, None, 'get_readingDirection'),
|
|
77
|
+
(PageType, 'get_textLineOrder', 'get_readingDirection'),
|
|
78
|
+
(TextRegionType, 'get_textLineOrder', 'get_readingDirection'),
|
|
79
|
+
(TextLineType, None, 'get_readingDirection'),
|
|
80
|
+
(WordType, None, 'get_readingDirection'),
|
|
81
81
|
]
|
|
82
82
|
|
|
83
83
|
# The following parameters control how tolerant we are with respect to
|
|
@@ -115,9 +115,9 @@ class ConsistencyError(Exception):
|
|
|
115
115
|
self.file_id = file_id
|
|
116
116
|
self.actual = actual
|
|
117
117
|
self.expected = expected
|
|
118
|
-
super(
|
|
119
|
-
"INCONSISTENCY in
|
|
120
|
-
|
|
118
|
+
super().__init__(
|
|
119
|
+
f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': "
|
|
120
|
+
f"text results '{actual}' != concatenated '{expected}'")
|
|
121
121
|
|
|
122
122
|
class CoordinateConsistencyError(Exception):
|
|
123
123
|
"""
|
|
@@ -141,9 +141,9 @@ class CoordinateConsistencyError(Exception):
|
|
|
141
141
|
self.file_id = file_id
|
|
142
142
|
self.outer = outer
|
|
143
143
|
self.inner = inner
|
|
144
|
-
super(
|
|
145
|
-
"INCONSISTENCY in
|
|
146
|
-
|
|
144
|
+
super().__init__(
|
|
145
|
+
f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': "
|
|
146
|
+
f"coords '{inner}' not within parent coords '{outer}'")
|
|
147
147
|
|
|
148
148
|
class CoordinateValidityError(Exception):
|
|
149
149
|
"""
|
|
@@ -166,9 +166,8 @@ class CoordinateValidityError(Exception):
|
|
|
166
166
|
self.ID = ID
|
|
167
167
|
self.file_id = file_id
|
|
168
168
|
self.points = points
|
|
169
|
-
super(
|
|
170
|
-
"INVALIDITY in
|
|
171
|
-
tag, ID, file_id, points, reason))
|
|
169
|
+
super().__init__(
|
|
170
|
+
f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}")
|
|
172
171
|
|
|
173
172
|
def compare_without_whitespace(a, b):
|
|
174
173
|
"""
|
|
@@ -177,13 +176,14 @@ def compare_without_whitespace(a, b):
|
|
|
177
176
|
return re.sub('\\s+', '', a) == re.sub('\\s+', '', b)
|
|
178
177
|
|
|
179
178
|
def page_get_reading_order(ro, rogroup):
|
|
180
|
-
"""
|
|
181
|
-
|
|
179
|
+
"""
|
|
180
|
+
Add all elements from the given reading order group to the given dictionary.
|
|
181
|
+
|
|
182
182
|
Given a dict ``ro`` from layout element IDs to ReadingOrder element objects,
|
|
183
183
|
and an object ``rogroup`` with additional ReadingOrder element objects,
|
|
184
184
|
add all references to the dict, traversing the group recursively.
|
|
185
185
|
"""
|
|
186
|
-
regionrefs =
|
|
186
|
+
regionrefs = []
|
|
187
187
|
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
|
|
188
188
|
regionrefs = (rogroup.get_RegionRefIndexed() +
|
|
189
189
|
rogroup.get_OrderedGroupIndexed() +
|
|
@@ -241,12 +241,12 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
|
|
|
241
241
|
node_id = node.get_pcGtsId()
|
|
242
242
|
node = node.get_Page() # has no .id
|
|
243
243
|
if not readingOrder:
|
|
244
|
-
readingOrder =
|
|
244
|
+
readingOrder = {}
|
|
245
245
|
ro = node.get_ReadingOrder()
|
|
246
246
|
if ro:
|
|
247
247
|
page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup())
|
|
248
248
|
if not joinRelations:
|
|
249
|
-
joinRelations =
|
|
249
|
+
joinRelations = []
|
|
250
250
|
relations = node.get_Relations() # get RelationsType
|
|
251
251
|
if relations:
|
|
252
252
|
relations = relations.get_Relation() # get list of RelationType
|
|
@@ -358,7 +358,7 @@ def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None):
|
|
|
358
358
|
if not nodes:
|
|
359
359
|
return ''
|
|
360
360
|
if not joins:
|
|
361
|
-
joins =
|
|
361
|
+
joins = []
|
|
362
362
|
result = get_text(nodes[0], page_textequiv_strategy)
|
|
363
363
|
for node, next_node in zip(nodes, nodes[1:]):
|
|
364
364
|
if (node.id, next_node.id) not in joins:
|
|
@@ -470,11 +470,11 @@ class PageValidator():
|
|
|
470
470
|
page = parse(filename, silence=True)
|
|
471
471
|
file_id = filename
|
|
472
472
|
else:
|
|
473
|
-
raise
|
|
473
|
+
raise ValueError("At least one of ocrd_page, ocrd_file or filename must be set")
|
|
474
474
|
if page_textequiv_strategy not in ('first'):
|
|
475
|
-
raise
|
|
475
|
+
raise ValueError("page_textequiv_strategy %s not implemented" % page_textequiv_strategy)
|
|
476
476
|
if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'):
|
|
477
|
-
raise
|
|
477
|
+
raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency)
|
|
478
478
|
report = ValidationReport()
|
|
479
479
|
log.info("Validating input file '%s'", file_id)
|
|
480
480
|
validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id)
|
|
@@ -20,7 +20,7 @@ class ParameterValidator(JsonValidator):
|
|
|
20
20
|
obj (dict):
|
|
21
21
|
schema (dict):
|
|
22
22
|
"""
|
|
23
|
-
return super(
|
|
23
|
+
return super()._validate(*args, **kwargs)
|
|
24
24
|
|
|
25
25
|
def __init__(self, ocrd_tool):
|
|
26
26
|
"""
|
|
@@ -40,7 +40,7 @@ class ParameterValidator(JsonValidator):
|
|
|
40
40
|
if p[n]['required']:
|
|
41
41
|
required.append(n)
|
|
42
42
|
del(p[n]['required'])
|
|
43
|
-
super(
|
|
43
|
+
super().__init__({
|
|
44
44
|
"type": "object",
|
|
45
45
|
"required": required,
|
|
46
46
|
"additionalProperties": False,
|
|
@@ -16,9 +16,10 @@ class OcrdResourceListValidator(JsonValidator):
|
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
18
|
@staticmethod
|
|
19
|
-
def validate(obj, schema=
|
|
19
|
+
def validate(obj, schema=None):
|
|
20
20
|
"""
|
|
21
21
|
Validate against ``resource_list.schema.yml`` schema.
|
|
22
22
|
"""
|
|
23
|
-
|
|
24
|
-
|
|
23
|
+
if schema is None:
|
|
24
|
+
schema = RESOURCE_LIST_SCHEMA
|
|
25
|
+
return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) # pylint: disable=protected-access
|
|
@@ -103,7 +103,7 @@ class WorkspaceValidator():
|
|
|
103
103
|
'page_xsd']
|
|
104
104
|
if check not in self.skip]
|
|
105
105
|
|
|
106
|
-
self.find_kwargs =
|
|
106
|
+
self.find_kwargs = {"include_fileGrp": include_fileGrp, "exclude_fileGrp": exclude_fileGrp}
|
|
107
107
|
self.src_dir = src_dir
|
|
108
108
|
self.workspace = None
|
|
109
109
|
self.mets = None
|
|
@@ -139,7 +139,7 @@ class WorkspaceValidator():
|
|
|
139
139
|
self._resolve_workspace()
|
|
140
140
|
except Exception as e: # pylint: disable=broad-except
|
|
141
141
|
self.log.warning("Failed to instantiate workspace: %s", e)
|
|
142
|
-
self.report.add_error("Failed to instantiate workspace:
|
|
142
|
+
self.report.add_error(f"Failed to instantiate workspace: {e}")
|
|
143
143
|
return self.report
|
|
144
144
|
with pushd_popd(self.workspace.directory):
|
|
145
145
|
try:
|
|
@@ -158,7 +158,7 @@ class WorkspaceValidator():
|
|
|
158
158
|
if self.page_checks:
|
|
159
159
|
self._validate_page()
|
|
160
160
|
except Exception: # pylint: disable=broad-except
|
|
161
|
-
self.report.add_error("Validation aborted with exception:
|
|
161
|
+
self.report.add_error(f"Validation aborted with exception: {format_exc()}")
|
|
162
162
|
return self.report
|
|
163
163
|
|
|
164
164
|
def _resolve_workspace(self):
|
|
@@ -193,9 +193,9 @@ class WorkspaceValidator():
|
|
|
193
193
|
page = page_from_file(f).get_Page()
|
|
194
194
|
imageFilename = page.imageFilename
|
|
195
195
|
if not self.mets.find_files(url=imageFilename, **self.find_kwargs):
|
|
196
|
-
self.report.add_error("PAGE
|
|
196
|
+
self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
|
|
197
197
|
if is_local_filename(imageFilename) and not Path(imageFilename).exists():
|
|
198
|
-
self.report.add_warning("PAGE
|
|
198
|
+
self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file")
|
|
199
199
|
|
|
200
200
|
def _validate_dimension(self):
|
|
201
201
|
"""
|
|
@@ -210,9 +210,9 @@ class WorkspaceValidator():
|
|
|
210
210
|
page = page_from_file(f).get_Page()
|
|
211
211
|
_, _, exif = self.workspace.image_from_page(page, f.pageId)
|
|
212
212
|
if page.imageHeight != exif.height:
|
|
213
|
-
self.report.add_error("PAGE '
|
|
213
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})")
|
|
214
214
|
if page.imageWidth != exif.width:
|
|
215
|
-
self.report.add_error("PAGE '
|
|
215
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})")
|
|
216
216
|
|
|
217
217
|
def _validate_multipage(self):
|
|
218
218
|
"""
|
|
@@ -229,9 +229,9 @@ class WorkspaceValidator():
|
|
|
229
229
|
try:
|
|
230
230
|
exif = self.workspace.resolve_image_exif(f.local_filename)
|
|
231
231
|
if exif.n_frames > 1:
|
|
232
|
-
self.report.add_error("Image
|
|
232
|
+
self.report.add_error(f"Image '{f.ID}': More than 1 frame: {exif.n_frames}")
|
|
233
233
|
except FileNotFoundError:
|
|
234
|
-
self.report.add_error("Image
|
|
234
|
+
self.report.add_error(f"Image '{f.ID}': Could not retrieve (local_filename='{f.local_filename}', url='{f.url}')")
|
|
235
235
|
return
|
|
236
236
|
|
|
237
237
|
def _validate_pixel_density(self):
|
|
@@ -250,7 +250,7 @@ class WorkspaceValidator():
|
|
|
250
250
|
for k in ['xResolution', 'yResolution']:
|
|
251
251
|
v = exif.__dict__.get(k)
|
|
252
252
|
if v is None or v <= 72:
|
|
253
|
-
self.report.add_notice("Image
|
|
253
|
+
self.report.add_notice(f"Image '{f.ID}': {k} ({v} pixels per {exif.resolutionUnit}) is suspiciously low")
|
|
254
254
|
|
|
255
255
|
def _validate_mets_file_group_names(self):
|
|
256
256
|
"""
|
|
@@ -261,7 +261,7 @@ class WorkspaceValidator():
|
|
|
261
261
|
self.log.debug('_validate_mets_file_group_names')
|
|
262
262
|
for fileGrp in self.mets.file_groups:
|
|
263
263
|
if not fileGrp.startswith(FILE_GROUP_PREFIX):
|
|
264
|
-
self.report.add_notice("fileGrp USE does not begin with '
|
|
264
|
+
self.report.add_notice(f"fileGrp USE '{fileGrp}' does not begin with '{FILE_GROUP_PREFIX}'")
|
|
265
265
|
else:
|
|
266
266
|
# OCR-D-FOO-BAR -> ('FOO', 'BAR')
|
|
267
267
|
# \____/\_/ \_/
|
|
@@ -273,9 +273,9 @@ class WorkspaceValidator():
|
|
|
273
273
|
if '-' in category:
|
|
274
274
|
category, name = category.split('-', 1)
|
|
275
275
|
if category not in FILE_GROUP_CATEGORIES:
|
|
276
|
-
self.report.add_notice("Unspecified USE category '
|
|
276
|
+
self.report.add_notice(f"Unspecified USE category '{category}' in fileGrp '{fileGrp}'")
|
|
277
277
|
if name is not None and not re.match(r'^[A-Z0-9-]{3,}$', name):
|
|
278
|
-
self.report.add_notice("Invalid USE name '
|
|
278
|
+
self.report.add_notice(f"Invalid USE name '{name}' in fileGrp '{fileGrp}'")
|
|
279
279
|
|
|
280
280
|
def _validate_mets_files(self):
|
|
281
281
|
"""
|
|
@@ -288,16 +288,16 @@ class WorkspaceValidator():
|
|
|
288
288
|
self.report.add_error("No files")
|
|
289
289
|
for f in self.mets.find_files(**self.find_kwargs):
|
|
290
290
|
if f._el.get('GROUPID'): # pylint: disable=protected-access
|
|
291
|
-
self.report.add_notice("File '
|
|
291
|
+
self.report.add_notice(f"File '{f.ID}' has GROUPID attribute - document might need an update")
|
|
292
292
|
if not (f.url or f.local_filename):
|
|
293
|
-
self.report.add_error("File '
|
|
293
|
+
self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href")
|
|
294
294
|
continue
|
|
295
295
|
if f.url and 'url' not in self.skip:
|
|
296
296
|
if re.match(r'^file:/[^/]', f.url):
|
|
297
|
-
self.report.add_error("File '
|
|
297
|
+
self.report.add_error(f"File '{f.ID}' has an invalid (Java-specific) file URL '{f.url}'")
|
|
298
298
|
scheme = f.url[0:f.url.index(':')]
|
|
299
299
|
if scheme not in ('http', 'https', 'file'):
|
|
300
|
-
self.report.add_warning("File '
|
|
300
|
+
self.report.add_warning(f"File '{f.ID}' has non-HTTP, non-file URL '{f.url}'")
|
|
301
301
|
|
|
302
302
|
def _validate_page(self):
|
|
303
303
|
"""
|
|
@@ -323,15 +323,15 @@ class WorkspaceValidator():
|
|
|
323
323
|
if 'dimension' in self.page_checks:
|
|
324
324
|
_, _, exif = self.workspace.image_from_page(page, f.pageId)
|
|
325
325
|
if page.imageHeight != exif.height:
|
|
326
|
-
self.report.add_error("PAGE '
|
|
326
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})")
|
|
327
327
|
if page.imageWidth != exif.width:
|
|
328
|
-
self.report.add_error("PAGE '
|
|
328
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})")
|
|
329
329
|
if 'imagefilename' in self.page_checks:
|
|
330
330
|
imageFilename = page.imageFilename
|
|
331
331
|
if not self.mets.find_files(url=imageFilename):
|
|
332
|
-
self.report.add_error("PAGE
|
|
332
|
+
self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
|
|
333
333
|
if is_local_filename(imageFilename) and not Path(imageFilename).exists():
|
|
334
|
-
self.report.add_warning("PAGE
|
|
334
|
+
self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file")
|
|
335
335
|
if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID:
|
|
336
336
|
self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pcgts.pcGtsId or '', f.ID or ''))
|
|
337
337
|
|
ocrd_validators/xsd_validator.py
CHANGED
|
@@ -45,7 +45,7 @@ class XsdValidator():
|
|
|
45
45
|
schema_url (str): URI of XML schema to validate against.
|
|
46
46
|
"""
|
|
47
47
|
if schema_url not in XSD_PATHS:
|
|
48
|
-
raise
|
|
48
|
+
raise ValueError('XML schema not bundled with OCR-D: %s' % schema_url)
|
|
49
49
|
with open(XSD_PATHS[schema_url], 'r') as f:
|
|
50
50
|
xmlschema_doc = ET.parse(f)
|
|
51
51
|
self._xmlschema = ET.XMLSchema(xmlschema_doc)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|