ocrd 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +8 -6
- ocrd/cli/bashlib.py +8 -114
- ocrd/cli/network.py +0 -2
- ocrd/cli/ocrd_tool.py +26 -4
- ocrd/cli/process.py +1 -0
- ocrd/cli/resmgr.py +0 -1
- ocrd/cli/validate.py +32 -13
- ocrd/cli/workspace.py +125 -52
- ocrd/cli/zip.py +13 -4
- ocrd/decorators/__init__.py +28 -52
- ocrd/decorators/loglevel_option.py +4 -0
- ocrd/decorators/mets_find_options.py +2 -1
- ocrd/decorators/ocrd_cli_options.py +3 -7
- ocrd/decorators/parameter_option.py +12 -11
- ocrd/mets_server.py +11 -15
- ocrd/processor/base.py +88 -71
- ocrd/processor/builtin/dummy_processor.py +7 -4
- ocrd/processor/builtin/filter_processor.py +3 -2
- ocrd/processor/helpers.py +5 -6
- ocrd/processor/ocrd_page_result.py +7 -5
- ocrd/resolver.py +42 -32
- ocrd/task_sequence.py +11 -4
- ocrd/workspace.py +64 -54
- ocrd/workspace_backup.py +3 -0
- ocrd/workspace_bagger.py +15 -8
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/METADATA +2 -8
- ocrd-3.7.0.dist-info/RECORD +123 -0
- ocrd_modelfactory/__init__.py +4 -2
- ocrd_models/constants.py +18 -1
- ocrd_models/ocrd_agent.py +1 -1
- ocrd_models/ocrd_exif.py +7 -3
- ocrd_models/ocrd_file.py +24 -19
- ocrd_models/ocrd_mets.py +90 -67
- ocrd_models/ocrd_page.py +17 -13
- ocrd_models/ocrd_xml_base.py +1 -0
- ocrd_models/report.py +2 -1
- ocrd_models/utils.py +4 -3
- ocrd_models/xpath_functions.py +3 -1
- ocrd_network/__init__.py +1 -2
- ocrd_network/cli/__init__.py +0 -2
- ocrd_network/cli/client.py +122 -50
- ocrd_network/cli/processing_server.py +1 -2
- ocrd_network/client.py +2 -2
- ocrd_network/client_utils.py +30 -13
- ocrd_network/constants.py +1 -6
- ocrd_network/database.py +3 -3
- ocrd_network/logging_utils.py +2 -7
- ocrd_network/models/__init__.py +0 -2
- ocrd_network/models/job.py +31 -33
- ocrd_network/models/messages.py +3 -2
- ocrd_network/models/workspace.py +5 -5
- ocrd_network/process_helpers.py +54 -17
- ocrd_network/processing_server.py +63 -114
- ocrd_network/processing_worker.py +6 -5
- ocrd_network/rabbitmq_utils/__init__.py +2 -0
- ocrd_network/rabbitmq_utils/helpers.py +24 -7
- ocrd_network/runtime_data/__init__.py +1 -2
- ocrd_network/runtime_data/deployer.py +12 -85
- ocrd_network/runtime_data/hosts.py +61 -130
- ocrd_network/runtime_data/network_agents.py +7 -31
- ocrd_network/runtime_data/network_services.py +1 -1
- ocrd_network/server_cache.py +1 -1
- ocrd_network/server_utils.py +13 -52
- ocrd_network/utils.py +1 -0
- ocrd_utils/__init__.py +4 -4
- ocrd_utils/config.py +86 -76
- ocrd_utils/deprecate.py +3 -0
- ocrd_utils/image.py +51 -23
- ocrd_utils/introspect.py +8 -3
- ocrd_utils/logging.py +15 -7
- ocrd_utils/os.py +17 -4
- ocrd_utils/str.py +32 -16
- ocrd_validators/json_validator.py +4 -1
- ocrd_validators/ocrd_tool_validator.py +2 -1
- ocrd_validators/ocrd_zip_validator.py +5 -4
- ocrd_validators/page_validator.py +21 -9
- ocrd_validators/parameter_validator.py +3 -2
- ocrd_validators/processing_server_config.schema.yml +1 -33
- ocrd_validators/resource_list_validator.py +3 -1
- ocrd_validators/workspace_validator.py +30 -20
- ocrd_validators/xsd_mets_validator.py +2 -1
- ocrd_validators/xsd_page_validator.py +2 -1
- ocrd_validators/xsd_validator.py +4 -2
- ocrd/cli/log.py +0 -51
- ocrd/lib.bash +0 -317
- ocrd-3.5.1.dist-info/RECORD +0 -128
- ocrd_network/cli/processor_server.py +0 -31
- ocrd_network/models/ocrd_tool.py +0 -12
- ocrd_network/processor_server.py +0 -255
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/LICENSE +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/WHEEL +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/entry_points.txt +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/top_level.txt +0 -0
ocrd_utils/str.py
CHANGED
|
@@ -37,13 +37,14 @@ def assert_file_grp_cardinality(grps, n, msg=None):
|
|
|
37
37
|
if isinstance(grps, str):
|
|
38
38
|
grps = grps.split(',')
|
|
39
39
|
assert len(grps) == n, \
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
40
|
+
"Expected exactly %d output file group%s%s, but '%s' has %d" % (
|
|
41
|
+
n,
|
|
42
|
+
'' if n == 1 else 's',
|
|
43
|
+
' (%s)' % msg if msg else '',
|
|
44
|
+
grps,
|
|
45
|
+
len(grps)
|
|
46
|
+
)
|
|
47
|
+
|
|
47
48
|
|
|
48
49
|
def concat_padded(base, *args):
|
|
49
50
|
"""
|
|
@@ -54,18 +55,20 @@ def concat_padded(base, *args):
|
|
|
54
55
|
if is_string(n):
|
|
55
56
|
ret = "%s_%s" % (ret, n)
|
|
56
57
|
else:
|
|
57
|
-
ret = "%s_%04i"
|
|
58
|
+
ret = "%s_%04i" % (ret, n)
|
|
58
59
|
return ret
|
|
59
60
|
|
|
61
|
+
|
|
60
62
|
def remove_non_path_from_url(url):
|
|
61
63
|
"""
|
|
62
64
|
Remove everything from URL after path.
|
|
63
65
|
"""
|
|
64
|
-
url = url.split('?', 1)[0]
|
|
65
|
-
url = url.split('#', 1)[0]
|
|
66
|
-
url = re.sub(r"/+$", "", url)
|
|
66
|
+
url = url.split('?', 1)[0] # query
|
|
67
|
+
url = url.split('#', 1)[0] # fragment identifier
|
|
68
|
+
url = re.sub(r"/+$", "", url) # trailing slashes
|
|
67
69
|
return url
|
|
68
70
|
|
|
71
|
+
|
|
69
72
|
def make_file_id(ocrd_file, output_file_grp):
|
|
70
73
|
"""
|
|
71
74
|
Derive a new file ID for an output file from an existing input file ``ocrd_file``
|
|
@@ -101,9 +104,12 @@ def make_file_id(ocrd_file, output_file_grp):
|
|
|
101
104
|
ret = output_file_grp + '_' + ocrd_file.ID
|
|
102
105
|
return make_xml_id(ret)
|
|
103
106
|
|
|
107
|
+
|
|
104
108
|
def make_xml_id(idstr: str) -> str:
|
|
105
109
|
"""
|
|
106
|
-
Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``,
|
|
110
|
+
Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``,
|
|
111
|
+
removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_`
|
|
112
|
+
if ``idstr`` starts with a number.
|
|
107
113
|
"""
|
|
108
114
|
ret = idstr
|
|
109
115
|
if not REGEX_FILE_ID.fullmatch(ret):
|
|
@@ -113,6 +119,7 @@ def make_xml_id(idstr: str) -> str:
|
|
|
113
119
|
ret = re.sub(r'[^\w.-]', r'', ret)
|
|
114
120
|
return ret
|
|
115
121
|
|
|
122
|
+
|
|
116
123
|
def nth_url_segment(url, n=-1):
|
|
117
124
|
"""
|
|
118
125
|
Return the last /-delimited segment of a URL-like string
|
|
@@ -127,6 +134,7 @@ def nth_url_segment(url, n=-1):
|
|
|
127
134
|
except IndexError:
|
|
128
135
|
return ''
|
|
129
136
|
|
|
137
|
+
|
|
130
138
|
def get_local_filename(url, start=None):
|
|
131
139
|
"""
|
|
132
140
|
Return local filename, optionally relative to ``start``
|
|
@@ -150,12 +158,14 @@ def get_local_filename(url, start=None):
|
|
|
150
158
|
url = url[len(start):]
|
|
151
159
|
return url
|
|
152
160
|
|
|
161
|
+
|
|
153
162
|
def is_local_filename(url):
|
|
154
163
|
"""
|
|
155
164
|
Whether a url is a local filename.
|
|
156
165
|
"""
|
|
157
166
|
# deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
|
|
158
|
-
return url.startswith('file://') or
|
|
167
|
+
return url.startswith('file://') or '://' not in url
|
|
168
|
+
|
|
159
169
|
|
|
160
170
|
def is_string(val):
|
|
161
171
|
"""
|
|
@@ -171,6 +181,7 @@ def parse_json_file_with_comments(val):
|
|
|
171
181
|
with open(val, 'r', encoding='utf-8') as inputf:
|
|
172
182
|
return parse_json_string_with_comments(inputf.read())
|
|
173
183
|
|
|
184
|
+
|
|
174
185
|
def parse_json_string_with_comments(val):
|
|
175
186
|
"""
|
|
176
187
|
Parse a string of JSON interspersed with #-prefixed full-line comments
|
|
@@ -178,6 +189,7 @@ def parse_json_string_with_comments(val):
|
|
|
178
189
|
jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
|
|
179
190
|
return json.loads(jsonstr)
|
|
180
191
|
|
|
192
|
+
|
|
181
193
|
def parse_json_string_or_file(*values, resolve_preset_file=None): # pylint: disable=unused-argument
|
|
182
194
|
"""
|
|
183
195
|
Parse a string as either the path to a JSON object or a literal JSON object.
|
|
@@ -208,6 +220,7 @@ def parse_json_string_or_file(*values, resolve_preset_file=None): # pylint: d
|
|
|
208
220
|
ret = {**ret, **value_parsed}
|
|
209
221
|
return ret
|
|
210
222
|
|
|
223
|
+
|
|
211
224
|
def safe_filename(url):
|
|
212
225
|
"""
|
|
213
226
|
Sanitize input to be safely used as the basename of a local file.
|
|
@@ -218,7 +231,8 @@ def safe_filename(url):
|
|
|
218
231
|
# print('safe filename: %s -> %s' % (url, ret))
|
|
219
232
|
return ret
|
|
220
233
|
|
|
221
|
-
|
|
234
|
+
|
|
235
|
+
def generate_range(start: str, end: str) -> List[str]:
|
|
222
236
|
"""
|
|
223
237
|
Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
|
|
224
238
|
"""
|
|
@@ -228,7 +242,8 @@ def generate_range(start : str, end : str) -> List[str]:
|
|
|
228
242
|
except IndexError:
|
|
229
243
|
raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
|
|
230
244
|
if start[:-len(start_num)] != end[:-len(end_num)]:
|
|
231
|
-
raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part:
|
|
245
|
+
raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: "
|
|
246
|
+
f"'{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'")
|
|
232
247
|
if start_num == end_num:
|
|
233
248
|
warn("Range '%s..%s': evaluates to the same number")
|
|
234
249
|
for i in range(int(start_num), int(end_num) + 1):
|
|
@@ -261,7 +276,8 @@ def partition_list(lst, chunks, chunk_index=None):
|
|
|
261
276
|
return [ret[chunk_index]]
|
|
262
277
|
return ret
|
|
263
278
|
|
|
264
|
-
|
|
279
|
+
|
|
280
|
+
def sparkline(values: List[int]) -> str:
|
|
265
281
|
"""
|
|
266
282
|
Render a list of points with block characters
|
|
267
283
|
"""
|
|
@@ -7,9 +7,11 @@ from jsonschema import Draft201909Validator, ValidationError, validators # pylin
|
|
|
7
7
|
|
|
8
8
|
from ocrd_models import ValidationReport
|
|
9
9
|
|
|
10
|
+
|
|
10
11
|
class JsonSchemaDeprecationWarning(ValidationError):
|
|
11
12
|
pass
|
|
12
13
|
|
|
14
|
+
|
|
13
15
|
# http://python-jsonschema.readthedocs.io/en/latest/faq/
|
|
14
16
|
def extend_with_default(validator_class):
|
|
15
17
|
"""
|
|
@@ -34,6 +36,7 @@ def extend_with_default(validator_class):
|
|
|
34
36
|
|
|
35
37
|
DefaultValidatingDraft20199Validator = extend_with_default(Draft201909Validator)
|
|
36
38
|
|
|
39
|
+
|
|
37
40
|
#
|
|
38
41
|
# -------------------------------------------------
|
|
39
42
|
#
|
|
@@ -54,7 +57,7 @@ class JsonValidator():
|
|
|
54
57
|
"""
|
|
55
58
|
if isinstance(obj, str):
|
|
56
59
|
obj = json.loads(obj)
|
|
57
|
-
return JsonValidator(schema)._validate(obj)
|
|
60
|
+
return JsonValidator(schema)._validate(obj) # pylint: disable=protected-access
|
|
58
61
|
|
|
59
62
|
def __init__(self, schema, validator_class=Draft201909Validator):
|
|
60
63
|
"""
|
|
@@ -6,6 +6,7 @@ See `specs <https://ocr-d.de/en/spec/ocrd_tool>`_.
|
|
|
6
6
|
from .constants import OCRD_TOOL_SCHEMA
|
|
7
7
|
from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator
|
|
8
8
|
|
|
9
|
+
|
|
9
10
|
#
|
|
10
11
|
# -------------------------------------------------
|
|
11
12
|
#
|
|
@@ -20,7 +21,7 @@ class OcrdToolValidator(JsonValidator):
|
|
|
20
21
|
"""
|
|
21
22
|
Validate against ``ocrd-tool.json`` schema.
|
|
22
23
|
"""
|
|
23
|
-
return OcrdToolValidator(schema)._validate(obj)
|
|
24
|
+
return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access
|
|
24
25
|
|
|
25
26
|
def __init__(self, schema):
|
|
26
27
|
super().__init__(schema, validator_class=DefaultValidatingDraft20199Validator)
|
|
@@ -8,12 +8,13 @@ from shutil import rmtree
|
|
|
8
8
|
|
|
9
9
|
from ocrd_utils import getLogger, unzip_file_to_dir
|
|
10
10
|
|
|
11
|
-
from bagit import Bag, BagValidationError
|
|
12
|
-
from bagit_profile import Profile, ProfileValidationError
|
|
11
|
+
from bagit import Bag, BagValidationError # pylint: disable=no-name-in-module
|
|
12
|
+
from bagit_profile import Profile, ProfileValidationError # pylint: disable=no-name-in-module
|
|
13
13
|
|
|
14
14
|
from .constants import OCRD_BAGIT_PROFILE, OCRD_BAGIT_PROFILE_URL, TMP_BAGIT_PREFIX
|
|
15
15
|
from ocrd_models import ValidationReport
|
|
16
16
|
|
|
17
|
+
|
|
17
18
|
#
|
|
18
19
|
# -------------------------------------------------
|
|
19
20
|
#
|
|
@@ -58,7 +59,8 @@ class OcrdZipValidator():
|
|
|
58
59
|
# for d in e.details:
|
|
59
60
|
# log = getLogger('ocrd.ocrd_zip_validator')
|
|
60
61
|
# if isinstance(d, ChecksumMismatch):
|
|
61
|
-
# log.error("Validation Error: expected %s to have %s checksum of %s but found %s",
|
|
62
|
+
# log.error("Validation Error: expected %s to have %s checksum of %s but found %s",
|
|
63
|
+
# d.path, d.algorithm, d.expected, d.found)
|
|
62
64
|
# else:
|
|
63
65
|
# log.error("Validation Error: %s", d)
|
|
64
66
|
if failed:
|
|
@@ -89,7 +91,6 @@ class OcrdZipValidator():
|
|
|
89
91
|
bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX)
|
|
90
92
|
unzip_file_to_dir(self.path_to_zip, bagdir)
|
|
91
93
|
|
|
92
|
-
|
|
93
94
|
try:
|
|
94
95
|
bag = Bag(bagdir)
|
|
95
96
|
self._validate_profile(bag)
|
|
@@ -119,6 +119,7 @@ class ConsistencyError(Exception):
|
|
|
119
119
|
f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': "
|
|
120
120
|
f"text results '{actual}' != concatenated '{expected}'")
|
|
121
121
|
|
|
122
|
+
|
|
122
123
|
class CoordinateConsistencyError(Exception):
|
|
123
124
|
"""
|
|
124
125
|
Exception representing a consistency error in coordinate confinement across levels of a PAGE-XML.
|
|
@@ -145,6 +146,7 @@ class CoordinateConsistencyError(Exception):
|
|
|
145
146
|
f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': "
|
|
146
147
|
f"coords '{inner}' not within parent coords '{outer}'")
|
|
147
148
|
|
|
149
|
+
|
|
148
150
|
class CoordinateValidityError(Exception):
|
|
149
151
|
"""
|
|
150
152
|
Exception representing a validity error of an element's coordinates in PAGE-XML.
|
|
@@ -169,12 +171,14 @@ class CoordinateValidityError(Exception):
|
|
|
169
171
|
super().__init__(
|
|
170
172
|
f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}")
|
|
171
173
|
|
|
174
|
+
|
|
172
175
|
def compare_without_whitespace(a, b):
|
|
173
176
|
"""
|
|
174
177
|
Compare two strings, ignoring all whitespace.
|
|
175
178
|
"""
|
|
176
179
|
return re.sub('\\s+', '', a) == re.sub('\\s+', '', b)
|
|
177
180
|
|
|
181
|
+
|
|
178
182
|
def page_get_reading_order(ro, rogroup):
|
|
179
183
|
"""
|
|
180
184
|
Add all elements from the given reading order group to the given dictionary.
|
|
@@ -197,6 +201,7 @@ def page_get_reading_order(ro, rogroup):
|
|
|
197
201
|
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
|
|
198
202
|
page_get_reading_order(ro, elem)
|
|
199
203
|
|
|
204
|
+
|
|
200
205
|
def make_poly(polygon_points):
|
|
201
206
|
"""Instantiate a Polygon from a list of point pairs, or return an error string"""
|
|
202
207
|
if len(polygon_points) < 4:
|
|
@@ -212,6 +217,7 @@ def make_poly(polygon_points):
|
|
|
212
217
|
return 'is negative'
|
|
213
218
|
return poly
|
|
214
219
|
|
|
220
|
+
|
|
215
221
|
def make_line(line_points):
|
|
216
222
|
"""Instantiate a LineString from a list of point pairs, or return an error string"""
|
|
217
223
|
if len(line_points) < 2:
|
|
@@ -225,6 +231,7 @@ def make_line(line_points):
|
|
|
225
231
|
return 'is negative'
|
|
226
232
|
return line
|
|
227
233
|
|
|
234
|
+
|
|
228
235
|
@deprecated_alias(strictness='page_textequiv_consistency')
|
|
229
236
|
@deprecated_alias(strategy='page_textequiv_strategy')
|
|
230
237
|
def validate_consistency(node, page_textequiv_consistency, page_textequiv_strategy,
|
|
@@ -239,7 +246,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
|
|
|
239
246
|
if isinstance(node, (PcGtsType, OcrdPage)):
|
|
240
247
|
# top-level (start recursion)
|
|
241
248
|
node_id = node.get_pcGtsId()
|
|
242
|
-
node = node.get_Page()
|
|
249
|
+
node = node.get_Page() # has no .id
|
|
243
250
|
if not readingOrder:
|
|
244
251
|
readingOrder = {}
|
|
245
252
|
ro = node.get_ReadingOrder()
|
|
@@ -247,13 +254,13 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
|
|
|
247
254
|
page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup())
|
|
248
255
|
if not joinRelations:
|
|
249
256
|
joinRelations = []
|
|
250
|
-
relations = node.get_Relations()
|
|
257
|
+
relations = node.get_Relations() # get RelationsType
|
|
251
258
|
if relations:
|
|
252
|
-
relations = relations.get_Relation()
|
|
259
|
+
relations = relations.get_Relation() # get list of RelationType
|
|
253
260
|
else:
|
|
254
261
|
relations = []
|
|
255
262
|
for relation in relations:
|
|
256
|
-
if relation.get_type() == 'join':
|
|
263
|
+
if relation.get_type() == 'join': # ignore 'link' type here
|
|
257
264
|
joinRelations.append((relation.get_SourceRegionRef().get_regionRef(),
|
|
258
265
|
relation.get_TargetRegionRef().get_regionRef()))
|
|
259
266
|
elif isinstance(node, GlyphType):
|
|
@@ -277,7 +284,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
|
|
|
277
284
|
parent_points, node_poly))
|
|
278
285
|
log.debug("Invalid coords of %s %s", tag, node_id)
|
|
279
286
|
consistent = False
|
|
280
|
-
node_poly = None
|
|
287
|
+
node_poly = None # don't use in further comparisons
|
|
281
288
|
else:
|
|
282
289
|
node_poly = None
|
|
283
290
|
for class_, getterLO, getterRD in _ORDER[1:]:
|
|
@@ -314,7 +321,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
|
|
|
314
321
|
# report.add_error(CoordinateValidityError(child_tag, child.id, file_id, child_points))
|
|
315
322
|
# log.debug("Invalid coords of %s %s", child_tag, child.id)
|
|
316
323
|
# consistent = False
|
|
317
|
-
pass
|
|
324
|
+
pass # already reported in recursive call above
|
|
318
325
|
elif not child_poly.within(node_poly.buffer(PARENT_SLACK)):
|
|
319
326
|
# TODO: automatic repair?
|
|
320
327
|
report.add_error(CoordinateConsistencyError(child_tag, child.id, file_id,
|
|
@@ -344,13 +351,14 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
|
|
|
344
351
|
if page_textequiv_consistency == 'fix':
|
|
345
352
|
log.debug("Repaired text of %s %s", tag, node_id)
|
|
346
353
|
set_text(node, concatenated, page_textequiv_strategy)
|
|
347
|
-
elif (page_textequiv_consistency == 'strict'
|
|
354
|
+
elif (page_textequiv_consistency == 'strict' # or 'lax' but...
|
|
348
355
|
or not compare_without_whitespace(concatenated, text_results)):
|
|
349
356
|
log.debug("Inconsistent text of %s %s", tag, node_id)
|
|
350
357
|
report.add_error(ConsistencyError(tag, node_id, file_id,
|
|
351
358
|
text_results, concatenated))
|
|
352
359
|
return consistent
|
|
353
360
|
|
|
361
|
+
|
|
354
362
|
def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None):
|
|
355
363
|
"""
|
|
356
364
|
Concatenate nodes textually according to https://ocr-d.github.io/page#consistency-of-text-results-on-different-levels
|
|
@@ -367,6 +375,7 @@ def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None):
|
|
|
367
375
|
result += get_text(next_node, page_textequiv_strategy)
|
|
368
376
|
return result.strip()
|
|
369
377
|
|
|
378
|
+
|
|
370
379
|
def get_text(node, page_textequiv_strategy='first'):
|
|
371
380
|
"""
|
|
372
381
|
Get the first or most confident among text results (depending on ``page_textequiv_strategy``).
|
|
@@ -399,6 +408,7 @@ def get_text(node, page_textequiv_strategy='first'):
|
|
|
399
408
|
# fall back to first element
|
|
400
409
|
return textEquivs[0].get_Unicode().strip()
|
|
401
410
|
|
|
411
|
+
|
|
402
412
|
def set_text(node, text, page_textequiv_strategy):
|
|
403
413
|
"""
|
|
404
414
|
Set the first or most confident among text results (depending on ``page_textequiv_strategy``).
|
|
@@ -410,7 +420,7 @@ def set_text(node, text, page_textequiv_strategy):
|
|
|
410
420
|
text = text.strip()
|
|
411
421
|
textEquivs = node.get_TextEquiv()
|
|
412
422
|
if not textEquivs:
|
|
413
|
-
node.add_TextEquiv(TextEquivType(Unicode=text))
|
|
423
|
+
node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ?
|
|
414
424
|
elif page_textequiv_strategy == 'best':
|
|
415
425
|
if len(textEquivs) > 1:
|
|
416
426
|
textEquivsSorted = sorted([x for x in textEquivs if x.conf],
|
|
@@ -432,6 +442,7 @@ def set_text(node, text, page_textequiv_strategy):
|
|
|
432
442
|
# fall back to first element
|
|
433
443
|
textEquivs[0].set_Unicode(text)
|
|
434
444
|
|
|
445
|
+
|
|
435
446
|
class PageValidator():
|
|
436
447
|
"""
|
|
437
448
|
Validator for `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`.
|
|
@@ -477,5 +488,6 @@ class PageValidator():
|
|
|
477
488
|
raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency)
|
|
478
489
|
report = ValidationReport()
|
|
479
490
|
log.info("Validating input file '%s'", file_id)
|
|
480
|
-
validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords,
|
|
491
|
+
validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords,
|
|
492
|
+
report, file_id)
|
|
481
493
|
return report
|
|
@@ -3,6 +3,7 @@ Validate parameters against ocrd-tool.json.
|
|
|
3
3
|
"""
|
|
4
4
|
from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator
|
|
5
5
|
|
|
6
|
+
|
|
6
7
|
#
|
|
7
8
|
# -------------------------------------------------
|
|
8
9
|
#
|
|
@@ -12,7 +13,7 @@ class ParameterValidator(JsonValidator):
|
|
|
12
13
|
JsonValidator validating parametersagains ocrd-tool.json.
|
|
13
14
|
"""
|
|
14
15
|
|
|
15
|
-
def validate(self, *args, **kwargs):
|
|
16
|
+
def validate(self, *args, **kwargs): # pylint: disable=arguments-differ
|
|
16
17
|
"""
|
|
17
18
|
Validate a parameter dict against a parameter schema from an ocrd-tool.json
|
|
18
19
|
|
|
@@ -39,7 +40,7 @@ class ParameterValidator(JsonValidator):
|
|
|
39
40
|
if 'required' in p[n]:
|
|
40
41
|
if p[n]['required']:
|
|
41
42
|
required.append(n)
|
|
42
|
-
del
|
|
43
|
+
del p[n]['required']
|
|
43
44
|
super().__init__({
|
|
44
45
|
"type": "object",
|
|
45
46
|
"required": required,
|
|
@@ -68,16 +68,12 @@ properties:
|
|
|
68
68
|
required:
|
|
69
69
|
- address
|
|
70
70
|
- username
|
|
71
|
+
- workers
|
|
71
72
|
oneOf:
|
|
72
73
|
- required:
|
|
73
74
|
- password
|
|
74
75
|
- required:
|
|
75
76
|
- path_to_privkey
|
|
76
|
-
anyOf:
|
|
77
|
-
- required:
|
|
78
|
-
- workers
|
|
79
|
-
- required:
|
|
80
|
-
- servers
|
|
81
77
|
properties:
|
|
82
78
|
address:
|
|
83
79
|
description: The IP address or domain name of the target machine
|
|
@@ -118,34 +114,6 @@ properties:
|
|
|
118
114
|
- native
|
|
119
115
|
- docker
|
|
120
116
|
default: native
|
|
121
|
-
servers:
|
|
122
|
-
description: List of processor servers that will be deployed
|
|
123
|
-
type: array
|
|
124
|
-
minItems: 1
|
|
125
|
-
items:
|
|
126
|
-
type: object
|
|
127
|
-
additionalProperties: false
|
|
128
|
-
required:
|
|
129
|
-
- name
|
|
130
|
-
- port
|
|
131
|
-
properties:
|
|
132
|
-
name:
|
|
133
|
-
description: Name of the processor
|
|
134
|
-
type: string
|
|
135
|
-
pattern: "^ocrd-.*$"
|
|
136
|
-
examples:
|
|
137
|
-
- ocrd-cis-ocropy-binarize
|
|
138
|
-
- ocrd-olena-binarize
|
|
139
|
-
deploy_type:
|
|
140
|
-
description: Should the processor server be deployed natively or with Docker
|
|
141
|
-
type: string
|
|
142
|
-
enum:
|
|
143
|
-
- native
|
|
144
|
-
- docker
|
|
145
|
-
default: native
|
|
146
|
-
port:
|
|
147
|
-
description: The port number to be deployed on the host
|
|
148
|
-
$ref: "#/$defs/port"
|
|
149
117
|
|
|
150
118
|
$defs:
|
|
151
119
|
address:
|
|
@@ -6,6 +6,7 @@ See `specs <https://ocr-d.de/en/spec/cli#processor-resources>`_.
|
|
|
6
6
|
from .constants import RESOURCE_LIST_SCHEMA
|
|
7
7
|
from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator
|
|
8
8
|
|
|
9
|
+
|
|
9
10
|
#
|
|
10
11
|
# -------------------------------------------------
|
|
11
12
|
#
|
|
@@ -22,4 +23,5 @@ class OcrdResourceListValidator(JsonValidator):
|
|
|
22
23
|
"""
|
|
23
24
|
if schema is None:
|
|
24
25
|
schema = RESOURCE_LIST_SCHEMA
|
|
25
|
-
|
|
26
|
+
validator = JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)
|
|
27
|
+
return validator._validate(obj) # pylint: disable=protected-access
|
|
@@ -15,6 +15,7 @@ from .page_validator import PageValidator
|
|
|
15
15
|
from .xsd_page_validator import XsdPageValidator
|
|
16
16
|
from .xsd_mets_validator import XsdMetsValidator
|
|
17
17
|
|
|
18
|
+
|
|
18
19
|
#
|
|
19
20
|
# -------------------------------------------------
|
|
20
21
|
#
|
|
@@ -57,7 +58,8 @@ class WorkspaceValidator():
|
|
|
57
58
|
if page_id:
|
|
58
59
|
for one_page_id in page_id:
|
|
59
60
|
if next(workspace.mets.find_files(fileGrp=grp, pageId=one_page_id), None):
|
|
60
|
-
report.add_error("Output fileGrp[@USE='%s'] already contains output for page %s" % (
|
|
61
|
+
report.add_error("Output fileGrp[@USE='%s'] already contains output for page %s" % (
|
|
62
|
+
grp, one_page_id))
|
|
61
63
|
else:
|
|
62
64
|
report.add_error("Output fileGrp[@USE='%s'] already in METS!" % grp)
|
|
63
65
|
return report
|
|
@@ -121,10 +123,10 @@ class WorkspaceValidator():
|
|
|
121
123
|
resolver (:class:`ocrd.Resolver`): Resolver
|
|
122
124
|
mets_url (string): URL of the METS file
|
|
123
125
|
src_dir (string, None): Directory containing mets file
|
|
124
|
-
skip (list): Validation checks to omit. One or more of
|
|
126
|
+
skip (list): Validation checks to omit. One or more of
|
|
125
127
|
'mets_unique_identifier',
|
|
126
128
|
'mets_files', 'pixel_density', 'dimension', 'url',
|
|
127
|
-
'multipage', 'page', 'page_xsd', 'mets_xsd',
|
|
129
|
+
'multipage', 'page', 'page_xsd', 'mets_xsd',
|
|
128
130
|
'mets_fileid_page_pcgtsid'
|
|
129
131
|
download (boolean): Whether to download remote file references
|
|
130
132
|
temporarily during validation (like a processor would)
|
|
@@ -133,7 +135,7 @@ class WorkspaceValidator():
|
|
|
133
135
|
report (:class:`ValidationReport`) Report on the validity
|
|
134
136
|
"""
|
|
135
137
|
validator = WorkspaceValidator(*args, **kwargs)
|
|
136
|
-
return validator._validate()
|
|
138
|
+
return validator._validate() # pylint: disable=protected-access
|
|
137
139
|
|
|
138
140
|
def _validate(self):
|
|
139
141
|
"""
|
|
@@ -141,7 +143,7 @@ class WorkspaceValidator():
|
|
|
141
143
|
"""
|
|
142
144
|
try:
|
|
143
145
|
self._resolve_workspace()
|
|
144
|
-
except Exception as e:
|
|
146
|
+
except Exception as e: # pylint: disable=broad-except
|
|
145
147
|
self.log.warning("Failed to instantiate workspace: %s", e)
|
|
146
148
|
self.report.add_error(f"Failed to instantiate workspace: {e}")
|
|
147
149
|
return self.report
|
|
@@ -159,7 +161,7 @@ class WorkspaceValidator():
|
|
|
159
161
|
self._validate_mets_xsd()
|
|
160
162
|
if self.page_checks:
|
|
161
163
|
self._validate_page()
|
|
162
|
-
except Exception:
|
|
164
|
+
except Exception: # pylint: disable=broad-except
|
|
163
165
|
self.report.add_error(f"Validation aborted with exception: {format_exc()}")
|
|
164
166
|
return self.report
|
|
165
167
|
|
|
@@ -216,9 +218,11 @@ class WorkspaceValidator():
|
|
|
216
218
|
page = page_from_file(f).get_Page()
|
|
217
219
|
_, _, exif = self.workspace.image_from_page(page, f.pageId)
|
|
218
220
|
if page.imageHeight != exif.height:
|
|
219
|
-
self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height
|
|
221
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height "
|
|
222
|
+
f"({page.imageHeight} != {exif.height})")
|
|
220
223
|
if page.imageWidth != exif.width:
|
|
221
|
-
self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width
|
|
224
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width "
|
|
225
|
+
f"({page.imageWidth} != {exif.width})")
|
|
222
226
|
|
|
223
227
|
def _validate_multipage(self):
|
|
224
228
|
"""
|
|
@@ -237,7 +241,8 @@ class WorkspaceValidator():
|
|
|
237
241
|
if exif.n_frames > 1:
|
|
238
242
|
self.report.add_error(f"Image '{f.ID}': More than 1 frame: {exif.n_frames}")
|
|
239
243
|
except FileNotFoundError:
|
|
240
|
-
self.report.add_error(f"Image '{f.ID}': Could not retrieve
|
|
244
|
+
self.report.add_error(f"Image '{f.ID}': Could not retrieve "
|
|
245
|
+
f"(local_filename='{f.local_filename}', url='{f.url}')")
|
|
241
246
|
return
|
|
242
247
|
|
|
243
248
|
def _validate_pixel_density(self):
|
|
@@ -293,10 +298,11 @@ class WorkspaceValidator():
|
|
|
293
298
|
except StopIteration:
|
|
294
299
|
self.report.add_error("No files")
|
|
295
300
|
for f in self.mets.find_files(**self.find_kwargs):
|
|
296
|
-
if f._el.get('GROUPID'):
|
|
301
|
+
if f._el.get('GROUPID'): # pylint: disable=protected-access
|
|
297
302
|
self.report.add_notice(f"File '{f.ID}' has GROUPID attribute - document might need an update")
|
|
298
303
|
if not (f.url or f.local_filename):
|
|
299
|
-
self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href
|
|
304
|
+
self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href "
|
|
305
|
+
"nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href")
|
|
300
306
|
continue
|
|
301
307
|
if f.url and 'url' not in self.skip:
|
|
302
308
|
if re.match(r'^file:/[^/]', f.url):
|
|
@@ -322,19 +328,22 @@ class WorkspaceValidator():
|
|
|
322
328
|
for err in XsdPageValidator.validate(Path(f.local_filename)).errors:
|
|
323
329
|
self.report.add_error("%s: %s" % (f.ID, err))
|
|
324
330
|
if 'page' in self.page_checks:
|
|
325
|
-
page_report = PageValidator.validate(
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
331
|
+
page_report = PageValidator.validate(
|
|
332
|
+
ocrd_file=f,
|
|
333
|
+
page_textequiv_consistency=self.page_strictness,
|
|
334
|
+
check_coords=self.page_coordinate_consistency in ['poly', 'both'],
|
|
335
|
+
check_baseline=self.page_coordinate_consistency in ['baseline', 'both'])
|
|
329
336
|
self.report.merge_report(page_report)
|
|
330
337
|
pcgts = page_from_file(f)
|
|
331
338
|
page = pcgts.get_Page()
|
|
332
339
|
if 'dimension' in self.page_checks:
|
|
333
340
|
img = self.workspace._resolve_image_as_pil(page.imageFilename)
|
|
334
341
|
if page.imageHeight != img.height:
|
|
335
|
-
self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height
|
|
342
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height "
|
|
343
|
+
f"({page.imageHeight} != {img.height})")
|
|
336
344
|
if page.imageWidth != img.width:
|
|
337
|
-
self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width
|
|
345
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width "
|
|
346
|
+
f"({page.imageWidth} != {img.width})")
|
|
338
347
|
if 'imagefilename' in self.page_checks:
|
|
339
348
|
imageFilename = page.imageFilename
|
|
340
349
|
if is_local_filename(imageFilename):
|
|
@@ -344,7 +353,8 @@ class WorkspaceValidator():
|
|
|
344
353
|
if not self.mets.find_files(**kwargs):
|
|
345
354
|
self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
|
|
346
355
|
if is_local_filename(imageFilename) and not Path(imageFilename).exists():
|
|
347
|
-
self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}'
|
|
356
|
+
self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' "
|
|
357
|
+
"points to non-existent local file")
|
|
348
358
|
if 'alternativeimage_filename' in self.page_checks:
|
|
349
359
|
for altimg in page.get_AllAlternativeImages():
|
|
350
360
|
if is_local_filename(altimg.filename):
|
|
@@ -368,8 +378,8 @@ class WorkspaceValidator():
|
|
|
368
378
|
self.report.add_error(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
|
|
369
379
|
f"'{altimg.filename}' feature '{feature}' not standardized for PAGE")
|
|
370
380
|
if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID:
|
|
371
|
-
self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (
|
|
372
|
-
|
|
381
|
+
self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (
|
|
382
|
+
pcgts.pcGtsId or '', f.ID or ''))
|
|
373
383
|
|
|
374
384
|
def _validate_page_xsd(self):
|
|
375
385
|
"""
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from .xsd_validator import XsdValidator
|
|
2
2
|
from .constants import XSD_METS_URL
|
|
3
3
|
|
|
4
|
+
|
|
4
5
|
class XsdMetsValidator(XsdValidator):
|
|
5
6
|
"""
|
|
6
7
|
XML Schema validator.
|
|
@@ -14,4 +15,4 @@ class XsdMetsValidator(XsdValidator):
|
|
|
14
15
|
Args:
|
|
15
16
|
doc (etree.ElementTree|str|bytes):
|
|
16
17
|
"""
|
|
17
|
-
return cls.instance(XSD_METS_URL)._validate(doc)
|
|
18
|
+
return cls.instance(XSD_METS_URL)._validate(doc) # pylint: disable=protected-access
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from .xsd_validator import XsdValidator
|
|
2
2
|
from .constants import XSD_PAGE_URL
|
|
3
3
|
|
|
4
|
+
|
|
4
5
|
class XsdPageValidator(XsdValidator):
|
|
5
6
|
"""
|
|
6
7
|
XML Schema validator.
|
|
@@ -14,4 +15,4 @@ class XsdPageValidator(XsdValidator):
|
|
|
14
15
|
Args:
|
|
15
16
|
doc (etree.ElementTree|str|bytes):
|
|
16
17
|
"""
|
|
17
|
-
return cls.instance(XSD_PAGE_URL)._validate(doc)
|
|
18
|
+
return cls.instance(XSD_PAGE_URL)._validate(doc) # pylint: disable=protected-access
|