ocrd 3.1.1__py3-none-any.whl → 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/network.py +2 -0
- ocrd/cli/resmgr.py +24 -61
- ocrd/cli/workspace.py +2 -2
- ocrd/processor/base.py +15 -18
- ocrd/resource_manager.py +199 -116
- {ocrd-3.1.1.dist-info → ocrd-3.2.0.dist-info}/METADATA +2 -2
- {ocrd-3.1.1.dist-info → ocrd-3.2.0.dist-info}/RECORD +25 -23
- {ocrd-3.1.1.dist-info → ocrd-3.2.0.dist-info}/WHEEL +1 -1
- ocrd_models/constants.py +16 -0
- ocrd_network/__init__.py +1 -0
- ocrd_network/cli/__init__.py +3 -1
- ocrd_network/cli/resmgr_server.py +23 -0
- ocrd_network/constants.py +3 -0
- ocrd_network/logging_utils.py +5 -0
- ocrd_network/resource_manager_server.py +178 -0
- ocrd_network/runtime_data/hosts.py +47 -56
- ocrd_network/runtime_data/network_agents.py +26 -3
- ocrd_utils/__init__.py +2 -0
- ocrd_utils/constants.py +5 -0
- ocrd_utils/os.py +130 -52
- ocrd_validators/ocrd_tool.schema.yml +7 -4
- ocrd_validators/workspace_validator.py +45 -10
- {ocrd-3.1.1.dist-info → ocrd-3.2.0.dist-info}/LICENSE +0 -0
- {ocrd-3.1.1.dist-info → ocrd-3.2.0.dist-info}/entry_points.txt +0 -0
- {ocrd-3.1.1.dist-info → ocrd-3.2.0.dist-info}/top_level.txt +0 -0
ocrd_utils/os.py
CHANGED
|
@@ -8,6 +8,7 @@ __all__ = [
|
|
|
8
8
|
'get_ocrd_tool_json',
|
|
9
9
|
'get_moduledir',
|
|
10
10
|
'get_processor_resource_types',
|
|
11
|
+
'get_env_locations',
|
|
11
12
|
'guess_media_type',
|
|
12
13
|
'pushd_popd',
|
|
13
14
|
'unzip_file_to_dir',
|
|
@@ -15,28 +16,30 @@ __all__ = [
|
|
|
15
16
|
'redirect_stderr_and_stdout_to_file',
|
|
16
17
|
]
|
|
17
18
|
|
|
19
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
18
20
|
from tempfile import TemporaryDirectory, gettempdir
|
|
19
21
|
from functools import lru_cache
|
|
20
22
|
from contextlib import contextmanager, redirect_stderr, redirect_stdout
|
|
21
23
|
from shutil import which
|
|
22
24
|
from json import loads
|
|
23
25
|
from json.decoder import JSONDecodeError
|
|
24
|
-
from os import getcwd, chdir, stat, chmod, umask, environ
|
|
26
|
+
from os import getcwd, chdir, stat, chmod, umask, environ, PathLike
|
|
25
27
|
from pathlib import Path
|
|
26
28
|
from os.path import abspath as abspath_, join
|
|
27
29
|
from zipfile import ZipFile
|
|
28
30
|
from subprocess import run, PIPE
|
|
29
31
|
from mimetypes import guess_type as mimetypes_guess
|
|
30
32
|
from filetype import guess as filetype_guess
|
|
33
|
+
from fnmatch import filter as apply_glob
|
|
31
34
|
|
|
32
35
|
from atomicwrites import atomic_write as atomic_write_, AtomicWriter
|
|
33
36
|
|
|
34
|
-
from .constants import EXT_TO_MIME
|
|
37
|
+
from .constants import EXT_TO_MIME, MIME_TO_EXT, RESOURCE_LOCATIONS, RESOURCES_DIR_SYSTEM
|
|
35
38
|
from .config import config
|
|
36
39
|
from .logging import getLogger
|
|
37
40
|
from .introspect import resource_string
|
|
38
41
|
|
|
39
|
-
def abspath(url):
|
|
42
|
+
def abspath(url : str) -> str:
|
|
40
43
|
"""
|
|
41
44
|
Get a full path to a file or file URL
|
|
42
45
|
|
|
@@ -47,7 +50,7 @@ def abspath(url):
|
|
|
47
50
|
return abspath_(url)
|
|
48
51
|
|
|
49
52
|
@contextmanager
|
|
50
|
-
def pushd_popd(newcwd=None, tempdir=False):
|
|
53
|
+
def pushd_popd(newcwd : Union[str, PathLike] = None, tempdir : bool = False) -> Iterator[PathLike]:
|
|
51
54
|
if newcwd and tempdir:
|
|
52
55
|
raise Exception("pushd_popd can accept either newcwd or tempdir, not both")
|
|
53
56
|
try:
|
|
@@ -67,7 +70,7 @@ def pushd_popd(newcwd=None, tempdir=False):
|
|
|
67
70
|
finally:
|
|
68
71
|
chdir(oldcwd)
|
|
69
72
|
|
|
70
|
-
def unzip_file_to_dir(path_to_zip, output_directory):
|
|
73
|
+
def unzip_file_to_dir(path_to_zip : Union[str, PathLike], output_directory : str) -> None:
|
|
71
74
|
"""
|
|
72
75
|
Extract a ZIP archive to a directory
|
|
73
76
|
"""
|
|
@@ -75,7 +78,7 @@ def unzip_file_to_dir(path_to_zip, output_directory):
|
|
|
75
78
|
z.extractall(output_directory)
|
|
76
79
|
|
|
77
80
|
@lru_cache()
|
|
78
|
-
def get_ocrd_tool_json(executable):
|
|
81
|
+
def get_ocrd_tool_json(executable : str) -> Dict[str, Any]:
|
|
79
82
|
"""
|
|
80
83
|
Get the ``ocrd-tool`` description of ``executable``.
|
|
81
84
|
"""
|
|
@@ -90,11 +93,11 @@ def get_ocrd_tool_json(executable):
|
|
|
90
93
|
except (JSONDecodeError, OSError) as e:
|
|
91
94
|
getLogger('ocrd.utils.get_ocrd_tool_json').error(f'{executable} --dump-json produced invalid JSON: {e}')
|
|
92
95
|
if 'resource_locations' not in ocrd_tool:
|
|
93
|
-
ocrd_tool['resource_locations'] =
|
|
96
|
+
ocrd_tool['resource_locations'] = RESOURCE_LOCATIONS
|
|
94
97
|
return ocrd_tool
|
|
95
98
|
|
|
96
99
|
@lru_cache()
|
|
97
|
-
def get_moduledir(executable):
|
|
100
|
+
def get_moduledir(executable : str) -> str:
|
|
98
101
|
moduledir = None
|
|
99
102
|
try:
|
|
100
103
|
ocrd_all_moduledir = loads(resource_string('ocrd', 'ocrd-all-module-dir.json'))
|
|
@@ -106,57 +109,80 @@ def get_moduledir(executable):
|
|
|
106
109
|
getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}')
|
|
107
110
|
return moduledir
|
|
108
111
|
|
|
109
|
-
def
|
|
112
|
+
def get_env_locations(executable: str) -> List[str]:
|
|
113
|
+
processor_path_var = '%s_PATH' % executable.replace('-', '_').upper()
|
|
114
|
+
if processor_path_var in environ:
|
|
115
|
+
return environ[processor_path_var].split(':')
|
|
116
|
+
return []
|
|
117
|
+
|
|
118
|
+
def list_resource_candidates(executable : str, fname : str, cwd : Optional[str] = None, moduled : Optional[str] = None, xdg_data_home : Optional[str] = None) -> List[str]:
|
|
110
119
|
"""
|
|
111
120
|
Generate candidates for processor resources according to
|
|
112
121
|
https://ocr-d.de/en/spec/ocrd_tool#file-parameters
|
|
113
122
|
"""
|
|
123
|
+
if cwd is None:
|
|
124
|
+
cwd = getcwd()
|
|
114
125
|
candidates = []
|
|
115
126
|
candidates.append(join(cwd, fname))
|
|
116
|
-
xdg_data_home = config.XDG_DATA_HOME
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
candidates += [join(x, fname) for x in environ[processor_path_var].split(':')]
|
|
127
|
+
xdg_data_home = xdg_data_home or config.XDG_DATA_HOME
|
|
128
|
+
for processor_path in get_env_locations(executable):
|
|
129
|
+
candidates.append(join(processor_path, fname))
|
|
120
130
|
candidates.append(join(xdg_data_home, 'ocrd-resources', executable, fname))
|
|
121
|
-
candidates.append(join(
|
|
131
|
+
candidates.append(join(RESOURCES_DIR_SYSTEM, executable, fname))
|
|
122
132
|
if moduled:
|
|
123
133
|
candidates.append(join(moduled, fname))
|
|
124
134
|
return candidates
|
|
125
135
|
|
|
126
|
-
def list_all_resources(executable, moduled=None, xdg_data_home=None):
|
|
136
|
+
def list_all_resources(executable : str, ocrd_tool : Optional[Dict[str, Any]] = None, moduled : Optional[str] = None, xdg_data_home : Optional[str] = None) -> List[str]:
|
|
127
137
|
"""
|
|
128
138
|
List all processor resources in the filesystem according to
|
|
129
|
-
https://ocr-d.de/en/spec/ocrd_tool#
|
|
139
|
+
https://ocr-d.de/en/spec/ocrd_tool#resource-parameters
|
|
130
140
|
"""
|
|
131
|
-
|
|
141
|
+
xdg_data_home = xdg_data_home or config.XDG_DATA_HOME
|
|
142
|
+
if ocrd_tool is None:
|
|
143
|
+
ocrd_tool = get_ocrd_tool_json(executable)
|
|
144
|
+
# processor we're looking for might not be installed, hence the fallbacks
|
|
132
145
|
try:
|
|
133
|
-
|
|
134
|
-
except
|
|
135
|
-
|
|
146
|
+
mimetypes = get_processor_resource_types(executable, ocrd_tool=ocrd_tool)
|
|
147
|
+
except KeyError:
|
|
148
|
+
mimetypes = ['*/*']
|
|
149
|
+
try:
|
|
150
|
+
resource_locations = ocrd_tool['resource_locations']
|
|
151
|
+
except KeyError:
|
|
136
152
|
# Assume the default
|
|
137
|
-
resource_locations =
|
|
138
|
-
|
|
139
|
-
|
|
153
|
+
resource_locations = RESOURCE_LOCATIONS
|
|
154
|
+
try:
|
|
155
|
+
# fixme: if resources_list contains directories, their "suffix" will interfere
|
|
156
|
+
# (e.g. dirname without dot means we falsely match files without suffix)
|
|
157
|
+
resource_suffixes = [Path(res['name']).suffix
|
|
158
|
+
for res in ocrd_tool['resources']]
|
|
159
|
+
except KeyError:
|
|
160
|
+
resource_suffixes = []
|
|
161
|
+
logger = getLogger('ocrd.utils.list_all_resources')
|
|
162
|
+
candidates = []
|
|
163
|
+
# cwd would list too many false positives:
|
|
140
164
|
# if 'cwd' in resource_locations:
|
|
141
|
-
#
|
|
142
|
-
#
|
|
143
|
-
#
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
165
|
+
# cwddir = Path.cwd()
|
|
166
|
+
# candidates.append(cwddir.itertree())
|
|
167
|
+
# but we do not use this anyway:
|
|
168
|
+
# relative paths are tried w.r.t. CWD
|
|
169
|
+
# prior to list_all_resources resolution.
|
|
170
|
+
for processor_path in get_env_locations(executable):
|
|
171
|
+
processor_path = Path(processor_path)
|
|
172
|
+
if processor_path.is_dir():
|
|
173
|
+
candidates += processor_path.iterdir()
|
|
149
174
|
if 'data' in resource_locations:
|
|
150
175
|
datadir = Path(xdg_data_home, 'ocrd-resources', executable)
|
|
151
176
|
if datadir.is_dir():
|
|
152
177
|
candidates += datadir.iterdir()
|
|
153
178
|
if 'system' in resource_locations:
|
|
154
|
-
systemdir = Path(
|
|
179
|
+
systemdir = Path(RESOURCES_DIR_SYSTEM, executable)
|
|
155
180
|
if systemdir.is_dir():
|
|
156
181
|
candidates += systemdir.iterdir()
|
|
157
182
|
if 'module' in resource_locations and moduled:
|
|
158
183
|
# recurse fully
|
|
159
|
-
|
|
184
|
+
moduled = Path(moduled)
|
|
185
|
+
for resource in moduled.iterdir():
|
|
160
186
|
if resource.is_dir():
|
|
161
187
|
continue
|
|
162
188
|
if any(resource.match(pattern) for pattern in
|
|
@@ -164,17 +190,66 @@ def list_all_resources(executable, moduled=None, xdg_data_home=None):
|
|
|
164
190
|
# code and data; `is_resource()` only singles out
|
|
165
191
|
# files over directories; but we want data files only
|
|
166
192
|
# todo: more code and cache exclusion patterns!
|
|
167
|
-
['*.py', '*.py[cod]', '*~', '
|
|
193
|
+
['*.py', '*.py[cod]', '*~', '.*.swp', '*.swo',
|
|
194
|
+
'__pycache__/*', '*.egg-info/*', '*.egg',
|
|
195
|
+
'copyright.txt', 'LICENSE*', 'README.md', 'MANIFEST',
|
|
196
|
+
'TAGS', '.DS_Store',
|
|
197
|
+
# C extensions
|
|
198
|
+
'*.so',
|
|
199
|
+
# translations
|
|
200
|
+
'*.mo', '*.pot',
|
|
201
|
+
'*.log', '*.orig', '*.BAK',
|
|
202
|
+
'.git/*',
|
|
203
|
+
# our stuff
|
|
204
|
+
'ocrd-tool.json',
|
|
168
205
|
'environment.pickle', 'resource_list.yml', 'lib.bash']):
|
|
206
|
+
logger.debug("ignoring module candidate '%s'", resource)
|
|
169
207
|
continue
|
|
170
208
|
candidates.append(resource)
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
209
|
+
if mimetypes != ['*/*']:
|
|
210
|
+
logger.debug("matching candidates for %s by content-type %s", executable, str(mimetypes))
|
|
211
|
+
def valid_resource_type(path):
|
|
212
|
+
if '*/*' in mimetypes:
|
|
213
|
+
return True
|
|
214
|
+
if path.is_dir():
|
|
215
|
+
if not 'text/directory' in mimetypes:
|
|
216
|
+
logger.debug("ignoring directory candidate '%s'", path)
|
|
217
|
+
return False
|
|
218
|
+
if path.name in ['.git']:
|
|
219
|
+
logger.debug("ignoring directory candidate '%s'", path)
|
|
220
|
+
return False
|
|
221
|
+
return True
|
|
222
|
+
if not path.is_file():
|
|
223
|
+
logger.warning("ignoring non-file, non-directory candidate '%s'", path)
|
|
224
|
+
return False
|
|
225
|
+
res_mimetype = guess_media_type(path, fallback='')
|
|
226
|
+
if res_mimetype == 'application/json':
|
|
227
|
+
# always accept, regardless of configured mimetypes:
|
|
228
|
+
# needed for distributing or sharing parameter preset files
|
|
229
|
+
return True
|
|
230
|
+
if ['text/directory'] == mimetypes:
|
|
231
|
+
logger.debug("ignoring non-directory candidate '%s'", path)
|
|
232
|
+
return False
|
|
233
|
+
if 'application/octet-stream' in mimetypes:
|
|
234
|
+
# catch-all type - do not enforce anything
|
|
235
|
+
return True
|
|
236
|
+
if path.suffix in resource_suffixes:
|
|
237
|
+
return True
|
|
238
|
+
if any(path.suffix == MIME_TO_EXT.get(mime, None)
|
|
239
|
+
for mime in mimetypes):
|
|
240
|
+
return True
|
|
241
|
+
if not res_mimetype:
|
|
242
|
+
logger.warning("cannot determine content type of candidate '%s'", path)
|
|
243
|
+
return True
|
|
244
|
+
if any(apply_glob([res_mimetype], mime)
|
|
245
|
+
for mime in mimetypes):
|
|
246
|
+
return True
|
|
247
|
+
logger.debug("ignoring %s candidate '%s'", res_mimetype, path)
|
|
248
|
+
return False
|
|
249
|
+
candidates = sorted(filter(valid_resource_type, candidates))
|
|
250
|
+
return map(str, candidates)
|
|
176
251
|
|
|
177
|
-
def get_processor_resource_types(executable, ocrd_tool=None):
|
|
252
|
+
def get_processor_resource_types(executable : str, ocrd_tool : Optional[Dict[str, Any]] = None) -> List[str]:
|
|
178
253
|
"""
|
|
179
254
|
Determine what type of resource parameters a processor needs.
|
|
180
255
|
|
|
@@ -186,13 +261,16 @@ def get_processor_resource_types(executable, ocrd_tool=None):
|
|
|
186
261
|
if not which(executable):
|
|
187
262
|
return ['*/*']
|
|
188
263
|
ocrd_tool = get_ocrd_tool_json(executable)
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
264
|
+
mime_types = [mime
|
|
265
|
+
for param in ocrd_tool.get('parameters', {}).values()
|
|
266
|
+
if param['type'] == 'string' and param.get('format', '') == 'uri' and 'content-type' in param
|
|
267
|
+
for mime in param['content-type'].split(',')]
|
|
268
|
+
if not len(mime_types):
|
|
269
|
+
# None of the parameters for this processor are resources
|
|
270
|
+
# (or the parameters' resource types are not properly declared,)
|
|
271
|
+
# so output both directories and files
|
|
193
272
|
return ['*/*']
|
|
194
|
-
return
|
|
195
|
-
if 'content-type' in p]
|
|
273
|
+
return mime_types
|
|
196
274
|
|
|
197
275
|
# ht @pabs3
|
|
198
276
|
# https://github.com/untitaker/python-atomicwrites/issues/42
|
|
@@ -211,12 +289,12 @@ class AtomicWriterPerms(AtomicWriter):
|
|
|
211
289
|
return f
|
|
212
290
|
|
|
213
291
|
@contextmanager
|
|
214
|
-
def atomic_write(fpath):
|
|
292
|
+
def atomic_write(fpath : str) -> Iterator[str]:
|
|
215
293
|
with atomic_write_(fpath, writer_cls=AtomicWriterPerms, overwrite=True) as f:
|
|
216
294
|
yield f
|
|
217
295
|
|
|
218
296
|
|
|
219
|
-
def is_file_in_directory(directory, file):
|
|
297
|
+
def is_file_in_directory(directory : Union[str, PathLike], file : Union[str, PathLike]) -> bool:
|
|
220
298
|
"""
|
|
221
299
|
Return True if ``file`` is in ``directory`` (by checking that all components of ``directory`` are in ``file.parts``)
|
|
222
300
|
"""
|
|
@@ -224,7 +302,7 @@ def is_file_in_directory(directory, file):
|
|
|
224
302
|
file = Path(file)
|
|
225
303
|
return list(file.parts)[:len(directory.parts)] == list(directory.parts)
|
|
226
304
|
|
|
227
|
-
def itertree(path):
|
|
305
|
+
def itertree(path : Union[str, PathLike]) -> PathLike:
|
|
228
306
|
"""
|
|
229
307
|
Generate a list of paths by recursively enumerating ``path``
|
|
230
308
|
"""
|
|
@@ -235,14 +313,14 @@ def itertree(path):
|
|
|
235
313
|
yield from itertree(subpath)
|
|
236
314
|
yield path
|
|
237
315
|
|
|
238
|
-
def directory_size(path):
|
|
316
|
+
def directory_size(path : Union[str, PathLike]) -> int:
|
|
239
317
|
"""
|
|
240
318
|
Calculates size of all files in directory ``path``
|
|
241
319
|
"""
|
|
242
320
|
path = Path(path)
|
|
243
321
|
return sum(f.stat().st_size for f in path.glob('**/*') if f.is_file())
|
|
244
322
|
|
|
245
|
-
def guess_media_type(input_file : str, fallback : str = None, application_xml : str = 'application/xml'):
|
|
323
|
+
def guess_media_type(input_file : str, fallback : Optional[str] = None, application_xml : str = 'application/xml') -> str:
|
|
246
324
|
"""
|
|
247
325
|
Guess the media type of a file path
|
|
248
326
|
"""
|
|
@@ -254,7 +332,7 @@ def guess_media_type(input_file : str, fallback : str = None, application_xml :
|
|
|
254
332
|
if mimetype is None:
|
|
255
333
|
mimetype = EXT_TO_MIME.get(''.join(Path(input_file).suffixes), fallback)
|
|
256
334
|
if mimetype is None:
|
|
257
|
-
raise ValueError("Could not determine MIME type of input_file
|
|
335
|
+
raise ValueError("Could not determine MIME type of input_file '%s'", str(input_file))
|
|
258
336
|
if mimetype == 'application/xml':
|
|
259
337
|
mimetype = application_xml
|
|
260
338
|
return mimetype
|
|
@@ -142,18 +142,21 @@ properties:
|
|
|
142
142
|
description: List the allowed values if a fixed list.
|
|
143
143
|
content-type:
|
|
144
144
|
type: string
|
|
145
|
-
default: 'application/octet-stream'
|
|
146
145
|
description: >
|
|
146
|
+
If parameter is reference to file (type=string format=uri):
|
|
147
147
|
The media type of resources this processor expects for
|
|
148
148
|
this parameter. Most processors use files for resources
|
|
149
149
|
(e.g. `*.traineddata` for `ocrd-tesserocr-recognize`)
|
|
150
150
|
while others use directories of files (e.g. `default` for
|
|
151
|
-
`ocrd-eynollah-segment`).
|
|
152
|
-
directories, it must set
|
|
151
|
+
`ocrd-eynollah-segment`).
|
|
152
|
+
If a parameter requires directories, it must set this to
|
|
153
153
|
`text/directory`.
|
|
154
154
|
cacheable:
|
|
155
155
|
type: boolean
|
|
156
|
-
description:
|
|
156
|
+
description: >
|
|
157
|
+
If parameter is reference to file (type=string format=uri):
|
|
158
|
+
Whether the file should be cached, e.g. because it is large
|
|
159
|
+
and won't change.
|
|
157
160
|
default: false
|
|
158
161
|
description:
|
|
159
162
|
description: Concise description of what the tool does
|
|
@@ -7,6 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
|
|
8
8
|
from ocrd_utils import getLogger, MIMETYPE_PAGE, pushd_popd, is_local_filename, DEFAULT_METS_BASENAME
|
|
9
9
|
from ocrd_models import ValidationReport
|
|
10
|
+
from ocrd_models.constants import PAGE_ALTIMG_FEATURES
|
|
10
11
|
from ocrd_modelfactory import page_from_file
|
|
11
12
|
|
|
12
13
|
from .constants import FILE_GROUP_CATEGORIES, FILE_GROUP_PREFIX
|
|
@@ -98,6 +99,9 @@ class WorkspaceValidator():
|
|
|
98
99
|
self.page_coordinate_consistency = page_coordinate_consistency
|
|
99
100
|
# there will be more options to come
|
|
100
101
|
self.page_checks = [check for check in ['mets_fileid_page_pcgtsid',
|
|
102
|
+
'imagefilename',
|
|
103
|
+
'alternativeimage_filename',
|
|
104
|
+
'alternativeimage_comments',
|
|
101
105
|
'dimension',
|
|
102
106
|
'page',
|
|
103
107
|
'page_xsd']
|
|
@@ -118,7 +122,7 @@ class WorkspaceValidator():
|
|
|
118
122
|
mets_url (string): URL of the METS file
|
|
119
123
|
src_dir (string, None): Directory containing mets file
|
|
120
124
|
skip (list): Validation checks to omit. One or more of
|
|
121
|
-
'mets_unique_identifier',
|
|
125
|
+
'mets_unique_identifier',
|
|
122
126
|
'mets_files', 'pixel_density', 'dimension', 'url',
|
|
123
127
|
'multipage', 'page', 'page_xsd', 'mets_xsd',
|
|
124
128
|
'mets_fileid_page_pcgtsid'
|
|
@@ -145,8 +149,6 @@ class WorkspaceValidator():
|
|
|
145
149
|
try:
|
|
146
150
|
if 'mets_unique_identifier' not in self.skip:
|
|
147
151
|
self._validate_mets_unique_identifier()
|
|
148
|
-
if 'mets_file_group_names' not in self.skip:
|
|
149
|
-
self._validate_mets_file_group_names()
|
|
150
152
|
if 'mets_files' not in self.skip:
|
|
151
153
|
self._validate_mets_files()
|
|
152
154
|
if 'pixel_density' not in self.skip:
|
|
@@ -192,7 +194,11 @@ class WorkspaceValidator():
|
|
|
192
194
|
self.workspace.download_file(f)
|
|
193
195
|
page = page_from_file(f).get_Page()
|
|
194
196
|
imageFilename = page.imageFilename
|
|
195
|
-
if
|
|
197
|
+
if is_local_filename(imageFilename):
|
|
198
|
+
kwargs = dict(local_filename=imageFilename, **self.find_kwargs)
|
|
199
|
+
else:
|
|
200
|
+
kwargs = dict(url=imageFilename, **self.find_kwargs)
|
|
201
|
+
if not self.mets.find_files(**kwargs):
|
|
196
202
|
self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
|
|
197
203
|
if is_local_filename(imageFilename) and not Path(imageFilename).exists():
|
|
198
204
|
self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file")
|
|
@@ -295,6 +301,9 @@ class WorkspaceValidator():
|
|
|
295
301
|
if f.url and 'url' not in self.skip:
|
|
296
302
|
if re.match(r'^file:/[^/]', f.url):
|
|
297
303
|
self.report.add_error(f"File '{f.ID}' has an invalid (Java-specific) file URL '{f.url}'")
|
|
304
|
+
elif ':' not in f.url:
|
|
305
|
+
self.report.add_error(f"File '{f.ID}' has an invalid (non-URI) file URL '{f.url}'")
|
|
306
|
+
continue
|
|
298
307
|
scheme = f.url[0:f.url.index(':')]
|
|
299
308
|
if scheme not in ('http', 'https', 'file'):
|
|
300
309
|
self.report.add_warning(f"File '{f.ID}' has non-HTTP, non-file URL '{f.url}'")
|
|
@@ -321,17 +330,43 @@ class WorkspaceValidator():
|
|
|
321
330
|
pcgts = page_from_file(f)
|
|
322
331
|
page = pcgts.get_Page()
|
|
323
332
|
if 'dimension' in self.page_checks:
|
|
324
|
-
|
|
325
|
-
if page.imageHeight !=
|
|
326
|
-
self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {
|
|
327
|
-
if page.imageWidth !=
|
|
328
|
-
self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {
|
|
333
|
+
img = self.workspace._resolve_image_as_pil(page.imageFilename)
|
|
334
|
+
if page.imageHeight != img.height:
|
|
335
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {img.height})")
|
|
336
|
+
if page.imageWidth != img.width:
|
|
337
|
+
self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {img.width})")
|
|
329
338
|
if 'imagefilename' in self.page_checks:
|
|
330
339
|
imageFilename = page.imageFilename
|
|
331
|
-
if
|
|
340
|
+
if is_local_filename(imageFilename):
|
|
341
|
+
kwargs = dict(local_filename=imageFilename, **self.find_kwargs)
|
|
342
|
+
else:
|
|
343
|
+
kwargs = dict(url=imageFilename, **self.find_kwargs)
|
|
344
|
+
if not self.mets.find_files(**kwargs):
|
|
332
345
|
self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
|
|
333
346
|
if is_local_filename(imageFilename) and not Path(imageFilename).exists():
|
|
334
347
|
self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file")
|
|
348
|
+
if 'alternativeimage_filename' in self.page_checks:
|
|
349
|
+
for altimg in page.get_AllAlternativeImages():
|
|
350
|
+
if is_local_filename(altimg.filename):
|
|
351
|
+
kwargs = dict(local_filename=altimg.filename, **self.find_kwargs)
|
|
352
|
+
else:
|
|
353
|
+
kwargs = dict(url=altimg.filename, **self.find_kwargs)
|
|
354
|
+
if not self.mets.find_files(**kwargs):
|
|
355
|
+
self.report.add_error(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
|
|
356
|
+
f"'{altimg.filename}' not found in METS")
|
|
357
|
+
if is_local_filename(altimg.filename) and not Path(altimg.filename).exists():
|
|
358
|
+
self.report.add_warning(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
|
|
359
|
+
f"'{altimg.filename}' points to non-existent local file")
|
|
360
|
+
if 'alternativeimage_comments' in self.page_checks:
|
|
361
|
+
for altimg in page.get_AllAlternativeImages():
|
|
362
|
+
if altimg.comments is None:
|
|
363
|
+
self.report.add_error(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
|
|
364
|
+
f"'{altimg.filename}' features not specified in PAGE")
|
|
365
|
+
else:
|
|
366
|
+
for feature in altimg.comments.split(','):
|
|
367
|
+
if feature not in PAGE_ALTIMG_FEATURES:
|
|
368
|
+
self.report.add_error(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
|
|
369
|
+
f"'{altimg.filename}' feature '{feature}' not standardized for PAGE")
|
|
335
370
|
if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID:
|
|
336
371
|
self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pcgts.pcGtsId or '', f.ID or ''))
|
|
337
372
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|