ocrd 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +8 -6
- ocrd/cli/bashlib.py +8 -114
- ocrd/cli/network.py +0 -2
- ocrd/cli/ocrd_tool.py +26 -4
- ocrd/cli/process.py +1 -0
- ocrd/cli/resmgr.py +0 -1
- ocrd/cli/validate.py +32 -13
- ocrd/cli/workspace.py +125 -52
- ocrd/cli/zip.py +13 -4
- ocrd/decorators/__init__.py +28 -52
- ocrd/decorators/loglevel_option.py +4 -0
- ocrd/decorators/mets_find_options.py +2 -1
- ocrd/decorators/ocrd_cli_options.py +3 -7
- ocrd/decorators/parameter_option.py +12 -11
- ocrd/mets_server.py +11 -15
- ocrd/processor/base.py +88 -71
- ocrd/processor/builtin/dummy_processor.py +7 -4
- ocrd/processor/builtin/filter_processor.py +3 -2
- ocrd/processor/helpers.py +5 -6
- ocrd/processor/ocrd_page_result.py +7 -5
- ocrd/resolver.py +42 -32
- ocrd/task_sequence.py +11 -4
- ocrd/workspace.py +64 -54
- ocrd/workspace_backup.py +3 -0
- ocrd/workspace_bagger.py +15 -8
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/METADATA +2 -8
- ocrd-3.7.0.dist-info/RECORD +123 -0
- ocrd_modelfactory/__init__.py +4 -2
- ocrd_models/constants.py +18 -1
- ocrd_models/ocrd_agent.py +1 -1
- ocrd_models/ocrd_exif.py +7 -3
- ocrd_models/ocrd_file.py +24 -19
- ocrd_models/ocrd_mets.py +90 -67
- ocrd_models/ocrd_page.py +17 -13
- ocrd_models/ocrd_xml_base.py +1 -0
- ocrd_models/report.py +2 -1
- ocrd_models/utils.py +4 -3
- ocrd_models/xpath_functions.py +3 -1
- ocrd_network/__init__.py +1 -2
- ocrd_network/cli/__init__.py +0 -2
- ocrd_network/cli/client.py +122 -50
- ocrd_network/cli/processing_server.py +1 -2
- ocrd_network/client.py +2 -2
- ocrd_network/client_utils.py +30 -13
- ocrd_network/constants.py +1 -6
- ocrd_network/database.py +3 -3
- ocrd_network/logging_utils.py +2 -7
- ocrd_network/models/__init__.py +0 -2
- ocrd_network/models/job.py +31 -33
- ocrd_network/models/messages.py +3 -2
- ocrd_network/models/workspace.py +5 -5
- ocrd_network/process_helpers.py +54 -17
- ocrd_network/processing_server.py +63 -114
- ocrd_network/processing_worker.py +6 -5
- ocrd_network/rabbitmq_utils/__init__.py +2 -0
- ocrd_network/rabbitmq_utils/helpers.py +24 -7
- ocrd_network/runtime_data/__init__.py +1 -2
- ocrd_network/runtime_data/deployer.py +12 -85
- ocrd_network/runtime_data/hosts.py +61 -130
- ocrd_network/runtime_data/network_agents.py +7 -31
- ocrd_network/runtime_data/network_services.py +1 -1
- ocrd_network/server_cache.py +1 -1
- ocrd_network/server_utils.py +13 -52
- ocrd_network/utils.py +1 -0
- ocrd_utils/__init__.py +4 -4
- ocrd_utils/config.py +86 -76
- ocrd_utils/deprecate.py +3 -0
- ocrd_utils/image.py +51 -23
- ocrd_utils/introspect.py +8 -3
- ocrd_utils/logging.py +15 -7
- ocrd_utils/os.py +17 -4
- ocrd_utils/str.py +32 -16
- ocrd_validators/json_validator.py +4 -1
- ocrd_validators/ocrd_tool_validator.py +2 -1
- ocrd_validators/ocrd_zip_validator.py +5 -4
- ocrd_validators/page_validator.py +21 -9
- ocrd_validators/parameter_validator.py +3 -2
- ocrd_validators/processing_server_config.schema.yml +1 -33
- ocrd_validators/resource_list_validator.py +3 -1
- ocrd_validators/workspace_validator.py +30 -20
- ocrd_validators/xsd_mets_validator.py +2 -1
- ocrd_validators/xsd_page_validator.py +2 -1
- ocrd_validators/xsd_validator.py +4 -2
- ocrd/cli/log.py +0 -51
- ocrd/lib.bash +0 -317
- ocrd-3.5.1.dist-info/RECORD +0 -128
- ocrd_network/cli/processor_server.py +0 -31
- ocrd_network/models/ocrd_tool.py +0 -12
- ocrd_network/processor_server.py +0 -255
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/LICENSE +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/WHEEL +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/entry_points.txt +0 -0
- {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/top_level.txt +0 -0
ocrd/resolver.py
CHANGED
|
@@ -20,6 +20,7 @@ from ocrd.workspace import Workspace
|
|
|
20
20
|
from ocrd_models import OcrdMets
|
|
21
21
|
from ocrd_models.utils import handle_oai_response
|
|
22
22
|
|
|
23
|
+
|
|
23
24
|
class Resolver():
|
|
24
25
|
"""
|
|
25
26
|
Handle uploads, downloads, repository access, and manage temporary directories
|
|
@@ -31,11 +32,13 @@ class Resolver():
|
|
|
31
32
|
|
|
32
33
|
If ``url`` looks like a file path, check whether that exists.
|
|
33
34
|
If it does exist and is within ``directory` already, return early.
|
|
34
|
-
If it does exist but is outside of ``directory
|
|
35
|
-
If ``url` does not appear to be a file path, try downloading via HTTP,
|
|
35
|
+
If it does exist but is outside of ``directory``, copy it.
|
|
36
|
+
If ``url` does not appear to be a file path, try downloading via HTTP,
|
|
37
|
+
retrying ``retries`` times with timeout ``timeout`` between calls.
|
|
36
38
|
|
|
37
39
|
If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``.
|
|
38
40
|
|
|
41
|
+
\b
|
|
39
42
|
If the target file already exists within ``directory``, behavior depends on ``if_exists``:
|
|
40
43
|
- ``skip`` (default): do nothing and return early. Note that this
|
|
41
44
|
- ``overwrite``: overwrite the existing file
|
|
@@ -56,11 +59,12 @@ class Resolver():
|
|
|
56
59
|
Returns:
|
|
57
60
|
Local filename string, *relative* to directory
|
|
58
61
|
"""
|
|
59
|
-
log = getLogger('ocrd.resolver.download_to_directory')
|
|
60
|
-
log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|",
|
|
62
|
+
log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
|
|
63
|
+
log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|",
|
|
64
|
+
directory, url, basename, if_exists, subdir)
|
|
61
65
|
|
|
62
66
|
if not url:
|
|
63
|
-
raise ValueError(f"'url' must be a non-empty string, not '{url}'")
|
|
67
|
+
raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok
|
|
64
68
|
if not directory:
|
|
65
69
|
raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work
|
|
66
70
|
|
|
@@ -123,25 +127,25 @@ class Resolver():
|
|
|
123
127
|
retries = Retry(total=retries or 0,
|
|
124
128
|
status_forcelist=[
|
|
125
129
|
# probably too wide (only transient failures):
|
|
126
|
-
408,
|
|
127
|
-
409,
|
|
128
|
-
412,
|
|
129
|
-
417,
|
|
130
|
-
423,
|
|
131
|
-
424,
|
|
132
|
-
425,
|
|
133
|
-
426,
|
|
134
|
-
428,
|
|
135
|
-
429,
|
|
136
|
-
440,
|
|
137
|
-
500,
|
|
138
|
-
503,
|
|
139
|
-
504,
|
|
140
|
-
509,
|
|
141
|
-
529,
|
|
142
|
-
598,
|
|
143
|
-
599,
|
|
144
|
-
|
|
130
|
+
408, # Request Timeout
|
|
131
|
+
409, # Conflict
|
|
132
|
+
412, # Precondition Failed
|
|
133
|
+
417, # Expectation Failed
|
|
134
|
+
423, # Locked
|
|
135
|
+
424, # Fail
|
|
136
|
+
425, # Too Early
|
|
137
|
+
426, # Upgrade Required
|
|
138
|
+
428, # Precondition Required
|
|
139
|
+
429, # Too Many Requests
|
|
140
|
+
440, # Login Timeout
|
|
141
|
+
500, # Internal Server Error
|
|
142
|
+
503, # Service Unavailable
|
|
143
|
+
504, # Gateway Timeout
|
|
144
|
+
509, # Bandwidth Limit Exceeded
|
|
145
|
+
529, # Site Overloaded
|
|
146
|
+
598, # Proxy Read Timeout
|
|
147
|
+
599, # Proxy Connect Timeout
|
|
148
|
+
])
|
|
145
149
|
adapter = HTTPAdapter(max_retries=retries)
|
|
146
150
|
session.mount('http://', adapter)
|
|
147
151
|
session.mount('https://', adapter)
|
|
@@ -181,7 +185,7 @@ class Resolver():
|
|
|
181
185
|
the filesystem directly.
|
|
182
186
|
**kwargs (): Passed on to ``OcrdMets.find_files`` if download == True
|
|
183
187
|
|
|
184
|
-
Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless
|
|
188
|
+
Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless
|
|
185
189
|
the former is already local and the latter is ``none`` or already identical to its directory name.
|
|
186
190
|
|
|
187
191
|
Returns:
|
|
@@ -218,11 +222,13 @@ class Resolver():
|
|
|
218
222
|
Path(dst_dir).mkdir(parents=True, exist_ok=False)
|
|
219
223
|
dst_dir = str(Path(dst_dir).resolve())
|
|
220
224
|
|
|
221
|
-
log.debug("
|
|
222
|
-
|
|
223
|
-
self.download_to_directory(dst_dir, mets_url, basename=mets_basename,
|
|
225
|
+
log.debug("mets_basename='%s' mets_url='%s' src_baseurl='%s' dst_dir='%s'",
|
|
226
|
+
mets_basename, mets_url, src_baseurl, dst_dir)
|
|
227
|
+
self.download_to_directory(dst_dir, mets_url, basename=mets_basename,
|
|
228
|
+
if_exists='overwrite' if clobber_mets else 'raise')
|
|
224
229
|
|
|
225
|
-
workspace = Workspace(self, dst_dir,
|
|
230
|
+
workspace = Workspace(self, dst_dir,
|
|
231
|
+
mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)
|
|
226
232
|
|
|
227
233
|
if download:
|
|
228
234
|
for f in workspace.mets.find_files(**kwargs):
|
|
@@ -273,7 +279,8 @@ class Resolver():
|
|
|
273
279
|
# if directory and mets_url and not mets_is_remote:
|
|
274
280
|
# raise ValueError("Use either --mets or --directory, not both")
|
|
275
281
|
|
|
276
|
-
# If --mets is a URL, a directory must be explicitly provided
|
|
282
|
+
# If --mets is a URL, a directory must be explicitly provided
|
|
283
|
+
# (not strictly necessary, but retained for legacy behavior)
|
|
277
284
|
if not directory and mets_is_remote:
|
|
278
285
|
raise ValueError("--mets is an http(s) URL but no --directory was given")
|
|
279
286
|
|
|
@@ -297,7 +304,7 @@ class Resolver():
|
|
|
297
304
|
elif not directory and mets_url:
|
|
298
305
|
mets_url = Path(mets_url).resolve()
|
|
299
306
|
directory = mets_url.parent
|
|
300
|
-
else:
|
|
307
|
+
else: # == directory and mets_url:
|
|
301
308
|
directory = Path(directory).resolve()
|
|
302
309
|
if not mets_is_remote:
|
|
303
310
|
# --mets is just a basename and --directory is set, so treat --mets as --mets-basename
|
|
@@ -306,10 +313,13 @@ class Resolver():
|
|
|
306
313
|
else:
|
|
307
314
|
mets_url = Path(mets_url).resolve()
|
|
308
315
|
if not is_file_in_directory(directory, mets_url):
|
|
309
|
-
raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (
|
|
316
|
+
raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (
|
|
317
|
+
mets_url, directory))
|
|
310
318
|
|
|
311
319
|
if mets_server_url and not mets_server_url.startswith('http://'):
|
|
312
320
|
# UDS socket
|
|
313
321
|
mets_server_url = str(Path(mets_server_url).resolve())
|
|
314
322
|
|
|
323
|
+
log.debug("directory='%s' mets_url='%s', mets_basename='%s', mets_server_url='%s'" % (
|
|
324
|
+
directory, str(mets_url), str(mets_basename), mets_server_url))
|
|
315
325
|
return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url
|
ocrd/task_sequence.py
CHANGED
|
@@ -9,6 +9,7 @@ from ocrd.resolver import Resolver
|
|
|
9
9
|
from ocrd_validators import ParameterValidator, WorkspaceValidator
|
|
10
10
|
from ocrd_models import ValidationReport
|
|
11
11
|
|
|
12
|
+
|
|
12
13
|
class ProcessorTask():
|
|
13
14
|
|
|
14
15
|
@classmethod
|
|
@@ -85,6 +86,7 @@ class ProcessorTask():
|
|
|
85
86
|
ret += " -p '%s'" % json.dumps(self.parameters)
|
|
86
87
|
return ret
|
|
87
88
|
|
|
89
|
+
|
|
88
90
|
def validate_tasks(tasks, workspace, page_id=None, overwrite=False):
|
|
89
91
|
report = ValidationReport()
|
|
90
92
|
prev_output_file_grps = workspace.mets.file_groups
|
|
@@ -93,14 +95,18 @@ def validate_tasks(tasks, workspace, page_id=None, overwrite=False):
|
|
|
93
95
|
first_task.validate()
|
|
94
96
|
|
|
95
97
|
# first task: check input/output file groups from METS
|
|
96
|
-
WorkspaceValidator.check_file_grp(workspace,
|
|
98
|
+
WorkspaceValidator.check_file_grp(workspace,
|
|
99
|
+
first_task.input_file_grps,
|
|
100
|
+
'' if overwrite else first_task.output_file_grps,
|
|
101
|
+
page_id,
|
|
102
|
+
report)
|
|
97
103
|
|
|
98
104
|
prev_output_file_grps += first_task.output_file_grps
|
|
99
105
|
for task in tasks[1:]:
|
|
100
106
|
task.validate()
|
|
101
107
|
# check either existing fileGrp or output-file group of previous task matches current input_file_group
|
|
102
108
|
for input_file_grp in task.input_file_grps:
|
|
103
|
-
if not
|
|
109
|
+
if input_file_grp not in prev_output_file_grps:
|
|
104
110
|
report.add_error("Input file group not contained in METS or produced by previous steps: %s" % input_file_grp)
|
|
105
111
|
if not overwrite:
|
|
106
112
|
WorkspaceValidator.check_file_grp(workspace, [], task.output_file_grps, page_id, report)
|
|
@@ -157,5 +163,6 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False, mets_server_
|
|
|
157
163
|
|
|
158
164
|
# check output file groups are in mets
|
|
159
165
|
for output_file_grp in task.output_file_grps:
|
|
160
|
-
if not
|
|
161
|
-
raise Exception("Invalid state: expected output file group '%s' not in METS
|
|
166
|
+
if output_file_grp not in workspace.mets.file_groups:
|
|
167
|
+
raise Exception("Invalid state: expected output file group '%s' not in METS "
|
|
168
|
+
"(despite processor success)" % output_file_grp)
|
ocrd/workspace.py
CHANGED
|
@@ -28,16 +28,13 @@ from ocrd_utils import (
|
|
|
28
28
|
scale_coordinates,
|
|
29
29
|
shift_coordinates,
|
|
30
30
|
rotate_coordinates,
|
|
31
|
-
transform_coordinates,
|
|
32
31
|
transpose_coordinates,
|
|
33
32
|
crop_image,
|
|
34
33
|
rotate_image,
|
|
35
34
|
transpose_image,
|
|
36
35
|
bbox_from_polygon,
|
|
37
|
-
polygon_from_points,
|
|
38
36
|
xywh_from_bbox,
|
|
39
37
|
pushd_popd,
|
|
40
|
-
is_local_filename,
|
|
41
38
|
deprecated_alias,
|
|
42
39
|
DEFAULT_METS_BASENAME,
|
|
43
40
|
MIME_TO_EXT,
|
|
@@ -51,6 +48,7 @@ from .mets_server import ClientSideOcrdMets
|
|
|
51
48
|
|
|
52
49
|
__all__ = ['Workspace']
|
|
53
50
|
|
|
51
|
+
|
|
54
52
|
@contextmanager
|
|
55
53
|
def download_temporary_file(url):
|
|
56
54
|
with NamedTemporaryFile(prefix='ocrd-download-') as f:
|
|
@@ -82,7 +80,7 @@ class Workspace():
|
|
|
82
80
|
self,
|
|
83
81
|
resolver,
|
|
84
82
|
directory,
|
|
85
|
-
mets
|
|
83
|
+
mets: Optional[Union[OcrdMets, ClientSideOcrdMets]] = None,
|
|
86
84
|
mets_basename=DEFAULT_METS_BASENAME,
|
|
87
85
|
automatic_backup=False,
|
|
88
86
|
baseurl=None,
|
|
@@ -96,8 +94,9 @@ class Workspace():
|
|
|
96
94
|
if self.is_remote:
|
|
97
95
|
mets = ClientSideOcrdMets(mets_server_url, self.directory)
|
|
98
96
|
if mets.workspace_path != self.directory:
|
|
99
|
-
raise ValueError(
|
|
100
|
-
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs "
|
|
99
|
+
f"from local workspace directory '{self.directory}'. These are not the same workspaces.")
|
|
101
100
|
else:
|
|
102
101
|
mets = OcrdMets(filename=self.mets_target)
|
|
103
102
|
self.mets = mets
|
|
@@ -148,7 +147,7 @@ class Workspace():
|
|
|
148
147
|
if not copy_files:
|
|
149
148
|
fpath_src = Path(other_workspace.directory).resolve()
|
|
150
149
|
fpath_dst = Path(self.directory).resolve()
|
|
151
|
-
dstprefix = fpath_src.relative_to(fpath_dst)
|
|
150
|
+
dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath
|
|
152
151
|
f.local_filename = dstprefix / f.local_filename
|
|
153
152
|
return
|
|
154
153
|
fpath_src = Path(other_workspace.directory, f.local_filename)
|
|
@@ -171,7 +170,6 @@ class Workspace():
|
|
|
171
170
|
|
|
172
171
|
self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs)
|
|
173
172
|
|
|
174
|
-
|
|
175
173
|
@deprecated(version='1.0.0', reason="Use workspace.download_file")
|
|
176
174
|
def download_url(self, url, **kwargs):
|
|
177
175
|
"""
|
|
@@ -199,19 +197,23 @@ class Workspace():
|
|
|
199
197
|
file_path = Path(f.local_filename).absolute()
|
|
200
198
|
if file_path.exists():
|
|
201
199
|
try:
|
|
202
|
-
file_path.relative_to(Path(self.directory).resolve())
|
|
200
|
+
file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative
|
|
203
201
|
# If the f.local_filename exists and is within self.directory, nothing to do
|
|
204
202
|
log.debug(f"'local_filename' {f.local_filename} already within {self.directory} - nothing to do")
|
|
205
203
|
except ValueError:
|
|
206
204
|
# f.local_filename exists, but not within self.directory, copy it
|
|
207
|
-
log.debug("Copying 'local_filename' %s to workspace directory %s" % (
|
|
208
|
-
|
|
205
|
+
log.debug("Copying 'local_filename' %s to workspace directory %s" % (
|
|
206
|
+
f.local_filename, self.directory))
|
|
207
|
+
f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename,
|
|
208
|
+
subdir=f.fileGrp)
|
|
209
209
|
return f
|
|
210
210
|
if f.url:
|
|
211
|
-
log.debug("OcrdFile has 'local_filename' but it doesn't resolve -
|
|
211
|
+
log.debug("OcrdFile has 'local_filename' but it doesn't resolve - "
|
|
212
|
+
"trying to download from 'url' %s", f.url)
|
|
212
213
|
url = f.url
|
|
213
214
|
elif self.baseurl:
|
|
214
|
-
log.debug("OcrdFile has 'local_filename' but it doesn't resolve, and no 'url' -
|
|
215
|
+
log.debug("OcrdFile has 'local_filename' but it doesn't resolve, and no 'url' - "
|
|
216
|
+
"trying 'baseurl' %s with 'local_filename' %s",
|
|
215
217
|
self.baseurl, f.local_filename)
|
|
216
218
|
url = '%s/%s' % (self.baseurl, f.local_filename)
|
|
217
219
|
else:
|
|
@@ -223,7 +225,8 @@ class Workspace():
|
|
|
223
225
|
if f.url:
|
|
224
226
|
# If f.url is set, download the file to the workspace
|
|
225
227
|
basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename
|
|
226
|
-
f.local_filename = self.resolver.download_to_directory(self.directory, f.url,
|
|
228
|
+
f.local_filename = self.resolver.download_to_directory(self.directory, f.url,
|
|
229
|
+
subdir=f.fileGrp, basename=basename)
|
|
227
230
|
return f
|
|
228
231
|
# If neither f.local_filename nor f.url is set, fail
|
|
229
232
|
raise ValueError(f"OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded")
|
|
@@ -281,7 +284,8 @@ class Workspace():
|
|
|
281
284
|
if not force:
|
|
282
285
|
raise e
|
|
283
286
|
|
|
284
|
-
def remove_file_group(self, USE, recursive=False, force=False, keep_files=False,
|
|
287
|
+
def remove_file_group(self, USE, recursive=False, force=False, keep_files=False,
|
|
288
|
+
page_recursive=False, page_same_group=False):
|
|
285
289
|
"""
|
|
286
290
|
Remove a METS `fileGrp`.
|
|
287
291
|
|
|
@@ -302,7 +306,8 @@ class Workspace():
|
|
|
302
306
|
file_dirs = []
|
|
303
307
|
if recursive:
|
|
304
308
|
for f in self.mets.find_files(fileGrp=USE):
|
|
305
|
-
self.remove_file(
|
|
309
|
+
self.remove_file(
|
|
310
|
+
f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group)
|
|
306
311
|
if f.local_filename:
|
|
307
312
|
f_dir = path.dirname(f.local_filename)
|
|
308
313
|
if f_dir:
|
|
@@ -320,7 +325,6 @@ class Workspace():
|
|
|
320
325
|
if Path(file_dir).is_dir() and not listdir(file_dir):
|
|
321
326
|
Path(file_dir).rmdir()
|
|
322
327
|
|
|
323
|
-
|
|
324
328
|
def rename_file_group(self, old, new):
|
|
325
329
|
"""
|
|
326
330
|
Rename a METS `fileGrp`.
|
|
@@ -361,7 +365,8 @@ class Workspace():
|
|
|
361
365
|
new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID)
|
|
362
366
|
try:
|
|
363
367
|
next(self.mets.find_files(ID=new_id))
|
|
364
|
-
log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (
|
|
368
|
+
log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (
|
|
369
|
+
new_id, old_local_filename, new_local_filename))
|
|
365
370
|
except StopIteration:
|
|
366
371
|
mets_file.ID = new_id
|
|
367
372
|
# change file paths in PAGE-XML imageFilename and filename attributes
|
|
@@ -378,7 +383,8 @@ class Workspace():
|
|
|
378
383
|
for old_local_filename, new_local_filename in local_filename_replacements.items():
|
|
379
384
|
if ai.filename == old_local_filename:
|
|
380
385
|
changed = True
|
|
381
|
-
log.debug("Rename pc:Page/../AlternativeImage: %s -> %s" % (
|
|
386
|
+
log.debug("Rename pc:Page/../AlternativeImage: %s -> %s" % (
|
|
387
|
+
old_local_filename, new_local_filename))
|
|
382
388
|
ai.filename = new_local_filename
|
|
383
389
|
if changed:
|
|
384
390
|
log.debug("PAGE-XML changed, writing %s" % (page_file.local_filename))
|
|
@@ -502,7 +508,7 @@ class Workspace():
|
|
|
502
508
|
def _resolve_image_as_pil(self, image_url, coords=None):
|
|
503
509
|
log = getLogger('ocrd.workspace._resolve_image_as_pil')
|
|
504
510
|
pil_image = self._apply_mets_file(image_url, Image.open)
|
|
505
|
-
pil_image.load()
|
|
511
|
+
pil_image.load() # alloc and give up the FD
|
|
506
512
|
|
|
507
513
|
# Pillow does not properly support higher color depths
|
|
508
514
|
# (e.g. 16-bit or 32-bit or floating point grayscale),
|
|
@@ -544,7 +550,7 @@ class Workspace():
|
|
|
544
550
|
|
|
545
551
|
# FIXME: remove or replace this by (image_from_polygon+) crop_image ...
|
|
546
552
|
log.debug("Converting PIL to OpenCV: %s", image_url)
|
|
547
|
-
color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else
|
|
553
|
+
color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else COLOR_RGB2BGR
|
|
548
554
|
pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image)
|
|
549
555
|
cv2_image = cvtColor(pil_as_np_array, color_conversion)
|
|
550
556
|
|
|
@@ -659,8 +665,8 @@ class Workspace():
|
|
|
659
665
|
orientation = (page_coords['angle'] + 45) % 360
|
|
660
666
|
orientation = orientation - (orientation % 90)
|
|
661
667
|
skew = (page_coords['angle'] % 360) - orientation
|
|
662
|
-
skew = 180 - (180 - skew) % 360
|
|
663
|
-
page_coords['angle'] = 0
|
|
668
|
+
skew = 180 - (180 - skew) % 360 # map to [-45,45]
|
|
669
|
+
page_coords['angle'] = 0 # nothing applied yet (depends on filters)
|
|
664
670
|
log.debug("page '%s' has %s orientation=%d skew=%.2f",
|
|
665
671
|
page_id, "border," if border else "", orientation, skew)
|
|
666
672
|
if page_image_info.resolution != 1:
|
|
@@ -696,7 +702,7 @@ class Workspace():
|
|
|
696
702
|
for feature in feature_selector.split(',') if feature) and
|
|
697
703
|
not any(feature in featureset
|
|
698
704
|
for feature in feature_filter.split(',') if feature) and
|
|
699
|
-
len(featureset.difference(auto_features)) >=
|
|
705
|
+
len(featureset.difference(auto_features)) >=
|
|
700
706
|
len(best_features.difference(auto_features))):
|
|
701
707
|
best_features = featureset
|
|
702
708
|
best_image = alternative_image
|
|
@@ -705,7 +711,7 @@ class Workspace():
|
|
|
705
711
|
alternative_images.index(best_image) + 1,
|
|
706
712
|
best_features, page_id)
|
|
707
713
|
page_image = self._resolve_image_as_pil(best_image.get_filename())
|
|
708
|
-
page_coords['features'] = best_image.get_comments()
|
|
714
|
+
page_coords['features'] = best_image.get_comments() # including duplicates
|
|
709
715
|
|
|
710
716
|
# adjust the coord transformation to the steps applied on the image,
|
|
711
717
|
# and apply steps on the existing image in case it is missing there,
|
|
@@ -727,18 +733,18 @@ class Workspace():
|
|
|
727
733
|
for i, feature in enumerate(alternative_image_features +
|
|
728
734
|
(['cropped']
|
|
729
735
|
if (border and
|
|
730
|
-
|
|
731
|
-
|
|
736
|
+
'cropped' not in alternative_image_features and
|
|
737
|
+
'cropped' not in feature_filter.split(','))
|
|
732
738
|
else []) +
|
|
733
739
|
(['rotated-%d' % orientation]
|
|
734
740
|
if (orientation and
|
|
735
|
-
|
|
736
|
-
|
|
741
|
+
'rotated-%d' % orientation not in alternative_image_features and
|
|
742
|
+
'rotated-%d' % orientation not in feature_filter.split(','))
|
|
737
743
|
else []) +
|
|
738
744
|
(['deskewed']
|
|
739
745
|
if (skew and
|
|
740
|
-
|
|
741
|
-
|
|
746
|
+
'deskewed' not in alternative_image_features and
|
|
747
|
+
'deskewed' not in feature_filter.split(','))
|
|
742
748
|
else []) +
|
|
743
749
|
# not a feature to be added, but merely as a fallback position
|
|
744
750
|
# to always enter loop at i == len(alternative_image_features)
|
|
@@ -931,15 +937,15 @@ class Workspace():
|
|
|
931
937
|
orientation = (angle + 45) % 360
|
|
932
938
|
orientation = orientation - (orientation % 90)
|
|
933
939
|
skew = (angle % 360) - orientation
|
|
934
|
-
skew = 180 - (180 - skew) % 360
|
|
940
|
+
skew = 180 - (180 - skew) % 360 # map to [-45,45]
|
|
935
941
|
log.debug("segment '%s' has orientation=%d skew=%.2f",
|
|
936
942
|
segment.id, orientation, skew)
|
|
937
943
|
else:
|
|
938
944
|
orientation = 0
|
|
939
945
|
skew = 0
|
|
940
|
-
segment_coords['angle'] = parent_coords['angle']
|
|
946
|
+
segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters)
|
|
941
947
|
if 'DPI' in parent_coords:
|
|
942
|
-
segment_coords['DPI'] = parent_coords['DPI']
|
|
948
|
+
segment_coords['DPI'] = parent_coords['DPI'] # not rescaled yet
|
|
943
949
|
|
|
944
950
|
# initialize AlternativeImage@comments classes from parent, except
|
|
945
951
|
# for those operations that can apply on multiple hierarchy levels:
|
|
@@ -971,7 +977,7 @@ class Workspace():
|
|
|
971
977
|
for feature in feature_selector.split(',') if feature) and
|
|
972
978
|
not any(feature in featureset
|
|
973
979
|
for feature in feature_filter.split(',') if feature) and
|
|
974
|
-
len(featureset.difference(auto_features)) >=
|
|
980
|
+
len(featureset.difference(auto_features)) >=
|
|
975
981
|
len(best_features.difference(auto_features))):
|
|
976
982
|
best_features = featureset
|
|
977
983
|
best_image = alternative_image
|
|
@@ -980,7 +986,7 @@ class Workspace():
|
|
|
980
986
|
alternative_images.index(best_image) + 1,
|
|
981
987
|
best_features, segment.id)
|
|
982
988
|
segment_image = self._resolve_image_as_pil(alternative_image.get_filename())
|
|
983
|
-
segment_coords['features'] = best_image.get_comments()
|
|
989
|
+
segment_coords['features'] = best_image.get_comments() # including duplicates
|
|
984
990
|
|
|
985
991
|
alternative_image_features = segment_coords['features'].split(',')
|
|
986
992
|
for duplicate_feature in set([feature for feature in alternative_image_features
|
|
@@ -993,13 +999,13 @@ class Workspace():
|
|
|
993
999
|
for i, feature in enumerate(alternative_image_features +
|
|
994
1000
|
(['rotated-%d' % orientation]
|
|
995
1001
|
if (orientation and
|
|
996
|
-
|
|
997
|
-
|
|
1002
|
+
'rotated-%d' % orientation not in alternative_image_features and
|
|
1003
|
+
'rotated-%d' % orientation not in feature_filter.split(','))
|
|
998
1004
|
else []) +
|
|
999
1005
|
(['deskewed']
|
|
1000
1006
|
if (skew and
|
|
1001
|
-
|
|
1002
|
-
|
|
1007
|
+
'deskewed' not in alternative_image_features and
|
|
1008
|
+
'deskewed' not in feature_filter.split(','))
|
|
1003
1009
|
else []) +
|
|
1004
1010
|
# not a feature to be added, but merely as a fallback position
|
|
1005
1011
|
# to always enter loop at i == len(alternative_image_features)
|
|
@@ -1052,13 +1058,13 @@ class Workspace():
|
|
|
1052
1058
|
return segment_image, segment_coords
|
|
1053
1059
|
|
|
1054
1060
|
# pylint: disable=redefined-builtin
|
|
1055
|
-
def save_image_file(self, image
|
|
1056
|
-
file_id
|
|
1057
|
-
file_grp
|
|
1058
|
-
file_path
|
|
1059
|
-
page_id
|
|
1060
|
-
mimetype
|
|
1061
|
-
force
|
|
1061
|
+
def save_image_file(self, image: Image.Image,
|
|
1062
|
+
file_id: str,
|
|
1063
|
+
file_grp: str,
|
|
1064
|
+
file_path: Optional[str] = None,
|
|
1065
|
+
page_id: Optional[str] = None,
|
|
1066
|
+
mimetype: str = 'image/png',
|
|
1067
|
+
force: bool = False) -> str:
|
|
1062
1068
|
"""Store an image in the filesystem and reference it as new file in the METS.
|
|
1063
1069
|
|
|
1064
1070
|
Args:
|
|
@@ -1120,6 +1126,7 @@ class Workspace():
|
|
|
1120
1126
|
with pushd_popd(self.directory):
|
|
1121
1127
|
return self.mets.find_files(*args, **kwargs)
|
|
1122
1128
|
|
|
1129
|
+
|
|
1123
1130
|
def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwargs):
|
|
1124
1131
|
segment_coords = parent_coords.copy()
|
|
1125
1132
|
# get polygon outline of segment relative to parent image:
|
|
@@ -1131,8 +1138,8 @@ def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwarg
|
|
|
1131
1138
|
# also possibly different from size after rotation below/AlternativeImage):
|
|
1132
1139
|
segment_xywh = xywh_from_bbox(*segment_bbox)
|
|
1133
1140
|
# crop, if (still) necessary:
|
|
1134
|
-
if (not isinstance(segment, BorderType) or
|
|
1135
|
-
not
|
|
1141
|
+
if (not isinstance(segment, BorderType) or # always crop below page level
|
|
1142
|
+
op not in parent_coords['features']):
|
|
1136
1143
|
if op == 'recropped':
|
|
1137
1144
|
log.debug("Recropping %s", name)
|
|
1138
1145
|
elif isinstance(segment, BorderType):
|
|
@@ -1152,6 +1159,7 @@ def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwarg
|
|
|
1152
1159
|
-segment_bbox[1]]))
|
|
1153
1160
|
return segment_image, segment_coords, segment_xywh
|
|
1154
1161
|
|
|
1162
|
+
|
|
1155
1163
|
def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh):
|
|
1156
1164
|
# Transpose in affine coordinate transform:
|
|
1157
1165
|
# (consistent with image transposition or AlternativeImage below)
|
|
@@ -1159,7 +1167,7 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh
|
|
|
1159
1167
|
90: Image.Transpose.ROTATE_90,
|
|
1160
1168
|
180: Image.Transpose.ROTATE_180,
|
|
1161
1169
|
270: Image.Transpose.ROTATE_270
|
|
1162
|
-
}.get(orientation)
|
|
1170
|
+
}.get(orientation) # no default
|
|
1163
1171
|
segment_coords['transform'] = transpose_coordinates(
|
|
1164
1172
|
segment_coords['transform'], transposition,
|
|
1165
1173
|
np.array([0.5 * segment_xywh['w'],
|
|
@@ -1174,6 +1182,7 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh
|
|
|
1174
1182
|
segment_coords['features'] += ',rotated-%d' % orientation
|
|
1175
1183
|
return segment_image, segment_coords, segment_xywh
|
|
1176
1184
|
|
|
1185
|
+
|
|
1177
1186
|
def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xywh, **kwargs):
|
|
1178
1187
|
# Rotate around center in affine coordinate transform:
|
|
1179
1188
|
# (consistent with image rotation or AlternativeImage below)
|
|
@@ -1185,12 +1194,12 @@ def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xyw
|
|
|
1185
1194
|
[segment_xywh['w'], segment_xywh['h']], skew)
|
|
1186
1195
|
segment_coords['angle'] += skew
|
|
1187
1196
|
# deskew, if (still) necessary:
|
|
1188
|
-
if
|
|
1197
|
+
if 'deskewed' not in segment_coords['features']:
|
|
1189
1198
|
log.debug("Rotating %s by %.2f°", name, skew)
|
|
1190
1199
|
segment_image = rotate_image(segment_image, skew, **kwargs)
|
|
1191
1200
|
segment_coords['features'] += ',deskewed'
|
|
1192
1201
|
if (segment and
|
|
1193
|
-
(not isinstance(segment, BorderType) or
|
|
1202
|
+
(not isinstance(segment, BorderType) or # always crop below page level
|
|
1194
1203
|
'cropped' in segment_coords['features'])):
|
|
1195
1204
|
# re-crop to new bbox (which may deviate
|
|
1196
1205
|
# if segment polygon was not a rectangle)
|
|
@@ -1198,7 +1207,7 @@ def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xyw
|
|
|
1198
1207
|
log, name, segment, segment_image, segment_coords,
|
|
1199
1208
|
op='recropped', **kwargs)
|
|
1200
1209
|
elif (segment and
|
|
1201
|
-
(not isinstance(segment, BorderType) or
|
|
1210
|
+
(not isinstance(segment, BorderType) or # always crop below page level
|
|
1202
1211
|
'cropped' in segment_coords['features'])):
|
|
1203
1212
|
# only shift coordinates as if re-cropping
|
|
1204
1213
|
segment_polygon = coordinates_of_segment(segment, segment_image, segment_coords)
|
|
@@ -1210,6 +1219,7 @@ def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xyw
|
|
|
1210
1219
|
-segment_bbox[1]]))
|
|
1211
1220
|
return segment_image, segment_coords, segment_xywh
|
|
1212
1221
|
|
|
1222
|
+
|
|
1213
1223
|
def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwargs):
|
|
1214
1224
|
# Resize linearly
|
|
1215
1225
|
segment_coords['transform'] = scale_coordinates(
|
|
@@ -1218,7 +1228,7 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa
|
|
|
1218
1228
|
segment_xywh['w'] *= factor
|
|
1219
1229
|
segment_xywh['h'] *= factor
|
|
1220
1230
|
# resize, if (still) necessary
|
|
1221
|
-
if
|
|
1231
|
+
if 'scaled' not in segment_coords['features']:
|
|
1222
1232
|
log.debug("Scaling %s by %.2f", name, factor)
|
|
1223
1233
|
segment_coords['features'] += ',scaled'
|
|
1224
1234
|
# FIXME: validate factor against PAGE-XML attributes
|
ocrd/workspace_backup.py
CHANGED
|
@@ -10,9 +10,11 @@ from ocrd_utils import getLogger, atomic_write, DEFAULT_METS_BASENAME
|
|
|
10
10
|
|
|
11
11
|
from .constants import BACKUP_DIR
|
|
12
12
|
|
|
13
|
+
|
|
13
14
|
def _chksum(s):
|
|
14
15
|
return hashlib.sha256(s).hexdigest()
|
|
15
16
|
|
|
17
|
+
|
|
16
18
|
class WorkspaceBackup():
|
|
17
19
|
|
|
18
20
|
@classmethod
|
|
@@ -37,6 +39,7 @@ class WorkspaceBackup():
|
|
|
37
39
|
self.mets_xml.file_groups
|
|
38
40
|
)
|
|
39
41
|
|
|
42
|
+
|
|
40
43
|
class WorkspaceBackupManager():
|
|
41
44
|
"""
|
|
42
45
|
Manages backups of a workspace in a directory BACKUP_DIR
|
ocrd/workspace_bagger.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
|
-
from os import makedirs,
|
|
2
|
+
from os import makedirs, walk
|
|
3
3
|
from os.path import join, isdir, basename as os_path_basename, exists, relpath
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from shutil import make_archive, rmtree, copyfile, move, copytree
|
|
@@ -7,7 +7,11 @@ from tempfile import mkdtemp, TemporaryDirectory
|
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
9
|
import sys
|
|
10
|
-
from bagit import
|
|
10
|
+
from bagit import (
|
|
11
|
+
Bag,
|
|
12
|
+
make_manifests,
|
|
13
|
+
_load_tag_file, _make_tag_file, _make_tagmanifest_file, # pylint: disable=no-name-in-module
|
|
14
|
+
)
|
|
11
15
|
|
|
12
16
|
from ocrd_utils import (
|
|
13
17
|
pushd_popd,
|
|
@@ -25,10 +29,11 @@ from ocrd_models.ocrd_page import to_xml
|
|
|
25
29
|
|
|
26
30
|
from .workspace import Workspace
|
|
27
31
|
|
|
28
|
-
tempfile.tempdir = '/tmp'
|
|
32
|
+
tempfile.tempdir = '/tmp' # TODO hard-coded
|
|
29
33
|
|
|
30
34
|
BACKUPDIR = join('/tmp', TMP_BAGIT_PREFIX + 'backup')
|
|
31
35
|
|
|
36
|
+
|
|
32
37
|
class WorkspaceBagger():
|
|
33
38
|
"""
|
|
34
39
|
Serialize/De-serialize from OCRD-ZIP to workspace and back.
|
|
@@ -50,7 +55,7 @@ class WorkspaceBagger():
|
|
|
50
55
|
def _log_or_raise(self, msg):
|
|
51
56
|
log = getLogger('ocrd.workspace_bagger')
|
|
52
57
|
if self.strict:
|
|
53
|
-
raise
|
|
58
|
+
raise Exception(msg)
|
|
54
59
|
else:
|
|
55
60
|
log.info(msg)
|
|
56
61
|
|
|
@@ -112,10 +117,11 @@ class WorkspaceBagger():
|
|
|
112
117
|
log.info("New vs. old: %s" % changed_local_filenames)
|
|
113
118
|
return total_bytes, total_files
|
|
114
119
|
|
|
115
|
-
def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum,
|
|
120
|
+
def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum,
|
|
121
|
+
ocrd_mets=DEFAULT_METS_BASENAME):
|
|
116
122
|
bag.info['BagIt-Profile-Identifier'] = OCRD_BAGIT_PROFILE_URL
|
|
117
123
|
bag.info['Bag-Software-Agent'] = 'ocrd/core %s (bagit.py %s, bagit_profile %s) [cmdline: "%s"]' % (
|
|
118
|
-
VERSION,
|
|
124
|
+
VERSION, # TODO
|
|
119
125
|
dist_version('ocrd-fork-bagit'),
|
|
120
126
|
dist_version('ocrd-fork-bagit_profile'),
|
|
121
127
|
' '.join(sys.argv))
|
|
@@ -139,7 +145,7 @@ class WorkspaceBagger():
|
|
|
139
145
|
tag_files=None,
|
|
140
146
|
include_fileGrp=None,
|
|
141
147
|
exclude_fileGrp=None,
|
|
142
|
-
|
|
148
|
+
):
|
|
143
149
|
"""
|
|
144
150
|
Bag a workspace
|
|
145
151
|
|
|
@@ -178,7 +184,8 @@ class WorkspaceBagger():
|
|
|
178
184
|
f.write(BAGIT_TXT.encode('utf-8'))
|
|
179
185
|
|
|
180
186
|
# create manifests
|
|
181
|
-
total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes,
|
|
187
|
+
total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes,
|
|
188
|
+
include_fileGrp, exclude_fileGrp)
|
|
182
189
|
|
|
183
190
|
# create bag-info.txt
|
|
184
191
|
bag = Bag(bagdir)
|