ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +34 -26
- ocrd/cli/bashlib.py +32 -18
- ocrd/cli/ocrd_tool.py +7 -5
- ocrd/cli/workspace.py +10 -8
- ocrd/decorators/__init__.py +13 -7
- ocrd/lib.bash +2 -0
- ocrd/mets_server.py +2 -3
- ocrd/processor/base.py +163 -63
- ocrd/processor/builtin/dummy_processor.py +4 -11
- ocrd/processor/helpers.py +23 -17
- ocrd/processor/ocrd_page_result.py +3 -3
- ocrd/resolver.py +0 -3
- ocrd/resource_manager.py +9 -5
- ocrd/workspace.py +8 -9
- ocrd/workspace_backup.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/METADATA +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/RECORD +47 -46
- ocrd_modelfactory/__init__.py +1 -1
- ocrd_models/constants.py +0 -1
- ocrd_models/ocrd_exif.py +2 -2
- ocrd_models/ocrd_file.py +2 -2
- ocrd_models/ocrd_mets.py +22 -22
- ocrd_models/ocrd_page.py +0 -1
- ocrd_models/ocrd_xml_base.py +2 -2
- ocrd_network/cli/client.py +134 -30
- ocrd_network/client.py +53 -27
- ocrd_network/client_utils.py +101 -0
- ocrd_network/processing_server.py +1 -1
- ocrd_network/runtime_data/deployer.py +12 -3
- ocrd_network/server_utils.py +12 -10
- ocrd_utils/__init__.py +2 -0
- ocrd_utils/config.py +16 -2
- ocrd_utils/image.py +25 -25
- ocrd_utils/logging.py +17 -19
- ocrd_utils/os.py +4 -5
- ocrd_utils/str.py +10 -3
- ocrd_validators/json_validator.py +1 -3
- ocrd_validators/ocrd_tool_validator.py +2 -2
- ocrd_validators/page_validator.py +56 -56
- ocrd_validators/parameter_validator.py +2 -2
- ocrd_validators/resource_list_validator.py +4 -3
- ocrd_validators/workspace_validator.py +21 -21
- ocrd_validators/xsd_validator.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/LICENSE +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/WHEEL +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/entry_points.txt +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/top_level.txt +0 -0
ocrd/processor/base.py
CHANGED
|
@@ -9,6 +9,7 @@ __all__ = [
|
|
|
9
9
|
'run_processor'
|
|
10
10
|
]
|
|
11
11
|
|
|
12
|
+
from functools import cached_property
|
|
12
13
|
from os.path import exists, join
|
|
13
14
|
from shutil import copyfileobj
|
|
14
15
|
import json
|
|
@@ -20,7 +21,8 @@ import sys
|
|
|
20
21
|
import inspect
|
|
21
22
|
import tarfile
|
|
22
23
|
import io
|
|
23
|
-
|
|
24
|
+
import weakref
|
|
25
|
+
from frozendict import frozendict
|
|
24
26
|
from deprecated import deprecated
|
|
25
27
|
from requests import HTTPError
|
|
26
28
|
|
|
@@ -33,18 +35,25 @@ from ocrd_utils import (
|
|
|
33
35
|
MIME_TO_EXT,
|
|
34
36
|
config,
|
|
35
37
|
getLogger,
|
|
36
|
-
initLogging,
|
|
37
38
|
list_resource_candidates,
|
|
38
39
|
pushd_popd,
|
|
39
40
|
list_all_resources,
|
|
40
41
|
get_processor_resource_types,
|
|
41
42
|
resource_filename,
|
|
42
|
-
|
|
43
|
+
parse_json_file_with_comments,
|
|
43
44
|
make_file_id,
|
|
44
45
|
deprecation_warning
|
|
45
46
|
)
|
|
46
47
|
from ocrd_validators import ParameterValidator
|
|
47
|
-
from ocrd_models.ocrd_page import
|
|
48
|
+
from ocrd_models.ocrd_page import (
|
|
49
|
+
PageType,
|
|
50
|
+
AlternativeImageType,
|
|
51
|
+
MetadataItemType,
|
|
52
|
+
LabelType,
|
|
53
|
+
LabelsType,
|
|
54
|
+
OcrdPage,
|
|
55
|
+
to_xml,
|
|
56
|
+
)
|
|
48
57
|
from ocrd_modelfactory import page_from_file
|
|
49
58
|
from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
|
|
50
59
|
|
|
@@ -94,54 +103,139 @@ class MissingInputFile(ValueError):
|
|
|
94
103
|
|
|
95
104
|
class Processor():
|
|
96
105
|
"""
|
|
97
|
-
A processor is a tool that implements the uniform OCR-D
|
|
98
|
-
for run-time data processing.
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
106
|
+
A processor is a tool that implements the uniform OCR-D
|
|
107
|
+
`command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
|
|
108
|
+
|
|
109
|
+
That is, it executes a single workflow step, or a combination of workflow steps,
|
|
110
|
+
on the workspace (represented by local METS). It reads input files for all or selected
|
|
111
|
+
physical pages of the input fileGrp(s), computes additional annotation, and writes output
|
|
112
|
+
files for them into the output fileGrp(s). It may take a number of optional or mandatory
|
|
113
|
+
parameters.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
max_instances : int = -1
|
|
117
|
+
"""
|
|
118
|
+
maximum number of cached instances (ignored if negative), to be applied on top of
|
|
119
|
+
:py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
|
|
120
|
+
|
|
121
|
+
(Override this if you know how many instances fit into memory at once.)
|
|
103
122
|
"""
|
|
104
123
|
|
|
105
124
|
@property
|
|
125
|
+
def metadata_filename(self) -> str:
|
|
126
|
+
"""
|
|
127
|
+
Relative location of the ``ocrd-tool.json`` file inside the package.
|
|
128
|
+
|
|
129
|
+
Used by :py:data:`metadata_location`.
|
|
130
|
+
|
|
131
|
+
(Override if ``ocrd-tool.json`` is not in the root of the module,
|
|
132
|
+
e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
|
|
133
|
+
"""
|
|
134
|
+
return 'ocrd-tool.json'
|
|
135
|
+
|
|
136
|
+
@cached_property
|
|
137
|
+
def metadata_location(self) -> Path:
|
|
138
|
+
"""
|
|
139
|
+
Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
|
|
140
|
+
|
|
141
|
+
Used by :py:data:`metadata_rawdict`.
|
|
142
|
+
|
|
143
|
+
(Override if ``ocrd-tool.json`` is not distributed with the Python package.)
|
|
144
|
+
"""
|
|
145
|
+
return resource_filename(self.__module__.split('.')[0], self.metadata_filename)
|
|
146
|
+
|
|
147
|
+
@cached_property
|
|
148
|
+
def metadata_rawdict(self) -> dict:
|
|
149
|
+
"""
|
|
150
|
+
Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
|
|
151
|
+
|
|
152
|
+
Used by :py:data:`metadata`.
|
|
153
|
+
|
|
154
|
+
(Override if ``ocrd-tool.json`` is not in a file.)
|
|
155
|
+
"""
|
|
156
|
+
return parse_json_file_with_comments(self.metadata_location)
|
|
157
|
+
|
|
158
|
+
@cached_property
|
|
106
159
|
def metadata(self) -> dict:
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
160
|
+
"""
|
|
161
|
+
The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
|
|
162
|
+
`spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
|
|
163
|
+
|
|
164
|
+
After deserialisation, it also gets validated against the
|
|
165
|
+
`schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
|
|
166
|
+
expanded.
|
|
167
|
+
|
|
168
|
+
Used by :py:data:`ocrd_tool` and :py:data:`version`.
|
|
169
|
+
|
|
170
|
+
(Override if you want to provide metadata programmatically instead of a
|
|
171
|
+
JSON file.)
|
|
172
|
+
"""
|
|
173
|
+
metadata = self.metadata_rawdict
|
|
174
|
+
report = OcrdToolValidator.validate(metadata)
|
|
112
175
|
if not report.is_valid:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
return self._metadata
|
|
176
|
+
self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
|
|
177
|
+
f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
|
|
178
|
+
return metadata
|
|
117
179
|
|
|
118
|
-
@
|
|
180
|
+
@cached_property
|
|
119
181
|
def version(self) -> str:
|
|
120
|
-
"""
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
self._version = self.metadata['version']
|
|
124
|
-
return self._version
|
|
182
|
+
"""
|
|
183
|
+
The program version of the package.
|
|
184
|
+
Usually the ``version`` part of :py:data:`metadata`.
|
|
125
185
|
|
|
126
|
-
|
|
186
|
+
(Override if you do not want to use :py:data:`metadata` lookup
|
|
187
|
+
mechanism.)
|
|
188
|
+
"""
|
|
189
|
+
return self.metadata['version']
|
|
190
|
+
|
|
191
|
+
@cached_property
|
|
127
192
|
def executable(self) -> str:
|
|
128
|
-
"""
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
self._executable = os.path.basename(inspect.stack()[-1].filename)
|
|
132
|
-
return self._executable
|
|
193
|
+
"""
|
|
194
|
+
The executable name of this processor tool. Taken from the runtime
|
|
195
|
+
filename.
|
|
133
196
|
|
|
134
|
-
|
|
197
|
+
Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
|
|
198
|
+
|
|
199
|
+
(Override if your entry-point name deviates from the ``executable``
|
|
200
|
+
name, or the processor gets instantiated from another runtime.)
|
|
201
|
+
"""
|
|
202
|
+
return os.path.basename(inspect.stack()[-1].filename)
|
|
203
|
+
|
|
204
|
+
@cached_property
|
|
135
205
|
def ocrd_tool(self) -> dict:
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
206
|
+
"""
|
|
207
|
+
The ``ocrd-tool.json`` dict contents of this processor tool.
|
|
208
|
+
Usually the :py:data:`executable` key of the ``tools`` part
|
|
209
|
+
of :py:data:`metadata`.
|
|
210
|
+
|
|
211
|
+
(Override if you do not want to use :py:data:`metadata` lookup
|
|
212
|
+
mechanism.)
|
|
213
|
+
"""
|
|
214
|
+
return self.metadata['tools'][self.executable]
|
|
215
|
+
|
|
216
|
+
@property
|
|
217
|
+
def parameter(self) -> Optional[dict]:
|
|
218
|
+
"""the runtime parameter dict to be used by this processor"""
|
|
219
|
+
if hasattr(self, '_parameter'):
|
|
220
|
+
return self._parameter
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
@parameter.setter
|
|
224
|
+
def parameter(self, parameter : dict) -> None:
|
|
225
|
+
if self.parameter is not None:
|
|
226
|
+
self.shutdown()
|
|
227
|
+
parameterValidator = ParameterValidator(self.ocrd_tool)
|
|
228
|
+
report = parameterValidator.validate(parameter)
|
|
229
|
+
if not report.is_valid:
|
|
230
|
+
raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
|
|
231
|
+
# make parameter dict read-only
|
|
232
|
+
self._parameter = frozendict(parameter)
|
|
233
|
+
# (re-)run setup to load models etc
|
|
234
|
+
self.setup()
|
|
141
235
|
|
|
142
236
|
def __init__(
|
|
143
237
|
self,
|
|
144
|
-
# FIXME:
|
|
238
|
+
# FIXME: remove in favor of process_workspace(workspace)
|
|
145
239
|
workspace : Optional[Workspace],
|
|
146
240
|
ocrd_tool=None,
|
|
147
241
|
parameter=None,
|
|
@@ -204,19 +298,14 @@ class Processor():
|
|
|
204
298
|
"is deprecated - pass as argument to process_workspace instead")
|
|
205
299
|
self.page_id = page_id or None
|
|
206
300
|
self.download = download_files
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
parameterValidator = ParameterValidator(self.ocrd_tool)
|
|
210
|
-
|
|
211
|
-
report = parameterValidator.validate(parameter)
|
|
212
|
-
if not report.is_valid:
|
|
213
|
-
raise ValueError("Invalid parameters %s" % report.errors)
|
|
214
|
-
self.parameter = parameter
|
|
215
|
-
# NOTE: this is the logger to be used by processor implementations,
|
|
216
|
-
# `processor.base` default implementations should use
|
|
217
|
-
# :py:attr:`self._base_logger`
|
|
301
|
+
#: The logger to be used by processor implementations.
|
|
302
|
+
# `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
|
|
218
303
|
self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
|
|
219
304
|
self._base_logger = getLogger('ocrd.processor.base')
|
|
305
|
+
if parameter is not None:
|
|
306
|
+
self.parameter = parameter
|
|
307
|
+
# ensure that shutdown gets called at destruction
|
|
308
|
+
self._finalizer = weakref.finalize(self, self.shutdown)
|
|
220
309
|
# workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
|
|
221
310
|
setattr(self, 'process',
|
|
222
311
|
deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
|
|
@@ -254,14 +343,10 @@ class Processor():
|
|
|
254
343
|
assert len(grps) >= minimum, msg % (len(grps), str(spec))
|
|
255
344
|
if maximum > 0:
|
|
256
345
|
assert len(grps) <= maximum, msg % (len(grps), str(spec))
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
"Unexpected number of input file groups %d vs %s")
|
|
262
|
-
if 'output_file_grp_cardinality' in self.ocrd_tool:
|
|
263
|
-
assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
|
|
264
|
-
"Unexpected number of output file groups %d vs %s")
|
|
346
|
+
assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
|
|
347
|
+
"Unexpected number of input file groups %d vs %s")
|
|
348
|
+
assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
|
|
349
|
+
"Unexpected number of output file groups %d vs %s")
|
|
265
350
|
for input_file_grp in input_file_grps:
|
|
266
351
|
assert input_file_grp in self.workspace.mets.file_groups
|
|
267
352
|
# keep this for backwards compatibility:
|
|
@@ -272,14 +357,12 @@ class Processor():
|
|
|
272
357
|
Print :py:attr:`ocrd_tool` on stdout.
|
|
273
358
|
"""
|
|
274
359
|
print(json.dumps(self.ocrd_tool, indent=True))
|
|
275
|
-
return
|
|
276
360
|
|
|
277
361
|
def dump_module_dir(self):
|
|
278
362
|
"""
|
|
279
363
|
Print :py:attr:`moduledir` on stdout.
|
|
280
364
|
"""
|
|
281
365
|
print(self.moduledir)
|
|
282
|
-
return
|
|
283
366
|
|
|
284
367
|
def list_resources(self):
|
|
285
368
|
"""
|
|
@@ -287,7 +370,6 @@ class Processor():
|
|
|
287
370
|
"""
|
|
288
371
|
for res in self.list_all_resources():
|
|
289
372
|
print(res)
|
|
290
|
-
return
|
|
291
373
|
|
|
292
374
|
def setup(self) -> None:
|
|
293
375
|
"""
|
|
@@ -299,6 +381,16 @@ class Processor():
|
|
|
299
381
|
"""
|
|
300
382
|
pass
|
|
301
383
|
|
|
384
|
+
def shutdown(self) -> None:
|
|
385
|
+
"""
|
|
386
|
+
Bring down the processor after data processing,
|
|
387
|
+
after to changing back from the workspace directory but
|
|
388
|
+
before exiting (or setting up with different parameters).
|
|
389
|
+
|
|
390
|
+
(Override this to unload models from memory etc.)
|
|
391
|
+
"""
|
|
392
|
+
pass
|
|
393
|
+
|
|
302
394
|
@deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
|
|
303
395
|
def process(self) -> None:
|
|
304
396
|
"""
|
|
@@ -439,7 +531,15 @@ class Processor():
|
|
|
439
531
|
for image_result in result.images:
|
|
440
532
|
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
|
|
441
533
|
image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
|
|
442
|
-
image_result.alternative_image
|
|
534
|
+
if isinstance(image_result.alternative_image, PageType):
|
|
535
|
+
image_result.alternative_image.set_imageFilename(image_file_path)
|
|
536
|
+
image_result.alternative_image.set_imageWidth(image_result.pil.width)
|
|
537
|
+
image_result.alternative_image.set_imageHeight(image_result.pil.height)
|
|
538
|
+
elif isinstance(image_result.alternative_image, AlternativeImageType):
|
|
539
|
+
image_result.alternative_image.set_filename(image_file_path)
|
|
540
|
+
else:
|
|
541
|
+
raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
|
|
542
|
+
f"{type(image_result.alternative_image)}")
|
|
443
543
|
self.workspace.save_image_file(
|
|
444
544
|
image_result.pil,
|
|
445
545
|
image_file_id,
|
|
@@ -668,7 +768,7 @@ class Processor():
|
|
|
668
768
|
# can actually be much more costly than traversing the ltree.
|
|
669
769
|
# This might depend on the number of pages vs number of fileGrps.
|
|
670
770
|
|
|
671
|
-
pages =
|
|
771
|
+
pages = {}
|
|
672
772
|
for i, ifg in enumerate(ifgs):
|
|
673
773
|
files_ = sorted(self.workspace.mets.find_all_files(
|
|
674
774
|
pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
|
|
@@ -723,7 +823,7 @@ class Processor():
|
|
|
723
823
|
if self.page_id and not any(pages):
|
|
724
824
|
self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
|
|
725
825
|
f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
|
|
726
|
-
ifts =
|
|
826
|
+
ifts = []
|
|
727
827
|
for page, ifiles in pages.items():
|
|
728
828
|
for i, ifg in enumerate(ifgs):
|
|
729
829
|
if not ifiles[i]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# pylint: disable=missing-module-docstring,invalid-name
|
|
2
|
-
from os.path import join
|
|
3
|
-
from typing import Optional
|
|
2
|
+
from os.path import join
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
import click
|
|
6
6
|
|
|
@@ -10,7 +10,6 @@ from ocrd.processor.ocrd_page_result import OcrdPageResult
|
|
|
10
10
|
from ocrd_models.ocrd_file import OcrdFileType
|
|
11
11
|
from ocrd_models.ocrd_page import OcrdPage, to_xml
|
|
12
12
|
from ocrd_utils import (
|
|
13
|
-
getLogger,
|
|
14
13
|
make_file_id,
|
|
15
14
|
MIME_TO_EXT,
|
|
16
15
|
MIMETYPE_PAGE,
|
|
@@ -20,8 +19,6 @@ from ocrd_utils import (
|
|
|
20
19
|
)
|
|
21
20
|
from ocrd_modelfactory import page_from_file
|
|
22
21
|
|
|
23
|
-
OCRD_TOOL = parse_json_string_with_comments(resource_string(__package__ + '.dummy', 'ocrd-tool.json'))
|
|
24
|
-
|
|
25
22
|
class DummyProcessor(Processor):
|
|
26
23
|
"""
|
|
27
24
|
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
|
|
@@ -76,17 +73,13 @@ class DummyProcessor(Processor):
|
|
|
76
73
|
super().process_page_file(input_file)
|
|
77
74
|
|
|
78
75
|
@property
|
|
79
|
-
def
|
|
80
|
-
return
|
|
76
|
+
def metadata_filename(self):
|
|
77
|
+
return 'processor/builtin/dummy/ocrd-tool.json'
|
|
81
78
|
|
|
82
79
|
@property
|
|
83
80
|
def executable(self):
|
|
84
81
|
return 'ocrd-dummy'
|
|
85
82
|
|
|
86
|
-
@property
|
|
87
|
-
def version(self):
|
|
88
|
-
return '0.0.3'
|
|
89
|
-
|
|
90
83
|
@click.command()
|
|
91
84
|
@ocrd_cli_options
|
|
92
85
|
def cli(*args, **kwargs):
|
ocrd/processor/helpers.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Helper methods for running and documenting processors
|
|
3
3
|
"""
|
|
4
|
-
from os import chdir, getcwd
|
|
5
4
|
from time import perf_counter, process_time
|
|
6
5
|
from functools import lru_cache
|
|
7
6
|
import json
|
|
8
7
|
import inspect
|
|
9
8
|
from subprocess import run
|
|
10
|
-
from typing import List
|
|
9
|
+
from typing import List, Optional
|
|
11
10
|
|
|
12
11
|
from click import wrap_text
|
|
13
12
|
from ocrd.workspace import Workspace
|
|
@@ -39,10 +38,7 @@ def run_processor(
|
|
|
39
38
|
log_level=None,
|
|
40
39
|
input_file_grp=None,
|
|
41
40
|
output_file_grp=None,
|
|
42
|
-
show_resource=None,
|
|
43
|
-
list_resources=False,
|
|
44
41
|
parameter=None,
|
|
45
|
-
parameter_override=None,
|
|
46
42
|
working_dir=None,
|
|
47
43
|
mets_server_url=None,
|
|
48
44
|
instance_caching=False
|
|
@@ -84,7 +80,7 @@ def run_processor(
|
|
|
84
80
|
log.debug("Running processor %s", processorClass)
|
|
85
81
|
|
|
86
82
|
processor = get_processor(
|
|
87
|
-
|
|
83
|
+
processorClass,
|
|
88
84
|
parameter=parameter,
|
|
89
85
|
workspace=None,
|
|
90
86
|
page_id=page_id,
|
|
@@ -102,7 +98,7 @@ def run_processor(
|
|
|
102
98
|
t0_cpu = process_time()
|
|
103
99
|
if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
|
|
104
100
|
backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
|
|
105
|
-
from memory_profiler import memory_usage
|
|
101
|
+
from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
|
|
106
102
|
try:
|
|
107
103
|
mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}),
|
|
108
104
|
# only run process once
|
|
@@ -212,7 +208,7 @@ def run_cli(
|
|
|
212
208
|
if not log_filename:
|
|
213
209
|
result = run(args, check=False)
|
|
214
210
|
else:
|
|
215
|
-
with open(log_filename, 'a') as file_desc:
|
|
211
|
+
with open(log_filename, 'a', encoding='utf-8') as file_desc:
|
|
216
212
|
result = run(args, check=False, stdout=file_desc, stderr=file_desc)
|
|
217
213
|
return result.returncode
|
|
218
214
|
|
|
@@ -359,9 +355,9 @@ Options:
|
|
|
359
355
|
pass
|
|
360
356
|
|
|
361
357
|
|
|
362
|
-
#
|
|
363
|
-
|
|
364
|
-
|
|
358
|
+
# not decorated here but at runtime (on first use)
|
|
359
|
+
#@freeze_args
|
|
360
|
+
#@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
|
|
365
361
|
def get_cached_processor(parameter: dict, processor_class):
|
|
366
362
|
"""
|
|
367
363
|
Call this function to get back an instance of a processor.
|
|
@@ -374,16 +370,13 @@ def get_cached_processor(parameter: dict, processor_class):
|
|
|
374
370
|
Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
|
|
375
371
|
"""
|
|
376
372
|
if processor_class:
|
|
377
|
-
|
|
378
|
-
processor = processor_class(None, parameter=dict_params)
|
|
379
|
-
processor.setup()
|
|
373
|
+
processor = processor_class(None, parameter=dict(parameter))
|
|
380
374
|
return processor
|
|
381
375
|
return None
|
|
382
376
|
|
|
383
|
-
|
|
384
377
|
def get_processor(
|
|
385
378
|
processor_class,
|
|
386
|
-
parameter: dict,
|
|
379
|
+
parameter: Optional[dict] = None,
|
|
387
380
|
workspace: Workspace = None,
|
|
388
381
|
page_id: str = None,
|
|
389
382
|
input_file_grp: List[str] = None,
|
|
@@ -391,11 +384,24 @@ def get_processor(
|
|
|
391
384
|
instance_caching: bool = False,
|
|
392
385
|
):
|
|
393
386
|
if processor_class:
|
|
387
|
+
if parameter is None:
|
|
388
|
+
parameter = {}
|
|
394
389
|
if instance_caching:
|
|
390
|
+
global get_cached_processor
|
|
391
|
+
if not hasattr(get_cached_processor, '__wrapped__'):
|
|
392
|
+
# first call: wrap
|
|
393
|
+
if processor_class.max_instances < 0:
|
|
394
|
+
maxsize = config.OCRD_MAX_PROCESSOR_CACHE
|
|
395
|
+
else:
|
|
396
|
+
maxsize = min(config.OCRD_MAX_PROCESSOR_CACHE, processor_class.max_instances)
|
|
397
|
+
# wrapping in call cache
|
|
398
|
+
# wrapping dict into frozendict (from https://github.com/OCR-D/core/pull/884)
|
|
399
|
+
get_cached_processor = freeze_args(lru_cache(maxsize=maxsize)(get_cached_processor))
|
|
395
400
|
processor = get_cached_processor(parameter, processor_class)
|
|
396
401
|
else:
|
|
402
|
+
# avoid passing workspace already (deprecated chdir behaviour)
|
|
397
403
|
processor = processor_class(None, parameter=parameter)
|
|
398
|
-
|
|
404
|
+
# set current processing parameters
|
|
399
405
|
processor.workspace = workspace
|
|
400
406
|
processor.page_id = page_id
|
|
401
407
|
processor.input_file_grp = input_file_grp
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import List
|
|
2
|
+
from typing import List, Union
|
|
3
3
|
from ocrd_models.ocrd_page import OcrdPage
|
|
4
4
|
from PIL.Image import Image
|
|
5
5
|
|
|
6
|
-
from ocrd_models.ocrd_page_generateds import AlternativeImageType
|
|
6
|
+
from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
|
|
7
7
|
|
|
8
8
|
@dataclass
|
|
9
9
|
class OcrdPageResultImage():
|
|
10
10
|
pil : Image
|
|
11
11
|
file_id_suffix : str
|
|
12
|
-
alternative_image : AlternativeImageType
|
|
12
|
+
alternative_image : Union[AlternativeImageType, PageType]
|
|
13
13
|
|
|
14
14
|
@dataclass
|
|
15
15
|
class OcrdPageResult():
|
ocrd/resolver.py
CHANGED
|
@@ -18,7 +18,6 @@ from ocrd_utils import (
|
|
|
18
18
|
)
|
|
19
19
|
from ocrd.workspace import Workspace
|
|
20
20
|
from ocrd_models import OcrdMets
|
|
21
|
-
from ocrd_models.constants import NAMESPACES as NS
|
|
22
21
|
from ocrd_models.utils import handle_oai_response
|
|
23
22
|
|
|
24
23
|
class Resolver():
|
|
@@ -310,5 +309,3 @@ class Resolver():
|
|
|
310
309
|
raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))
|
|
311
310
|
|
|
312
311
|
return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url
|
|
313
|
-
|
|
314
|
-
|
ocrd/resource_manager.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from os.path import join
|
|
3
|
-
from os import environ, listdir,
|
|
3
|
+
from os import environ, listdir, getcwd, unlink
|
|
4
4
|
from shutil import copytree, rmtree, copy
|
|
5
5
|
from fnmatch import filter as apply_glob
|
|
6
6
|
from datetime import datetime
|
|
@@ -13,14 +13,18 @@ from gdown.parse_url import parse_url as gparse_url
|
|
|
13
13
|
from gdown.download import get_url_from_gdrive_confirmation
|
|
14
14
|
from yaml import safe_load, safe_dump
|
|
15
15
|
|
|
16
|
+
# pylint: disable=wrong-import-position
|
|
17
|
+
|
|
16
18
|
# https://github.com/OCR-D/core/issues/867
|
|
17
19
|
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
|
|
18
20
|
import yaml.constructor
|
|
19
|
-
yaml.constructor.SafeConstructor.yaml_constructors[
|
|
20
|
-
yaml.constructor.SafeConstructor.yaml_constructors[
|
|
21
|
+
yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \
|
|
22
|
+
yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str']
|
|
23
|
+
|
|
24
|
+
# pylint: enable=wrong-import-position
|
|
21
25
|
|
|
22
26
|
from ocrd_validators import OcrdResourceListValidator
|
|
23
|
-
from ocrd_utils import getLogger, directory_size, get_moduledir,
|
|
27
|
+
from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
|
|
24
28
|
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
|
|
25
29
|
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
|
|
26
30
|
|
|
@@ -248,7 +252,7 @@ class OcrdResourceManager:
|
|
|
248
252
|
if "Content-Disposition" not in r.headers:
|
|
249
253
|
url = get_url_from_gdrive_confirmation(r.text)
|
|
250
254
|
except RuntimeError as e:
|
|
251
|
-
log.warning("Cannot unwrap Google Drive URL: ", e)
|
|
255
|
+
log.warning("Cannot unwrap Google Drive URL: %s", e)
|
|
252
256
|
with open(filename, 'wb') as f:
|
|
253
257
|
with requests.get(url, stream=True) as r:
|
|
254
258
|
r.raise_for_status()
|
ocrd/workspace.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import io
|
|
2
2
|
from os import makedirs, unlink, listdir, path
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from shutil import
|
|
4
|
+
from shutil import copyfileobj
|
|
5
5
|
from re import sub
|
|
6
6
|
from tempfile import NamedTemporaryFile
|
|
7
7
|
from contextlib import contextmanager
|
|
@@ -43,7 +43,6 @@ from ocrd_utils import (
|
|
|
43
43
|
MIME_TO_PIL,
|
|
44
44
|
MIMETYPE_PAGE,
|
|
45
45
|
REGEX_PREFIX,
|
|
46
|
-
config
|
|
47
46
|
)
|
|
48
47
|
|
|
49
48
|
from .workspace_backup import WorkspaceBackupManager
|
|
@@ -111,7 +110,7 @@ class Workspace():
|
|
|
111
110
|
|
|
112
111
|
def __repr__(self):
|
|
113
112
|
return 'Workspace[remote=%s, directory=%s, baseurl=%s, file_groups=%s, files=%s]' % (
|
|
114
|
-
|
|
113
|
+
self.is_remote,
|
|
115
114
|
self.directory,
|
|
116
115
|
self.baseurl,
|
|
117
116
|
self.mets.file_groups,
|
|
@@ -648,7 +647,7 @@ class Workspace():
|
|
|
648
647
|
log = getLogger('ocrd.workspace.image_from_page')
|
|
649
648
|
page_image_info = self.resolve_image_exif(page.imageFilename)
|
|
650
649
|
page_image = self._resolve_image_as_pil(page.imageFilename)
|
|
651
|
-
page_coords =
|
|
650
|
+
page_coords = {}
|
|
652
651
|
# use identity as initial affine coordinate transform:
|
|
653
652
|
page_coords['transform'] = np.eye(3)
|
|
654
653
|
# interim bbox (updated with each change to the transform):
|
|
@@ -1091,7 +1090,7 @@ class Workspace():
|
|
|
1091
1090
|
The (absolute) path of the created file.
|
|
1092
1091
|
"""
|
|
1093
1092
|
log = getLogger('ocrd.workspace.save_image_file')
|
|
1094
|
-
saveargs =
|
|
1093
|
+
saveargs = {}
|
|
1095
1094
|
if 'dpi' in image.info:
|
|
1096
1095
|
saveargs['dpi'] = image.info['dpi']
|
|
1097
1096
|
image_bytes = io.BytesIO()
|
|
@@ -1168,9 +1167,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh
|
|
|
1168
1167
|
# Transpose in affine coordinate transform:
|
|
1169
1168
|
# (consistent with image transposition or AlternativeImage below)
|
|
1170
1169
|
transposition = {
|
|
1171
|
-
90: Image.ROTATE_90,
|
|
1172
|
-
180: Image.ROTATE_180,
|
|
1173
|
-
270: Image.ROTATE_270
|
|
1170
|
+
90: Image.Transpose.ROTATE_90,
|
|
1171
|
+
180: Image.Transpose.ROTATE_180,
|
|
1172
|
+
270: Image.Transpose.ROTATE_270
|
|
1174
1173
|
}.get(orientation) # no default
|
|
1175
1174
|
segment_coords['transform'] = transpose_coordinates(
|
|
1176
1175
|
segment_coords['transform'], transposition,
|
|
@@ -1238,5 +1237,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa
|
|
|
1238
1237
|
segment_image = segment_image.resize((int(segment_image.width * factor),
|
|
1239
1238
|
int(segment_image.height * factor)),
|
|
1240
1239
|
# slowest, but highest quality:
|
|
1241
|
-
Image.BICUBIC)
|
|
1240
|
+
Image.Resampling.BICUBIC)
|
|
1242
1241
|
return segment_image, segment_coords, segment_xywh
|
ocrd/workspace_backup.py
CHANGED