ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/__init__.py +34 -26
- ocrd/cli/bashlib.py +32 -18
- ocrd/cli/ocrd_tool.py +7 -5
- ocrd/cli/workspace.py +10 -8
- ocrd/decorators/__init__.py +13 -7
- ocrd/decorators/ocrd_cli_options.py +1 -1
- ocrd/lib.bash +3 -0
- ocrd/mets_server.py +3 -4
- ocrd/processor/__init__.py +1 -1
- ocrd/processor/base.py +421 -98
- ocrd/processor/builtin/dummy_processor.py +4 -11
- ocrd/processor/helpers.py +24 -161
- ocrd/processor/ocrd_page_result.py +3 -3
- ocrd/resolver.py +0 -3
- ocrd/resource_manager.py +9 -5
- ocrd/workspace.py +10 -11
- ocrd/workspace_backup.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/METADATA +32 -10
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/RECORD +49 -48
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/WHEEL +1 -1
- ocrd_modelfactory/__init__.py +1 -1
- ocrd_models/constants.py +0 -1
- ocrd_models/ocrd_exif.py +2 -2
- ocrd_models/ocrd_file.py +2 -2
- ocrd_models/ocrd_mets.py +22 -22
- ocrd_models/ocrd_page.py +0 -1
- ocrd_models/ocrd_xml_base.py +2 -2
- ocrd_network/cli/client.py +134 -30
- ocrd_network/client.py +53 -27
- ocrd_network/client_utils.py +101 -0
- ocrd_network/processing_server.py +1 -1
- ocrd_network/runtime_data/deployer.py +12 -3
- ocrd_network/server_utils.py +12 -10
- ocrd_utils/__init__.py +2 -0
- ocrd_utils/config.py +31 -2
- ocrd_utils/image.py +25 -25
- ocrd_utils/logging.py +20 -20
- ocrd_utils/os.py +4 -5
- ocrd_utils/str.py +10 -3
- ocrd_validators/json_validator.py +1 -3
- ocrd_validators/ocrd_tool_validator.py +2 -2
- ocrd_validators/page_validator.py +56 -56
- ocrd_validators/parameter_validator.py +2 -2
- ocrd_validators/resource_list_validator.py +4 -3
- ocrd_validators/workspace_validator.py +21 -21
- ocrd_validators/xsd_validator.py +1 -1
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/LICENSE +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/entry_points.txt +0 -0
- {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# pylint: disable=missing-module-docstring,invalid-name
|
|
2
|
-
from os.path import join
|
|
3
|
-
from typing import Optional
|
|
2
|
+
from os.path import join
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
import click
|
|
6
6
|
|
|
@@ -10,7 +10,6 @@ from ocrd.processor.ocrd_page_result import OcrdPageResult
|
|
|
10
10
|
from ocrd_models.ocrd_file import OcrdFileType
|
|
11
11
|
from ocrd_models.ocrd_page import OcrdPage, to_xml
|
|
12
12
|
from ocrd_utils import (
|
|
13
|
-
getLogger,
|
|
14
13
|
make_file_id,
|
|
15
14
|
MIME_TO_EXT,
|
|
16
15
|
MIMETYPE_PAGE,
|
|
@@ -20,8 +19,6 @@ from ocrd_utils import (
|
|
|
20
19
|
)
|
|
21
20
|
from ocrd_modelfactory import page_from_file
|
|
22
21
|
|
|
23
|
-
OCRD_TOOL = parse_json_string_with_comments(resource_string(__package__ + '.dummy', 'ocrd-tool.json'))
|
|
24
|
-
|
|
25
22
|
class DummyProcessor(Processor):
|
|
26
23
|
"""
|
|
27
24
|
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
|
|
@@ -76,17 +73,13 @@ class DummyProcessor(Processor):
|
|
|
76
73
|
super().process_page_file(input_file)
|
|
77
74
|
|
|
78
75
|
@property
|
|
79
|
-
def
|
|
80
|
-
return
|
|
76
|
+
def metadata_filename(self):
|
|
77
|
+
return 'processor/builtin/dummy/ocrd-tool.json'
|
|
81
78
|
|
|
82
79
|
@property
|
|
83
80
|
def executable(self):
|
|
84
81
|
return 'ocrd-dummy'
|
|
85
82
|
|
|
86
|
-
@property
|
|
87
|
-
def version(self):
|
|
88
|
-
return '0.0.3'
|
|
89
|
-
|
|
90
83
|
@click.command()
|
|
91
84
|
@ocrd_cli_options
|
|
92
85
|
def cli(*args, **kwargs):
|
ocrd/processor/helpers.py
CHANGED
|
@@ -1,21 +1,18 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Helper methods for running and documenting processors
|
|
3
3
|
"""
|
|
4
|
-
from os import chdir, getcwd
|
|
5
4
|
from time import perf_counter, process_time
|
|
6
5
|
from functools import lru_cache
|
|
7
6
|
import json
|
|
8
7
|
import inspect
|
|
9
8
|
from subprocess import run
|
|
10
|
-
from typing import List
|
|
9
|
+
from typing import List, Optional
|
|
11
10
|
|
|
12
|
-
from
|
|
13
|
-
from ocrd.workspace import Workspace
|
|
11
|
+
from ..workspace import Workspace
|
|
14
12
|
from ocrd_utils import freeze_args, getLogger, config, setOverrideLogLevel, getLevelName, sparkline
|
|
15
13
|
|
|
16
14
|
|
|
17
15
|
__all__ = [
|
|
18
|
-
'generate_processor_help',
|
|
19
16
|
'run_cli',
|
|
20
17
|
'run_processor'
|
|
21
18
|
]
|
|
@@ -39,10 +36,7 @@ def run_processor(
|
|
|
39
36
|
log_level=None,
|
|
40
37
|
input_file_grp=None,
|
|
41
38
|
output_file_grp=None,
|
|
42
|
-
show_resource=None,
|
|
43
|
-
list_resources=False,
|
|
44
39
|
parameter=None,
|
|
45
|
-
parameter_override=None,
|
|
46
40
|
working_dir=None,
|
|
47
41
|
mets_server_url=None,
|
|
48
42
|
instance_caching=False
|
|
@@ -84,7 +78,7 @@ def run_processor(
|
|
|
84
78
|
log.debug("Running processor %s", processorClass)
|
|
85
79
|
|
|
86
80
|
processor = get_processor(
|
|
87
|
-
|
|
81
|
+
processorClass,
|
|
88
82
|
parameter=parameter,
|
|
89
83
|
workspace=None,
|
|
90
84
|
page_id=page_id,
|
|
@@ -102,7 +96,7 @@ def run_processor(
|
|
|
102
96
|
t0_cpu = process_time()
|
|
103
97
|
if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
|
|
104
98
|
backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
|
|
105
|
-
from memory_profiler import memory_usage
|
|
99
|
+
from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
|
|
106
100
|
try:
|
|
107
101
|
mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}),
|
|
108
102
|
# only run process once
|
|
@@ -212,156 +206,15 @@ def run_cli(
|
|
|
212
206
|
if not log_filename:
|
|
213
207
|
result = run(args, check=False)
|
|
214
208
|
else:
|
|
215
|
-
with open(log_filename, 'a') as file_desc:
|
|
209
|
+
with open(log_filename, 'a', encoding='utf-8') as file_desc:
|
|
216
210
|
result = run(args, check=False, stdout=file_desc, stderr=file_desc)
|
|
217
211
|
return result.returncode
|
|
218
212
|
|
|
219
213
|
|
|
220
|
-
def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
|
|
221
|
-
"""Generate a string describing the full CLI of this processor including params.
|
|
222
|
-
|
|
223
|
-
Args:
|
|
224
|
-
ocrd_tool (dict): this processor's ``tools`` section of the module's ``ocrd-tool.json``
|
|
225
|
-
processor_instance (object, optional): the processor implementation
|
|
226
|
-
(for adding any module/class/function docstrings)
|
|
227
|
-
subcommand (string): 'worker' or 'server'
|
|
228
|
-
"""
|
|
229
|
-
doc_help = ''
|
|
230
|
-
if processor_instance:
|
|
231
|
-
module = inspect.getmodule(processor_instance)
|
|
232
|
-
if module and module.__doc__:
|
|
233
|
-
doc_help += '\n' + inspect.cleandoc(module.__doc__) + '\n'
|
|
234
|
-
if processor_instance.__doc__:
|
|
235
|
-
doc_help += '\n' + inspect.cleandoc(processor_instance.__doc__) + '\n'
|
|
236
|
-
if processor_instance.process_workspace.__doc__:
|
|
237
|
-
doc_help += '\n' + inspect.cleandoc(processor_instance.process_workspace.__doc__) + '\n'
|
|
238
|
-
if processor_instance.process.__doc__:
|
|
239
|
-
doc_help += '\n' + inspect.cleandoc(processor_instance.process.__doc__) + '\n'
|
|
240
|
-
if doc_help:
|
|
241
|
-
doc_help = '\n\n' + wrap_text(doc_help, width=72,
|
|
242
|
-
initial_indent=' > ',
|
|
243
|
-
subsequent_indent=' > ',
|
|
244
|
-
preserve_paragraphs=True)
|
|
245
|
-
subcommands = '''\
|
|
246
|
-
worker Start a processing worker rather than do local processing
|
|
247
|
-
server Start a processor server rather than do local processing
|
|
248
|
-
'''
|
|
249
|
-
|
|
250
|
-
processing_worker_options = '''\
|
|
251
|
-
--queue The RabbitMQ server address in format
|
|
252
|
-
"amqp://{user}:{pass}@{host}:{port}/{vhost}"
|
|
253
|
-
[amqp://admin:admin@localhost:5672]
|
|
254
|
-
--database The MongoDB server address in format
|
|
255
|
-
"mongodb://{host}:{port}"
|
|
256
|
-
[mongodb://localhost:27018]
|
|
257
|
-
--log-filename Filename to redirect STDOUT/STDERR to,
|
|
258
|
-
if specified.
|
|
259
|
-
'''
|
|
260
|
-
|
|
261
|
-
processing_server_options = '''\
|
|
262
|
-
--address The Processor server address in format
|
|
263
|
-
"{host}:{port}"
|
|
264
|
-
--database The MongoDB server address in format
|
|
265
|
-
"mongodb://{host}:{port}"
|
|
266
|
-
[mongodb://localhost:27018]
|
|
267
|
-
'''
|
|
268
|
-
|
|
269
|
-
processing_options = '''\
|
|
270
|
-
-m, --mets URL-PATH URL or file path of METS to process [./mets.xml]
|
|
271
|
-
-w, --working-dir PATH Working directory of local workspace [dirname(URL-PATH)]
|
|
272
|
-
-I, --input-file-grp USE File group(s) used as input
|
|
273
|
-
-O, --output-file-grp USE File group(s) used as output
|
|
274
|
-
-g, --page-id ID Physical page ID(s) to process instead of full document []
|
|
275
|
-
--overwrite Remove existing output pages/images
|
|
276
|
-
(with "--page-id", remove only those).
|
|
277
|
-
Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
|
|
278
|
-
--debug Abort on any errors with full stack trace.
|
|
279
|
-
Short-hand for OCRD_MISSING_OUTPUT=ABORT
|
|
280
|
-
--profile Enable profiling
|
|
281
|
-
--profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile"
|
|
282
|
-
-p, --parameter JSON-PATH Parameters, either verbatim JSON string
|
|
283
|
-
or JSON file path
|
|
284
|
-
-P, --param-override KEY VAL Override a single JSON object key-value pair,
|
|
285
|
-
taking precedence over --parameter
|
|
286
|
-
-U, --mets-server-url URL URL of a METS Server for parallel incremental access to METS
|
|
287
|
-
If URL starts with http:// start an HTTP server there,
|
|
288
|
-
otherwise URL is a path to an on-demand-created unix socket
|
|
289
|
-
-l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
|
|
290
|
-
Override log level globally [INFO]
|
|
291
|
-
'''
|
|
292
|
-
|
|
293
|
-
information_options = '''\
|
|
294
|
-
-C, --show-resource RESNAME Dump the content of processor resource RESNAME
|
|
295
|
-
-L, --list-resources List names of processor resources
|
|
296
|
-
-J, --dump-json Dump tool description as JSON
|
|
297
|
-
-D, --dump-module-dir Show the 'module' resource location path for this processor
|
|
298
|
-
-h, --help Show this message
|
|
299
|
-
-V, --version Show version
|
|
300
|
-
'''
|
|
301
|
-
|
|
302
|
-
parameter_help = ''
|
|
303
|
-
if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
|
|
304
|
-
parameter_help = ' NONE\n'
|
|
305
|
-
else:
|
|
306
|
-
def wrap(s):
|
|
307
|
-
return wrap_text(s, initial_indent=' '*3,
|
|
308
|
-
subsequent_indent=' '*4,
|
|
309
|
-
width=72, preserve_paragraphs=True)
|
|
310
|
-
for param_name, param in ocrd_tool['parameters'].items():
|
|
311
|
-
parameter_help += wrap('"%s" [%s%s]' % (
|
|
312
|
-
param_name,
|
|
313
|
-
param['type'],
|
|
314
|
-
' - REQUIRED' if 'required' in param and param['required'] else
|
|
315
|
-
' - %s' % json.dumps(param['default']) if 'default' in param else ''))
|
|
316
|
-
parameter_help += '\n ' + wrap(param['description'])
|
|
317
|
-
if 'enum' in param:
|
|
318
|
-
parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
|
|
319
|
-
parameter_help += "\n"
|
|
320
|
-
|
|
321
|
-
if not subcommand:
|
|
322
|
-
return f'''\
|
|
323
|
-
Usage: {ocrd_tool['executable']} [worker|server] [OPTIONS]
|
|
324
|
-
|
|
325
|
-
{ocrd_tool['description']}{doc_help}
|
|
326
|
-
|
|
327
|
-
Subcommands:
|
|
328
|
-
{subcommands}
|
|
329
|
-
Options for processing:
|
|
330
|
-
{processing_options}
|
|
331
|
-
Options for information:
|
|
332
|
-
{information_options}
|
|
333
|
-
Parameters:
|
|
334
|
-
{parameter_help}
|
|
335
|
-
'''
|
|
336
|
-
elif subcommand == 'worker':
|
|
337
|
-
return f'''\
|
|
338
|
-
Usage: {ocrd_tool['executable']} worker [OPTIONS]
|
|
339
|
-
|
|
340
|
-
Run {ocrd_tool['executable']} as a processing worker.
|
|
341
|
-
|
|
342
|
-
{ocrd_tool['description']}{doc_help}
|
|
343
|
-
|
|
344
|
-
Options:
|
|
345
|
-
{processing_worker_options}
|
|
346
|
-
'''
|
|
347
|
-
elif subcommand == 'server':
|
|
348
|
-
return f'''\
|
|
349
|
-
Usage: {ocrd_tool['executable']} server [OPTIONS]
|
|
350
|
-
|
|
351
|
-
Run {ocrd_tool['executable']} as a processor sever.
|
|
352
214
|
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
{processing_server_options}
|
|
357
|
-
'''
|
|
358
|
-
else:
|
|
359
|
-
pass
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
# Taken from https://github.com/OCR-D/core/pull/884
|
|
363
|
-
@freeze_args
|
|
364
|
-
@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
|
|
215
|
+
# not decorated here but at runtime (on first use)
|
|
216
|
+
#@freeze_args
|
|
217
|
+
#@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
|
|
365
218
|
def get_cached_processor(parameter: dict, processor_class):
|
|
366
219
|
"""
|
|
367
220
|
Call this function to get back an instance of a processor.
|
|
@@ -374,16 +227,13 @@ def get_cached_processor(parameter: dict, processor_class):
|
|
|
374
227
|
Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
|
|
375
228
|
"""
|
|
376
229
|
if processor_class:
|
|
377
|
-
|
|
378
|
-
processor = processor_class(None, parameter=dict_params)
|
|
379
|
-
processor.setup()
|
|
230
|
+
processor = processor_class(None, parameter=dict(parameter))
|
|
380
231
|
return processor
|
|
381
232
|
return None
|
|
382
233
|
|
|
383
|
-
|
|
384
234
|
def get_processor(
|
|
385
235
|
processor_class,
|
|
386
|
-
parameter: dict,
|
|
236
|
+
parameter: Optional[dict] = None,
|
|
387
237
|
workspace: Workspace = None,
|
|
388
238
|
page_id: str = None,
|
|
389
239
|
input_file_grp: List[str] = None,
|
|
@@ -391,11 +241,24 @@ def get_processor(
|
|
|
391
241
|
instance_caching: bool = False,
|
|
392
242
|
):
|
|
393
243
|
if processor_class:
|
|
244
|
+
if parameter is None:
|
|
245
|
+
parameter = {}
|
|
394
246
|
if instance_caching:
|
|
247
|
+
global get_cached_processor
|
|
248
|
+
if not hasattr(get_cached_processor, '__wrapped__'):
|
|
249
|
+
# first call: wrap
|
|
250
|
+
if processor_class.max_instances < 0:
|
|
251
|
+
maxsize = config.OCRD_MAX_PROCESSOR_CACHE
|
|
252
|
+
else:
|
|
253
|
+
maxsize = min(config.OCRD_MAX_PROCESSOR_CACHE, processor_class.max_instances)
|
|
254
|
+
# wrapping in call cache
|
|
255
|
+
# wrapping dict into frozendict (from https://github.com/OCR-D/core/pull/884)
|
|
256
|
+
get_cached_processor = freeze_args(lru_cache(maxsize=maxsize)(get_cached_processor))
|
|
395
257
|
processor = get_cached_processor(parameter, processor_class)
|
|
396
258
|
else:
|
|
259
|
+
# avoid passing workspace already (deprecated chdir behaviour)
|
|
397
260
|
processor = processor_class(None, parameter=parameter)
|
|
398
|
-
|
|
261
|
+
# set current processing parameters
|
|
399
262
|
processor.workspace = workspace
|
|
400
263
|
processor.page_id = page_id
|
|
401
264
|
processor.input_file_grp = input_file_grp
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import List
|
|
2
|
+
from typing import List, Union
|
|
3
3
|
from ocrd_models.ocrd_page import OcrdPage
|
|
4
4
|
from PIL.Image import Image
|
|
5
5
|
|
|
6
|
-
from ocrd_models.ocrd_page_generateds import AlternativeImageType
|
|
6
|
+
from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
|
|
7
7
|
|
|
8
8
|
@dataclass
|
|
9
9
|
class OcrdPageResultImage():
|
|
10
10
|
pil : Image
|
|
11
11
|
file_id_suffix : str
|
|
12
|
-
alternative_image : AlternativeImageType
|
|
12
|
+
alternative_image : Union[AlternativeImageType, PageType]
|
|
13
13
|
|
|
14
14
|
@dataclass
|
|
15
15
|
class OcrdPageResult():
|
ocrd/resolver.py
CHANGED
|
@@ -18,7 +18,6 @@ from ocrd_utils import (
|
|
|
18
18
|
)
|
|
19
19
|
from ocrd.workspace import Workspace
|
|
20
20
|
from ocrd_models import OcrdMets
|
|
21
|
-
from ocrd_models.constants import NAMESPACES as NS
|
|
22
21
|
from ocrd_models.utils import handle_oai_response
|
|
23
22
|
|
|
24
23
|
class Resolver():
|
|
@@ -310,5 +309,3 @@ class Resolver():
|
|
|
310
309
|
raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))
|
|
311
310
|
|
|
312
311
|
return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url
|
|
313
|
-
|
|
314
|
-
|
ocrd/resource_manager.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from os.path import join
|
|
3
|
-
from os import environ, listdir,
|
|
3
|
+
from os import environ, listdir, getcwd, unlink
|
|
4
4
|
from shutil import copytree, rmtree, copy
|
|
5
5
|
from fnmatch import filter as apply_glob
|
|
6
6
|
from datetime import datetime
|
|
@@ -13,14 +13,18 @@ from gdown.parse_url import parse_url as gparse_url
|
|
|
13
13
|
from gdown.download import get_url_from_gdrive_confirmation
|
|
14
14
|
from yaml import safe_load, safe_dump
|
|
15
15
|
|
|
16
|
+
# pylint: disable=wrong-import-position
|
|
17
|
+
|
|
16
18
|
# https://github.com/OCR-D/core/issues/867
|
|
17
19
|
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
|
|
18
20
|
import yaml.constructor
|
|
19
|
-
yaml.constructor.SafeConstructor.yaml_constructors[
|
|
20
|
-
yaml.constructor.SafeConstructor.yaml_constructors[
|
|
21
|
+
yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \
|
|
22
|
+
yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str']
|
|
23
|
+
|
|
24
|
+
# pylint: enable=wrong-import-position
|
|
21
25
|
|
|
22
26
|
from ocrd_validators import OcrdResourceListValidator
|
|
23
|
-
from ocrd_utils import getLogger, directory_size, get_moduledir,
|
|
27
|
+
from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
|
|
24
28
|
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
|
|
25
29
|
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
|
|
26
30
|
|
|
@@ -248,7 +252,7 @@ class OcrdResourceManager:
|
|
|
248
252
|
if "Content-Disposition" not in r.headers:
|
|
249
253
|
url = get_url_from_gdrive_confirmation(r.text)
|
|
250
254
|
except RuntimeError as e:
|
|
251
|
-
log.warning("Cannot unwrap Google Drive URL: ", e)
|
|
255
|
+
log.warning("Cannot unwrap Google Drive URL: %s", e)
|
|
252
256
|
with open(filename, 'wb') as f:
|
|
253
257
|
with requests.get(url, stream=True) as r:
|
|
254
258
|
r.raise_for_status()
|
ocrd/workspace.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import io
|
|
2
2
|
from os import makedirs, unlink, listdir, path
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from shutil import
|
|
4
|
+
from shutil import copyfileobj
|
|
5
5
|
from re import sub
|
|
6
6
|
from tempfile import NamedTemporaryFile
|
|
7
7
|
from contextlib import contextmanager
|
|
@@ -43,7 +43,6 @@ from ocrd_utils import (
|
|
|
43
43
|
MIME_TO_PIL,
|
|
44
44
|
MIMETYPE_PAGE,
|
|
45
45
|
REGEX_PREFIX,
|
|
46
|
-
config
|
|
47
46
|
)
|
|
48
47
|
|
|
49
48
|
from .workspace_backup import WorkspaceBackupManager
|
|
@@ -96,8 +95,8 @@ class Workspace():
|
|
|
96
95
|
if self.is_remote:
|
|
97
96
|
mets = ClientSideOcrdMets(mets_server_url, self.directory)
|
|
98
97
|
if mets.workspace_path != self.directory:
|
|
99
|
-
raise ValueError(f"METS server {mets_server_url} workspace directory {mets.workspace_path} differs "
|
|
100
|
-
f"from local workspace directory {self.directory}. These are not the same workspaces.")
|
|
98
|
+
raise ValueError(f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs "
|
|
99
|
+
f"from local workspace directory '{self.directory}'. These are not the same workspaces.")
|
|
101
100
|
else:
|
|
102
101
|
mets = OcrdMets(filename=self.mets_target)
|
|
103
102
|
self.mets = mets
|
|
@@ -111,7 +110,7 @@ class Workspace():
|
|
|
111
110
|
|
|
112
111
|
def __repr__(self):
|
|
113
112
|
return 'Workspace[remote=%s, directory=%s, baseurl=%s, file_groups=%s, files=%s]' % (
|
|
114
|
-
|
|
113
|
+
self.is_remote,
|
|
115
114
|
self.directory,
|
|
116
115
|
self.baseurl,
|
|
117
116
|
self.mets.file_groups,
|
|
@@ -648,7 +647,7 @@ class Workspace():
|
|
|
648
647
|
log = getLogger('ocrd.workspace.image_from_page')
|
|
649
648
|
page_image_info = self.resolve_image_exif(page.imageFilename)
|
|
650
649
|
page_image = self._resolve_image_as_pil(page.imageFilename)
|
|
651
|
-
page_coords =
|
|
650
|
+
page_coords = {}
|
|
652
651
|
# use identity as initial affine coordinate transform:
|
|
653
652
|
page_coords['transform'] = np.eye(3)
|
|
654
653
|
# interim bbox (updated with each change to the transform):
|
|
@@ -1091,7 +1090,7 @@ class Workspace():
|
|
|
1091
1090
|
The (absolute) path of the created file.
|
|
1092
1091
|
"""
|
|
1093
1092
|
log = getLogger('ocrd.workspace.save_image_file')
|
|
1094
|
-
saveargs =
|
|
1093
|
+
saveargs = {}
|
|
1095
1094
|
if 'dpi' in image.info:
|
|
1096
1095
|
saveargs['dpi'] = image.info['dpi']
|
|
1097
1096
|
image_bytes = io.BytesIO()
|
|
@@ -1168,9 +1167,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh
|
|
|
1168
1167
|
# Transpose in affine coordinate transform:
|
|
1169
1168
|
# (consistent with image transposition or AlternativeImage below)
|
|
1170
1169
|
transposition = {
|
|
1171
|
-
90: Image.ROTATE_90,
|
|
1172
|
-
180: Image.ROTATE_180,
|
|
1173
|
-
270: Image.ROTATE_270
|
|
1170
|
+
90: Image.Transpose.ROTATE_90,
|
|
1171
|
+
180: Image.Transpose.ROTATE_180,
|
|
1172
|
+
270: Image.Transpose.ROTATE_270
|
|
1174
1173
|
}.get(orientation) # no default
|
|
1175
1174
|
segment_coords['transform'] = transpose_coordinates(
|
|
1176
1175
|
segment_coords['transform'], transposition,
|
|
@@ -1238,5 +1237,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa
|
|
|
1238
1237
|
segment_image = segment_image.resize((int(segment_image.width * factor),
|
|
1239
1238
|
int(segment_image.height * factor)),
|
|
1240
1239
|
# slowest, but highest quality:
|
|
1241
|
-
Image.BICUBIC)
|
|
1240
|
+
Image.Resampling.BICUBIC)
|
|
1242
1241
|
return segment_image, segment_coords, segment_xywh
|
ocrd/workspace_backup.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.0b2
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -94,17 +94,12 @@ complete stack of OCR-D-related software.
|
|
|
94
94
|
|
|
95
95
|
The easiest way to install is via `pip`:
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
pip install ocrd
|
|
97
|
+
pip install ocrd
|
|
99
98
|
|
|
100
|
-
# or just the functionality you need, e.g.
|
|
101
|
-
|
|
102
|
-
pip install ocrd_modelfactory
|
|
103
|
-
```
|
|
104
99
|
|
|
105
100
|
All Python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.8 or higher.
|
|
106
101
|
|
|
107
|
-
**NOTE** Some OCR-D
|
|
102
|
+
> **NOTE** Some OCR-D tools (or even test cases) _might_ reveal an unintended behavior if you have specific environment modifications, like:
|
|
108
103
|
* using a custom build of [ImageMagick](https://github.com/ImageMagick/ImageMagick), whose format delegates are different from what OCR-D supposes
|
|
109
104
|
* custom Python logging configurations in your personal account
|
|
110
105
|
|
|
@@ -129,7 +124,6 @@ Almost all behaviour of the OCR-D/core software is configured via CLI options an
|
|
|
129
124
|
|
|
130
125
|
Some parts of the software are configured via environment variables:
|
|
131
126
|
|
|
132
|
-
* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification.
|
|
133
127
|
* `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens:
|
|
134
128
|
* `CPU`: Enable CPU profiling of processor runs
|
|
135
129
|
* `RSS`: Enable RSS memory profiling
|
|
@@ -142,18 +136,46 @@ Some parts of the software are configured via environment variables:
|
|
|
142
136
|
* `XDG_CONFIG_HOME`: Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database) – defaults to `$HOME/.config`.
|
|
143
137
|
* `XDG_DATA_HOME`: Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location) – defaults to `$HOME/.local/share`.
|
|
144
138
|
|
|
145
|
-
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of workspace files.
|
|
139
|
+
* `OCRD_DOWNLOAD_RETRIES`: Number of times to retry failed attempts for downloads of resources or workspace files.
|
|
146
140
|
* `OCRD_DOWNLOAD_TIMEOUT`: Timeout in seconds for connecting or reading (comma-separated) when downloading.
|
|
147
141
|
|
|
142
|
+
* `OCRD_MISSING_INPUT`: How to deal with missing input files (for some fileGrp/pageId) during processing:
|
|
143
|
+
* `SKIP`: ignore and proceed with next page's input
|
|
144
|
+
* `ABORT`: throw `MissingInputFile` exception
|
|
145
|
+
|
|
146
|
+
* `OCRD_MISSING_OUTPUT`: How to deal with missing output files (for some fileGrp/pageId) during processing:
|
|
147
|
+
* `SKIP`: ignore and proceed processing next page
|
|
148
|
+
* `COPY`: fall back to copying input PAGE to output fileGrp for page
|
|
149
|
+
* `ABORT`: re-throw whatever caused processing to fail
|
|
150
|
+
|
|
151
|
+
* `OCRD_MAX_MISSING_OUTPUTS`: Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).
|
|
152
|
+
|
|
153
|
+
* `OCRD_EXISTING_OUTPUT`: How to deal with already existing output files (for some fileGrp/pageId) during processing:
|
|
154
|
+
* `SKIP`: ignore and proceed processing next page
|
|
155
|
+
* `OVERWRITE`: force writing result to output fileGrp for page
|
|
156
|
+
* `ABORT`: re-throw `FileExistsError` exception
|
|
157
|
+
|
|
158
|
+
|
|
148
159
|
* `OCRD_METS_CACHING`: Whether to enable in-memory storage of OcrdMets data structures for speedup during processing or workspace operations.
|
|
149
160
|
|
|
150
161
|
* `OCRD_MAX_PROCESSOR_CACHE`: Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.
|
|
151
162
|
|
|
163
|
+
* `OCRD_MAX_PARALLEL_PAGES`: Maximum number of processor threads for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set `>1`, then a METS Server must be used for METS synchronisation.
|
|
164
|
+
|
|
165
|
+
* `OCRD_PROCESSING_PAGE_TIMEOUT`: Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.
|
|
166
|
+
|
|
152
167
|
* `OCRD_NETWORK_SERVER_ADDR_PROCESSING`: Default address of Processing Server to connect to (for `ocrd network client processing`).
|
|
153
168
|
* `OCRD_NETWORK_SERVER_ADDR_WORKFLOW`: Default address of Workflow Server to connect to (for `ocrd network client workflow`).
|
|
154
169
|
* `OCRD_NETWORK_SERVER_ADDR_WORKSPACE`: Default address of Workspace Server to connect to (for `ocrd network client workspace`).
|
|
155
170
|
* `OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS`: Number of attempts for a worker to create its queue. Helpful if the rabbitmq-server needs time to be fully started.
|
|
156
171
|
|
|
172
|
+
* `OCRD_NETWORK_CLIENT_POLLING_SLEEP`: How many seconds to sleep before trying `ocrd network client` again.
|
|
173
|
+
* `OCRD_NETWORK_CLIENT_POLLING_TIMEOUT`: Timeout for a blocking `ocrd network client` (in seconds).
|
|
174
|
+
|
|
175
|
+
* `OCRD_NETWORK_SOCKETS_ROOT_DIR`: The root directory where all mets server related socket files are created.
|
|
176
|
+
* `OCRD_NETWORK_LOGS_ROOT_DIR`: The root directory where all ocrd_network related file logs are stored.
|
|
177
|
+
|
|
178
|
+
|
|
157
179
|
|
|
158
180
|
## Packages
|
|
159
181
|
|