ocrd 3.0.0a2__py3-none-any.whl → 3.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. ocrd/cli/__init__.py +34 -26
  2. ocrd/cli/bashlib.py +32 -18
  3. ocrd/cli/ocrd_tool.py +7 -5
  4. ocrd/cli/workspace.py +10 -8
  5. ocrd/decorators/__init__.py +13 -7
  6. ocrd/lib.bash +2 -0
  7. ocrd/mets_server.py +2 -3
  8. ocrd/processor/base.py +163 -63
  9. ocrd/processor/builtin/dummy_processor.py +4 -11
  10. ocrd/processor/helpers.py +23 -17
  11. ocrd/processor/ocrd_page_result.py +3 -3
  12. ocrd/resolver.py +0 -3
  13. ocrd/resource_manager.py +9 -5
  14. ocrd/workspace.py +8 -9
  15. ocrd/workspace_backup.py +1 -1
  16. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/METADATA +1 -1
  17. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/RECORD +47 -46
  18. ocrd_modelfactory/__init__.py +1 -1
  19. ocrd_models/constants.py +0 -1
  20. ocrd_models/ocrd_exif.py +2 -2
  21. ocrd_models/ocrd_file.py +2 -2
  22. ocrd_models/ocrd_mets.py +22 -22
  23. ocrd_models/ocrd_page.py +0 -1
  24. ocrd_models/ocrd_xml_base.py +2 -2
  25. ocrd_network/cli/client.py +134 -30
  26. ocrd_network/client.py +53 -27
  27. ocrd_network/client_utils.py +101 -0
  28. ocrd_network/processing_server.py +1 -1
  29. ocrd_network/runtime_data/deployer.py +12 -3
  30. ocrd_network/server_utils.py +12 -10
  31. ocrd_utils/__init__.py +2 -0
  32. ocrd_utils/config.py +16 -2
  33. ocrd_utils/image.py +25 -25
  34. ocrd_utils/logging.py +17 -19
  35. ocrd_utils/os.py +4 -5
  36. ocrd_utils/str.py +10 -3
  37. ocrd_validators/json_validator.py +1 -3
  38. ocrd_validators/ocrd_tool_validator.py +2 -2
  39. ocrd_validators/page_validator.py +56 -56
  40. ocrd_validators/parameter_validator.py +2 -2
  41. ocrd_validators/resource_list_validator.py +4 -3
  42. ocrd_validators/workspace_validator.py +21 -21
  43. ocrd_validators/xsd_validator.py +1 -1
  44. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/LICENSE +0 -0
  45. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/WHEEL +0 -0
  46. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/entry_points.txt +0 -0
  47. {ocrd-3.0.0a2.dist-info → ocrd-3.0.0b1.dist-info}/top_level.txt +0 -0
ocrd/processor/base.py CHANGED
@@ -9,6 +9,7 @@ __all__ = [
9
9
  'run_processor'
10
10
  ]
11
11
 
12
+ from functools import cached_property
12
13
  from os.path import exists, join
13
14
  from shutil import copyfileobj
14
15
  import json
@@ -20,7 +21,8 @@ import sys
20
21
  import inspect
21
22
  import tarfile
22
23
  import io
23
- from warnings import warn
24
+ import weakref
25
+ from frozendict import frozendict
24
26
  from deprecated import deprecated
25
27
  from requests import HTTPError
26
28
 
@@ -33,18 +35,25 @@ from ocrd_utils import (
33
35
  MIME_TO_EXT,
34
36
  config,
35
37
  getLogger,
36
- initLogging,
37
38
  list_resource_candidates,
38
39
  pushd_popd,
39
40
  list_all_resources,
40
41
  get_processor_resource_types,
41
42
  resource_filename,
42
- resource_string,
43
+ parse_json_file_with_comments,
43
44
  make_file_id,
44
45
  deprecation_warning
45
46
  )
46
47
  from ocrd_validators import ParameterValidator
47
- from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType, OcrdPage, to_xml
48
+ from ocrd_models.ocrd_page import (
49
+ PageType,
50
+ AlternativeImageType,
51
+ MetadataItemType,
52
+ LabelType,
53
+ LabelsType,
54
+ OcrdPage,
55
+ to_xml,
56
+ )
48
57
  from ocrd_modelfactory import page_from_file
49
58
  from ocrd_validators.ocrd_tool_validator import OcrdToolValidator
50
59
 
@@ -94,54 +103,139 @@ class MissingInputFile(ValueError):
94
103
 
95
104
  class Processor():
96
105
  """
97
- A processor is a tool that implements the uniform OCR-D command-line interface
98
- for run-time data processing. That is, it executes a single workflow step,
99
- or a combination of workflow steps, on the workspace (represented by local METS).
100
- It reads input files for all or requested physical pages of the input fileGrp(s),
101
- and writes output files for them into the output fileGrp(s). It may take
102
- a number of optional or mandatory parameters.
106
+ A processor is a tool that implements the uniform OCR-D
107
+ `command-line interface for run-time data processing <https://ocr-d.de/en/spec/cli>`_.
108
+
109
+ That is, it executes a single workflow step, or a combination of workflow steps,
110
+ on the workspace (represented by local METS). It reads input files for all or selected
111
+ physical pages of the input fileGrp(s), computes additional annotation, and writes output
112
+ files for them into the output fileGrp(s). It may take a number of optional or mandatory
113
+ parameters.
114
+ """
115
+
116
+ max_instances : int = -1
117
+ """
118
+ maximum number of cached instances (ignored if negative), to be applied on top of
119
+ :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller).
120
+
121
+ (Override this if you know how many instances fit into memory at once.)
103
122
  """
104
123
 
105
124
  @property
125
+ def metadata_filename(self) -> str:
126
+ """
127
+ Relative location of the ``ocrd-tool.json`` file inside the package.
128
+
129
+ Used by :py:data:`metadata_location`.
130
+
131
+ (Override if ``ocrd-tool.json`` is not in the root of the module,
132
+ e.g. ``namespace/ocrd-tool.json`` or ``data/ocrd-tool.json``).
133
+ """
134
+ return 'ocrd-tool.json'
135
+
136
+ @cached_property
137
+ def metadata_location(self) -> Path:
138
+ """
139
+ Absolute path of the ``ocrd-tool.json`` file as distributed with the package.
140
+
141
+ Used by :py:data:`metadata_rawdict`.
142
+
143
+ (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
144
+ """
145
+ return resource_filename(self.__module__.split('.')[0], self.metadata_filename)
146
+
147
+ @cached_property
148
+ def metadata_rawdict(self) -> dict:
149
+ """
150
+ Raw (unvalidated, unexpanded) ``ocrd-tool.json`` dict contents of the package.
151
+
152
+ Used by :py:data:`metadata`.
153
+
154
+ (Override if ``ocrd-tool.json`` is not in a file.)
155
+ """
156
+ return parse_json_file_with_comments(self.metadata_location)
157
+
158
+ @cached_property
106
159
  def metadata(self) -> dict:
107
- """the ocrd-tool.json dict of the package"""
108
- if hasattr(self, '_metadata'):
109
- return self._metadata
110
- self._metadata = json.loads(resource_string(self.__module__.split('.')[0], 'ocrd-tool.json'))
111
- report = OcrdToolValidator.validate(self._metadata)
160
+ """
161
+ The ``ocrd-tool.json`` dict contents of the package, according to the OCR-D
162
+ `spec <https://ocr-d.de/en/spec/ocrd_tool>`_ for processor tools.
163
+
164
+ After deserialisation, it also gets validated against the
165
+ `schema <https://ocr-d.de/en/spec/ocrd_tool#definition>`_ with all defaults
166
+ expanded.
167
+
168
+ Used by :py:data:`ocrd_tool` and :py:data:`version`.
169
+
170
+ (Override if you want to provide metadata programmatically instead of a
171
+ JSON file.)
172
+ """
173
+ metadata = self.metadata_rawdict
174
+ report = OcrdToolValidator.validate(metadata)
112
175
  if not report.is_valid:
113
- # FIXME: remove when bertsky/core#10 is merged
114
- self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
115
- self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n{report.to_xml()}.\nPlease open an issue at {self._metadata['git_url']}.")
116
- return self._metadata
176
+ self.logger.error(f"The ocrd-tool.json of this processor is {'problematic' if not report.errors else 'invalid'}:\n"
177
+ f"{report.to_xml()}.\nPlease open an issue at {metadata.get('git_url', 'the website')}.")
178
+ return metadata
117
179
 
118
- @property
180
+ @cached_property
119
181
  def version(self) -> str:
120
- """the version of the package"""
121
- if hasattr(self, '_version'):
122
- return self._version
123
- self._version = self.metadata['version']
124
- return self._version
182
+ """
183
+ The program version of the package.
184
+ Usually the ``version`` part of :py:data:`metadata`.
125
185
 
126
- @property
186
+ (Override if you do not want to use :py:data:`metadata` lookup
187
+ mechanism.)
188
+ """
189
+ return self.metadata['version']
190
+
191
+ @cached_property
127
192
  def executable(self) -> str:
128
- """the executable name of this processor tool"""
129
- if hasattr(self, '_executable'):
130
- return self._executable
131
- self._executable = os.path.basename(inspect.stack()[-1].filename)
132
- return self._executable
193
+ """
194
+ The executable name of this processor tool. Taken from the runtime
195
+ filename.
133
196
 
134
- @property
197
+ Used by :py:data:`ocrd_tool` for lookup in :py:data:`metadata`.
198
+
199
+ (Override if your entry-point name deviates from the ``executable``
200
+ name, or the processor gets instantiated from another runtime.)
201
+ """
202
+ return os.path.basename(inspect.stack()[-1].filename)
203
+
204
+ @cached_property
135
205
  def ocrd_tool(self) -> dict:
136
- """the ocrd-tool.json dict of this processor tool"""
137
- if hasattr(self, '_ocrd_tool'):
138
- return self._ocrd_tool
139
- self._ocrd_tool = self.metadata['tools'][self.executable]
140
- return self._ocrd_tool
206
+ """
207
+ The ``ocrd-tool.json`` dict contents of this processor tool.
208
+ Usually the :py:data:`executable` key of the ``tools`` part
209
+ of :py:data:`metadata`.
210
+
211
+ (Override if you do not want to use :py:data:`metadata` lookup
212
+ mechanism.)
213
+ """
214
+ return self.metadata['tools'][self.executable]
215
+
216
+ @property
217
+ def parameter(self) -> Optional[dict]:
218
+ """the runtime parameter dict to be used by this processor"""
219
+ if hasattr(self, '_parameter'):
220
+ return self._parameter
221
+ return None
222
+
223
+ @parameter.setter
224
+ def parameter(self, parameter : dict) -> None:
225
+ if self.parameter is not None:
226
+ self.shutdown()
227
+ parameterValidator = ParameterValidator(self.ocrd_tool)
228
+ report = parameterValidator.validate(parameter)
229
+ if not report.is_valid:
230
+ raise ValueError(f'Invalid parameters:\n{report.to_xml()}')
231
+ # make parameter dict read-only
232
+ self._parameter = frozendict(parameter)
233
+ # (re-)run setup to load models etc
234
+ self.setup()
141
235
 
142
236
  def __init__(
143
237
  self,
144
- # FIXME: deprecate in favor of process_workspace(workspace)
238
+ # FIXME: remove in favor of process_workspace(workspace)
145
239
  workspace : Optional[Workspace],
146
240
  ocrd_tool=None,
147
241
  parameter=None,
@@ -204,19 +298,14 @@ class Processor():
204
298
  "is deprecated - pass as argument to process_workspace instead")
205
299
  self.page_id = page_id or None
206
300
  self.download = download_files
207
- if parameter is None:
208
- parameter = {}
209
- parameterValidator = ParameterValidator(self.ocrd_tool)
210
-
211
- report = parameterValidator.validate(parameter)
212
- if not report.is_valid:
213
- raise ValueError("Invalid parameters %s" % report.errors)
214
- self.parameter = parameter
215
- # NOTE: this is the logger to be used by processor implementations,
216
- # `processor.base` default implementations should use
217
- # :py:attr:`self._base_logger`
301
+ #: The logger to be used by processor implementations.
302
+ # `ocrd.processor.base` internals should use :py:attr:`self._base_logger`
218
303
  self.logger = getLogger(f'ocrd.processor.{self.__class__.__name__}')
219
304
  self._base_logger = getLogger('ocrd.processor.base')
305
+ if parameter is not None:
306
+ self.parameter = parameter
307
+ # ensure that shutdown gets called at destruction
308
+ self._finalizer = weakref.finalize(self, self.shutdown)
220
309
  # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
221
310
  setattr(self, 'process',
222
311
  deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
@@ -254,14 +343,10 @@ class Processor():
254
343
  assert len(grps) >= minimum, msg % (len(grps), str(spec))
255
344
  if maximum > 0:
256
345
  assert len(grps) <= maximum, msg % (len(grps), str(spec))
257
- # FIXME: maybe we should enforce the cardinality properties to be specified or apply default=1 here
258
- # (but we already have ocrd-tool validation, and these first need to be adopted by implementors)
259
- if 'input_file_grp_cardinality' in self.ocrd_tool:
260
- assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
261
- "Unexpected number of input file groups %d vs %s")
262
- if 'output_file_grp_cardinality' in self.ocrd_tool:
263
- assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
264
- "Unexpected number of output file groups %d vs %s")
346
+ assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
347
+ "Unexpected number of input file groups %d vs %s")
348
+ assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
349
+ "Unexpected number of output file groups %d vs %s")
265
350
  for input_file_grp in input_file_grps:
266
351
  assert input_file_grp in self.workspace.mets.file_groups
267
352
  # keep this for backwards compatibility:
@@ -272,14 +357,12 @@ class Processor():
272
357
  Print :py:attr:`ocrd_tool` on stdout.
273
358
  """
274
359
  print(json.dumps(self.ocrd_tool, indent=True))
275
- return
276
360
 
277
361
  def dump_module_dir(self):
278
362
  """
279
363
  Print :py:attr:`moduledir` on stdout.
280
364
  """
281
365
  print(self.moduledir)
282
- return
283
366
 
284
367
  def list_resources(self):
285
368
  """
@@ -287,7 +370,6 @@ class Processor():
287
370
  """
288
371
  for res in self.list_all_resources():
289
372
  print(res)
290
- return
291
373
 
292
374
  def setup(self) -> None:
293
375
  """
@@ -299,6 +381,16 @@ class Processor():
299
381
  """
300
382
  pass
301
383
 
384
+ def shutdown(self) -> None:
385
+ """
386
+ Bring down the processor after data processing,
387
+ after to changing back from the workspace directory but
388
+ before exiting (or setting up with different parameters).
389
+
390
+ (Override this to unload models from memory etc.)
391
+ """
392
+ pass
393
+
302
394
  @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
303
395
  def process(self) -> None:
304
396
  """
@@ -439,7 +531,15 @@ class Processor():
439
531
  for image_result in result.images:
440
532
  image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
441
533
  image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
442
- image_result.alternative_image.set_filename(image_file_path)
534
+ if isinstance(image_result.alternative_image, PageType):
535
+ image_result.alternative_image.set_imageFilename(image_file_path)
536
+ image_result.alternative_image.set_imageWidth(image_result.pil.width)
537
+ image_result.alternative_image.set_imageHeight(image_result.pil.height)
538
+ elif isinstance(image_result.alternative_image, AlternativeImageType):
539
+ image_result.alternative_image.set_filename(image_file_path)
540
+ else:
541
+ raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
542
+ f"{type(image_result.alternative_image)}")
443
543
  self.workspace.save_image_file(
444
544
  image_result.pil,
445
545
  image_file_id,
@@ -668,7 +768,7 @@ class Processor():
668
768
  # can actually be much more costly than traversing the ltree.
669
769
  # This might depend on the number of pages vs number of fileGrps.
670
770
 
671
- pages = dict()
771
+ pages = {}
672
772
  for i, ifg in enumerate(ifgs):
673
773
  files_ = sorted(self.workspace.mets.find_all_files(
674
774
  pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
@@ -723,7 +823,7 @@ class Processor():
723
823
  if self.page_id and not any(pages):
724
824
  self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
725
825
  f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
726
- ifts = list()
826
+ ifts = []
727
827
  for page, ifiles in pages.items():
728
828
  for i, ifg in enumerate(ifgs):
729
829
  if not ifiles[i]:
@@ -1,6 +1,6 @@
1
1
  # pylint: disable=missing-module-docstring,invalid-name
2
- from os.path import join, basename
3
- from typing import Optional, Union
2
+ from os.path import join
3
+ from typing import Optional
4
4
 
5
5
  import click
6
6
 
@@ -10,7 +10,6 @@ from ocrd.processor.ocrd_page_result import OcrdPageResult
10
10
  from ocrd_models.ocrd_file import OcrdFileType
11
11
  from ocrd_models.ocrd_page import OcrdPage, to_xml
12
12
  from ocrd_utils import (
13
- getLogger,
14
13
  make_file_id,
15
14
  MIME_TO_EXT,
16
15
  MIMETYPE_PAGE,
@@ -20,8 +19,6 @@ from ocrd_utils import (
20
19
  )
21
20
  from ocrd_modelfactory import page_from_file
22
21
 
23
- OCRD_TOOL = parse_json_string_with_comments(resource_string(__package__ + '.dummy', 'ocrd-tool.json'))
24
-
25
22
  class DummyProcessor(Processor):
26
23
  """
27
24
  Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
@@ -76,17 +73,13 @@ class DummyProcessor(Processor):
76
73
  super().process_page_file(input_file)
77
74
 
78
75
  @property
79
- def metadata(self):
80
- return OCRD_TOOL
76
+ def metadata_filename(self):
77
+ return 'processor/builtin/dummy/ocrd-tool.json'
81
78
 
82
79
  @property
83
80
  def executable(self):
84
81
  return 'ocrd-dummy'
85
82
 
86
- @property
87
- def version(self):
88
- return '0.0.3'
89
-
90
83
  @click.command()
91
84
  @ocrd_cli_options
92
85
  def cli(*args, **kwargs):
ocrd/processor/helpers.py CHANGED
@@ -1,13 +1,12 @@
1
1
  """
2
2
  Helper methods for running and documenting processors
3
3
  """
4
- from os import chdir, getcwd
5
4
  from time import perf_counter, process_time
6
5
  from functools import lru_cache
7
6
  import json
8
7
  import inspect
9
8
  from subprocess import run
10
- from typing import List
9
+ from typing import List, Optional
11
10
 
12
11
  from click import wrap_text
13
12
  from ocrd.workspace import Workspace
@@ -39,10 +38,7 @@ def run_processor(
39
38
  log_level=None,
40
39
  input_file_grp=None,
41
40
  output_file_grp=None,
42
- show_resource=None,
43
- list_resources=False,
44
41
  parameter=None,
45
- parameter_override=None,
46
42
  working_dir=None,
47
43
  mets_server_url=None,
48
44
  instance_caching=False
@@ -84,7 +80,7 @@ def run_processor(
84
80
  log.debug("Running processor %s", processorClass)
85
81
 
86
82
  processor = get_processor(
87
- processor_class=processorClass,
83
+ processorClass,
88
84
  parameter=parameter,
89
85
  workspace=None,
90
86
  page_id=page_id,
@@ -102,7 +98,7 @@ def run_processor(
102
98
  t0_cpu = process_time()
103
99
  if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']):
104
100
  backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil'
105
- from memory_profiler import memory_usage
101
+ from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel
106
102
  try:
107
103
  mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}),
108
104
  # only run process once
@@ -212,7 +208,7 @@ def run_cli(
212
208
  if not log_filename:
213
209
  result = run(args, check=False)
214
210
  else:
215
- with open(log_filename, 'a') as file_desc:
211
+ with open(log_filename, 'a', encoding='utf-8') as file_desc:
216
212
  result = run(args, check=False, stdout=file_desc, stderr=file_desc)
217
213
  return result.returncode
218
214
 
@@ -359,9 +355,9 @@ Options:
359
355
  pass
360
356
 
361
357
 
362
- # Taken from https://github.com/OCR-D/core/pull/884
363
- @freeze_args
364
- @lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
358
+ # not decorated here but at runtime (on first use)
359
+ #@freeze_args
360
+ #@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE)
365
361
  def get_cached_processor(parameter: dict, processor_class):
366
362
  """
367
363
  Call this function to get back an instance of a processor.
@@ -374,16 +370,13 @@ def get_cached_processor(parameter: dict, processor_class):
374
370
  Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned.
375
371
  """
376
372
  if processor_class:
377
- dict_params = dict(parameter) if parameter else None
378
- processor = processor_class(None, parameter=dict_params)
379
- processor.setup()
373
+ processor = processor_class(None, parameter=dict(parameter))
380
374
  return processor
381
375
  return None
382
376
 
383
-
384
377
  def get_processor(
385
378
  processor_class,
386
- parameter: dict,
379
+ parameter: Optional[dict] = None,
387
380
  workspace: Workspace = None,
388
381
  page_id: str = None,
389
382
  input_file_grp: List[str] = None,
@@ -391,11 +384,24 @@ def get_processor(
391
384
  instance_caching: bool = False,
392
385
  ):
393
386
  if processor_class:
387
+ if parameter is None:
388
+ parameter = {}
394
389
  if instance_caching:
390
+ global get_cached_processor
391
+ if not hasattr(get_cached_processor, '__wrapped__'):
392
+ # first call: wrap
393
+ if processor_class.max_instances < 0:
394
+ maxsize = config.OCRD_MAX_PROCESSOR_CACHE
395
+ else:
396
+ maxsize = min(config.OCRD_MAX_PROCESSOR_CACHE, processor_class.max_instances)
397
+ # wrapping in call cache
398
+ # wrapping dict into frozendict (from https://github.com/OCR-D/core/pull/884)
399
+ get_cached_processor = freeze_args(lru_cache(maxsize=maxsize)(get_cached_processor))
395
400
  processor = get_cached_processor(parameter, processor_class)
396
401
  else:
402
+ # avoid passing workspace already (deprecated chdir behaviour)
397
403
  processor = processor_class(None, parameter=parameter)
398
- processor.setup()
404
+ # set current processing parameters
399
405
  processor.workspace = workspace
400
406
  processor.page_id = page_id
401
407
  processor.input_file_grp = input_file_grp
@@ -1,15 +1,15 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List
2
+ from typing import List, Union
3
3
  from ocrd_models.ocrd_page import OcrdPage
4
4
  from PIL.Image import Image
5
5
 
6
- from ocrd_models.ocrd_page_generateds import AlternativeImageType
6
+ from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
7
7
 
8
8
  @dataclass
9
9
  class OcrdPageResultImage():
10
10
  pil : Image
11
11
  file_id_suffix : str
12
- alternative_image : AlternativeImageType
12
+ alternative_image : Union[AlternativeImageType, PageType]
13
13
 
14
14
  @dataclass
15
15
  class OcrdPageResult():
ocrd/resolver.py CHANGED
@@ -18,7 +18,6 @@ from ocrd_utils import (
18
18
  )
19
19
  from ocrd.workspace import Workspace
20
20
  from ocrd_models import OcrdMets
21
- from ocrd_models.constants import NAMESPACES as NS
22
21
  from ocrd_models.utils import handle_oai_response
23
22
 
24
23
  class Resolver():
@@ -310,5 +309,3 @@ class Resolver():
310
309
  raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))
311
310
 
312
311
  return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url
313
-
314
-
ocrd/resource_manager.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from pathlib import Path
2
2
  from os.path import join
3
- from os import environ, listdir, makedirs, getcwd, path, unlink
3
+ from os import environ, listdir, getcwd, unlink
4
4
  from shutil import copytree, rmtree, copy
5
5
  from fnmatch import filter as apply_glob
6
6
  from datetime import datetime
@@ -13,14 +13,18 @@ from gdown.parse_url import parse_url as gparse_url
13
13
  from gdown.download import get_url_from_gdrive_confirmation
14
14
  from yaml import safe_load, safe_dump
15
15
 
16
+ # pylint: disable=wrong-import-position
17
+
16
18
  # https://github.com/OCR-D/core/issues/867
17
19
  # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
18
20
  import yaml.constructor
19
- yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
20
- yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
21
+ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \
22
+ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str']
23
+
24
+ # pylint: enable=wrong-import-position
21
25
 
22
26
  from ocrd_validators import OcrdResourceListValidator
23
- from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config
27
+ from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
24
28
  from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
25
29
  from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
26
30
 
@@ -248,7 +252,7 @@ class OcrdResourceManager:
248
252
  if "Content-Disposition" not in r.headers:
249
253
  url = get_url_from_gdrive_confirmation(r.text)
250
254
  except RuntimeError as e:
251
- log.warning("Cannot unwrap Google Drive URL: ", e)
255
+ log.warning("Cannot unwrap Google Drive URL: %s", e)
252
256
  with open(filename, 'wb') as f:
253
257
  with requests.get(url, stream=True) as r:
254
258
  r.raise_for_status()
ocrd/workspace.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import io
2
2
  from os import makedirs, unlink, listdir, path
3
3
  from pathlib import Path
4
- from shutil import move, copyfileobj
4
+ from shutil import copyfileobj
5
5
  from re import sub
6
6
  from tempfile import NamedTemporaryFile
7
7
  from contextlib import contextmanager
@@ -43,7 +43,6 @@ from ocrd_utils import (
43
43
  MIME_TO_PIL,
44
44
  MIMETYPE_PAGE,
45
45
  REGEX_PREFIX,
46
- config
47
46
  )
48
47
 
49
48
  from .workspace_backup import WorkspaceBackupManager
@@ -111,7 +110,7 @@ class Workspace():
111
110
 
112
111
  def __repr__(self):
113
112
  return 'Workspace[remote=%s, directory=%s, baseurl=%s, file_groups=%s, files=%s]' % (
114
- not not self.is_remote,
113
+ self.is_remote,
115
114
  self.directory,
116
115
  self.baseurl,
117
116
  self.mets.file_groups,
@@ -648,7 +647,7 @@ class Workspace():
648
647
  log = getLogger('ocrd.workspace.image_from_page')
649
648
  page_image_info = self.resolve_image_exif(page.imageFilename)
650
649
  page_image = self._resolve_image_as_pil(page.imageFilename)
651
- page_coords = dict()
650
+ page_coords = {}
652
651
  # use identity as initial affine coordinate transform:
653
652
  page_coords['transform'] = np.eye(3)
654
653
  # interim bbox (updated with each change to the transform):
@@ -1091,7 +1090,7 @@ class Workspace():
1091
1090
  The (absolute) path of the created file.
1092
1091
  """
1093
1092
  log = getLogger('ocrd.workspace.save_image_file')
1094
- saveargs = dict()
1093
+ saveargs = {}
1095
1094
  if 'dpi' in image.info:
1096
1095
  saveargs['dpi'] = image.info['dpi']
1097
1096
  image_bytes = io.BytesIO()
@@ -1168,9 +1167,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh
1168
1167
  # Transpose in affine coordinate transform:
1169
1168
  # (consistent with image transposition or AlternativeImage below)
1170
1169
  transposition = {
1171
- 90: Image.ROTATE_90,
1172
- 180: Image.ROTATE_180,
1173
- 270: Image.ROTATE_270
1170
+ 90: Image.Transpose.ROTATE_90,
1171
+ 180: Image.Transpose.ROTATE_180,
1172
+ 270: Image.Transpose.ROTATE_270
1174
1173
  }.get(orientation) # no default
1175
1174
  segment_coords['transform'] = transpose_coordinates(
1176
1175
  segment_coords['transform'], transposition,
@@ -1238,5 +1237,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa
1238
1237
  segment_image = segment_image.resize((int(segment_image.width * factor),
1239
1238
  int(segment_image.height * factor)),
1240
1239
  # slowest, but highest quality:
1241
- Image.BICUBIC)
1240
+ Image.Resampling.BICUBIC)
1242
1241
  return segment_image, segment_coords, segment_xywh
ocrd/workspace_backup.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from datetime import datetime
2
2
  from os import makedirs
3
- from os.path import join, basename, getsize, abspath
3
+ from os.path import join, basename, getsize
4
4
  from glob import glob
5
5
  from shutil import copy
6
6
  import hashlib
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.0.0a2
3
+ Version: 3.0.0b1
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0