ocrd 3.5.1__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. ocrd/cli/__init__.py +8 -6
  2. ocrd/cli/bashlib.py +8 -114
  3. ocrd/cli/network.py +0 -2
  4. ocrd/cli/ocrd_tool.py +26 -4
  5. ocrd/cli/process.py +1 -0
  6. ocrd/cli/resmgr.py +0 -1
  7. ocrd/cli/validate.py +32 -13
  8. ocrd/cli/workspace.py +125 -52
  9. ocrd/cli/zip.py +13 -4
  10. ocrd/decorators/__init__.py +28 -52
  11. ocrd/decorators/loglevel_option.py +4 -0
  12. ocrd/decorators/mets_find_options.py +2 -1
  13. ocrd/decorators/ocrd_cli_options.py +3 -7
  14. ocrd/decorators/parameter_option.py +12 -11
  15. ocrd/mets_server.py +11 -15
  16. ocrd/processor/base.py +88 -71
  17. ocrd/processor/builtin/dummy_processor.py +7 -4
  18. ocrd/processor/builtin/filter_processor.py +3 -2
  19. ocrd/processor/helpers.py +5 -6
  20. ocrd/processor/ocrd_page_result.py +7 -5
  21. ocrd/resolver.py +42 -32
  22. ocrd/task_sequence.py +11 -4
  23. ocrd/workspace.py +64 -54
  24. ocrd/workspace_backup.py +3 -0
  25. ocrd/workspace_bagger.py +15 -8
  26. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/METADATA +2 -8
  27. ocrd-3.7.0.dist-info/RECORD +123 -0
  28. ocrd_modelfactory/__init__.py +4 -2
  29. ocrd_models/constants.py +18 -1
  30. ocrd_models/ocrd_agent.py +1 -1
  31. ocrd_models/ocrd_exif.py +7 -3
  32. ocrd_models/ocrd_file.py +24 -19
  33. ocrd_models/ocrd_mets.py +90 -67
  34. ocrd_models/ocrd_page.py +17 -13
  35. ocrd_models/ocrd_xml_base.py +1 -0
  36. ocrd_models/report.py +2 -1
  37. ocrd_models/utils.py +4 -3
  38. ocrd_models/xpath_functions.py +3 -1
  39. ocrd_network/__init__.py +1 -2
  40. ocrd_network/cli/__init__.py +0 -2
  41. ocrd_network/cli/client.py +122 -50
  42. ocrd_network/cli/processing_server.py +1 -2
  43. ocrd_network/client.py +2 -2
  44. ocrd_network/client_utils.py +30 -13
  45. ocrd_network/constants.py +1 -6
  46. ocrd_network/database.py +3 -3
  47. ocrd_network/logging_utils.py +2 -7
  48. ocrd_network/models/__init__.py +0 -2
  49. ocrd_network/models/job.py +31 -33
  50. ocrd_network/models/messages.py +3 -2
  51. ocrd_network/models/workspace.py +5 -5
  52. ocrd_network/process_helpers.py +54 -17
  53. ocrd_network/processing_server.py +63 -114
  54. ocrd_network/processing_worker.py +6 -5
  55. ocrd_network/rabbitmq_utils/__init__.py +2 -0
  56. ocrd_network/rabbitmq_utils/helpers.py +24 -7
  57. ocrd_network/runtime_data/__init__.py +1 -2
  58. ocrd_network/runtime_data/deployer.py +12 -85
  59. ocrd_network/runtime_data/hosts.py +61 -130
  60. ocrd_network/runtime_data/network_agents.py +7 -31
  61. ocrd_network/runtime_data/network_services.py +1 -1
  62. ocrd_network/server_cache.py +1 -1
  63. ocrd_network/server_utils.py +13 -52
  64. ocrd_network/utils.py +1 -0
  65. ocrd_utils/__init__.py +4 -4
  66. ocrd_utils/config.py +86 -76
  67. ocrd_utils/deprecate.py +3 -0
  68. ocrd_utils/image.py +51 -23
  69. ocrd_utils/introspect.py +8 -3
  70. ocrd_utils/logging.py +15 -7
  71. ocrd_utils/os.py +17 -4
  72. ocrd_utils/str.py +32 -16
  73. ocrd_validators/json_validator.py +4 -1
  74. ocrd_validators/ocrd_tool_validator.py +2 -1
  75. ocrd_validators/ocrd_zip_validator.py +5 -4
  76. ocrd_validators/page_validator.py +21 -9
  77. ocrd_validators/parameter_validator.py +3 -2
  78. ocrd_validators/processing_server_config.schema.yml +1 -33
  79. ocrd_validators/resource_list_validator.py +3 -1
  80. ocrd_validators/workspace_validator.py +30 -20
  81. ocrd_validators/xsd_mets_validator.py +2 -1
  82. ocrd_validators/xsd_page_validator.py +2 -1
  83. ocrd_validators/xsd_validator.py +4 -2
  84. ocrd/cli/log.py +0 -51
  85. ocrd/lib.bash +0 -317
  86. ocrd-3.5.1.dist-info/RECORD +0 -128
  87. ocrd_network/cli/processor_server.py +0 -31
  88. ocrd_network/models/ocrd_tool.py +0 -12
  89. ocrd_network/processor_server.py +0 -255
  90. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/LICENSE +0 -0
  91. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/WHEEL +0 -0
  92. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/entry_points.txt +0 -0
  93. {ocrd-3.5.1.dist-info → ocrd-3.7.0.dist-info}/top_level.txt +0 -0
ocrd/resolver.py CHANGED
@@ -20,6 +20,7 @@ from ocrd.workspace import Workspace
20
20
  from ocrd_models import OcrdMets
21
21
  from ocrd_models.utils import handle_oai_response
22
22
 
23
+
23
24
  class Resolver():
24
25
  """
25
26
  Handle uploads, downloads, repository access, and manage temporary directories
@@ -31,11 +32,13 @@ class Resolver():
31
32
 
32
33
  If ``url`` looks like a file path, check whether that exists.
33
34
  If it does exist and is within ``directory` already, return early.
34
- If it does exist but is outside of ``directory``. copy it.
35
- If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls.
35
+ If it does exist but is outside of ``directory``, copy it.
36
+ If ``url` does not appear to be a file path, try downloading via HTTP,
37
+ retrying ``retries`` times with timeout ``timeout`` between calls.
36
38
 
37
39
  If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``.
38
40
 
41
+ \b
39
42
  If the target file already exists within ``directory``, behavior depends on ``if_exists``:
40
43
  - ``skip`` (default): do nothing and return early. Note that this
41
44
  - ``overwrite``: overwrite the existing file
@@ -56,11 +59,12 @@ class Resolver():
56
59
  Returns:
57
60
  Local filename string, *relative* to directory
58
61
  """
59
- log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
60
- log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)
62
+ log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
63
+ log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|",
64
+ directory, url, basename, if_exists, subdir)
61
65
 
62
66
  if not url:
63
- raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok
67
+ raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok
64
68
  if not directory:
65
69
  raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work
66
70
 
@@ -123,25 +127,25 @@ class Resolver():
123
127
  retries = Retry(total=retries or 0,
124
128
  status_forcelist=[
125
129
  # probably too wide (only transient failures):
126
- 408, # Request Timeout
127
- 409, # Conflict
128
- 412, # Precondition Failed
129
- 417, # Expectation Failed
130
- 423, # Locked
131
- 424, # Fail
132
- 425, # Too Early
133
- 426, # Upgrade Required
134
- 428, # Precondition Required
135
- 429, # Too Many Requests
136
- 440, # Login Timeout
137
- 500, # Internal Server Error
138
- 503, # Service Unavailable
139
- 504, # Gateway Timeout
140
- 509, # Bandwidth Limit Exceeded
141
- 529, # Site Overloaded
142
- 598, # Proxy Read Timeout
143
- 599, # Proxy Connect Timeout
144
- ])
130
+ 408, # Request Timeout
131
+ 409, # Conflict
132
+ 412, # Precondition Failed
133
+ 417, # Expectation Failed
134
+ 423, # Locked
135
+ 424, # Fail
136
+ 425, # Too Early
137
+ 426, # Upgrade Required
138
+ 428, # Precondition Required
139
+ 429, # Too Many Requests
140
+ 440, # Login Timeout
141
+ 500, # Internal Server Error
142
+ 503, # Service Unavailable
143
+ 504, # Gateway Timeout
144
+ 509, # Bandwidth Limit Exceeded
145
+ 529, # Site Overloaded
146
+ 598, # Proxy Read Timeout
147
+ 599, # Proxy Connect Timeout
148
+ ])
145
149
  adapter = HTTPAdapter(max_retries=retries)
146
150
  session.mount('http://', adapter)
147
151
  session.mount('https://', adapter)
@@ -181,7 +185,7 @@ class Resolver():
181
185
  the filesystem directly.
182
186
  **kwargs (): Passed on to ``OcrdMets.find_files`` if download == True
183
187
 
184
- Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless
188
+ Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless
185
189
  the former is already local and the latter is ``none`` or already identical to its directory name.
186
190
 
187
191
  Returns:
@@ -218,11 +222,13 @@ class Resolver():
218
222
  Path(dst_dir).mkdir(parents=True, exist_ok=False)
219
223
  dst_dir = str(Path(dst_dir).resolve())
220
224
 
221
- log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
222
- mets_basename, mets_url, src_baseurl, dst_dir)
223
- self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise')
225
+ log.debug("mets_basename='%s' mets_url='%s' src_baseurl='%s' dst_dir='%s'",
226
+ mets_basename, mets_url, src_baseurl, dst_dir)
227
+ self.download_to_directory(dst_dir, mets_url, basename=mets_basename,
228
+ if_exists='overwrite' if clobber_mets else 'raise')
224
229
 
225
- workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)
230
+ workspace = Workspace(self, dst_dir,
231
+ mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)
226
232
 
227
233
  if download:
228
234
  for f in workspace.mets.find_files(**kwargs):
@@ -273,7 +279,8 @@ class Resolver():
273
279
  # if directory and mets_url and not mets_is_remote:
274
280
  # raise ValueError("Use either --mets or --directory, not both")
275
281
 
276
- # If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior)
282
+ # If --mets is a URL, a directory must be explicitly provided
283
+ # (not strictly necessary, but retained for legacy behavior)
277
284
  if not directory and mets_is_remote:
278
285
  raise ValueError("--mets is an http(s) URL but no --directory was given")
279
286
 
@@ -297,7 +304,7 @@ class Resolver():
297
304
  elif not directory and mets_url:
298
305
  mets_url = Path(mets_url).resolve()
299
306
  directory = mets_url.parent
300
- else: # == directory and mets_url:
307
+ else: # == directory and mets_url:
301
308
  directory = Path(directory).resolve()
302
309
  if not mets_is_remote:
303
310
  # --mets is just a basename and --directory is set, so treat --mets as --mets-basename
@@ -306,10 +313,13 @@ class Resolver():
306
313
  else:
307
314
  mets_url = Path(mets_url).resolve()
308
315
  if not is_file_in_directory(directory, mets_url):
309
- raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))
316
+ raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (
317
+ mets_url, directory))
310
318
 
311
319
  if mets_server_url and not mets_server_url.startswith('http://'):
312
320
  # UDS socket
313
321
  mets_server_url = str(Path(mets_server_url).resolve())
314
322
 
323
+ log.debug("directory='%s' mets_url='%s', mets_basename='%s', mets_server_url='%s'" % (
324
+ directory, str(mets_url), str(mets_basename), mets_server_url))
315
325
  return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url
ocrd/task_sequence.py CHANGED
@@ -9,6 +9,7 @@ from ocrd.resolver import Resolver
9
9
  from ocrd_validators import ParameterValidator, WorkspaceValidator
10
10
  from ocrd_models import ValidationReport
11
11
 
12
+
12
13
  class ProcessorTask():
13
14
 
14
15
  @classmethod
@@ -85,6 +86,7 @@ class ProcessorTask():
85
86
  ret += " -p '%s'" % json.dumps(self.parameters)
86
87
  return ret
87
88
 
89
+
88
90
  def validate_tasks(tasks, workspace, page_id=None, overwrite=False):
89
91
  report = ValidationReport()
90
92
  prev_output_file_grps = workspace.mets.file_groups
@@ -93,14 +95,18 @@ def validate_tasks(tasks, workspace, page_id=None, overwrite=False):
93
95
  first_task.validate()
94
96
 
95
97
  # first task: check input/output file groups from METS
96
- WorkspaceValidator.check_file_grp(workspace, first_task.input_file_grps, '' if overwrite else first_task.output_file_grps, page_id, report)
98
+ WorkspaceValidator.check_file_grp(workspace,
99
+ first_task.input_file_grps,
100
+ '' if overwrite else first_task.output_file_grps,
101
+ page_id,
102
+ report)
97
103
 
98
104
  prev_output_file_grps += first_task.output_file_grps
99
105
  for task in tasks[1:]:
100
106
  task.validate()
101
107
  # check either existing fileGrp or output-file group of previous task matches current input_file_group
102
108
  for input_file_grp in task.input_file_grps:
103
- if not input_file_grp in prev_output_file_grps:
109
+ if input_file_grp not in prev_output_file_grps:
104
110
  report.add_error("Input file group not contained in METS or produced by previous steps: %s" % input_file_grp)
105
111
  if not overwrite:
106
112
  WorkspaceValidator.check_file_grp(workspace, [], task.output_file_grps, page_id, report)
@@ -157,5 +163,6 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False, mets_server_
157
163
 
158
164
  # check output file groups are in mets
159
165
  for output_file_grp in task.output_file_grps:
160
- if not output_file_grp in workspace.mets.file_groups:
161
- raise Exception("Invalid state: expected output file group '%s' not in METS (despite processor success)" % output_file_grp)
166
+ if output_file_grp not in workspace.mets.file_groups:
167
+ raise Exception("Invalid state: expected output file group '%s' not in METS "
168
+ "(despite processor success)" % output_file_grp)
ocrd/workspace.py CHANGED
@@ -28,16 +28,13 @@ from ocrd_utils import (
28
28
  scale_coordinates,
29
29
  shift_coordinates,
30
30
  rotate_coordinates,
31
- transform_coordinates,
32
31
  transpose_coordinates,
33
32
  crop_image,
34
33
  rotate_image,
35
34
  transpose_image,
36
35
  bbox_from_polygon,
37
- polygon_from_points,
38
36
  xywh_from_bbox,
39
37
  pushd_popd,
40
- is_local_filename,
41
38
  deprecated_alias,
42
39
  DEFAULT_METS_BASENAME,
43
40
  MIME_TO_EXT,
@@ -51,6 +48,7 @@ from .mets_server import ClientSideOcrdMets
51
48
 
52
49
  __all__ = ['Workspace']
53
50
 
51
+
54
52
  @contextmanager
55
53
  def download_temporary_file(url):
56
54
  with NamedTemporaryFile(prefix='ocrd-download-') as f:
@@ -82,7 +80,7 @@ class Workspace():
82
80
  self,
83
81
  resolver,
84
82
  directory,
85
- mets : Optional[Union[OcrdMets, ClientSideOcrdMets]] = None,
83
+ mets: Optional[Union[OcrdMets, ClientSideOcrdMets]] = None,
86
84
  mets_basename=DEFAULT_METS_BASENAME,
87
85
  automatic_backup=False,
88
86
  baseurl=None,
@@ -96,8 +94,9 @@ class Workspace():
96
94
  if self.is_remote:
97
95
  mets = ClientSideOcrdMets(mets_server_url, self.directory)
98
96
  if mets.workspace_path != self.directory:
99
- raise ValueError(f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs "
100
- f"from local workspace directory '{self.directory}'. These are not the same workspaces.")
97
+ raise ValueError(
98
+ f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs "
99
+ f"from local workspace directory '{self.directory}'. These are not the same workspaces.")
101
100
  else:
102
101
  mets = OcrdMets(filename=self.mets_target)
103
102
  self.mets = mets
@@ -148,7 +147,7 @@ class Workspace():
148
147
  if not copy_files:
149
148
  fpath_src = Path(other_workspace.directory).resolve()
150
149
  fpath_dst = Path(self.directory).resolve()
151
- dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath
150
+ dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath
152
151
  f.local_filename = dstprefix / f.local_filename
153
152
  return
154
153
  fpath_src = Path(other_workspace.directory, f.local_filename)
@@ -171,7 +170,6 @@ class Workspace():
171
170
 
172
171
  self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs)
173
172
 
174
-
175
173
  @deprecated(version='1.0.0', reason="Use workspace.download_file")
176
174
  def download_url(self, url, **kwargs):
177
175
  """
@@ -199,19 +197,23 @@ class Workspace():
199
197
  file_path = Path(f.local_filename).absolute()
200
198
  if file_path.exists():
201
199
  try:
202
- file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative
200
+ file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative
203
201
  # If the f.local_filename exists and is within self.directory, nothing to do
204
202
  log.debug(f"'local_filename' {f.local_filename} already within {self.directory} - nothing to do")
205
203
  except ValueError:
206
204
  # f.local_filename exists, but not within self.directory, copy it
207
- log.debug("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory))
208
- f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp)
205
+ log.debug("Copying 'local_filename' %s to workspace directory %s" % (
206
+ f.local_filename, self.directory))
207
+ f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename,
208
+ subdir=f.fileGrp)
209
209
  return f
210
210
  if f.url:
211
- log.debug("OcrdFile has 'local_filename' but it doesn't resolve - trying to download from 'url' %s", f.url)
211
+ log.debug("OcrdFile has 'local_filename' but it doesn't resolve - "
212
+ "trying to download from 'url' %s", f.url)
212
213
  url = f.url
213
214
  elif self.baseurl:
214
- log.debug("OcrdFile has 'local_filename' but it doesn't resolve, and no 'url' - trying 'baseurl' %s with 'local_filename' %s",
215
+ log.debug("OcrdFile has 'local_filename' but it doesn't resolve, and no 'url' - "
216
+ "trying 'baseurl' %s with 'local_filename' %s",
215
217
  self.baseurl, f.local_filename)
216
218
  url = '%s/%s' % (self.baseurl, f.local_filename)
217
219
  else:
@@ -223,7 +225,8 @@ class Workspace():
223
225
  if f.url:
224
226
  # If f.url is set, download the file to the workspace
225
227
  basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename
226
- f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename)
228
+ f.local_filename = self.resolver.download_to_directory(self.directory, f.url,
229
+ subdir=f.fileGrp, basename=basename)
227
230
  return f
228
231
  # If neither f.local_filename nor f.url is set, fail
229
232
  raise ValueError(f"OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded")
@@ -281,7 +284,8 @@ class Workspace():
281
284
  if not force:
282
285
  raise e
283
286
 
284
- def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_recursive=False, page_same_group=False):
287
+ def remove_file_group(self, USE, recursive=False, force=False, keep_files=False,
288
+ page_recursive=False, page_same_group=False):
285
289
  """
286
290
  Remove a METS `fileGrp`.
287
291
 
@@ -302,7 +306,8 @@ class Workspace():
302
306
  file_dirs = []
303
307
  if recursive:
304
308
  for f in self.mets.find_files(fileGrp=USE):
305
- self.remove_file(f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group)
309
+ self.remove_file(
310
+ f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group)
306
311
  if f.local_filename:
307
312
  f_dir = path.dirname(f.local_filename)
308
313
  if f_dir:
@@ -320,7 +325,6 @@ class Workspace():
320
325
  if Path(file_dir).is_dir() and not listdir(file_dir):
321
326
  Path(file_dir).rmdir()
322
327
 
323
-
324
328
  def rename_file_group(self, old, new):
325
329
  """
326
330
  Rename a METS `fileGrp`.
@@ -361,7 +365,8 @@ class Workspace():
361
365
  new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID)
362
366
  try:
363
367
  next(self.mets.find_files(ID=new_id))
364
- log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_local_filename, new_local_filename))
368
+ log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (
369
+ new_id, old_local_filename, new_local_filename))
365
370
  except StopIteration:
366
371
  mets_file.ID = new_id
367
372
  # change file paths in PAGE-XML imageFilename and filename attributes
@@ -378,7 +383,8 @@ class Workspace():
378
383
  for old_local_filename, new_local_filename in local_filename_replacements.items():
379
384
  if ai.filename == old_local_filename:
380
385
  changed = True
381
- log.debug("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_local_filename, new_local_filename))
386
+ log.debug("Rename pc:Page/../AlternativeImage: %s -> %s" % (
387
+ old_local_filename, new_local_filename))
382
388
  ai.filename = new_local_filename
383
389
  if changed:
384
390
  log.debug("PAGE-XML changed, writing %s" % (page_file.local_filename))
@@ -502,7 +508,7 @@ class Workspace():
502
508
  def _resolve_image_as_pil(self, image_url, coords=None):
503
509
  log = getLogger('ocrd.workspace._resolve_image_as_pil')
504
510
  pil_image = self._apply_mets_file(image_url, Image.open)
505
- pil_image.load() # alloc and give up the FD
511
+ pil_image.load() # alloc and give up the FD
506
512
 
507
513
  # Pillow does not properly support higher color depths
508
514
  # (e.g. 16-bit or 32-bit or floating point grayscale),
@@ -544,7 +550,7 @@ class Workspace():
544
550
 
545
551
  # FIXME: remove or replace this by (image_from_polygon+) crop_image ...
546
552
  log.debug("Converting PIL to OpenCV: %s", image_url)
547
- color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else COLOR_RGB2BGR
553
+ color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else COLOR_RGB2BGR
548
554
  pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image)
549
555
  cv2_image = cvtColor(pil_as_np_array, color_conversion)
550
556
 
@@ -659,8 +665,8 @@ class Workspace():
659
665
  orientation = (page_coords['angle'] + 45) % 360
660
666
  orientation = orientation - (orientation % 90)
661
667
  skew = (page_coords['angle'] % 360) - orientation
662
- skew = 180 - (180 - skew) % 360 # map to [-45,45]
663
- page_coords['angle'] = 0 # nothing applied yet (depends on filters)
668
+ skew = 180 - (180 - skew) % 360 # map to [-45,45]
669
+ page_coords['angle'] = 0 # nothing applied yet (depends on filters)
664
670
  log.debug("page '%s' has %s orientation=%d skew=%.2f",
665
671
  page_id, "border," if border else "", orientation, skew)
666
672
  if page_image_info.resolution != 1:
@@ -696,7 +702,7 @@ class Workspace():
696
702
  for feature in feature_selector.split(',') if feature) and
697
703
  not any(feature in featureset
698
704
  for feature in feature_filter.split(',') if feature) and
699
- len(featureset.difference(auto_features)) >= \
705
+ len(featureset.difference(auto_features)) >=
700
706
  len(best_features.difference(auto_features))):
701
707
  best_features = featureset
702
708
  best_image = alternative_image
@@ -705,7 +711,7 @@ class Workspace():
705
711
  alternative_images.index(best_image) + 1,
706
712
  best_features, page_id)
707
713
  page_image = self._resolve_image_as_pil(best_image.get_filename())
708
- page_coords['features'] = best_image.get_comments() # including duplicates
714
+ page_coords['features'] = best_image.get_comments() # including duplicates
709
715
 
710
716
  # adjust the coord transformation to the steps applied on the image,
711
717
  # and apply steps on the existing image in case it is missing there,
@@ -727,18 +733,18 @@ class Workspace():
727
733
  for i, feature in enumerate(alternative_image_features +
728
734
  (['cropped']
729
735
  if (border and
730
- not 'cropped' in alternative_image_features and
731
- not 'cropped' in feature_filter.split(','))
736
+ 'cropped' not in alternative_image_features and
737
+ 'cropped' not in feature_filter.split(','))
732
738
  else []) +
733
739
  (['rotated-%d' % orientation]
734
740
  if (orientation and
735
- not 'rotated-%d' % orientation in alternative_image_features and
736
- not 'rotated-%d' % orientation in feature_filter.split(','))
741
+ 'rotated-%d' % orientation not in alternative_image_features and
742
+ 'rotated-%d' % orientation not in feature_filter.split(','))
737
743
  else []) +
738
744
  (['deskewed']
739
745
  if (skew and
740
- not 'deskewed' in alternative_image_features and
741
- not 'deskewed' in feature_filter.split(','))
746
+ 'deskewed' not in alternative_image_features and
747
+ 'deskewed' not in feature_filter.split(','))
742
748
  else []) +
743
749
  # not a feature to be added, but merely as a fallback position
744
750
  # to always enter loop at i == len(alternative_image_features)
@@ -931,15 +937,15 @@ class Workspace():
931
937
  orientation = (angle + 45) % 360
932
938
  orientation = orientation - (orientation % 90)
933
939
  skew = (angle % 360) - orientation
934
- skew = 180 - (180 - skew) % 360 # map to [-45,45]
940
+ skew = 180 - (180 - skew) % 360 # map to [-45,45]
935
941
  log.debug("segment '%s' has orientation=%d skew=%.2f",
936
942
  segment.id, orientation, skew)
937
943
  else:
938
944
  orientation = 0
939
945
  skew = 0
940
- segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters)
946
+ segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters)
941
947
  if 'DPI' in parent_coords:
942
- segment_coords['DPI'] = parent_coords['DPI'] # not rescaled yet
948
+ segment_coords['DPI'] = parent_coords['DPI'] # not rescaled yet
943
949
 
944
950
  # initialize AlternativeImage@comments classes from parent, except
945
951
  # for those operations that can apply on multiple hierarchy levels:
@@ -971,7 +977,7 @@ class Workspace():
971
977
  for feature in feature_selector.split(',') if feature) and
972
978
  not any(feature in featureset
973
979
  for feature in feature_filter.split(',') if feature) and
974
- len(featureset.difference(auto_features)) >= \
980
+ len(featureset.difference(auto_features)) >=
975
981
  len(best_features.difference(auto_features))):
976
982
  best_features = featureset
977
983
  best_image = alternative_image
@@ -980,7 +986,7 @@ class Workspace():
980
986
  alternative_images.index(best_image) + 1,
981
987
  best_features, segment.id)
982
988
  segment_image = self._resolve_image_as_pil(alternative_image.get_filename())
983
- segment_coords['features'] = best_image.get_comments() # including duplicates
989
+ segment_coords['features'] = best_image.get_comments() # including duplicates
984
990
 
985
991
  alternative_image_features = segment_coords['features'].split(',')
986
992
  for duplicate_feature in set([feature for feature in alternative_image_features
@@ -993,13 +999,13 @@ class Workspace():
993
999
  for i, feature in enumerate(alternative_image_features +
994
1000
  (['rotated-%d' % orientation]
995
1001
  if (orientation and
996
- not 'rotated-%d' % orientation in alternative_image_features and
997
- not 'rotated-%d' % orientation in feature_filter.split(','))
1002
+ 'rotated-%d' % orientation not in alternative_image_features and
1003
+ 'rotated-%d' % orientation not in feature_filter.split(','))
998
1004
  else []) +
999
1005
  (['deskewed']
1000
1006
  if (skew and
1001
- not 'deskewed' in alternative_image_features and
1002
- not 'deskewed' in feature_filter.split(','))
1007
+ 'deskewed' not in alternative_image_features and
1008
+ 'deskewed' not in feature_filter.split(','))
1003
1009
  else []) +
1004
1010
  # not a feature to be added, but merely as a fallback position
1005
1011
  # to always enter loop at i == len(alternative_image_features)
@@ -1052,13 +1058,13 @@ class Workspace():
1052
1058
  return segment_image, segment_coords
1053
1059
 
1054
1060
  # pylint: disable=redefined-builtin
1055
- def save_image_file(self, image : Image.Image,
1056
- file_id : str,
1057
- file_grp : str,
1058
- file_path : Optional[str] = None,
1059
- page_id : Optional[str] = None,
1060
- mimetype : str = 'image/png',
1061
- force : bool = False) -> str:
1061
+ def save_image_file(self, image: Image.Image,
1062
+ file_id: str,
1063
+ file_grp: str,
1064
+ file_path: Optional[str] = None,
1065
+ page_id: Optional[str] = None,
1066
+ mimetype: str = 'image/png',
1067
+ force: bool = False) -> str:
1062
1068
  """Store an image in the filesystem and reference it as new file in the METS.
1063
1069
 
1064
1070
  Args:
@@ -1120,6 +1126,7 @@ class Workspace():
1120
1126
  with pushd_popd(self.directory):
1121
1127
  return self.mets.find_files(*args, **kwargs)
1122
1128
 
1129
+
1123
1130
  def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwargs):
1124
1131
  segment_coords = parent_coords.copy()
1125
1132
  # get polygon outline of segment relative to parent image:
@@ -1131,8 +1138,8 @@ def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwarg
1131
1138
  # also possibly different from size after rotation below/AlternativeImage):
1132
1139
  segment_xywh = xywh_from_bbox(*segment_bbox)
1133
1140
  # crop, if (still) necessary:
1134
- if (not isinstance(segment, BorderType) or # always crop below page level
1135
- not op in parent_coords['features']):
1141
+ if (not isinstance(segment, BorderType) or # always crop below page level
1142
+ op not in parent_coords['features']):
1136
1143
  if op == 'recropped':
1137
1144
  log.debug("Recropping %s", name)
1138
1145
  elif isinstance(segment, BorderType):
@@ -1152,6 +1159,7 @@ def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwarg
1152
1159
  -segment_bbox[1]]))
1153
1160
  return segment_image, segment_coords, segment_xywh
1154
1161
 
1162
+
1155
1163
  def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh):
1156
1164
  # Transpose in affine coordinate transform:
1157
1165
  # (consistent with image transposition or AlternativeImage below)
@@ -1159,7 +1167,7 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh
1159
1167
  90: Image.Transpose.ROTATE_90,
1160
1168
  180: Image.Transpose.ROTATE_180,
1161
1169
  270: Image.Transpose.ROTATE_270
1162
- }.get(orientation) # no default
1170
+ }.get(orientation) # no default
1163
1171
  segment_coords['transform'] = transpose_coordinates(
1164
1172
  segment_coords['transform'], transposition,
1165
1173
  np.array([0.5 * segment_xywh['w'],
@@ -1174,6 +1182,7 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh
1174
1182
  segment_coords['features'] += ',rotated-%d' % orientation
1175
1183
  return segment_image, segment_coords, segment_xywh
1176
1184
 
1185
+
1177
1186
  def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xywh, **kwargs):
1178
1187
  # Rotate around center in affine coordinate transform:
1179
1188
  # (consistent with image rotation or AlternativeImage below)
@@ -1185,12 +1194,12 @@ def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xyw
1185
1194
  [segment_xywh['w'], segment_xywh['h']], skew)
1186
1195
  segment_coords['angle'] += skew
1187
1196
  # deskew, if (still) necessary:
1188
- if not 'deskewed' in segment_coords['features']:
1197
+ if 'deskewed' not in segment_coords['features']:
1189
1198
  log.debug("Rotating %s by %.2f°", name, skew)
1190
1199
  segment_image = rotate_image(segment_image, skew, **kwargs)
1191
1200
  segment_coords['features'] += ',deskewed'
1192
1201
  if (segment and
1193
- (not isinstance(segment, BorderType) or # always crop below page level
1202
+ (not isinstance(segment, BorderType) or # always crop below page level
1194
1203
  'cropped' in segment_coords['features'])):
1195
1204
  # re-crop to new bbox (which may deviate
1196
1205
  # if segment polygon was not a rectangle)
@@ -1198,7 +1207,7 @@ def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xyw
1198
1207
  log, name, segment, segment_image, segment_coords,
1199
1208
  op='recropped', **kwargs)
1200
1209
  elif (segment and
1201
- (not isinstance(segment, BorderType) or # always crop below page level
1210
+ (not isinstance(segment, BorderType) or # always crop below page level
1202
1211
  'cropped' in segment_coords['features'])):
1203
1212
  # only shift coordinates as if re-cropping
1204
1213
  segment_polygon = coordinates_of_segment(segment, segment_image, segment_coords)
@@ -1210,6 +1219,7 @@ def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xyw
1210
1219
  -segment_bbox[1]]))
1211
1220
  return segment_image, segment_coords, segment_xywh
1212
1221
 
1222
+
1213
1223
  def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwargs):
1214
1224
  # Resize linearly
1215
1225
  segment_coords['transform'] = scale_coordinates(
@@ -1218,7 +1228,7 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa
1218
1228
  segment_xywh['w'] *= factor
1219
1229
  segment_xywh['h'] *= factor
1220
1230
  # resize, if (still) necessary
1221
- if not 'scaled' in segment_coords['features']:
1231
+ if 'scaled' not in segment_coords['features']:
1222
1232
  log.debug("Scaling %s by %.2f", name, factor)
1223
1233
  segment_coords['features'] += ',scaled'
1224
1234
  # FIXME: validate factor against PAGE-XML attributes
ocrd/workspace_backup.py CHANGED
@@ -10,9 +10,11 @@ from ocrd_utils import getLogger, atomic_write, DEFAULT_METS_BASENAME
10
10
 
11
11
  from .constants import BACKUP_DIR
12
12
 
13
+
13
14
  def _chksum(s):
14
15
  return hashlib.sha256(s).hexdigest()
15
16
 
17
+
16
18
  class WorkspaceBackup():
17
19
 
18
20
  @classmethod
@@ -37,6 +39,7 @@ class WorkspaceBackup():
37
39
  self.mets_xml.file_groups
38
40
  )
39
41
 
42
+
40
43
  class WorkspaceBackupManager():
41
44
  """
42
45
  Manages backups of a workspace in a directory BACKUP_DIR
ocrd/workspace_bagger.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from datetime import datetime
2
- from os import makedirs, chdir, walk
2
+ from os import makedirs, walk
3
3
  from os.path import join, isdir, basename as os_path_basename, exists, relpath
4
4
  from pathlib import Path
5
5
  from shutil import make_archive, rmtree, copyfile, move, copytree
@@ -7,7 +7,11 @@ from tempfile import mkdtemp, TemporaryDirectory
7
7
  import re
8
8
  import tempfile
9
9
  import sys
10
- from bagit import Bag, make_manifests, _load_tag_file, _make_tag_file, _make_tagmanifest_file # pylint: disable=no-name-in-module
10
+ from bagit import (
11
+ Bag,
12
+ make_manifests,
13
+ _load_tag_file, _make_tag_file, _make_tagmanifest_file, # pylint: disable=no-name-in-module
14
+ )
11
15
 
12
16
  from ocrd_utils import (
13
17
  pushd_popd,
@@ -25,10 +29,11 @@ from ocrd_models.ocrd_page import to_xml
25
29
 
26
30
  from .workspace import Workspace
27
31
 
28
- tempfile.tempdir = '/tmp' # TODO hard-coded
32
+ tempfile.tempdir = '/tmp' # TODO hard-coded
29
33
 
30
34
  BACKUPDIR = join('/tmp', TMP_BAGIT_PREFIX + 'backup')
31
35
 
36
+
32
37
  class WorkspaceBagger():
33
38
  """
34
39
  Serialize/De-serialize from OCRD-ZIP to workspace and back.
@@ -50,7 +55,7 @@ class WorkspaceBagger():
50
55
  def _log_or_raise(self, msg):
51
56
  log = getLogger('ocrd.workspace_bagger')
52
57
  if self.strict:
53
- raise(Exception(msg))
58
+ raise Exception(msg)
54
59
  else:
55
60
  log.info(msg)
56
61
 
@@ -112,10 +117,11 @@ class WorkspaceBagger():
112
117
  log.info("New vs. old: %s" % changed_local_filenames)
113
118
  return total_bytes, total_files
114
119
 
115
- def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, ocrd_mets=DEFAULT_METS_BASENAME):
120
+ def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum,
121
+ ocrd_mets=DEFAULT_METS_BASENAME):
116
122
  bag.info['BagIt-Profile-Identifier'] = OCRD_BAGIT_PROFILE_URL
117
123
  bag.info['Bag-Software-Agent'] = 'ocrd/core %s (bagit.py %s, bagit_profile %s) [cmdline: "%s"]' % (
118
- VERSION, # TODO
124
+ VERSION, # TODO
119
125
  dist_version('ocrd-fork-bagit'),
120
126
  dist_version('ocrd-fork-bagit_profile'),
121
127
  ' '.join(sys.argv))
@@ -139,7 +145,7 @@ class WorkspaceBagger():
139
145
  tag_files=None,
140
146
  include_fileGrp=None,
141
147
  exclude_fileGrp=None,
142
- ):
148
+ ):
143
149
  """
144
150
  Bag a workspace
145
151
 
@@ -178,7 +184,8 @@ class WorkspaceBagger():
178
184
  f.write(BAGIT_TXT.encode('utf-8'))
179
185
 
180
186
  # create manifests
181
- total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes, include_fileGrp, exclude_fileGrp)
187
+ total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes,
188
+ include_fileGrp, exclude_fileGrp)
182
189
 
183
190
  # create bag-info.txt
184
191
  bag = Bag(bagdir)