ocrd 3.7.0__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. ocrd/cli/network.py +2 -0
  2. ocrd/cli/resmgr.py +29 -65
  3. ocrd/constants.py +0 -2
  4. ocrd/processor/base.py +6 -16
  5. ocrd/processor/builtin/dummy/ocrd-tool.json +25 -0
  6. ocrd/processor/builtin/merge_processor.py +131 -0
  7. ocrd/processor/builtin/param_command_header2unordered.json +7 -0
  8. ocrd/processor/builtin/param_command_heading2unordered.json +7 -0
  9. ocrd/processor/builtin/param_command_lines2orientation.json +6 -0
  10. ocrd/processor/builtin/param_command_page-update-version.json +5 -0
  11. ocrd/processor/builtin/param_command_transkribus-to-prima.json +8 -0
  12. ocrd/processor/builtin/shell_processor.py +128 -0
  13. ocrd/resource_manager.py +213 -124
  14. {ocrd-3.7.0.dist-info → ocrd-3.8.0.dist-info}/METADATA +22 -3
  15. {ocrd-3.7.0.dist-info → ocrd-3.8.0.dist-info}/RECORD +33 -25
  16. {ocrd-3.7.0.dist-info → ocrd-3.8.0.dist-info}/entry_points.txt +2 -0
  17. ocrd_models/ocrd_agent.py +3 -3
  18. ocrd_network/__init__.py +1 -0
  19. ocrd_network/cli/__init__.py +2 -0
  20. ocrd_network/cli/resmgr_server.py +23 -0
  21. ocrd_network/constants.py +3 -0
  22. ocrd_network/logging_utils.py +5 -0
  23. ocrd_network/resource_manager_server.py +182 -0
  24. ocrd_network/runtime_data/connection_clients.py +1 -1
  25. ocrd_network/runtime_data/hosts.py +43 -16
  26. ocrd_network/runtime_data/network_agents.py +15 -1
  27. ocrd_utils/__init__.py +5 -1
  28. ocrd_utils/constants.py +5 -0
  29. ocrd_utils/os.py +141 -61
  30. ocrd_validators/ocrd_tool.schema.yml +7 -4
  31. ocrd/resource_list.yml +0 -61
  32. {ocrd-3.7.0.dist-info → ocrd-3.8.0.dist-info}/LICENSE +0 -0
  33. {ocrd-3.7.0.dist-info → ocrd-3.8.0.dist-info}/WHEEL +0 -0
  34. {ocrd-3.7.0.dist-info → ocrd-3.8.0.dist-info}/top_level.txt +0 -0
ocrd/resource_manager.py CHANGED
@@ -1,3 +1,4 @@
1
+ from logging import Logger
1
2
  from pathlib import Path
2
3
  from os.path import join
3
4
  from os import environ, listdir, getcwd, unlink
@@ -5,12 +6,14 @@ from shutil import copytree, rmtree, copy
5
6
  from fnmatch import filter as apply_glob
6
7
  from datetime import datetime
7
8
  from tarfile import open as open_tarfile
9
+ from typing import Dict, Optional
8
10
  from urllib.parse import urlparse, unquote
9
11
  from zipfile import ZipFile
10
12
 
11
13
  import requests
12
14
  from gdown.parse_url import parse_url as gparse_url
13
15
  from gdown.download import get_url_from_gdrive_confirmation
16
+ from git import Repo
14
17
  from yaml import safe_load, safe_dump
15
18
 
16
19
  # pylint: disable=wrong-import-position
@@ -29,8 +32,9 @@ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'
29
32
 
30
33
  from ocrd_validators import OcrdResourceListValidator
31
34
  from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
32
- from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
33
- from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
35
+ from ocrd_utils.constants import RESOURCES_DIR_SYSTEM, RESOURCE_TYPES, MIME_TO_EXT
36
+ from ocrd_utils.os import get_processor_resource_types, is_git_url, list_all_resources, pushd_popd, get_ocrd_tool_json
37
+ from .constants import RESOURCE_USER_LIST_COMMENT
34
38
 
35
39
 
36
40
  class OcrdResourceManager:
@@ -47,14 +51,23 @@ class OcrdResourceManager:
47
51
  self._userdir = userdir
48
52
  self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
49
53
 
54
+ self.log.info(f"OcrdResourceManager data home path: {self.xdg_data_home}")
55
+ self.log.info(f"OcrdResourceManager config home path: {self.xdg_config_home}")
56
+ self.log.info(f"OcrdResourceManager user list path: {self.user_list}")
57
+
50
58
  if not skip_init:
51
- self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
52
59
  if not self.user_list.exists():
53
60
  if not self.user_list.parent.exists():
54
61
  self.user_list.parent.mkdir(parents=True)
55
62
  self.save_user_list()
56
63
  self.load_resource_list(self.user_list)
57
64
 
65
+ def __repr__(self):
66
+ return f"user_list={str(self.user_list)} " + \
67
+ f"exists={self.user_list.exists()} " + \
68
+ f"database: {len(self.database)} executables " + \
69
+ f"{sum(map(len, self.database.values()))} resources"
70
+
58
71
  @property
59
72
  def userdir(self):
60
73
  if not self._userdir:
@@ -69,19 +82,22 @@ class OcrdResourceManager:
69
82
 
70
83
  @property
71
84
  def xdg_config_home(self):
72
- if self._xdg_config_home:
73
- return self._xdg_config_home
74
- return config.XDG_CONFIG_HOME
85
+ if not self._xdg_config_home:
86
+ self._xdg_config_home = config.XDG_CONFIG_HOME
87
+ return self._xdg_config_home
75
88
 
76
89
  def save_user_list(self, database=None):
77
90
  if not database:
78
91
  database = self.database
92
+ self.log.info(f"Saving resources to path: {self.user_list}")
93
+ self._dedup_database()
79
94
  with open(self.user_list, 'w', encoding='utf-8') as f:
80
95
  f.write(RESOURCE_USER_LIST_COMMENT)
81
96
  f.write('\n')
82
97
  f.write(safe_dump(database))
83
98
 
84
- def load_resource_list(self, list_filename, database=None):
99
+ def load_resource_list(self, list_filename: Path, database=None):
100
+ self.log.info(f"Loading resources from path: {list_filename}")
85
101
  if not database:
86
102
  database = self.database
87
103
  if list_filename.is_file():
@@ -98,30 +114,36 @@ class OcrdResourceManager:
98
114
  database[executable] = list_loaded[executable] + database[executable]
99
115
  return database
100
116
 
101
- def list_available(self, executable=None, dynamic=True, name=None, database=None, url=None):
117
+ def _search_executables(self, executable: Optional[str]):
118
+ skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
119
+ for exec_dir in environ['PATH'].split(':'):
120
+ self.log.debug(f"Searching for executables inside path: {exec_dir}")
121
+ for exec_path in Path(exec_dir).glob(f'{executable}'):
122
+ if not exec_path.name.startswith('ocrd-'):
123
+ self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
124
+ if exec_path.name in skip_executables:
125
+ self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
126
+ continue
127
+ self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
128
+ ocrd_tool = get_ocrd_tool_json(exec_path)
129
+ for res_dict in ocrd_tool.get('resources', ()):
130
+ if exec_path.name not in self.database:
131
+ self.database[exec_path.name] = []
132
+ self.database[exec_path.name].insert(0, res_dict)
133
+
134
+ def list_available(
135
+ self, executable: str = None, dynamic: bool = True, name: str = None, database: Dict = None, url: str = None
136
+ ):
102
137
  """
103
138
  List models available for download by processor
104
139
  """
105
140
  if not database:
106
141
  database = self.database
107
142
  if not executable:
108
- return database.items()
143
+ return list(database.items())
109
144
  if dynamic:
110
- skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
111
- for exec_dir in environ['PATH'].split(':'):
112
- for exec_path in Path(exec_dir).glob(f'{executable}'):
113
- if not exec_path.name.startswith('ocrd-'):
114
- self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
115
- if exec_path.name in skip_executables:
116
- self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
117
- continue
118
- self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
119
- ocrd_tool = get_ocrd_tool_json(exec_path)
120
- for resdict in ocrd_tool.get('resources', ()):
121
- if exec_path.name not in database:
122
- database[exec_path.name] = []
123
- database[exec_path.name].insert(0, resdict)
124
- database = self._dedup_database(database)
145
+ self._search_executables(executable)
146
+ self.save_user_list()
125
147
  found = False
126
148
  ret = []
127
149
  for k in database:
@@ -139,7 +161,7 @@ class OcrdResourceManager:
139
161
  ret = [(executable, [])]
140
162
  return ret
141
163
 
142
- def list_installed(self, executable=None):
164
+ def list_installed(self, executable: str = None):
143
165
  """
144
166
  List installed resources, matching with registry by ``name``
145
167
  """
@@ -150,28 +172,24 @@ class OcrdResourceManager:
150
172
  # resources we know about
151
173
  all_executables = list(self.database.keys())
152
174
  # resources in the file system
153
- parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
175
+ parent_dirs = [f"{join(self.xdg_data_home, 'ocrd-resources')}", RESOURCES_DIR_SYSTEM]
154
176
  for parent_dir in parent_dirs:
155
177
  if Path(parent_dir).exists():
156
178
  all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
157
179
  for this_executable in set(all_executables):
158
180
  reslist = []
159
- mimetypes = get_processor_resource_types(this_executable)
160
181
  moduledir = get_moduledir(this_executable)
161
- for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
162
- res_filename = Path(res_filename)
163
- if not '*/*' in mimetypes:
164
- if res_filename.is_dir() and not 'text/directory' in mimetypes:
165
- continue
166
- if res_filename.is_file() and ['text/directory'] == mimetypes:
167
- continue
182
+ resdict_list = self.list_available(executable=this_executable)[0][1]
183
+ for res_filename in list_all_resources(this_executable,
184
+ moduled=moduledir,
185
+ xdg_data_home=self.xdg_data_home):
186
+ res_filename = Path(res_filename).resolve()
168
187
  res_name = res_filename.name
169
188
  res_type = 'file' if res_filename.is_file() else 'directory'
170
189
  res_size = res_filename.stat().st_size if res_filename.is_file() else directory_size(res_filename)
171
- resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
172
- if resdict_list:
173
- resdict = resdict_list[0]
174
- elif str(res_filename.parent) == moduledir:
190
+ if resdict := next((res for res in resdict_list if res['name'] == res_name), False):
191
+ pass
192
+ elif str(res_filename.parent).startswith(moduledir):
175
193
  resdict = {
176
194
  'name': res_name,
177
195
  'url': str(res_filename),
@@ -181,28 +199,28 @@ class OcrdResourceManager:
181
199
  }
182
200
  else:
183
201
  resdict = self.add_to_user_database(this_executable, res_filename, resource_type=res_type)
184
- resdict['path'] = str(res_filename)
202
+ # resdict['path'] = str(res_filename)
185
203
  reslist.append(resdict)
186
204
  ret.append((this_executable, reslist))
205
+ self.save_user_list()
187
206
  return ret
188
207
 
189
208
  def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
190
209
  """
191
210
  Add a stub entry to the user resource.yml
192
211
  """
193
- res_name = Path(res_filename).name
194
- self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
195
- f"creating stub in {self.user_list}'")
212
+ res_name = res_filename.name
196
213
  if Path(res_filename).is_dir():
197
214
  res_size = directory_size(res_filename)
198
215
  else:
199
216
  res_size = Path(res_filename).stat().st_size
200
- with open(self.user_list, 'r', encoding='utf-8') as f:
201
- user_database = safe_load(f) or {}
217
+ user_database = self.load_resource_list(self.user_list)
202
218
  if executable not in user_database:
203
219
  user_database[executable] = []
204
220
  resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
205
221
  if not resources_found:
222
+ self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
223
+ f"creating stub in {self.user_list}'")
206
224
  resdict = {
207
225
  'name': res_name,
208
226
  'url': url if url else '???',
@@ -222,20 +240,45 @@ class OcrdResourceManager:
222
240
  def default_resource_dir(self):
223
241
  return self.location_to_resource_dir('data')
224
242
 
225
- def location_to_resource_dir(self, location):
226
- return '/usr/local/share/ocrd-resources' if location == 'system' else \
227
- join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
228
- getcwd()
243
+ def location_to_resource_dir(self, location: str) -> str:
244
+ if location == 'data':
245
+ return join(self.xdg_data_home, 'ocrd-resources')
246
+ if location == 'system':
247
+ return RESOURCES_DIR_SYSTEM
248
+ return getcwd()
229
249
 
230
- def resource_dir_to_location(self, resource_path):
250
+ def resource_dir_to_location(self, resource_path: Path) -> str:
231
251
  resource_path = str(resource_path)
232
- return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
233
- 'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
234
- 'cwd' if resource_path.startswith(getcwd()) else \
235
- resource_path
252
+ if resource_path.startswith(RESOURCES_DIR_SYSTEM):
253
+ return 'system'
254
+ if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')):
255
+ return 'data'
256
+ if resource_path.startswith(getcwd()):
257
+ return 'cwd'
258
+ return resource_path
259
+
260
+ def build_resource_dest_dir(self, location: str, executable: str) -> Path:
261
+ if location == 'module':
262
+ base_dir = get_moduledir(executable)
263
+ if not base_dir:
264
+ base_dir = self.location_to_resource_dir('data')
265
+ else:
266
+ base_dir = self.location_to_resource_dir(location)
267
+ no_subdir = location in ['cwd', 'module']
268
+ dest_dir = Path(base_dir) if no_subdir else Path(base_dir, executable)
269
+ return dest_dir
270
+
271
+ @staticmethod
272
+ def remove_resource(log: Logger, resource_path: Path):
273
+ if resource_path.is_dir():
274
+ log.info(f"Removing existing target resource directory {resource_path}")
275
+ rmtree(str(resource_path))
276
+ else:
277
+ log.info(f"Removing existing target resource file {resource_path}")
278
+ unlink(str(resource_path))
236
279
 
237
280
  @staticmethod
238
- def parameter_usage(name, usage='as-is'):
281
+ def parameter_usage(name: str, usage: str = 'as-is') -> str:
239
282
  if usage == 'as-is':
240
283
  return name
241
284
  elif usage == 'without-extension':
@@ -243,8 +286,7 @@ class OcrdResourceManager:
243
286
  raise ValueError(f"No such usage '{usage}'")
244
287
 
245
288
  @staticmethod
246
- def _download_impl(url, filename, progress_cb=None, size=None):
247
- log = getLogger('ocrd.resource_manager._download_impl')
289
+ def _download_impl(log: Logger, url: str, filename):
248
290
  log.info(f"Downloading {url} to {filename}")
249
291
  try:
250
292
  gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
@@ -256,36 +298,36 @@ class OcrdResourceManager:
256
298
  if "Content-Disposition" not in r.headers:
257
299
  url = get_url_from_gdrive_confirmation(r.text)
258
300
  except RuntimeError as e:
259
- log.warning("Cannot unwrap Google Drive URL: %s", e)
260
- with open(filename, 'wb') as f:
261
- with requests.get(url, stream=True) as r:
262
- r.raise_for_status()
263
- for data in r.iter_content(chunk_size=4096):
264
- if progress_cb:
265
- progress_cb(len(data))
266
- f.write(data)
301
+ log.warning(f"Cannot unwrap Google Drive URL: {e}")
302
+ if is_git_url(url):
303
+ log.info("Cloning a git repository")
304
+ repo = Repo.clone_from(url, filename, depth=1)
305
+ # keep only the checkout
306
+ rmtree(join(filename, '.git'))
307
+ else:
308
+ with open(filename, 'wb') as f:
309
+ with requests.get(url, stream=True) as r:
310
+ r.raise_for_status()
311
+ for data in r.iter_content(chunk_size=4096):
312
+ f.write(data)
267
313
  except Exception as e:
268
314
  rmtree(filename, ignore_errors=True)
269
315
  Path(filename).unlink(missing_ok=True)
270
316
  raise e
271
317
 
272
318
  @staticmethod
273
- def _copy_file(src, dst, progress_cb=None):
274
- log = getLogger('ocrd.resource_manager._copy_file')
319
+ def _copy_file(log: Logger, src, dst):
275
320
  log.info(f"Copying file {src} to {dst}")
276
321
  with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
277
322
  while True:
278
323
  chunk = f_in.read(4096)
279
324
  if chunk:
280
325
  f_out.write(chunk)
281
- if progress_cb:
282
- progress_cb(len(chunk))
283
326
  else:
284
327
  break
285
328
 
286
329
  @staticmethod
287
- def _copy_dir(src, dst, progress_cb=None):
288
- log = getLogger('ocrd.resource_manager._copy_dir')
330
+ def _copy_dir(log: Logger, src, dst):
289
331
  log.info(f"Copying dir recursively from {src} to {dst}")
290
332
  if not Path(src).is_dir():
291
333
  raise ValueError(f"The source is not a directory: {src}")
@@ -293,76 +335,123 @@ class OcrdResourceManager:
293
335
  for child in Path(src).rglob('*'):
294
336
  child_dst = Path(dst) / child.relative_to(src)
295
337
  if Path(child).is_dir():
296
- OcrdResourceManager._copy_dir(child, child_dst, progress_cb)
338
+ OcrdResourceManager._copy_dir(log, child, child_dst)
297
339
  else:
298
- OcrdResourceManager._copy_file(child, child_dst, progress_cb)
340
+ OcrdResourceManager._copy_file(log, child, child_dst)
299
341
 
300
342
  @staticmethod
301
- def _copy_impl(src_filename, filename, progress_cb=None):
302
- log = getLogger('ocrd.resource_manager._copy_impl')
343
+ def _copy_impl(log: Logger, src_filename, filename):
303
344
  log.info(f"Copying {src_filename} to {filename}")
304
345
  if Path(src_filename).is_dir():
305
- OcrdResourceManager._copy_dir(src_filename, filename, progress_cb)
346
+ OcrdResourceManager._copy_dir(log, src_filename, filename)
306
347
  else:
307
- OcrdResourceManager._copy_file(src_filename, filename, progress_cb)
348
+ OcrdResourceManager._copy_file(log, src_filename, filename)
349
+
350
+ @staticmethod
351
+ def _extract_archive(log: Logger, tempdir: Path, path_in_archive: str, fpath: Path, archive_fname: str):
352
+ Path('out').mkdir()
353
+ with pushd_popd('out'):
354
+ mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
355
+ log.info(f"Extracting {mimetype} archive to {tempdir}/out")
356
+ if mimetype == 'application/zip':
357
+ with ZipFile(f'../{archive_fname}', 'r') as zipf:
358
+ zipf.extractall()
359
+ elif mimetype in ('application/gzip', 'application/x-xz'):
360
+ with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
361
+ tar.extractall()
362
+ else:
363
+ raise RuntimeError(f"Unable to handle extraction of {mimetype} archive")
364
+ log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
365
+ if Path(path_in_archive).is_dir():
366
+ copytree(path_in_archive, str(fpath))
367
+ else:
368
+ copy(path_in_archive, str(fpath))
369
+
370
+ def copy_resource(
371
+ self, log: Logger, url: str, fpath: Path, resource_type: str = 'file', path_in_archive: str = '.'
372
+ ) -> Path:
373
+ """
374
+ Copy a local resource to another destination
375
+ """
376
+ if resource_type == 'archive':
377
+ archive_fname = 'download.tar.xx'
378
+ with pushd_popd(tempdir=True) as tempdir:
379
+ self._copy_impl(log, url, archive_fname)
380
+ self._extract_archive(log, tempdir, path_in_archive, fpath, archive_fname)
381
+ else:
382
+ self._copy_impl(log, url, fpath)
383
+ return fpath
384
+
385
+ def download_resource(
386
+ self, log: Logger, url: str, fpath: Path, resource_type: str = 'file', path_in_archive: str = '.'
387
+ ) -> Path:
388
+ """
389
+ Download a resource by URL to a destination directory
390
+ """
391
+ if resource_type == 'archive':
392
+ archive_fname = 'download.tar.xx'
393
+ with pushd_popd(tempdir=True) as tempdir:
394
+ self._download_impl(log, url, archive_fname)
395
+ self._extract_archive(log, tempdir, path_in_archive, fpath, archive_fname)
396
+ else:
397
+ self._download_impl(log, url, fpath)
398
+ return fpath
308
399
 
309
400
  # TODO Proper caching (make head request for size, If-Modified etc)
310
- def download(
311
- self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
312
- path_in_archive='.', progress_cb=None,
313
- ):
401
+ def handle_resource(
402
+ self, res_dict: Dict, executable: str, dest_dir: Path, any_url: str, overwrite: bool = False,
403
+ resource_type: str = 'file', path_in_archive: str = '.'
404
+ ) -> Optional[Path]:
314
405
  """
315
- Download a resource by URL
406
+ Download or Copy a resource by URL to a destination directory
316
407
  """
317
- log = getLogger('ocrd.resource_manager.download')
318
- destdir = Path(basedir) if no_subdir else Path(basedir, executable)
319
- if not name:
320
- url_parsed = urlparse(url)
321
- name = Path(unquote(url_parsed.path)).name
322
- fpath = Path(destdir, name)
323
- is_url = url.startswith('https://') or url.startswith('http://')
408
+ log = getLogger('ocrd.resource_manager.handle_resource')
409
+ registered = "registered" if "size" in res_dict else "unregistered"
410
+ resource_type = res_dict.get('type', resource_type)
411
+ resource_name = res_dict.get('name', None)
412
+ path_in_archive = res_dict.get('path_in_archive', path_in_archive)
413
+
414
+ if resource_type not in RESOURCE_TYPES:
415
+ raise ValueError(f"Unknown resource type: {resource_type}, must be one of: {RESOURCE_TYPES}")
416
+ if any_url:
417
+ res_dict['url'] = any_url
418
+ if not resource_name:
419
+ url_parsed = urlparse(res_dict['url'])
420
+ resource_name = Path(unquote(url_parsed.path)).name
421
+ if resource_type == 'archive' and path_in_archive != '.':
422
+ resource_name = Path(path_in_archive).name
423
+ if res_dict['url'] == '???':
424
+ log.warning(f"Skipping user resource {resource_name} since download url is: {res_dict['url']}")
425
+ return None
426
+
427
+ fpath = Path(dest_dir, resource_name)
324
428
  if fpath.exists():
325
429
  if not overwrite:
326
430
  fpath_type = 'Directory' if fpath.is_dir() else 'File'
327
431
  log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download")
328
432
  # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
329
433
  return fpath
330
- if fpath.is_dir():
331
- log.info(f"Removing existing target directory {fpath}")
332
- rmtree(str(fpath))
333
- else:
334
- log.info(f"Removing existing target file {fpath}")
335
- unlink(str(fpath))
336
- destdir.mkdir(parents=True, exist_ok=True)
337
- if resource_type in ('file', 'directory'):
338
- if is_url:
339
- self._download_impl(url, fpath, progress_cb)
340
- else:
341
- self._copy_impl(url, fpath, progress_cb)
342
- elif resource_type == 'archive':
343
- archive_fname = 'download.tar.xx'
344
- with pushd_popd(tempdir=True) as tempdir:
345
- if is_url:
346
- self._download_impl(url, archive_fname, progress_cb)
347
- else:
348
- self._copy_impl(url, archive_fname, progress_cb)
349
- Path('out').mkdir()
350
- with pushd_popd('out'):
351
- mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
352
- log.info(f"Extracting {mimetype} archive to {tempdir}/out")
353
- if mimetype == 'application/zip':
354
- with ZipFile(f'../{archive_fname}', 'r') as zipf:
355
- zipf.extractall()
356
- elif mimetype in ('application/gzip', 'application/x-xz'):
357
- with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
358
- tar.extractall()
359
- else:
360
- raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
361
- log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
362
- if Path(path_in_archive).is_dir():
363
- copytree(path_in_archive, str(fpath))
364
- else:
365
- copy(path_in_archive, str(fpath))
434
+ self.remove_resource(log, resource_path=fpath)
435
+ dest_dir.mkdir(parents=True, exist_ok=True)
436
+
437
+ # TODO @mehmedGIT: Consider properly handling cases for invalid URLs.
438
+ if res_dict['url'].startswith('https://') or res_dict['url'].startswith('http://'):
439
+ log.info(f"Downloading {registered} resource '{resource_name}' ({res_dict['url']})")
440
+ if 'size' not in res_dict:
441
+ with requests.head(res_dict['url']) as r:
442
+ res_dict['size'] = int(r.headers.get('content-length', 0))
443
+ fpath = self.download_resource(log, res_dict['url'], fpath, resource_type, path_in_archive)
444
+ else:
445
+ log.info(f"Copying {registered} resource '{resource_name}' ({res_dict['url']})")
446
+ urlpath = Path(res_dict['url'])
447
+ res_dict['url'] = str(urlpath.resolve())
448
+ res_dict['size'] = directory_size(urlpath) if Path(urlpath).is_dir() else urlpath.stat().st_size
449
+ fpath = self.copy_resource(log, res_dict['url'], fpath, resource_type, path_in_archive)
450
+
451
+ if registered == 'unregistered':
452
+ self.add_to_user_database(executable, fpath, url=res_dict['url'])
453
+ self.save_user_list()
454
+ log.info(f"Installed resource {res_dict['url']} under {fpath}")
366
455
  return fpath
367
456
 
368
457
  def _dedup_database(self, database=None, dedup_key='name'):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.7.0
3
+ Version: 3.8.0
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -16,12 +16,13 @@ Requires-Dist: beanie~=1.7
16
16
  Requires-Dist: click>=7
17
17
  Requires-Dist: cryptography<43.0.0
18
18
  Requires-Dist: Deprecated==1.2.0
19
- Requires-Dist: docker
19
+ Requires-Dist: docker>=7.1.0
20
20
  Requires-Dist: elementpath
21
21
  Requires-Dist: fastapi>=0.78.0
22
22
  Requires-Dist: filetype
23
23
  Requires-Dist: Flask
24
24
  Requires-Dist: frozendict>=2.4.0
25
+ Requires-Dist: gitpython
25
26
  Requires-Dist: gdown
26
27
  Requires-Dist: httpx>=0.22.0
27
28
  Requires-Dist: jsonschema>=4
@@ -68,6 +69,9 @@ Requires-Dist: shapely>=2; python_version >= "3.9"
68
69
  * [Command line tools](#command-line-tools)
69
70
  * [`ocrd` CLI](#ocrd-cli)
70
71
  * [`ocrd-dummy` CLI](#ocrd-dummy-cli)
72
+ * [`ocrd-filter` CLI](#ocrd-filter-cli)
73
+ * [`ocrd-command` CLI](#ocrd-command-cli)
74
+ * [`ocrd-merge` CLI](#ocrd-merge-cli)
71
75
  * [Configuration](#configuration)
72
76
  * [Packages](#packages)
73
77
  * [ocrd_utils](#ocrd_utils)
@@ -76,7 +80,6 @@ Requires-Dist: shapely>=2; python_version >= "3.9"
76
80
  * [ocrd_validators](#ocrd_validators)
77
81
  * [ocrd_network](#ocrd_network)
78
82
  * [ocrd](#ocrd)
79
- * [bash library](#bash-library)
80
83
  * [Testing](#testing)
81
84
  * [See Also](#see-also)
82
85
 
@@ -121,6 +124,22 @@ supported flags, options and arguments.
121
124
 
122
125
  A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp`
123
126
 
127
+ ### `ocrd-filter` CLI
128
+
129
+ A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that removes segments in PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` with arbitrary selection based on powerful XPath 2.0 expressions.
130
+
131
+ ### `ocrd-command` CLI
132
+
133
+ A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that runs arbitrary shell commands to transform PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` (in effect "wrapping" them for OCR-D).
134
+
135
+ ### `ocrd-merge` CLI
136
+
137
+ A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that (for every page) joins PAGE-XML files from multiple `-I/-input-file-grp` into a single `-O/-output-file-grp`, ensuring that
138
+ - `Border` polygons are joined
139
+ - all regions are concatenated, while
140
+ - ensuring segment identifiers do not clash,
141
+ - and the reading order simply gets concatenated.
142
+
124
143
  ## Configuration
125
144
 
126
145
  Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support.