ocrd 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/network.py +2 -0
- ocrd/cli/resmgr.py +29 -65
- ocrd/constants.py +0 -2
- ocrd/ocrd-all-tool.json +25 -0
- ocrd/processor/base.py +6 -16
- ocrd/processor/builtin/dummy/ocrd-tool.json +25 -0
- ocrd/processor/builtin/merge_processor.py +131 -0
- ocrd/processor/builtin/param_command_header2unordered.json +7 -0
- ocrd/processor/builtin/param_command_heading2unordered.json +7 -0
- ocrd/processor/builtin/param_command_lines2orientation.json +6 -0
- ocrd/processor/builtin/param_command_page-update-version.json +5 -0
- ocrd/processor/builtin/param_command_transkribus-to-prima.json +8 -0
- ocrd/processor/builtin/shell_processor.py +128 -0
- ocrd/resource_manager.py +213 -124
- {ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/METADATA +22 -3
- {ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/RECORD +34 -26
- {ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/entry_points.txt +2 -0
- ocrd_models/ocrd_agent.py +3 -3
- ocrd_network/__init__.py +1 -0
- ocrd_network/cli/__init__.py +2 -0
- ocrd_network/cli/resmgr_server.py +23 -0
- ocrd_network/constants.py +3 -0
- ocrd_network/logging_utils.py +5 -0
- ocrd_network/resource_manager_server.py +182 -0
- ocrd_network/runtime_data/connection_clients.py +1 -1
- ocrd_network/runtime_data/hosts.py +43 -16
- ocrd_network/runtime_data/network_agents.py +15 -1
- ocrd_utils/__init__.py +5 -1
- ocrd_utils/constants.py +5 -0
- ocrd_utils/os.py +141 -61
- ocrd_validators/ocrd_tool.schema.yml +7 -4
- ocrd/resource_list.yml +0 -61
- {ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/LICENSE +0 -0
- {ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/WHEEL +0 -0
- {ocrd-3.7.0.dist-info → ocrd-3.8.1.dist-info}/top_level.txt +0 -0
ocrd/resource_manager.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from logging import Logger
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
from os.path import join
|
|
3
4
|
from os import environ, listdir, getcwd, unlink
|
|
@@ -5,12 +6,14 @@ from shutil import copytree, rmtree, copy
|
|
|
5
6
|
from fnmatch import filter as apply_glob
|
|
6
7
|
from datetime import datetime
|
|
7
8
|
from tarfile import open as open_tarfile
|
|
9
|
+
from typing import Dict, Optional
|
|
8
10
|
from urllib.parse import urlparse, unquote
|
|
9
11
|
from zipfile import ZipFile
|
|
10
12
|
|
|
11
13
|
import requests
|
|
12
14
|
from gdown.parse_url import parse_url as gparse_url
|
|
13
15
|
from gdown.download import get_url_from_gdrive_confirmation
|
|
16
|
+
from git import Repo
|
|
14
17
|
from yaml import safe_load, safe_dump
|
|
15
18
|
|
|
16
19
|
# pylint: disable=wrong-import-position
|
|
@@ -29,8 +32,9 @@ yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'
|
|
|
29
32
|
|
|
30
33
|
from ocrd_validators import OcrdResourceListValidator
|
|
31
34
|
from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
|
|
32
|
-
from ocrd_utils.
|
|
33
|
-
from .
|
|
35
|
+
from ocrd_utils.constants import RESOURCES_DIR_SYSTEM, RESOURCE_TYPES, MIME_TO_EXT
|
|
36
|
+
from ocrd_utils.os import get_processor_resource_types, is_git_url, list_all_resources, pushd_popd, get_ocrd_tool_json
|
|
37
|
+
from .constants import RESOURCE_USER_LIST_COMMENT
|
|
34
38
|
|
|
35
39
|
|
|
36
40
|
class OcrdResourceManager:
|
|
@@ -47,14 +51,23 @@ class OcrdResourceManager:
|
|
|
47
51
|
self._userdir = userdir
|
|
48
52
|
self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
|
|
49
53
|
|
|
54
|
+
self.log.info(f"OcrdResourceManager data home path: {self.xdg_data_home}")
|
|
55
|
+
self.log.info(f"OcrdResourceManager config home path: {self.xdg_config_home}")
|
|
56
|
+
self.log.info(f"OcrdResourceManager user list path: {self.user_list}")
|
|
57
|
+
|
|
50
58
|
if not skip_init:
|
|
51
|
-
self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
|
|
52
59
|
if not self.user_list.exists():
|
|
53
60
|
if not self.user_list.parent.exists():
|
|
54
61
|
self.user_list.parent.mkdir(parents=True)
|
|
55
62
|
self.save_user_list()
|
|
56
63
|
self.load_resource_list(self.user_list)
|
|
57
64
|
|
|
65
|
+
def __repr__(self):
|
|
66
|
+
return f"user_list={str(self.user_list)} " + \
|
|
67
|
+
f"exists={self.user_list.exists()} " + \
|
|
68
|
+
f"database: {len(self.database)} executables " + \
|
|
69
|
+
f"{sum(map(len, self.database.values()))} resources"
|
|
70
|
+
|
|
58
71
|
@property
|
|
59
72
|
def userdir(self):
|
|
60
73
|
if not self._userdir:
|
|
@@ -69,19 +82,22 @@ class OcrdResourceManager:
|
|
|
69
82
|
|
|
70
83
|
@property
|
|
71
84
|
def xdg_config_home(self):
|
|
72
|
-
if self._xdg_config_home:
|
|
73
|
-
|
|
74
|
-
return
|
|
85
|
+
if not self._xdg_config_home:
|
|
86
|
+
self._xdg_config_home = config.XDG_CONFIG_HOME
|
|
87
|
+
return self._xdg_config_home
|
|
75
88
|
|
|
76
89
|
def save_user_list(self, database=None):
|
|
77
90
|
if not database:
|
|
78
91
|
database = self.database
|
|
92
|
+
self.log.info(f"Saving resources to path: {self.user_list}")
|
|
93
|
+
self._dedup_database()
|
|
79
94
|
with open(self.user_list, 'w', encoding='utf-8') as f:
|
|
80
95
|
f.write(RESOURCE_USER_LIST_COMMENT)
|
|
81
96
|
f.write('\n')
|
|
82
97
|
f.write(safe_dump(database))
|
|
83
98
|
|
|
84
|
-
def load_resource_list(self, list_filename, database=None):
|
|
99
|
+
def load_resource_list(self, list_filename: Path, database=None):
|
|
100
|
+
self.log.info(f"Loading resources from path: {list_filename}")
|
|
85
101
|
if not database:
|
|
86
102
|
database = self.database
|
|
87
103
|
if list_filename.is_file():
|
|
@@ -98,30 +114,36 @@ class OcrdResourceManager:
|
|
|
98
114
|
database[executable] = list_loaded[executable] + database[executable]
|
|
99
115
|
return database
|
|
100
116
|
|
|
101
|
-
def
|
|
117
|
+
def _search_executables(self, executable: Optional[str]):
|
|
118
|
+
skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
|
|
119
|
+
for exec_dir in environ['PATH'].split(':'):
|
|
120
|
+
self.log.debug(f"Searching for executables inside path: {exec_dir}")
|
|
121
|
+
for exec_path in Path(exec_dir).glob(f'{executable}'):
|
|
122
|
+
if not exec_path.name.startswith('ocrd-'):
|
|
123
|
+
self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
|
|
124
|
+
if exec_path.name in skip_executables:
|
|
125
|
+
self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
|
|
126
|
+
continue
|
|
127
|
+
self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
|
|
128
|
+
ocrd_tool = get_ocrd_tool_json(exec_path)
|
|
129
|
+
for res_dict in ocrd_tool.get('resources', ()):
|
|
130
|
+
if exec_path.name not in self.database:
|
|
131
|
+
self.database[exec_path.name] = []
|
|
132
|
+
self.database[exec_path.name].insert(0, res_dict)
|
|
133
|
+
|
|
134
|
+
def list_available(
|
|
135
|
+
self, executable: str = None, dynamic: bool = True, name: str = None, database: Dict = None, url: str = None
|
|
136
|
+
):
|
|
102
137
|
"""
|
|
103
138
|
List models available for download by processor
|
|
104
139
|
"""
|
|
105
140
|
if not database:
|
|
106
141
|
database = self.database
|
|
107
142
|
if not executable:
|
|
108
|
-
return database.items()
|
|
143
|
+
return list(database.items())
|
|
109
144
|
if dynamic:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
for exec_path in Path(exec_dir).glob(f'{executable}'):
|
|
113
|
-
if not exec_path.name.startswith('ocrd-'):
|
|
114
|
-
self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
|
|
115
|
-
if exec_path.name in skip_executables:
|
|
116
|
-
self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
|
|
117
|
-
continue
|
|
118
|
-
self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
|
|
119
|
-
ocrd_tool = get_ocrd_tool_json(exec_path)
|
|
120
|
-
for resdict in ocrd_tool.get('resources', ()):
|
|
121
|
-
if exec_path.name not in database:
|
|
122
|
-
database[exec_path.name] = []
|
|
123
|
-
database[exec_path.name].insert(0, resdict)
|
|
124
|
-
database = self._dedup_database(database)
|
|
145
|
+
self._search_executables(executable)
|
|
146
|
+
self.save_user_list()
|
|
125
147
|
found = False
|
|
126
148
|
ret = []
|
|
127
149
|
for k in database:
|
|
@@ -139,7 +161,7 @@ class OcrdResourceManager:
|
|
|
139
161
|
ret = [(executable, [])]
|
|
140
162
|
return ret
|
|
141
163
|
|
|
142
|
-
def list_installed(self, executable=None):
|
|
164
|
+
def list_installed(self, executable: str = None):
|
|
143
165
|
"""
|
|
144
166
|
List installed resources, matching with registry by ``name``
|
|
145
167
|
"""
|
|
@@ -150,28 +172,24 @@ class OcrdResourceManager:
|
|
|
150
172
|
# resources we know about
|
|
151
173
|
all_executables = list(self.database.keys())
|
|
152
174
|
# resources in the file system
|
|
153
|
-
parent_dirs = [join(
|
|
175
|
+
parent_dirs = [f"{join(self.xdg_data_home, 'ocrd-resources')}", RESOURCES_DIR_SYSTEM]
|
|
154
176
|
for parent_dir in parent_dirs:
|
|
155
177
|
if Path(parent_dir).exists():
|
|
156
178
|
all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
|
|
157
179
|
for this_executable in set(all_executables):
|
|
158
180
|
reslist = []
|
|
159
|
-
mimetypes = get_processor_resource_types(this_executable)
|
|
160
181
|
moduledir = get_moduledir(this_executable)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
if res_filename.is_file() and ['text/directory'] == mimetypes:
|
|
167
|
-
continue
|
|
182
|
+
resdict_list = self.list_available(executable=this_executable)[0][1]
|
|
183
|
+
for res_filename in list_all_resources(this_executable,
|
|
184
|
+
moduled=moduledir,
|
|
185
|
+
xdg_data_home=self.xdg_data_home):
|
|
186
|
+
res_filename = Path(res_filename).resolve()
|
|
168
187
|
res_name = res_filename.name
|
|
169
188
|
res_type = 'file' if res_filename.is_file() else 'directory'
|
|
170
189
|
res_size = res_filename.stat().st_size if res_filename.is_file() else directory_size(res_filename)
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
elif str(res_filename.parent) == moduledir:
|
|
190
|
+
if resdict := next((res for res in resdict_list if res['name'] == res_name), False):
|
|
191
|
+
pass
|
|
192
|
+
elif str(res_filename.parent).startswith(moduledir):
|
|
175
193
|
resdict = {
|
|
176
194
|
'name': res_name,
|
|
177
195
|
'url': str(res_filename),
|
|
@@ -181,28 +199,28 @@ class OcrdResourceManager:
|
|
|
181
199
|
}
|
|
182
200
|
else:
|
|
183
201
|
resdict = self.add_to_user_database(this_executable, res_filename, resource_type=res_type)
|
|
184
|
-
resdict['path'] = str(res_filename)
|
|
202
|
+
# resdict['path'] = str(res_filename)
|
|
185
203
|
reslist.append(resdict)
|
|
186
204
|
ret.append((this_executable, reslist))
|
|
205
|
+
self.save_user_list()
|
|
187
206
|
return ret
|
|
188
207
|
|
|
189
208
|
def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
|
|
190
209
|
"""
|
|
191
210
|
Add a stub entry to the user resource.yml
|
|
192
211
|
"""
|
|
193
|
-
res_name =
|
|
194
|
-
self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
|
|
195
|
-
f"creating stub in {self.user_list}'")
|
|
212
|
+
res_name = res_filename.name
|
|
196
213
|
if Path(res_filename).is_dir():
|
|
197
214
|
res_size = directory_size(res_filename)
|
|
198
215
|
else:
|
|
199
216
|
res_size = Path(res_filename).stat().st_size
|
|
200
|
-
|
|
201
|
-
user_database = safe_load(f) or {}
|
|
217
|
+
user_database = self.load_resource_list(self.user_list)
|
|
202
218
|
if executable not in user_database:
|
|
203
219
|
user_database[executable] = []
|
|
204
220
|
resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
|
|
205
221
|
if not resources_found:
|
|
222
|
+
self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
|
|
223
|
+
f"creating stub in {self.user_list}'")
|
|
206
224
|
resdict = {
|
|
207
225
|
'name': res_name,
|
|
208
226
|
'url': url if url else '???',
|
|
@@ -222,20 +240,45 @@ class OcrdResourceManager:
|
|
|
222
240
|
def default_resource_dir(self):
|
|
223
241
|
return self.location_to_resource_dir('data')
|
|
224
242
|
|
|
225
|
-
def location_to_resource_dir(self, location):
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
243
|
+
def location_to_resource_dir(self, location: str) -> str:
|
|
244
|
+
if location == 'data':
|
|
245
|
+
return join(self.xdg_data_home, 'ocrd-resources')
|
|
246
|
+
if location == 'system':
|
|
247
|
+
return RESOURCES_DIR_SYSTEM
|
|
248
|
+
return getcwd()
|
|
229
249
|
|
|
230
|
-
def resource_dir_to_location(self, resource_path):
|
|
250
|
+
def resource_dir_to_location(self, resource_path: Path) -> str:
|
|
231
251
|
resource_path = str(resource_path)
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
252
|
+
if resource_path.startswith(RESOURCES_DIR_SYSTEM):
|
|
253
|
+
return 'system'
|
|
254
|
+
if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')):
|
|
255
|
+
return 'data'
|
|
256
|
+
if resource_path.startswith(getcwd()):
|
|
257
|
+
return 'cwd'
|
|
258
|
+
return resource_path
|
|
259
|
+
|
|
260
|
+
def build_resource_dest_dir(self, location: str, executable: str) -> Path:
|
|
261
|
+
if location == 'module':
|
|
262
|
+
base_dir = get_moduledir(executable)
|
|
263
|
+
if not base_dir:
|
|
264
|
+
base_dir = self.location_to_resource_dir('data')
|
|
265
|
+
else:
|
|
266
|
+
base_dir = self.location_to_resource_dir(location)
|
|
267
|
+
no_subdir = location in ['cwd', 'module']
|
|
268
|
+
dest_dir = Path(base_dir) if no_subdir else Path(base_dir, executable)
|
|
269
|
+
return dest_dir
|
|
270
|
+
|
|
271
|
+
@staticmethod
|
|
272
|
+
def remove_resource(log: Logger, resource_path: Path):
|
|
273
|
+
if resource_path.is_dir():
|
|
274
|
+
log.info(f"Removing existing target resource directory {resource_path}")
|
|
275
|
+
rmtree(str(resource_path))
|
|
276
|
+
else:
|
|
277
|
+
log.info(f"Removing existing target resource file {resource_path}")
|
|
278
|
+
unlink(str(resource_path))
|
|
236
279
|
|
|
237
280
|
@staticmethod
|
|
238
|
-
def parameter_usage(name, usage='as-is'):
|
|
281
|
+
def parameter_usage(name: str, usage: str = 'as-is') -> str:
|
|
239
282
|
if usage == 'as-is':
|
|
240
283
|
return name
|
|
241
284
|
elif usage == 'without-extension':
|
|
@@ -243,8 +286,7 @@ class OcrdResourceManager:
|
|
|
243
286
|
raise ValueError(f"No such usage '{usage}'")
|
|
244
287
|
|
|
245
288
|
@staticmethod
|
|
246
|
-
def _download_impl(
|
|
247
|
-
log = getLogger('ocrd.resource_manager._download_impl')
|
|
289
|
+
def _download_impl(log: Logger, url: str, filename):
|
|
248
290
|
log.info(f"Downloading {url} to {filename}")
|
|
249
291
|
try:
|
|
250
292
|
gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
|
|
@@ -256,36 +298,36 @@ class OcrdResourceManager:
|
|
|
256
298
|
if "Content-Disposition" not in r.headers:
|
|
257
299
|
url = get_url_from_gdrive_confirmation(r.text)
|
|
258
300
|
except RuntimeError as e:
|
|
259
|
-
log.warning("Cannot unwrap Google Drive URL:
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
301
|
+
log.warning(f"Cannot unwrap Google Drive URL: {e}")
|
|
302
|
+
if is_git_url(url):
|
|
303
|
+
log.info("Cloning a git repository")
|
|
304
|
+
repo = Repo.clone_from(url, filename, depth=1)
|
|
305
|
+
# keep only the checkout
|
|
306
|
+
rmtree(join(filename, '.git'))
|
|
307
|
+
else:
|
|
308
|
+
with open(filename, 'wb') as f:
|
|
309
|
+
with requests.get(url, stream=True) as r:
|
|
310
|
+
r.raise_for_status()
|
|
311
|
+
for data in r.iter_content(chunk_size=4096):
|
|
312
|
+
f.write(data)
|
|
267
313
|
except Exception as e:
|
|
268
314
|
rmtree(filename, ignore_errors=True)
|
|
269
315
|
Path(filename).unlink(missing_ok=True)
|
|
270
316
|
raise e
|
|
271
317
|
|
|
272
318
|
@staticmethod
|
|
273
|
-
def _copy_file(src, dst
|
|
274
|
-
log = getLogger('ocrd.resource_manager._copy_file')
|
|
319
|
+
def _copy_file(log: Logger, src, dst):
|
|
275
320
|
log.info(f"Copying file {src} to {dst}")
|
|
276
321
|
with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
|
|
277
322
|
while True:
|
|
278
323
|
chunk = f_in.read(4096)
|
|
279
324
|
if chunk:
|
|
280
325
|
f_out.write(chunk)
|
|
281
|
-
if progress_cb:
|
|
282
|
-
progress_cb(len(chunk))
|
|
283
326
|
else:
|
|
284
327
|
break
|
|
285
328
|
|
|
286
329
|
@staticmethod
|
|
287
|
-
def _copy_dir(src, dst
|
|
288
|
-
log = getLogger('ocrd.resource_manager._copy_dir')
|
|
330
|
+
def _copy_dir(log: Logger, src, dst):
|
|
289
331
|
log.info(f"Copying dir recursively from {src} to {dst}")
|
|
290
332
|
if not Path(src).is_dir():
|
|
291
333
|
raise ValueError(f"The source is not a directory: {src}")
|
|
@@ -293,76 +335,123 @@ class OcrdResourceManager:
|
|
|
293
335
|
for child in Path(src).rglob('*'):
|
|
294
336
|
child_dst = Path(dst) / child.relative_to(src)
|
|
295
337
|
if Path(child).is_dir():
|
|
296
|
-
OcrdResourceManager._copy_dir(child, child_dst
|
|
338
|
+
OcrdResourceManager._copy_dir(log, child, child_dst)
|
|
297
339
|
else:
|
|
298
|
-
OcrdResourceManager._copy_file(child, child_dst
|
|
340
|
+
OcrdResourceManager._copy_file(log, child, child_dst)
|
|
299
341
|
|
|
300
342
|
@staticmethod
|
|
301
|
-
def _copy_impl(src_filename, filename
|
|
302
|
-
log = getLogger('ocrd.resource_manager._copy_impl')
|
|
343
|
+
def _copy_impl(log: Logger, src_filename, filename):
|
|
303
344
|
log.info(f"Copying {src_filename} to {filename}")
|
|
304
345
|
if Path(src_filename).is_dir():
|
|
305
|
-
OcrdResourceManager._copy_dir(src_filename, filename
|
|
346
|
+
OcrdResourceManager._copy_dir(log, src_filename, filename)
|
|
306
347
|
else:
|
|
307
|
-
OcrdResourceManager._copy_file(src_filename, filename
|
|
348
|
+
OcrdResourceManager._copy_file(log, src_filename, filename)
|
|
349
|
+
|
|
350
|
+
@staticmethod
|
|
351
|
+
def _extract_archive(log: Logger, tempdir: Path, path_in_archive: str, fpath: Path, archive_fname: str):
|
|
352
|
+
Path('out').mkdir()
|
|
353
|
+
with pushd_popd('out'):
|
|
354
|
+
mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
|
|
355
|
+
log.info(f"Extracting {mimetype} archive to {tempdir}/out")
|
|
356
|
+
if mimetype == 'application/zip':
|
|
357
|
+
with ZipFile(f'../{archive_fname}', 'r') as zipf:
|
|
358
|
+
zipf.extractall()
|
|
359
|
+
elif mimetype in ('application/gzip', 'application/x-xz'):
|
|
360
|
+
with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
|
|
361
|
+
tar.extractall()
|
|
362
|
+
else:
|
|
363
|
+
raise RuntimeError(f"Unable to handle extraction of {mimetype} archive")
|
|
364
|
+
log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
|
|
365
|
+
if Path(path_in_archive).is_dir():
|
|
366
|
+
copytree(path_in_archive, str(fpath))
|
|
367
|
+
else:
|
|
368
|
+
copy(path_in_archive, str(fpath))
|
|
369
|
+
|
|
370
|
+
def copy_resource(
|
|
371
|
+
self, log: Logger, url: str, fpath: Path, resource_type: str = 'file', path_in_archive: str = '.'
|
|
372
|
+
) -> Path:
|
|
373
|
+
"""
|
|
374
|
+
Copy a local resource to another destination
|
|
375
|
+
"""
|
|
376
|
+
if resource_type == 'archive':
|
|
377
|
+
archive_fname = 'download.tar.xx'
|
|
378
|
+
with pushd_popd(tempdir=True) as tempdir:
|
|
379
|
+
self._copy_impl(log, url, archive_fname)
|
|
380
|
+
self._extract_archive(log, tempdir, path_in_archive, fpath, archive_fname)
|
|
381
|
+
else:
|
|
382
|
+
self._copy_impl(log, url, fpath)
|
|
383
|
+
return fpath
|
|
384
|
+
|
|
385
|
+
def download_resource(
|
|
386
|
+
self, log: Logger, url: str, fpath: Path, resource_type: str = 'file', path_in_archive: str = '.'
|
|
387
|
+
) -> Path:
|
|
388
|
+
"""
|
|
389
|
+
Download a resource by URL to a destination directory
|
|
390
|
+
"""
|
|
391
|
+
if resource_type == 'archive':
|
|
392
|
+
archive_fname = 'download.tar.xx'
|
|
393
|
+
with pushd_popd(tempdir=True) as tempdir:
|
|
394
|
+
self._download_impl(log, url, archive_fname)
|
|
395
|
+
self._extract_archive(log, tempdir, path_in_archive, fpath, archive_fname)
|
|
396
|
+
else:
|
|
397
|
+
self._download_impl(log, url, fpath)
|
|
398
|
+
return fpath
|
|
308
399
|
|
|
309
400
|
# TODO Proper caching (make head request for size, If-Modified etc)
|
|
310
|
-
def
|
|
311
|
-
self, executable,
|
|
312
|
-
|
|
313
|
-
):
|
|
401
|
+
def handle_resource(
|
|
402
|
+
self, res_dict: Dict, executable: str, dest_dir: Path, any_url: str, overwrite: bool = False,
|
|
403
|
+
resource_type: str = 'file', path_in_archive: str = '.'
|
|
404
|
+
) -> Optional[Path]:
|
|
314
405
|
"""
|
|
315
|
-
Download a resource by URL
|
|
406
|
+
Download or Copy a resource by URL to a destination directory
|
|
316
407
|
"""
|
|
317
|
-
log = getLogger('ocrd.resource_manager.
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
408
|
+
log = getLogger('ocrd.resource_manager.handle_resource')
|
|
409
|
+
registered = "registered" if "size" in res_dict else "unregistered"
|
|
410
|
+
resource_type = res_dict.get('type', resource_type)
|
|
411
|
+
resource_name = res_dict.get('name', None)
|
|
412
|
+
path_in_archive = res_dict.get('path_in_archive', path_in_archive)
|
|
413
|
+
|
|
414
|
+
if resource_type not in RESOURCE_TYPES:
|
|
415
|
+
raise ValueError(f"Unknown resource type: {resource_type}, must be one of: {RESOURCE_TYPES}")
|
|
416
|
+
if any_url:
|
|
417
|
+
res_dict['url'] = any_url
|
|
418
|
+
if not resource_name:
|
|
419
|
+
url_parsed = urlparse(res_dict['url'])
|
|
420
|
+
resource_name = Path(unquote(url_parsed.path)).name
|
|
421
|
+
if resource_type == 'archive' and path_in_archive != '.':
|
|
422
|
+
resource_name = Path(path_in_archive).name
|
|
423
|
+
if res_dict['url'] == '???':
|
|
424
|
+
log.warning(f"Skipping user resource {resource_name} since download url is: {res_dict['url']}")
|
|
425
|
+
return None
|
|
426
|
+
|
|
427
|
+
fpath = Path(dest_dir, resource_name)
|
|
324
428
|
if fpath.exists():
|
|
325
429
|
if not overwrite:
|
|
326
430
|
fpath_type = 'Directory' if fpath.is_dir() else 'File'
|
|
327
431
|
log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download")
|
|
328
432
|
# raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
|
|
329
433
|
return fpath
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
|
|
352
|
-
log.info(f"Extracting {mimetype} archive to {tempdir}/out")
|
|
353
|
-
if mimetype == 'application/zip':
|
|
354
|
-
with ZipFile(f'../{archive_fname}', 'r') as zipf:
|
|
355
|
-
zipf.extractall()
|
|
356
|
-
elif mimetype in ('application/gzip', 'application/x-xz'):
|
|
357
|
-
with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
|
|
358
|
-
tar.extractall()
|
|
359
|
-
else:
|
|
360
|
-
raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
|
|
361
|
-
log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
|
|
362
|
-
if Path(path_in_archive).is_dir():
|
|
363
|
-
copytree(path_in_archive, str(fpath))
|
|
364
|
-
else:
|
|
365
|
-
copy(path_in_archive, str(fpath))
|
|
434
|
+
self.remove_resource(log, resource_path=fpath)
|
|
435
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
436
|
+
|
|
437
|
+
# TODO @mehmedGIT: Consider properly handling cases for invalid URLs.
|
|
438
|
+
if res_dict['url'].startswith('https://') or res_dict['url'].startswith('http://'):
|
|
439
|
+
log.info(f"Downloading {registered} resource '{resource_name}' ({res_dict['url']})")
|
|
440
|
+
if 'size' not in res_dict:
|
|
441
|
+
with requests.head(res_dict['url']) as r:
|
|
442
|
+
res_dict['size'] = int(r.headers.get('content-length', 0))
|
|
443
|
+
fpath = self.download_resource(log, res_dict['url'], fpath, resource_type, path_in_archive)
|
|
444
|
+
else:
|
|
445
|
+
log.info(f"Copying {registered} resource '{resource_name}' ({res_dict['url']})")
|
|
446
|
+
urlpath = Path(res_dict['url'])
|
|
447
|
+
res_dict['url'] = str(urlpath.resolve())
|
|
448
|
+
res_dict['size'] = directory_size(urlpath) if Path(urlpath).is_dir() else urlpath.stat().st_size
|
|
449
|
+
fpath = self.copy_resource(log, res_dict['url'], fpath, resource_type, path_in_archive)
|
|
450
|
+
|
|
451
|
+
if registered == 'unregistered':
|
|
452
|
+
self.add_to_user_database(executable, fpath, url=res_dict['url'])
|
|
453
|
+
self.save_user_list()
|
|
454
|
+
log.info(f"Installed resource {res_dict['url']} under {fpath}")
|
|
366
455
|
return fpath
|
|
367
456
|
|
|
368
457
|
def _dedup_database(self, database=None, dedup_key='name'):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ocrd
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.8.1
|
|
4
4
|
Summary: OCR-D framework
|
|
5
5
|
Author-email: Konstantin Baierer <unixprog@gmail.com>
|
|
6
6
|
License: Apache License 2.0
|
|
@@ -16,12 +16,13 @@ Requires-Dist: beanie~=1.7
|
|
|
16
16
|
Requires-Dist: click>=7
|
|
17
17
|
Requires-Dist: cryptography<43.0.0
|
|
18
18
|
Requires-Dist: Deprecated==1.2.0
|
|
19
|
-
Requires-Dist: docker
|
|
19
|
+
Requires-Dist: docker>=7.1.0
|
|
20
20
|
Requires-Dist: elementpath
|
|
21
21
|
Requires-Dist: fastapi>=0.78.0
|
|
22
22
|
Requires-Dist: filetype
|
|
23
23
|
Requires-Dist: Flask
|
|
24
24
|
Requires-Dist: frozendict>=2.4.0
|
|
25
|
+
Requires-Dist: gitpython
|
|
25
26
|
Requires-Dist: gdown
|
|
26
27
|
Requires-Dist: httpx>=0.22.0
|
|
27
28
|
Requires-Dist: jsonschema>=4
|
|
@@ -68,6 +69,9 @@ Requires-Dist: shapely>=2; python_version >= "3.9"
|
|
|
68
69
|
* [Command line tools](#command-line-tools)
|
|
69
70
|
* [`ocrd` CLI](#ocrd-cli)
|
|
70
71
|
* [`ocrd-dummy` CLI](#ocrd-dummy-cli)
|
|
72
|
+
* [`ocrd-filter` CLI](#ocrd-filter-cli)
|
|
73
|
+
* [`ocrd-command` CLI](#ocrd-command-cli)
|
|
74
|
+
* [`ocrd-merge` CLI](#ocrd-merge-cli)
|
|
71
75
|
* [Configuration](#configuration)
|
|
72
76
|
* [Packages](#packages)
|
|
73
77
|
* [ocrd_utils](#ocrd_utils)
|
|
@@ -76,7 +80,6 @@ Requires-Dist: shapely>=2; python_version >= "3.9"
|
|
|
76
80
|
* [ocrd_validators](#ocrd_validators)
|
|
77
81
|
* [ocrd_network](#ocrd_network)
|
|
78
82
|
* [ocrd](#ocrd)
|
|
79
|
-
* [bash library](#bash-library)
|
|
80
83
|
* [Testing](#testing)
|
|
81
84
|
* [See Also](#see-also)
|
|
82
85
|
|
|
@@ -121,6 +124,22 @@ supported flags, options and arguments.
|
|
|
121
124
|
|
|
122
125
|
A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp`
|
|
123
126
|
|
|
127
|
+
### `ocrd-filter` CLI
|
|
128
|
+
|
|
129
|
+
A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that removes segments in PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` with arbitrary selection based on powerful XPath 2.0 expressions.
|
|
130
|
+
|
|
131
|
+
### `ocrd-command` CLI
|
|
132
|
+
|
|
133
|
+
A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that runs arbitrary shell commands to transform PAGE-XML files from `-I/-input-file-grp` to `-O/-output-file-grp` (in effect "wrapping" them for OCR-D).
|
|
134
|
+
|
|
135
|
+
### `ocrd-merge` CLI
|
|
136
|
+
|
|
137
|
+
A simple [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that (for every page) joins PAGE-XML files from multiple `-I/-input-file-grp` into a single `-O/-output-file-grp`, ensuring that
|
|
138
|
+
- `Border` polygons are joined
|
|
139
|
+
- all regions are concatenated, while
|
|
140
|
+
- ensuring segment identifiers do not clash,
|
|
141
|
+
- and the reading order simply gets concatenated.
|
|
142
|
+
|
|
124
143
|
## Configuration
|
|
125
144
|
|
|
126
145
|
Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support.
|