ocrd 3.2.0__py3-none-any.whl → 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocrd/cli/network.py CHANGED
@@ -13,7 +13,6 @@ from ocrd_network.cli import (
13
13
  processing_server_cli,
14
14
  processing_worker_cli,
15
15
  processor_server_cli,
16
- resource_manager_server_cli
17
16
  )
18
17
 
19
18
 
@@ -29,4 +28,3 @@ network_cli.add_command(client_cli)
29
28
  network_cli.add_command(processing_server_cli)
30
29
  network_cli.add_command(processing_worker_cli)
31
30
  network_cli.add_command(processor_server_cli)
32
- network_cli.add_command(resource_manager_server_cli)
ocrd/cli/resmgr.py CHANGED
@@ -18,9 +18,9 @@ from ocrd_utils import (
18
18
  getLogger,
19
19
  get_moduledir,
20
20
  get_ocrd_tool_json,
21
+ resource_filename,
21
22
  initLogging,
22
23
  RESOURCE_LOCATIONS,
23
- RESOURCE_TYPES
24
24
  )
25
25
  from ocrd.constants import RESOURCE_USER_LIST_COMMENT
26
26
 
@@ -72,13 +72,13 @@ def list_installed(executable=None):
72
72
  @click.option('-n', '--any-url', default='', help='URL of unregistered resource to download/copy from')
73
73
  @click.option('-D', '--no-dynamic', default=False, is_flag=True,
74
74
  help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources")
75
- @click.option('-t', '--resource-type', type=click.Choice(RESOURCE_TYPES), default='file',
75
+ @click.option('-t', '--resource-type', type=click.Choice(['file', 'directory', 'archive']), default='file',
76
76
  help='Type of resource',)
77
77
  @click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type')
78
78
  @click.option('-a', '--allow-uninstalled', is_flag=True,
79
79
  help="Allow installing resources for uninstalled processors",)
80
80
  @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True)
81
- @click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS), default='data',
81
+ @click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
82
82
  help="Where to store resources - defaults to first location in processor's 'resource_locations' "
83
83
  "list or finally 'data'")
84
84
  @click.argument('executable', required=True)
@@ -107,6 +107,8 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
107
107
  executable = None
108
108
  if name == '*':
109
109
  name = None
110
+ is_url = (any_url.startswith('https://') or any_url.startswith('http://')) if any_url else False
111
+ is_filename = Path(any_url).exists() if any_url else False
110
112
  if executable and not which(executable):
111
113
  if not allow_uninstalled:
112
114
  log.error(f"Executable '{executable}' is not installed. "
@@ -125,30 +127,65 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
125
127
  'path_in_archive': path_in_archive}]
126
128
  )]
127
129
  for this_executable, this_reslist in reslist:
128
- resource_locations = get_ocrd_tool_json(this_executable)['resource_locations']
129
- if not location:
130
- location = resource_locations[0]
131
- elif location not in resource_locations:
132
- log.warning(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
133
- f"refusing to install to invalid location. Instead installing to: {resource_locations[0]}")
134
- res_dest_dir = resmgr.build_resource_dest_dir(location=location, executable=this_executable)
135
- for res_dict in this_reslist:
130
+ for resdict in this_reslist:
131
+ if 'size' in resdict:
132
+ registered = "registered"
133
+ else:
134
+ registered = "unregistered"
135
+ if any_url:
136
+ resdict['url'] = any_url
137
+ if resdict['url'] == '???':
138
+ log.warning(f"Cannot download user resource {resdict['name']}")
139
+ continue
140
+ if resdict['url'].startswith('https://') or resdict['url'].startswith('http://'):
141
+ log.info(f"Downloading {registered} resource '{resdict['name']}' ({resdict['url']})")
142
+ if 'size' not in resdict:
143
+ with requests.head(resdict['url']) as r:
144
+ resdict['size'] = int(r.headers.get('content-length', 0))
145
+ else:
146
+ log.info(f"Copying {registered} resource '{resdict['name']}' ({resdict['url']})")
147
+ urlpath = Path(resdict['url'])
148
+ resdict['url'] = str(urlpath.resolve())
149
+ if Path(urlpath).is_dir():
150
+ resdict['size'] = directory_size(urlpath)
151
+ else:
152
+ resdict['size'] = urlpath.stat().st_size
153
+ if not location:
154
+ location = get_ocrd_tool_json(this_executable)['resource_locations'][0]
155
+ elif location not in get_ocrd_tool_json(this_executable)['resource_locations']:
156
+ log.error(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
157
+ f"refusing to install to invalid location")
158
+ sys.exit(1)
159
+ if location != 'module':
160
+ basedir = resmgr.location_to_resource_dir(location)
161
+ else:
162
+ basedir = get_moduledir(this_executable)
163
+ if not basedir:
164
+ basedir = resmgr.location_to_resource_dir('data')
165
+
136
166
  try:
137
- fpath = resmgr.handle_resource(
138
- res_dict=res_dict,
139
- executable=this_executable,
140
- dest_dir=res_dest_dir,
141
- any_url=any_url,
142
- overwrite=overwrite,
143
- resource_type=resource_type,
144
- path_in_archive=path_in_archive
145
- )
146
- if not fpath:
147
- continue
167
+ with click.progressbar(length=resdict['size']) as bar:
168
+ fpath = resmgr.download(
169
+ this_executable,
170
+ resdict['url'],
171
+ basedir,
172
+ name=resdict['name'],
173
+ resource_type=resdict.get('type', resource_type),
174
+ path_in_archive=resdict.get('path_in_archive', path_in_archive),
175
+ overwrite=overwrite,
176
+ no_subdir=location in ['cwd', 'module'],
177
+ progress_cb=lambda delta: bar.update(delta)
178
+ )
179
+ if registered == 'unregistered':
180
+ log.info(f"{this_executable} resource '{name}' ({any_url}) not a known resource, creating stub "
181
+ f"in {resmgr.user_list}'")
182
+ resmgr.add_to_user_database(this_executable, fpath, url=any_url)
183
+ resmgr.save_user_list()
184
+ log.info(f"Installed resource {resdict['url']} under {fpath}")
148
185
  except FileExistsError as exc:
149
186
  log.info(str(exc))
150
- usage = res_dict.get('parameter_usage', 'as-is')
151
- log.info(f"Use in parameters as '{resmgr.parameter_usage(res_dict['name'], usage)}'")
187
+ log.info(f"Use in parameters as "
188
+ f"'{resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))}'")
152
189
 
153
190
 
154
191
  @resmgr_cli.command('migrate')
@@ -107,10 +107,10 @@ def ocrd_cli_wrap_processor(
107
107
  if 'parameter_override' in kwargs:
108
108
  set_json_key_value_overrides(kwargs['parameter'], *kwargs.pop('parameter_override'))
109
109
  # Assert -I / -O
110
- if not kwargs['input_file_grp']:
110
+ if not kwargs.get('input_file_grp', None):
111
111
  raise ValueError('-I/--input-file-grp is required')
112
- if not kwargs['output_file_grp']:
113
- raise ValueError('-O/--output-file-grp is required')
112
+ if 'output_file_grp' not in kwargs:
113
+ raise ValueError('-O/--output-file-grp is required') # actually, it may be None
114
114
  resolver = Resolver()
115
115
  working_dir, mets, _, mets_server_url = \
116
116
  resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url)
ocrd/processor/base.py CHANGED
@@ -43,14 +43,15 @@ from .ocrd_page_result import OcrdPageResult
43
43
  from ocrd_utils import (
44
44
  VERSION as OCRD_VERSION,
45
45
  MIMETYPE_PAGE,
46
+ MIME_TO_EXT,
46
47
  config,
47
48
  getLogger,
48
49
  list_resource_candidates,
50
+ pushd_popd,
49
51
  list_all_resources,
50
52
  get_processor_resource_types,
51
53
  resource_filename,
52
54
  parse_json_file_with_comments,
53
- pushd_popd,
54
55
  make_file_id,
55
56
  deprecation_warning
56
57
  )
@@ -904,8 +905,9 @@ class Processor():
904
905
  cwd = self.old_pwd
905
906
  else:
906
907
  cwd = getcwd()
907
- ret = list(filter(exists, list_resource_candidates(executable, val,
908
- cwd=cwd, moduled=self.moduledir)))
908
+ ret = [cand for cand in list_resource_candidates(executable, val,
909
+ cwd=cwd, moduled=self.moduledir)
910
+ if exists(cand)]
909
911
  if ret:
910
912
  self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
911
913
  return ret[0]
@@ -936,9 +938,17 @@ class Processor():
936
938
  """
937
939
  List all resources found in the filesystem and matching content-type by filename suffix
938
940
  """
939
- for res in list_all_resources(self.executable, ocrd_tool=self.ocrd_tool, moduled=self.moduledir):
941
+ mimetypes = get_processor_resource_types(None, self.ocrd_tool)
942
+ for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
940
943
  res = Path(res)
941
- yield res.name
944
+ if not '*/*' in mimetypes:
945
+ if res.is_dir() and not 'text/directory' in mimetypes:
946
+ continue
947
+ # if we do not know all MIME types, then keep the file, otherwise require suffix match
948
+ if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
949
+ for mime in mimetypes):
950
+ continue
951
+ yield res
942
952
 
943
953
  @property
944
954
  def module(self):
@@ -1102,7 +1112,7 @@ class Processor():
1102
1112
  if not ifiles[i]:
1103
1113
  # could be from non-unique with on_error=skip or from true gap
1104
1114
  self._base_logger.error(f'Found no file for page {page} in file group {ifg}')
1105
- if config.OCRD_MISSING_INPUT == 'abort':
1115
+ if config.OCRD_MISSING_INPUT == 'ABORT':
1106
1116
  raise MissingInputFile(ifg, page, mimetype)
1107
1117
  if not any(ifiles):
1108
1118
  # must be from non-unique with on_error=skip
@@ -1,6 +1,7 @@
1
1
  {
2
2
  "version": "1.0.0",
3
3
  "git_url": "https://github.com/OCR-D/core",
4
+ "dockerhub": "ocrd/core",
4
5
  "tools": {
5
6
  "ocrd-dummy": {
6
7
  "executable": "ocrd-dummy",
ocrd/processor/helpers.py CHANGED
@@ -276,7 +276,7 @@ def get_processor(
276
276
  # set current processing parameters
277
277
  processor.workspace = workspace
278
278
  processor.page_id = page_id
279
- processor.input_file_grp = input_file_grp
280
- processor.output_file_grp = output_file_grp
279
+ processor.input_file_grp = input_file_grp or ''
280
+ processor.output_file_grp = output_file_grp or ''
281
281
  return processor
282
282
  raise ValueError("Processor class is not known")