ocrd 3.1.2__py3-none-any.whl → 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocrd/cli/network.py CHANGED
@@ -13,6 +13,7 @@ from ocrd_network.cli import (
13
13
  processing_server_cli,
14
14
  processing_worker_cli,
15
15
  processor_server_cli,
16
+ resource_manager_server_cli
16
17
  )
17
18
 
18
19
 
@@ -28,3 +29,4 @@ network_cli.add_command(client_cli)
28
29
  network_cli.add_command(processing_server_cli)
29
30
  network_cli.add_command(processing_worker_cli)
30
31
  network_cli.add_command(processor_server_cli)
32
+ network_cli.add_command(resource_manager_server_cli)
ocrd/cli/resmgr.py CHANGED
@@ -18,9 +18,9 @@ from ocrd_utils import (
18
18
  getLogger,
19
19
  get_moduledir,
20
20
  get_ocrd_tool_json,
21
- resource_filename,
22
21
  initLogging,
23
22
  RESOURCE_LOCATIONS,
23
+ RESOURCE_TYPES
24
24
  )
25
25
  from ocrd.constants import RESOURCE_USER_LIST_COMMENT
26
26
 
@@ -72,13 +72,13 @@ def list_installed(executable=None):
72
72
  @click.option('-n', '--any-url', default='', help='URL of unregistered resource to download/copy from')
73
73
  @click.option('-D', '--no-dynamic', default=False, is_flag=True,
74
74
  help="Whether to skip looking into each processor's --dump-{json,module-dir} for module-level resources")
75
- @click.option('-t', '--resource-type', type=click.Choice(['file', 'directory', 'archive']), default='file',
75
+ @click.option('-t', '--resource-type', type=click.Choice(RESOURCE_TYPES), default='file',
76
76
  help='Type of resource',)
77
77
  @click.option('-P', '--path-in-archive', default='.', help='Path to extract in case of archive type')
78
78
  @click.option('-a', '--allow-uninstalled', is_flag=True,
79
79
  help="Allow installing resources for uninstalled processors",)
80
80
  @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True)
81
- @click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS),
81
+ @click.option('-l', '--location', type=click.Choice(RESOURCE_LOCATIONS), default='data',
82
82
  help="Where to store resources - defaults to first location in processor's 'resource_locations' "
83
83
  "list or finally 'data'")
84
84
  @click.argument('executable', required=True)
@@ -107,8 +107,6 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
107
107
  executable = None
108
108
  if name == '*':
109
109
  name = None
110
- is_url = (any_url.startswith('https://') or any_url.startswith('http://')) if any_url else False
111
- is_filename = Path(any_url).exists() if any_url else False
112
110
  if executable and not which(executable):
113
111
  if not allow_uninstalled:
114
112
  log.error(f"Executable '{executable}' is not installed. "
@@ -127,65 +125,30 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal
127
125
  'path_in_archive': path_in_archive}]
128
126
  )]
129
127
  for this_executable, this_reslist in reslist:
130
- for resdict in this_reslist:
131
- if 'size' in resdict:
132
- registered = "registered"
133
- else:
134
- registered = "unregistered"
135
- if any_url:
136
- resdict['url'] = any_url
137
- if resdict['url'] == '???':
138
- log.warning(f"Cannot download user resource {resdict['name']}")
139
- continue
140
- if resdict['url'].startswith('https://') or resdict['url'].startswith('http://'):
141
- log.info(f"Downloading {registered} resource '{resdict['name']}' ({resdict['url']})")
142
- if 'size' not in resdict:
143
- with requests.head(resdict['url']) as r:
144
- resdict['size'] = int(r.headers.get('content-length', 0))
145
- else:
146
- log.info(f"Copying {registered} resource '{resdict['name']}' ({resdict['url']})")
147
- urlpath = Path(resdict['url'])
148
- resdict['url'] = str(urlpath.resolve())
149
- if Path(urlpath).is_dir():
150
- resdict['size'] = directory_size(urlpath)
151
- else:
152
- resdict['size'] = urlpath.stat().st_size
153
- if not location:
154
- location = get_ocrd_tool_json(this_executable)['resource_locations'][0]
155
- elif location not in get_ocrd_tool_json(this_executable)['resource_locations']:
156
- log.error(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
157
- f"refusing to install to invalid location")
158
- sys.exit(1)
159
- if location != 'module':
160
- basedir = resmgr.location_to_resource_dir(location)
161
- else:
162
- basedir = get_moduledir(this_executable)
163
- if not basedir:
164
- basedir = resmgr.location_to_resource_dir('data')
165
-
128
+ resource_locations = get_ocrd_tool_json(this_executable)['resource_locations']
129
+ if not location:
130
+ location = resource_locations[0]
131
+ elif location not in resource_locations:
132
+ log.warning(f"The selected --location {location} is not in the {this_executable}'s resource search path, "
133
+ f"refusing to install to invalid location. Instead installing to: {resource_locations[0]}")
134
+ res_dest_dir = resmgr.build_resource_dest_dir(location=location, executable=this_executable)
135
+ for res_dict in this_reslist:
166
136
  try:
167
- with click.progressbar(length=resdict['size']) as bar:
168
- fpath = resmgr.download(
169
- this_executable,
170
- resdict['url'],
171
- basedir,
172
- name=resdict['name'],
173
- resource_type=resdict.get('type', resource_type),
174
- path_in_archive=resdict.get('path_in_archive', path_in_archive),
175
- overwrite=overwrite,
176
- no_subdir=location in ['cwd', 'module'],
177
- progress_cb=lambda delta: bar.update(delta)
178
- )
179
- if registered == 'unregistered':
180
- log.info(f"{this_executable} resource '{name}' ({any_url}) not a known resource, creating stub "
181
- f"in {resmgr.user_list}'")
182
- resmgr.add_to_user_database(this_executable, fpath, url=any_url)
183
- resmgr.save_user_list()
184
- log.info(f"Installed resource {resdict['url']} under {fpath}")
137
+ fpath = resmgr.handle_resource(
138
+ res_dict=res_dict,
139
+ executable=this_executable,
140
+ dest_dir=res_dest_dir,
141
+ any_url=any_url,
142
+ overwrite=overwrite,
143
+ resource_type=resource_type,
144
+ path_in_archive=path_in_archive
145
+ )
146
+ if not fpath:
147
+ continue
185
148
  except FileExistsError as exc:
186
149
  log.info(str(exc))
187
- log.info(f"Use in parameters as "
188
- f"'{resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))}'")
150
+ usage = res_dict.get('parameter_usage', 'as-is')
151
+ log.info(f"Use in parameters as '{resmgr.parameter_usage(res_dict['name'], usage)}'")
189
152
 
190
153
 
191
154
  @resmgr_cli.command('migrate')
ocrd/cli/workspace.py CHANGED
@@ -88,8 +88,8 @@ def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
88
88
  @pass_workspace
89
89
  @click.option('-a', '--download', is_flag=True, help="Download all files")
90
90
  @click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
91
- ['imagefilename', 'dimension', 'pixel_density', 'page', 'url', 'page_xsd', 'mets_fileid_page_pcgtsid',
92
- 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'mets_xsd']))
91
+ ['imagefilename', 'alternativeimage_filename', 'alternativeimage_comments', 'dimension', 'pixel_density', 'page', 'page_xsd',
92
+ 'url', 'mets_fileid_page_pcgtsid', 'mets_unique_identifier', 'mets_files', 'mets_xsd']))
93
93
  @click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
94
94
  @click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
95
95
  @click.argument('mets_url', default=None, required=False)
ocrd/processor/base.py CHANGED
@@ -43,15 +43,14 @@ from .ocrd_page_result import OcrdPageResult
43
43
  from ocrd_utils import (
44
44
  VERSION as OCRD_VERSION,
45
45
  MIMETYPE_PAGE,
46
- MIME_TO_EXT,
47
46
  config,
48
47
  getLogger,
49
48
  list_resource_candidates,
50
- pushd_popd,
51
49
  list_all_resources,
52
50
  get_processor_resource_types,
53
51
  resource_filename,
54
52
  parse_json_file_with_comments,
53
+ pushd_popd,
55
54
  make_file_id,
56
55
  deprecation_warning
57
56
  )
@@ -779,10 +778,14 @@ class Processor():
779
778
  to handle cases like multiple output fileGrps, non-PAGE input etc.)
780
779
  """
781
780
  input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
782
- assert isinstance(input_files[0], get_args(OcrdFileType))
783
- page_id = input_files[0].pageId
781
+ input_pos = next(i for i, input_file in enumerate(input_files) if input_file is not None)
782
+ page_id = input_files[input_pos].pageId
784
783
  self._base_logger.info("processing page %s", page_id)
785
784
  for i, input_file in enumerate(input_files):
785
+ if input_file is None:
786
+ grp = self.input_file_grp.split(',')[i]
787
+ self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}")
788
+ continue
786
789
  assert isinstance(input_file, get_args(OcrdFileType))
787
790
  self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
788
791
  try:
@@ -792,7 +795,10 @@ class Processor():
792
795
  except ValueError as err:
793
796
  # not PAGE and not an image to generate PAGE for
794
797
  self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
795
- output_file_id = make_file_id(input_files[0], self.output_file_grp)
798
+ output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
799
+ if input_files[input_pos].fileGrp == self.output_file_grp:
800
+ # input=output fileGrp: re-use ID exactly
801
+ output_file_id = input_files[input_pos].ID
796
802
  output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
797
803
  if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
798
804
  # short-cut avoiding useless computation:
@@ -898,9 +904,8 @@ class Processor():
898
904
  cwd = self.old_pwd
899
905
  else:
900
906
  cwd = getcwd()
901
- ret = [cand for cand in list_resource_candidates(executable, val,
902
- cwd=cwd, moduled=self.moduledir)
903
- if exists(cand)]
907
+ ret = list(filter(exists, list_resource_candidates(executable, val,
908
+ cwd=cwd, moduled=self.moduledir)))
904
909
  if ret:
905
910
  self._base_logger.debug("Resolved %s to absolute path %s" % (val, ret[0]))
906
911
  return ret[0]
@@ -931,17 +936,9 @@ class Processor():
931
936
  """
932
937
  List all resources found in the filesystem and matching content-type by filename suffix
933
938
  """
934
- mimetypes = get_processor_resource_types(None, self.ocrd_tool)
935
- for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir):
939
+ for res in list_all_resources(self.executable, ocrd_tool=self.ocrd_tool, moduled=self.moduledir):
936
940
  res = Path(res)
937
- if not '*/*' in mimetypes:
938
- if res.is_dir() and not 'text/directory' in mimetypes:
939
- continue
940
- # if we do not know all MIME types, then keep the file, otherwise require suffix match
941
- if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix)
942
- for mime in mimetypes):
943
- continue
944
- yield res
941
+ yield res.name
945
942
 
946
943
  @property
947
944
  def module(self):