ocrd 3.0.0b3__py3-none-any.whl → 3.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocrd/cli/bashlib.py CHANGED
@@ -76,10 +76,10 @@ def bashlib_constants(name):
76
76
  @click.option('--ocrd-tool', help="path to ocrd-tool.json of processor to feed", default=None)
77
77
  @click.option('--executable', help="name of processor executable in ocrd-tool.json", default=None)
78
78
  @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME)
79
- @click.option('-w', '--working-dir', help="Working Directory")
79
+ @click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server', default=None)
80
+ @click.option('-d', '--working-dir', help="Working Directory")
80
81
  @click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None)
81
82
  @click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None)
82
- # repeat some other processor options for convenience (will be ignored here)
83
83
  @click.option('-g', '--page-id', help="ID(s) of the pages to process")
84
84
  @click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n"
85
85
  "(with '--page-id', remove only those).\n"
@@ -126,9 +126,10 @@ def bashlib_input_files(ocrd_tool, executable, **kwargs):
126
126
  def executable(self):
127
127
  # needed for ocrd_tool lookup
128
128
  return executable
129
+ processor_class = FullBashlibProcessor
129
130
  else:
130
131
  # we have no true metadata file, so fill in just to make it work
131
- class FullBashlibProcessor(BashlibProcessor):
132
+ class UnknownBashlibProcessor(BashlibProcessor):
132
133
  @property
133
134
  def ocrd_tool(self):
134
135
  # needed to satisfy the validator
@@ -142,5 +143,6 @@ def bashlib_input_files(ocrd_tool, executable, **kwargs):
142
143
  def version(self):
143
144
  # needed to satisfy the validator and wrapper
144
145
  return '1.0'
146
+ processor_class = UnknownBashlibProcessor
145
147
 
146
- ocrd_cli_wrap_processor(FullBashlibProcessor, **kwargs)
148
+ ocrd_cli_wrap_processor(processor_class, **kwargs)
ocrd/cli/ocrd_tool.py CHANGED
@@ -125,7 +125,7 @@ def ocrd_tool_tool_list_resources(ctx):
125
125
  @click.argument('res_name')
126
126
  @pass_ocrd_tool
127
127
  def ocrd_tool_tool_resolve_resource(ctx, res_name):
128
- ctx.processor(None).resolve_resource(res_name)
128
+ print(ctx.processor(None).resolve_resource(res_name))
129
129
 
130
130
  @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource")
131
131
  @click.argument('res_name')
ocrd/cli/validate.py CHANGED
@@ -102,16 +102,19 @@ def validate_page(page, **kwargs):
102
102
  @validate_cli.command('tasks')
103
103
  @click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax')
104
104
  @click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace')
105
+ @click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server')
105
106
  @click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.')
106
107
  @click.option('-g', '--page-id', help="ID(s) of the pages to process")
107
108
  @click.argument('tasks', nargs=-1, required=True)
108
- def validate_process(tasks, workspace, mets_basename, overwrite, page_id):
109
+ def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id):
109
110
  '''
110
111
  Validate a sequence of tasks passable to `ocrd process`
111
112
  '''
112
113
  if workspace:
113
- _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks],
114
- Workspace(Resolver(), directory=workspace, mets_basename=mets_basename), page_id=page_id, overwrite=overwrite))
114
+ _inform_of_result(validate_tasks(
115
+ [ProcessorTask.parse(t) for t in tasks],
116
+ Workspace(Resolver(), directory=workspace, mets_basename=mets_basename, mets_server_url=mets_server_url),
117
+ page_id=page_id, overwrite=overwrite))
115
118
  else:
116
119
  for t in [ProcessorTask.parse(t) for t in tasks]:
117
120
  _inform_of_result(t.validate())
ocrd/cli/workspace.py CHANGED
@@ -36,6 +36,17 @@ class WorkspaceCtx():
36
36
  = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
37
37
  self.automatic_backup = automatic_backup
38
38
 
39
+ def workspace(self):
40
+ return Workspace(
41
+ self.resolver,
42
+ directory=self.directory,
43
+ mets_basename=self.mets_basename,
44
+ automatic_backup=self.automatic_backup,
45
+ mets_server_url=self.mets_server_url,
46
+ )
47
+ def backup_manager(self):
48
+ return WorkspaceBackupManager(self.workspace())
49
+
39
50
 
40
51
  pass_workspace = click.make_pass_decorator(WorkspaceCtx)
41
52
 
@@ -138,6 +149,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
138
149
  LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
139
150
  ctx.directory = workspace_dir
140
151
 
152
+ assert not ctx.mets_server_url
141
153
  workspace = ctx.resolver.workspace_from_url(
142
154
  mets_url,
143
155
  dst_dir=ctx.directory,
@@ -173,10 +185,11 @@ def workspace_init(ctx, clobber_mets, directory):
173
185
  if directory:
174
186
  LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
175
187
  ctx.directory = directory
188
+ assert not ctx.mets_server_url
176
189
  workspace = ctx.resolver.workspace_from_nothing(
177
190
  directory=ctx.directory,
178
191
  mets_basename=ctx.mets_basename,
179
- clobber_mets=clobber_mets
192
+ clobber_mets=clobber_mets,
180
193
  )
181
194
  workspace.save_mets()
182
195
  print(workspace.directory)
@@ -200,13 +213,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_
200
213
  Add a file or http(s) URL FNAME to METS in a workspace.
201
214
  If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
202
215
  """
203
- workspace = Workspace(
204
- ctx.resolver,
205
- directory=ctx.directory,
206
- mets_basename=ctx.mets_basename,
207
- automatic_backup=ctx.automatic_backup,
208
- mets_server_url=ctx.mets_server_url,
209
- )
216
+ workspace = ctx.workspace()
210
217
 
211
218
  log = getLogger('ocrd.cli.workspace.add')
212
219
  if not mimetype:
@@ -313,13 +320,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
313
320
 
314
321
  """
315
322
  log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
316
- workspace = Workspace(
317
- ctx.resolver,
318
- directory=ctx.directory,
319
- mets_basename=ctx.mets_basename,
320
- automatic_backup=ctx.automatic_backup,
321
- mets_server_url=ctx.mets_server_url,
322
- )
323
+ workspace = ctx.workspace()
323
324
 
324
325
  try:
325
326
  pat = re.compile(regex)
@@ -455,12 +456,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl
455
456
  output_field = [snake_to_camel.get(x, x) for x in output_field]
456
457
  modified_mets = False
457
458
  ret = []
458
- workspace = Workspace(
459
- ctx.resolver,
460
- directory=ctx.directory,
461
- mets_basename=ctx.mets_basename,
462
- mets_server_url=ctx.mets_server_url,
463
- )
459
+ workspace = ctx.workspace()
464
460
  with pushd_popd(workspace.directory):
465
461
  for f in workspace.find_files(
466
462
  file_id=file_id,
@@ -510,7 +506,7 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin
510
506
  (If any ``ID`` starts with ``//``, then its remainder
511
507
  will be interpreted as a regular expression.)
512
508
  """
513
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
509
+ workspace = ctx.workspace()
514
510
  for i in id:
515
511
  workspace.remove_file(i, force=force, keep_file=keep_file)
516
512
  workspace.save_mets()
@@ -528,7 +524,7 @@ def rename_group(ctx, old, new):
528
524
  """
529
525
  Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
530
526
  """
531
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
527
+ workspace = ctx.workspace()
532
528
  workspace.rename_file_group(old, new)
533
529
  workspace.save_mets()
534
530
 
@@ -549,7 +545,7 @@ def remove_group(ctx, group, recursive, force, keep_files):
549
545
  (If any ``GROUP`` starts with ``//``, then its remainder
550
546
  will be interpreted as a regular expression.)
551
547
  """
552
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
548
+ workspace = ctx.workspace()
553
549
  for g in group:
554
550
  workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
555
551
  workspace.save_mets()
@@ -571,7 +567,7 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
571
567
  (If any ``FILTER`` starts with ``//``, then its remainder
572
568
  will be interpreted as a regular expression.)
573
569
  """
574
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
570
+ workspace = ctx.workspace()
575
571
  with pushd_popd(workspace.directory):
576
572
  for f in workspace.find_files(
577
573
  file_id=file_id,
@@ -608,8 +604,7 @@ def clean(ctx, dry_run, directories, path_glob):
608
604
  If no PATH_GLOB are specified, then all files and directories
609
605
  may match.
610
606
  """
611
- log = getLogger('ocrd.cli.workspace.clean')
612
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
607
+ workspace = ctx.workspace()
613
608
  allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)]
614
609
  allowed_files.append(relpath(workspace.mets_target, start=workspace.directory))
615
610
  allowed_dirs = set(dirname(path) for path in allowed_files)
@@ -627,7 +622,7 @@ def clean(ctx, dry_run, directories, path_glob):
627
622
  if normpath(path) in allowed_files:
628
623
  continue
629
624
  if dry_run:
630
- log.info('unlink(%s)' % path)
625
+ ctx.log.info('unlink(%s)' % path)
631
626
  else:
632
627
  unlink(path)
633
628
  if not directories:
@@ -637,7 +632,7 @@ def clean(ctx, dry_run, directories, path_glob):
637
632
  if normpath(path) in allowed_dirs:
638
633
  continue
639
634
  if dry_run:
640
- log.info('rmdir(%s)' % path)
635
+ ctx.log.info('rmdir(%s)' % path)
641
636
  else:
642
637
  rmdir(path)
643
638
 
@@ -651,7 +646,7 @@ def list_groups(ctx):
651
646
  """
652
647
  List fileGrp USE attributes
653
648
  """
654
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
649
+ workspace = ctx.workspace()
655
650
  print("\n".join(workspace.mets.file_groups))
656
651
 
657
652
  # ----------------------------------------------------------------------
@@ -677,7 +672,7 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page
677
672
  (If any ``FILTER`` starts with ``//``, then its remainder
678
673
  will be interpreted as a regular expression.)
679
674
  """
680
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
675
+ workspace = ctx.workspace()
681
676
  find_kwargs = {}
682
677
  if page_id_range and 'ID' in output_field:
683
678
  find_kwargs['pageId'] = page_id_range
@@ -724,7 +719,7 @@ def get_id(ctx):
724
719
  """
725
720
  Get METS id if any
726
721
  """
727
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
722
+ workspace = ctx.workspace()
728
723
  ID = workspace.mets.unique_identifier
729
724
  if ID:
730
725
  print(ID)
@@ -744,7 +739,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin
744
739
 
745
740
  Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
746
741
  """
747
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
742
+ workspace = ctx.workspace()
748
743
  workspace.mets.unique_identifier = id
749
744
  workspace.save_mets()
750
745
 
@@ -767,7 +762,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
767
762
  if contentids:
768
763
  update_kwargs['CONTENTIDS'] = contentids
769
764
  try:
770
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
765
+ workspace = ctx.workspace()
771
766
  workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
772
767
  workspace.save_mets()
773
768
  except Exception as err:
@@ -805,7 +800,7 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
805
800
  mets_path = Path(mets_path)
806
801
  if filegrp_mapping:
807
802
  filegrp_mapping = loads(filegrp_mapping)
808
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
803
+ workspace = ctx.workspace()
809
804
  other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
810
805
  workspace.merge(
811
806
  other_workspace,
@@ -829,11 +824,12 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
829
824
  # ----------------------------------------------------------------------
830
825
 
831
826
  @workspace_cli.group('backup')
832
- @click.pass_context
827
+ @pass_workspace
833
828
  def workspace_backup_cli(ctx): # pylint: disable=unused-argument
834
829
  """
835
830
  Backing and restoring workspaces - dev edition
836
831
  """
832
+ assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server"
837
833
 
838
834
  @workspace_backup_cli.command('add')
839
835
  @pass_workspace
@@ -841,7 +837,7 @@ def workspace_backup_add(ctx):
841
837
  """
842
838
  Create a new backup
843
839
  """
844
- backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
840
+ backup_manager = ctx.backup_manager()
845
841
  backup_manager.add()
846
842
 
847
843
  @workspace_backup_cli.command('list')
@@ -850,7 +846,7 @@ def workspace_backup_list(ctx):
850
846
  """
851
847
  List backups
852
848
  """
853
- backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
849
+ backup_manager = ctx.backup_manager()
854
850
  for b in backup_manager.list():
855
851
  print(b)
856
852
 
@@ -862,7 +858,7 @@ def workspace_backup_restore(ctx, choose_first, bak):
862
858
  """
863
859
  Restore backup BAK
864
860
  """
865
- backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
861
+ backup_manager = ctx.backup_manager()
866
862
  backup_manager.restore(bak, choose_first)
867
863
 
868
864
  @workspace_backup_cli.command('undo')
@@ -871,7 +867,7 @@ def workspace_backup_undo(ctx):
871
867
  """
872
868
  Restore the last backup
873
869
  """
874
- backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
870
+ backup_manager = ctx.backup_manager()
875
871
  backup_manager.undo()
876
872
 
877
873
 
@@ -888,15 +884,24 @@ def workspace_serve_cli(ctx): # pylint: disable=unused-argument
888
884
  @workspace_serve_cli.command('stop')
889
885
  @pass_workspace
890
886
  def workspace_serve_stop(ctx): # pylint: disable=unused-argument
891
- """Stop the METS server"""
892
- workspace = Workspace(
893
- ctx.resolver,
894
- directory=ctx.directory,
895
- mets_basename=ctx.mets_basename,
896
- mets_server_url=ctx.mets_server_url,
897
- )
887
+ """Stop the METS server (saving changes to disk)"""
888
+ workspace = ctx.workspace()
898
889
  workspace.mets.stop()
899
890
 
891
+ @workspace_serve_cli.command('reload')
892
+ @pass_workspace
893
+ def workspace_serve_reload(ctx): # pylint: disable=unused-argument
894
+ """Reload the METS server from disk"""
895
+ workspace = ctx.workspace()
896
+ workspace.mets.reload()
897
+
898
+ @workspace_serve_cli.command('save')
899
+ @pass_workspace
900
+ def workspace_serve_save(ctx): # pylint: disable=unused-argument
901
+ """Save the METS changes to disk"""
902
+ workspace = ctx.workspace()
903
+ workspace.mets.save()
904
+
900
905
  @workspace_serve_cli.command('start')
901
906
  @pass_workspace
902
907
  def workspace_serve_start(ctx): # pylint: disable=unused-argument
@@ -43,6 +43,7 @@ def ocrd_cli_options(f):
43
43
  option('--address', type=ServerAddressParamType()),
44
44
  option('--queue', type=QueueServerParamType()),
45
45
  option('--database', type=DatabaseParamType()),
46
+ option('-R', '--resolve-resource'),
46
47
  option('-C', '--show-resource'),
47
48
  option('-L', '--list-resources', is_flag=True, default=False),
48
49
  option('-J', '--dump-json', is_flag=True, default=False),
ocrd/lib.bash CHANGED
@@ -27,8 +27,8 @@ ocrd__log () {
27
27
  ## Ensure minimum version
28
28
  # ht https://stackoverflow.com/posts/4025065
29
29
  ocrd__minversion () {
30
- local minversion_raw="$1"
31
30
  set -e
31
+ local minversion_raw="$1"
32
32
  local version_raw=$(ocrd --version|sed 's/ocrd, version //')
33
33
  local version_mmp=$(echo "$version_raw" | grep -Eo '([0-9]+\.?){3}')
34
34
  local version_prerelease_suffix="${version_raw#$version_mmp}"
@@ -123,6 +123,7 @@ ocrd__usage () {
123
123
  ## declare -A ocrd__argv=()
124
124
  ## ```
125
125
  ocrd__parse_argv () {
126
+ set -e
126
127
 
127
128
  # if [[ -n "$ZSH_VERSION" ]];then
128
129
  # print -r -- ${+ocrd__argv} ${(t)ocrd__argv}
@@ -135,11 +136,16 @@ ocrd__parse_argv () {
135
136
  ocrd__raise "Must set \$params (declare -A params)"
136
137
  fi
137
138
 
139
+ if ! declare -p "params_json" >/dev/null 2>/dev/null ;then
140
+ ocrd__raise "Must set \$params_json (declare params_json)"
141
+ fi
142
+
138
143
  if [[ $# = 0 ]];then
139
144
  ocrd__usage
140
145
  exit 1
141
146
  fi
142
147
 
148
+ ocrd__argv[debug]=false
143
149
  ocrd__argv[overwrite]=false
144
150
  ocrd__argv[profile]=false
145
151
  ocrd__argv[profile_file]=
@@ -170,6 +176,7 @@ ocrd__parse_argv () {
170
176
  -w|--working-dir) ocrd__argv[working_dir]=$(realpath "$2") ; shift ;;
171
177
  -m|--mets) ocrd__argv[mets_file]=$(realpath "$2") ; shift ;;
172
178
  -U|--mets-server-url) ocrd__argv[mets_server_url]="$2" ; shift ;;
179
+ --debug) ocrd__argv[debug]=true ;;
173
180
  --overwrite) ocrd__argv[overwrite]=true ;;
174
181
  --profile) ocrd__argv[profile]=true ;;
175
182
  --profile-file) ocrd__argv[profile_file]=$(realpath "$2") ; shift ;;
@@ -242,17 +249,6 @@ ocrd__parse_argv () {
242
249
  trap showtime DEBUG
243
250
  fi
244
251
 
245
- # check fileGrps
246
- local _valopts=( --workspace "${ocrd__argv[working_dir]}" --mets-basename "$(basename ${ocrd__argv[mets_file]})" )
247
- if [[ ${ocrd__argv[overwrite]} = true ]]; then
248
- _valopts+=( --overwrite )
249
- fi
250
- if [[ -n "${ocrd__argv[page_id]:-}" ]]; then
251
- _valopts+=( --page-id "${ocrd__argv[page_id]}" )
252
- fi
253
- _valopts+=( "${OCRD_TOOL_NAME#ocrd-} -I ${ocrd__argv[input_file_grp]} -O ${ocrd__argv[output_file_grp]} ${__parameters[*]@Q} ${__parameter_overrides[*]@Q}" )
254
- ocrd validate tasks "${_valopts[@]}" || exit $?
255
-
256
252
  # check parameters
257
253
  local params_parsed retval
258
254
  params_parsed="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params "${__parameters[@]}" "${__parameter_overrides[@]}")" || {
@@ -261,10 +257,12 @@ ocrd__parse_argv () {
261
257
  $params_parsed"
262
258
  }
263
259
  eval "$params_parsed"
260
+ params_json="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params --json "${__parameters[@]}" "${__parameter_overrides[@]}")"
264
261
 
265
262
  }
266
263
 
267
264
  ocrd__wrap () {
265
+ set -e
268
266
 
269
267
  declare -gx OCRD_TOOL_JSON="$1"
270
268
  declare -gx OCRD_TOOL_NAME="$2"
@@ -272,6 +270,7 @@ ocrd__wrap () {
272
270
  shift
273
271
  declare -Agx params
274
272
  params=()
273
+ declare -g params_json
275
274
  declare -Agx ocrd__argv
276
275
  ocrd__argv=()
277
276
 
@@ -293,22 +292,26 @@ ocrd__wrap () {
293
292
 
294
293
  ocrd__parse_argv "$@"
295
294
 
296
- i=0
297
- declare -ag ocrd__files=()
298
- while read line; do
299
- eval declare -Ag "ocrd__file$i=( $line )"
300
- eval "ocrd__files[$i]=ocrd__file$i"
301
- let ++i
302
- done < <(ocrd bashlib input-files \
295
+ declare -ag ocrd__files
296
+ IFS=$'\n'
297
+ ocrd__files=( $(ocrd bashlib input-files \
303
298
  --ocrd-tool $OCRD_TOOL_JSON \
304
299
  --executable $OCRD_TOOL_NAME \
300
+ $(if [[ ${ocrd__argv[debug]} = true ]]; then echo --debug; fi) \
301
+ $(if [[ ${ocrd__argv[overwrite]} = true ]]; then echo --overwrite; fi) \
305
302
  -m "${ocrd__argv[mets_file]}" \
303
+ -d "${ocrd__argv[working_dir]}" \
304
+ ${ocrd__argv[mets_server_url]:+-U} ${ocrd__argv[mets_server_url]:-} \
305
+ -p "$params_json" \
306
306
  -I "${ocrd__argv[input_file_grp]}" \
307
307
  -O "${ocrd__argv[output_file_grp]}" \
308
- ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-})
308
+ ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) )
309
+ IFS=$' \t\n'
309
310
  }
310
311
 
311
312
  ## usage: pageId=$(ocrd__input_file 3 pageId)
312
313
  ocrd__input_file() {
313
- eval echo "\${${ocrd__files[$1]}[$2]}"
314
+ declare -A input_file
315
+ eval input_file=( "${ocrd__files[$1]}" )
316
+ eval echo "${input_file[$2]}"
314
317
  }
ocrd/mets_server.py CHANGED
@@ -88,6 +88,14 @@ class OcrdFileGroupListModel(BaseModel):
88
88
  return OcrdFileGroupListModel(file_groups=file_groups)
89
89
 
90
90
 
91
+ class OcrdPageListModel(BaseModel):
92
+ physical_pages: List[str] = Field()
93
+
94
+ @staticmethod
95
+ def create(physical_pages: List[str]):
96
+ return OcrdPageListModel(physical_pages=physical_pages)
97
+
98
+
91
99
  class OcrdAgentListModel(BaseModel):
92
100
  agents: List[OcrdAgentModel] = Field()
93
101
 
@@ -210,6 +218,17 @@ class ClientSideOcrdMets:
210
218
  ).json()["text"]
211
219
  return self.ws_dir_path
212
220
 
221
+ @property
222
+ def physical_pages(self) -> List[str]:
223
+ if not self.multiplexing_mode:
224
+ return self.session.request("GET", f"{self.url}/physical_pages").json()["physical_pages"]
225
+ else:
226
+ return self.session.request(
227
+ "POST",
228
+ self.url,
229
+ json=MpxReq.physical_pages(self.ws_dir_path)
230
+ ).json()["physical_pages"]
231
+
213
232
  @property
214
233
  def file_groups(self):
215
234
  if not self.multiplexing_mode:
@@ -284,15 +303,17 @@ class ClientSideOcrdMets:
284
303
  file_id=ID, page_id=pageId,
285
304
  mimetype=mimetype, url=url, local_filename=local_filename
286
305
  )
306
+ # add force+ignore
307
+ kwargs = {**kwargs, **data.dict()}
287
308
 
288
309
  if not self.multiplexing_mode:
289
- r = self.session.request("POST", f"{self.url}/file", data=data.dict())
290
- if not r:
291
- raise RuntimeError("Add file failed. Please check provided parameters")
310
+ r = self.session.request("POST", f"{self.url}/file", data=kwargs)
311
+ if not r.ok:
312
+ raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()}")
292
313
  else:
293
- r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict()))
294
- if "error" in r:
295
- raise RuntimeError(f"Add file failed: Msg: {r['error']}")
314
+ r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, kwargs))
315
+ if not r.ok:
316
+ raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()[errors]}")
296
317
 
297
318
  return ClientSideOcrdFile(
298
319
  None, fileGrp=file_grp,
@@ -347,6 +368,11 @@ class MpxReq:
347
368
  return MpxReq.__args_wrapper(
348
369
  ws_dir_path, method_type="GET", response_type="text", request_url="workspace_path", request_data={})
349
370
 
371
+ @staticmethod
372
+ def physical_pages(ws_dir_path: str) -> Dict:
373
+ return MpxReq.__args_wrapper(
374
+ ws_dir_path, method_type="GET", response_type="dict", request_url="physical_pages", request_data={})
375
+
350
376
  @staticmethod
351
377
  def file_groups(ws_dir_path: str) -> Dict:
352
378
  return MpxReq.__args_wrapper(
@@ -466,6 +492,10 @@ class OcrdMetsServer:
466
492
  async def workspace_path():
467
493
  return Response(content=workspace.directory, media_type="text/plain")
468
494
 
495
+ @app.get(path='/physical_pages', response_model=OcrdPageListModel)
496
+ async def physical_pages():
497
+ return {'physical_pages': workspace.mets.physical_pages}
498
+
469
499
  @app.get(path='/file_groups', response_model=OcrdFileGroupListModel)
470
500
  async def file_groups():
471
501
  return {'file_groups': workspace.mets.file_groups}
@@ -505,7 +535,8 @@ class OcrdMetsServer:
505
535
  page_id: Optional[str] = Form(),
506
536
  mimetype: str = Form(),
507
537
  url: Optional[str] = Form(None),
508
- local_filename: Optional[str] = Form(None)
538
+ local_filename: Optional[str] = Form(None),
539
+ force: bool = Form(False),
509
540
  ):
510
541
  """
511
542
  Add a file
@@ -517,7 +548,7 @@ class OcrdMetsServer:
517
548
  )
518
549
  # Add to workspace
519
550
  kwargs = file_resource.dict()
520
- workspace.add_file(**kwargs)
551
+ workspace.add_file(**kwargs, force=force)
521
552
  return file_resource
522
553
 
523
554
  # ------------- #
ocrd/processor/base.py CHANGED
@@ -16,7 +16,7 @@ import json
16
16
  import os
17
17
  from os import getcwd
18
18
  from pathlib import Path
19
- from typing import List, Optional, Union, get_args
19
+ from typing import Any, List, Optional, Union, get_args
20
20
  import sys
21
21
  import inspect
22
22
  import tarfile
@@ -166,11 +166,14 @@ class Processor():
166
166
 
167
167
  (Override if ``ocrd-tool.json`` is not distributed with the Python package.)
168
168
  """
169
- # XXX HACK
170
- module_tokens = self.__module__.split('.')
171
- if module_tokens[0] == 'src':
172
- module_tokens.pop(0)
173
- return resource_filename(module_tokens[0], self.metadata_filename)
169
+ module = inspect.getmodule(self)
170
+ module_tokens = module.__package__.split('.')
171
+ # for namespace packages, we cannot just use the first token
172
+ for i in range(len(module_tokens)):
173
+ prefix = '.'.join(module_tokens[:i + 1])
174
+ if sys.modules[prefix].__spec__.has_location:
175
+ return resource_filename(prefix, self.metadata_filename)
176
+ raise Exception("cannot find top-level module prefix for %s", module.__package__)
174
177
 
175
178
  @cached_property
176
179
  def metadata_rawdict(self) -> dict:
@@ -336,7 +339,7 @@ class Processor():
336
339
  self._finalizer = weakref.finalize(self, self.shutdown)
337
340
  # workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
338
341
  setattr(self, 'process',
339
- deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')(getattr(self, 'process')))
342
+ deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
340
343
 
341
344
  def show_help(self, subcommand=None):
342
345
  """
@@ -355,6 +358,7 @@ class Processor():
355
358
  """
356
359
  Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
357
360
  """
361
+ # verify input and output file groups in parameters
358
362
  assert self.input_file_grp is not None
359
363
  assert self.output_file_grp is not None
360
364
  input_file_grps = self.input_file_grp.split(',')
@@ -371,12 +375,23 @@ class Processor():
371
375
  assert len(grps) >= minimum, msg % (len(grps), str(spec))
372
376
  if maximum > 0:
373
377
  assert len(grps) <= maximum, msg % (len(grps), str(spec))
374
- assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
375
- "Unexpected number of input file groups %d vs %s")
376
- assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
377
- "Unexpected number of output file groups %d vs %s")
378
+ # FIXME: enforce unconditionally as soon as grace period for deprecation is over
379
+ if 'input_file_grp_cardinality' in self.ocrd_tool:
380
+ assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
381
+ "Unexpected number of input file groups %d vs %s")
382
+ if 'output_file_grp_cardinality' in self.ocrd_tool:
383
+ assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
384
+ "Unexpected number of output file groups %d vs %s")
385
+ # verify input and output file groups in METS
378
386
  for input_file_grp in input_file_grps:
379
- assert input_file_grp in self.workspace.mets.file_groups
387
+ assert input_file_grp in self.workspace.mets.file_groups, \
388
+ f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}"
389
+ for output_file_grp in output_file_grps:
390
+ assert output_file_grp not in self.workspace.mets.file_groups \
391
+ or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \
392
+ or not any(self.workspace.mets.find_files(
393
+ pageId=self.page_id, fileGrp=output_file_grp)), \
394
+ f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}"
380
395
  # keep this for backwards compatibility:
381
396
  return True
382
397
 
@@ -455,17 +470,17 @@ class Processor():
455
470
  nr_copied = 0
456
471
 
457
472
  # set up multithreading
458
- if self.max_workers <= 0:
459
- max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
460
- else:
461
- max_workers = max(0, min(config.OCRD_MAX_PARALLEL_PAGES, self.max_workers))
473
+ max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
474
+ if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
475
+ self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
476
+ max_workers = self.max_workers
462
477
  if max_workers > 1:
463
478
  assert isinstance(workspace.mets, ClientSideOcrdMets), \
464
479
  "OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
465
- if self.max_page_seconds <= 0:
466
- max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
467
- else:
468
- max_seconds = max(0, min(config.OCRD_PROCESSING_PAGE_TIMEOUT, self.max_page_seconds))
480
+ max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
481
+ if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
482
+ self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
483
+ max_seconds = self.max_page_seconds
469
484
  executor = ThreadPoolExecutor(
470
485
  max_workers=max_workers or 1,
471
486
  thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
@@ -543,7 +558,11 @@ class Processor():
543
558
 
544
559
  except NotImplementedError:
545
560
  # fall back to deprecated method
546
- self.process()
561
+ try:
562
+ self.process()
563
+ except Exception as err:
564
+ # suppress the NotImplementedError context
565
+ raise err from None
547
566
 
548
567
  def _copy_page_file(self, input_file : OcrdFileType) -> None:
549
568
  """
@@ -571,7 +590,6 @@ class Processor():
571
590
  local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
572
591
  mimetype=MIMETYPE_PAGE,
573
592
  content=to_xml(input_pcgts),
574
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
575
593
  )
576
594
 
577
595
  def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
@@ -613,6 +631,8 @@ class Processor():
613
631
  image_result.alternative_image.set_imageHeight(image_result.pil.height)
614
632
  elif isinstance(image_result.alternative_image, AlternativeImageType):
615
633
  image_result.alternative_image.set_filename(image_file_path)
634
+ elif image_result.alternative_image is None:
635
+ pass # do not reference in PAGE result
616
636
  else:
617
637
  raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
618
638
  f"{type(image_result.alternative_image)}")
@@ -622,7 +642,6 @@ class Processor():
622
642
  self.output_file_grp,
623
643
  page_id=page_id,
624
644
  file_path=image_file_path,
625
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
626
645
  )
627
646
  result.pcgts.set_pcGtsId(output_file_id)
628
647
  self.add_metadata(result.pcgts)
@@ -633,7 +652,6 @@ class Processor():
633
652
  local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
634
653
  mimetype=MIMETYPE_PAGE,
635
654
  content=to_xml(result.pcgts),
636
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
637
655
  )
638
656
 
639
657
  def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
@@ -47,7 +47,6 @@ class DummyProcessor(Processor):
47
47
  mimetype=input_file.mimetype,
48
48
  local_filename=local_filename,
49
49
  content=f.read(),
50
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
51
50
  )
52
51
  file_id = file_id + '_PAGE'
53
52
  pcgts = page_from_file(output_file)
@@ -62,7 +61,6 @@ class DummyProcessor(Processor):
62
61
  local_filename=join(self.output_file_grp, file_id + '.xml'),
63
62
  mimetype=MIMETYPE_PAGE,
64
63
  content=to_xml(pcgts),
65
- force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
66
64
  )
67
65
  else:
68
66
  if self.parameter['copy_files']:
ocrd/processor/helpers.py CHANGED
@@ -89,7 +89,7 @@ def run_processor(
89
89
 
90
90
  ocrd_tool = processor.ocrd_tool
91
91
  name = '%s v%s' % (ocrd_tool['executable'], processor.version)
92
- otherrole = ocrd_tool['steps'][0]
92
+ otherrole = ocrd_tool.get('steps', [''])[0]
93
93
  logProfile = getLogger('ocrd.process.profile')
94
94
  log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
95
95
  t0_wall = perf_counter()
@@ -234,10 +234,10 @@ def get_cached_processor(parameter: dict, processor_class):
234
234
  def get_processor(
235
235
  processor_class,
236
236
  parameter: Optional[dict] = None,
237
- workspace: Workspace = None,
238
- page_id: str = None,
239
- input_file_grp: List[str] = None,
240
- output_file_grp: List[str] = None,
237
+ workspace: Optional[Workspace] = None,
238
+ page_id: Optional[str] = None,
239
+ input_file_grp: Optional[List[str]] = None,
240
+ output_file_grp: Optional[List[str]] = None,
241
241
  instance_caching: bool = False,
242
242
  ):
243
243
  if processor_class:
@@ -258,6 +258,7 @@ def get_processor(
258
258
  else:
259
259
  # avoid passing workspace already (deprecated chdir behaviour)
260
260
  processor = processor_class(None, parameter=parameter)
261
+ assert processor
261
262
  # set current processing parameters
262
263
  processor.workspace = workspace
263
264
  processor.page_id = page_id
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List, Union
2
+ from typing import List, Union, Optional
3
3
  from ocrd_models.ocrd_page import OcrdPage
4
4
  from PIL.Image import Image
5
5
 
@@ -9,7 +9,7 @@ from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
9
9
  class OcrdPageResultImage():
10
10
  pil : Image
11
11
  file_id_suffix : str
12
- alternative_image : Union[AlternativeImageType, PageType]
12
+ alternative_image : Optional[Union[AlternativeImageType, PageType]]
13
13
 
14
14
  @dataclass
15
15
  class OcrdPageResult():
ocrd/workspace.py CHANGED
@@ -19,6 +19,7 @@ from ocrd_models.ocrd_page import parse, BorderType, to_xml
19
19
  from ocrd_modelfactory import exif_from_filename, page_from_file
20
20
  from ocrd_utils import (
21
21
  atomic_write,
22
+ config,
22
23
  getLogger,
23
24
  image_from_polygon,
24
25
  coordinates_of_segment,
@@ -121,7 +122,10 @@ class Workspace():
121
122
  """
122
123
  Reload METS from the filesystem.
123
124
  """
124
- self.mets = OcrdMets(filename=self.mets_target)
125
+ if self.is_remote:
126
+ self.mets.reload()
127
+ else:
128
+ self.mets = OcrdMets(filename=self.mets_target)
125
129
 
126
130
  @deprecated_alias(pageId="page_id")
127
131
  @deprecated_alias(ID="file_id")
@@ -424,6 +428,8 @@ class Workspace():
424
428
  kwargs["pageId"] = kwargs.pop("page_id")
425
429
  if "file_id" in kwargs:
426
430
  kwargs["ID"] = kwargs.pop("file_id")
431
+ if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
432
+ kwargs["force"] = True
427
433
 
428
434
  ret = self.mets.add_file(file_grp, **kwargs)
429
435
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.0.0b3
3
+ Version: 3.0.0b5
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -1,36 +1,36 @@
1
1
  ocrd/__init__.py,sha256=ZswMVmlqFhAEIzMR3my6IKPq9XLH21aDPC_m_8Jh4dA,1076
2
2
  ocrd/constants.py,sha256=6dn3mG54WqHsKInmLZp4kJjNqqPtBoFoSuLUuRbOps0,740
3
- ocrd/lib.bash,sha256=dlOQd36OVsibwEPXg7i-e_JmIc-3uH-SIwGcE8OJ3Cc,10744
4
- ocrd/mets_server.py,sha256=SrCnCmWBuyEauHrNF0jLZO_Wi8WoPCMWxhec2wE5Y3w,20160
3
+ ocrd/lib.bash,sha256=le6XqAOEacdjP3JNSlPkxwRH1y0oVjNQM2tX5d6QFO4,10901
4
+ ocrd/mets_server.py,sha256=U62eih1_O_N0StunVFkEustFs2PlrcMzccraj6_QRk4,21295
5
5
  ocrd/ocrd-all-tool.json,sha256=9bX2VYnUwhTAzAvKaoT77BFzbgBGgyIt7qBqARpwWNc,586
6
6
  ocrd/resolver.py,sha256=Ba9ALQbTXz6_mla4VqN9tAfHoj6aKuNJAU4tIDnjcHE,14952
7
7
  ocrd/resource_list.yml,sha256=82-PiqkZnka1kTj3MQqNn4wXWKHHtoFchsQuetWuqFs,2633
8
8
  ocrd/resource_manager.py,sha256=8BMVKJq8J56hugi8vtGn9Ffuk7oRkbs197aG74aKbCY,16733
9
9
  ocrd/task_sequence.py,sha256=spiaUQaMM7M8WdBDoQGmLuTPm7tOugYXD6rcJ2UXzxw,6991
10
- ocrd/workspace.py,sha256=4s0qscEosS7rQ0jfn1qJeT9B3eC31YippAX-RUjXghA,65608
10
+ ocrd/workspace.py,sha256=cedqK7es2i2nwQCiUiVyWk3j4-nH7bsi6TF7v8siTio,65794
11
11
  ocrd/workspace_backup.py,sha256=iab_JjZ_mMP-G8NIUk4PZmfpNlQuGRoqc3NbTSSew1w,3621
12
12
  ocrd/workspace_bagger.py,sha256=yU8H3xR5WmQKvgQewac71ie-DUWcfLnMS01D55zsEHQ,11971
13
13
  ocrd/cli/__init__.py,sha256=lNR6wMf7JhQ8Jf33tUkowJr0mB3423OMY0_6dkMRLvU,2672
14
- ocrd/cli/bashlib.py,sha256=XGcO-MmYM3xJBRkSCLEZcGs0hqbw2GR8oyijJPtKnYM,5888
14
+ ocrd/cli/bashlib.py,sha256=ypFBM3-IULz_IEBx0Y04eGt9VbQWwEWm4ujm9g_hPWY,6009
15
15
  ocrd/cli/log.py,sha256=6_FrVmTKIIVNUaNLkuOJx8pvPhensHMuayJ0PA7T-XA,1562
16
16
  ocrd/cli/network.py,sha256=oWBHFEURxfUdb_t-F4svP_ri7o5mqBoNQnLZLbsZLTA,602
17
- ocrd/cli/ocrd_tool.py,sha256=Sqh9Q5-H3u5IPPovp9bpL-QaP01zAHKZDSsi5MhvnQ8,7028
17
+ ocrd/cli/ocrd_tool.py,sha256=qaJgt-LNH0tXkaupMNrEKXasxcgsabHdfLdYESEsomk,7035
18
18
  ocrd/cli/process.py,sha256=8KD0i7LT01H9u5CC1vktYMEVpS67da_rp_09_EOECmw,1233
19
19
  ocrd/cli/resmgr.py,sha256=bTE-MpF7RRCHhgAbknqZUFHgHScIK6FR3S4h4DEAets,10080
20
- ocrd/cli/validate.py,sha256=s5GtioCtO0UfRmYi_tdxNkKx0bJIzdAXGMCPA-PTbto,5563
21
- ocrd/cli/workspace.py,sha256=ETx3qjqUIXJp-Addv8eYmI2T_Js4PJR5UuZ4O6H7mis,40640
20
+ ocrd/cli/validate.py,sha256=nvageDaHCETcE71X5lu7i_4JKpgo9MrvJKinVPLYUTI,5727
21
+ ocrd/cli/workspace.py,sha256=InIQ5rfQWPn4Qsd1s_xA6AC6ndZLCsuyhoAEiqP8bK4,39479
22
22
  ocrd/cli/zip.py,sha256=MMJLw3OXWiJVfVtrdJcBkbB8vA1IzSautluazZRuCQ0,5910
23
23
  ocrd/decorators/__init__.py,sha256=IJlA1XcdVBO6Hxm9rNDya7QYcqeWcaXXuLtGjfjcen8,7596
24
24
  ocrd/decorators/loglevel_option.py,sha256=tgipROEu3t4hkwWvFssd80k2SbTBwBIC4WNE6Gc-XAg,798
25
25
  ocrd/decorators/mets_find_options.py,sha256=d4oATKMP6bFQHNqOK6nLqgUiWF2FYdkPvzkTVRMYpKo,635
26
- ocrd/decorators/ocrd_cli_options.py,sha256=4pcBLAFPSpYZLj6r9Yj1GZOQl4r_RWU00pyA4mHwFQk,2621
26
+ ocrd/decorators/ocrd_cli_options.py,sha256=hr2EugwAY_-GJ7F7g77Od9o9eAqhfLBHSpfmCql2OCU,2665
27
27
  ocrd/decorators/parameter_option.py,sha256=n8hYw7XVTd3i3tvpK8F1Jx_CqRp6EGF9qJVH95yj92Q,1076
28
28
  ocrd/processor/__init__.py,sha256=39ymNwYRdc-b_OJzzKmWCvo2ga3KdsGSYDHE1Hzkn_w,274
29
- ocrd/processor/base.py,sha256=341APZGx6zCbuxgX-XTkKhPfeQkqblykmC9zSMPH3ss,48843
30
- ocrd/processor/helpers.py,sha256=Lp9zbHYCLpT3GnPzl-p7UCSFU5Nx99gYEYXwW04v0RI,10157
31
- ocrd/processor/ocrd_page_result.py,sha256=AazEmnWyPEN47TxXVg0WUQpgFNV_mlIiExwwycUj0nQ,490
29
+ ocrd/processor/base.py,sha256=_TvaxKf_oMaIfuUcaFWas8YimhZ-l1d3RWGELlBJfy8,50307
30
+ ocrd/processor/helpers.py,sha256=vPYUri6ucuhdTNrideywriJ0fCa8UE2QyBXOmS-7RcQ,10232
31
+ ocrd/processor/ocrd_page_result.py,sha256=eDkpyVHcpaBzTHXiGrcNk9PP9Xr-XZru2w_uoX_ZeNA,510
32
32
  ocrd/processor/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
- ocrd/processor/builtin/dummy_processor.py,sha256=6ORike_59wb_UUivhA6Iw_Ldg0AaMzX37F7qL9R8S_A,3704
33
+ ocrd/processor/builtin/dummy_processor.py,sha256=iWiw_jJXOqwr7-hFjdkmTCCo1xGr6MLGOshx81PTu-8,3548
34
34
  ocrd/processor/builtin/dummy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  ocrd/processor/builtin/dummy/ocrd-tool.json,sha256=aTA2FZRsRsrkbTctkazFeRu4xsTF6yCdeY07cMzOyt4,677
36
36
  ocrd_modelfactory/__init__.py,sha256=0baYSJXrOCTCguHkE6hBeqpGNVUe3aZUocv64A-DMDk,4094
@@ -42,7 +42,7 @@ ocrd_models/ocrd_exif.py,sha256=5BRLjvB6jg36V68i8jvVnT2SSNnpqLbhLsaMuP51Scw,4583
42
42
  ocrd_models/ocrd_file.py,sha256=7lyHezuNnl2FEYV1lV35-QTCrgYAL-3wO2ulFUNq2Ak,9717
43
43
  ocrd_models/ocrd_mets.py,sha256=h3y_WI5fVLsbBoUIRNH2ebjuO1-_P6T3BMIULX-ZOIs,42514
44
44
  ocrd_models/ocrd_page.py,sha256=sVIvvMeBT8eZnOfW0DTjQUNyu62-llz0v_Ga5Xo-tUM,5393
45
- ocrd_models/ocrd_page_generateds.py,sha256=XpzRaJAvGc_N58Mb2_Sk42S59pDqMRFiplqce1pUubw,772504
45
+ ocrd_models/ocrd_page_generateds.py,sha256=wfx3vESMAi08rl6-16zNVJe4E3B6APIvL6RCr1roAzg,774092
46
46
  ocrd_models/ocrd_xml_base.py,sha256=OW57mXLlwm1nH8CNefvXmwLRws9KL9zSrb-3vH--mX8,1641
47
47
  ocrd_models/report.py,sha256=luZxvzAAQyGYOlRNSJQUIUIANG81iGmBW5ag-uXxKCA,2026
48
48
  ocrd_models/utils.py,sha256=0_WHf5NEn1WC8MKJc6X_RK8gW-70Z09_mslkKOj7uF8,2369
@@ -95,7 +95,7 @@ ocrd_utils/introspect.py,sha256=gfBlmeEFuRmRUSgdSK0jOxRpYqDRXl2IAE6gv2MZ6as,1977
95
95
  ocrd_utils/logging.py,sha256=5_-5T5OWSYicNk8SQyjVqdRj2bVl-gDK1Th-C7oW_HE,8248
96
96
  ocrd_utils/ocrd_logging.conf,sha256=kl9x9JS1d8h8F0QZabvrjZtW1iApIaChvkImYafKO5g,3623
97
97
  ocrd_utils/os.py,sha256=acRRdDBI8L6BK0Mf773yKEzwdpZSFRBJEKB2crL4EjU,9865
98
- ocrd_utils/str.py,sha256=JIhsyWphqJuxJAzhRQJUqlZ44AGOeObEPJMFhfWhfhQ,10084
98
+ ocrd_utils/str.py,sha256=cRgqYILDGOAqWr0qrCrV52I3y4wvpwDVtnBGEUjXNS4,10116
99
99
  ocrd_validators/__init__.py,sha256=ZFc-UqRVBk9o1YesZFmr9lOepttNJ_NKx1Zdb7g_YsU,972
100
100
  ocrd_validators/bagit-profile.yml,sha256=sdQJlSi7TOn1E9WYMOZ1shewJ-i_nPaKmsAFkh28TGY,1011
101
101
  ocrd_validators/constants.py,sha256=FLP57T3F39weka_XovG40RgVMW1GunnbK04QRQ9tmlE,1802
@@ -118,9 +118,9 @@ ocrd_validators/xlink.xsd,sha256=8fW7YAMWXN2PbB_MMvj9H5ZeFoEBDzuYBtlGC8_6ijw,318
118
118
  ocrd_validators/xsd_mets_validator.py,sha256=4GWfLyqkmca0x7osDuXuExYuM0HWVrKoqn0S35sFhHU,467
119
119
  ocrd_validators/xsd_page_validator.py,sha256=BNz_9u-Ek4UCeyZu3KxSQoolfW9lvuaSR9nIu1XXxeE,467
120
120
  ocrd_validators/xsd_validator.py,sha256=6HrVAf6SzCvfUIuQdIzz9bOq4V-zhyii9yrUPoK2Uvo,2094
121
- ocrd-3.0.0b3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
122
- ocrd-3.0.0b3.dist-info/METADATA,sha256=WZhPkJV0F8A5k-0IVK8HZ5zGWVWwYSa6FuDlpkuh4Xc,10397
123
- ocrd-3.0.0b3.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
124
- ocrd-3.0.0b3.dist-info/entry_points.txt,sha256=tV_gAdO8cbnOjS0GmKfJKbN60xBAV2DQRX6hEjleSjE,94
125
- ocrd-3.0.0b3.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
126
- ocrd-3.0.0b3.dist-info/RECORD,,
121
+ ocrd-3.0.0b5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
122
+ ocrd-3.0.0b5.dist-info/METADATA,sha256=eIpkAoobj7QocP9VXYASLqE82wN35JlvYuYjSBLk30o,10397
123
+ ocrd-3.0.0b5.dist-info/WHEEL,sha256=5Mi1sN9lKoFv_gxcPtisEVrJZihrm_beibeg5R6xb4I,91
124
+ ocrd-3.0.0b5.dist-info/entry_points.txt,sha256=tV_gAdO8cbnOjS0GmKfJKbN60xBAV2DQRX6hEjleSjE,94
125
+ ocrd-3.0.0b5.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
126
+ ocrd-3.0.0b5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.0.0)
2
+ Generator: setuptools (75.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -2,30 +2,28 @@
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
4
  #
5
- # Generated Wed Nov 3 12:30:32 2021 by generateDS.py version 2.35.20.
6
- # Python 3.6.9 (default, Jan 26 2021, 15:33:00) [GCC 8.4.0]
5
+ # Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20.
6
+ # Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0]
7
7
  #
8
8
  # Command line options:
9
9
  # ('-f', '')
10
10
  # ('--root-element', 'PcGts')
11
- # ('-o', 'ocrd_models/ocrd_models/ocrd_page_generateds.py')
11
+ # ('-o', 'src/ocrd_models/ocrd_page_generateds.py')
12
12
  # ('--silence', '')
13
13
  # ('--export', 'write etree')
14
14
  # ('--disable-generatedssuper-lookup', '')
15
- # ('--user-methods', 'ocrd_models/ocrd_page_user_methods.py')
15
+ # ('--user-methods', 'src/ocrd_page_user_methods.py')
16
16
  #
17
17
  # Command line arguments:
18
- # ocrd_validators/ocrd_validators/page.xsd
18
+ # src/ocrd_validators/page.xsd
19
19
  #
20
20
  # Command line:
21
- # /home/kba/monorepo/ocrd_all/venv/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" ocrd_validators/ocrd_validators/page.xsd
21
+ # /data/ocr-d/ocrd_all/venv38/bin/generateDS -f --root-element="PcGts" -o "src/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="src/ocrd_page_user_methods.py" src/ocrd_validators/page.xsd
22
22
  #
23
23
  # Current working directory (os.getcwd()):
24
24
  # core
25
25
  #
26
26
 
27
- # type: ignore
28
-
29
27
  from itertools import zip_longest
30
28
  import os
31
29
  import sys
@@ -223,7 +221,7 @@ class GeneratedsSuper(object):
223
221
  try:
224
222
  int(value)
225
223
  except (TypeError, ValueError):
226
- raise_parse_error(node, 'Requires sequence of integer values')
224
+ raise_parse_error(node, 'Requires sequence of integer valuess')
227
225
  return values
228
226
  def gds_format_float(self, input_data, input_name=''):
229
227
  return ('%.15f' % input_data).rstrip('0')
@@ -1230,9 +1228,10 @@ class PcGtsType(GeneratedsSuper):
1230
1228
  return hash(self.id)
1231
1229
  @property
1232
1230
  def id(self):
1231
+ from ocrd_utils import make_xml_id
1233
1232
  if hasattr(self, 'pcGtsId'):
1234
1233
  return self.pcGtsId or ''
1235
- return self.imageFilename
1234
+ return make_xml_id(self.imageFilename)
1236
1235
  def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True):
1237
1236
  """
1238
1237
  Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document.
@@ -3116,9 +3115,10 @@ class PageType(GeneratedsSuper):
3116
3115
  return hash(self.id)
3117
3116
  @property
3118
3117
  def id(self):
3118
+ from ocrd_utils import make_xml_id
3119
3119
  if hasattr(self, 'pcGtsId'):
3120
3120
  return self.pcGtsId or ''
3121
- return self.imageFilename
3121
+ return make_xml_id(self.imageFilename)
3122
3122
  # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
3123
3123
  def _region_class(self, x): # pylint: disable=unused-argument
3124
3124
  return x.__class__.__name__.replace('RegionType', '')
@@ -3314,6 +3314,39 @@ class PageType(GeneratedsSuper):
3314
3314
  ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines))
3315
3315
  return ret
3316
3316
 
3317
+ def get_ReadingOrderGroups(self) -> dict:
3318
+ """
3319
+ Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef
3320
+ (i.e. segment `@id`) to its referring group object (i.e one of
3321
+
3322
+ \b
3323
+ - :py:class:`.RegionRefType`
3324
+ - :py:class:`.RegionRefIndexedType`
3325
+ - :py:class:`.OrderedGroupType`
3326
+ - :py:class:`.OrderedGroupIndexedType`
3327
+ - :py:class:`.UnoderedGroupType`
3328
+ - :py:class:`.UnoderedGroupIndexedType`
3329
+ """
3330
+ def get_groupdict(group):
3331
+ regionrefs = list()
3332
+ if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)):
3333
+ regionrefs = (group.get_RegionRefIndexed() +
3334
+ group.get_OrderedGroupIndexed() +
3335
+ group.get_UnorderedGroupIndexed())
3336
+ if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)):
3337
+ regionrefs = (group.get_RegionRef() +
3338
+ group.get_OrderedGroup() +
3339
+ group.get_UnorderedGroup())
3340
+ refdict = {}
3341
+ for elem in regionrefs:
3342
+ refdict[elem.get_regionRef()] = elem
3343
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
3344
+ refdict = {**refdict, **get_groupdict(elem)}
3345
+ return refdict
3346
+ ro = self.get_ReadingOrder()
3347
+ if ro is None:
3348
+ return {}
3349
+ return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup())
3317
3350
  def set_orientation(self, orientation):
3318
3351
  """
3319
3352
  Set deskewing angle to given `orientation` number.
ocrd_utils/str.py CHANGED
@@ -108,10 +108,11 @@ def make_xml_id(idstr: str) -> str:
108
108
  ret = idstr
109
109
  if not REGEX_FILE_ID.fullmatch(ret):
110
110
  ret = ret.replace(':', '_')
111
+ ret = ret.replace('/', '_')
111
112
  ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
112
113
  ret = re.sub(r'[^\w.-]', r'', ret)
113
114
  return ret
114
-
115
+
115
116
  def nth_url_segment(url, n=-1):
116
117
  """
117
118
  Return the last /-delimited segment of a URL-like string