ocrd 3.0.0b3__py3-none-any.whl → 3.0.0b5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/cli/bashlib.py +6 -4
- ocrd/cli/ocrd_tool.py +1 -1
- ocrd/cli/validate.py +6 -3
- ocrd/cli/workspace.py +52 -47
- ocrd/decorators/ocrd_cli_options.py +1 -0
- ocrd/lib.bash +24 -21
- ocrd/mets_server.py +39 -8
- ocrd/processor/base.py +42 -24
- ocrd/processor/builtin/dummy_processor.py +0 -2
- ocrd/processor/helpers.py +6 -5
- ocrd/processor/ocrd_page_result.py +2 -2
- ocrd/workspace.py +7 -1
- {ocrd-3.0.0b3.dist-info → ocrd-3.0.0b5.dist-info}/METADATA +1 -1
- {ocrd-3.0.0b3.dist-info → ocrd-3.0.0b5.dist-info}/RECORD +20 -20
- {ocrd-3.0.0b3.dist-info → ocrd-3.0.0b5.dist-info}/WHEEL +1 -1
- ocrd_models/ocrd_page_generateds.py +44 -11
- ocrd_utils/str.py +2 -1
- {ocrd-3.0.0b3.dist-info → ocrd-3.0.0b5.dist-info}/LICENSE +0 -0
- {ocrd-3.0.0b3.dist-info → ocrd-3.0.0b5.dist-info}/entry_points.txt +0 -0
- {ocrd-3.0.0b3.dist-info → ocrd-3.0.0b5.dist-info}/top_level.txt +0 -0
ocrd/cli/bashlib.py
CHANGED
|
@@ -76,10 +76,10 @@ def bashlib_constants(name):
|
|
|
76
76
|
@click.option('--ocrd-tool', help="path to ocrd-tool.json of processor to feed", default=None)
|
|
77
77
|
@click.option('--executable', help="name of processor executable in ocrd-tool.json", default=None)
|
|
78
78
|
@click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME)
|
|
79
|
-
@click.option('-
|
|
79
|
+
@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server', default=None)
|
|
80
|
+
@click.option('-d', '--working-dir', help="Working Directory")
|
|
80
81
|
@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None)
|
|
81
82
|
@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None)
|
|
82
|
-
# repeat some other processor options for convenience (will be ignored here)
|
|
83
83
|
@click.option('-g', '--page-id', help="ID(s) of the pages to process")
|
|
84
84
|
@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n"
|
|
85
85
|
"(with '--page-id', remove only those).\n"
|
|
@@ -126,9 +126,10 @@ def bashlib_input_files(ocrd_tool, executable, **kwargs):
|
|
|
126
126
|
def executable(self):
|
|
127
127
|
# needed for ocrd_tool lookup
|
|
128
128
|
return executable
|
|
129
|
+
processor_class = FullBashlibProcessor
|
|
129
130
|
else:
|
|
130
131
|
# we have no true metadata file, so fill in just to make it work
|
|
131
|
-
class
|
|
132
|
+
class UnknownBashlibProcessor(BashlibProcessor):
|
|
132
133
|
@property
|
|
133
134
|
def ocrd_tool(self):
|
|
134
135
|
# needed to satisfy the validator
|
|
@@ -142,5 +143,6 @@ def bashlib_input_files(ocrd_tool, executable, **kwargs):
|
|
|
142
143
|
def version(self):
|
|
143
144
|
# needed to satisfy the validator and wrapper
|
|
144
145
|
return '1.0'
|
|
146
|
+
processor_class = UnknownBashlibProcessor
|
|
145
147
|
|
|
146
|
-
ocrd_cli_wrap_processor(
|
|
148
|
+
ocrd_cli_wrap_processor(processor_class, **kwargs)
|
ocrd/cli/ocrd_tool.py
CHANGED
|
@@ -125,7 +125,7 @@ def ocrd_tool_tool_list_resources(ctx):
|
|
|
125
125
|
@click.argument('res_name')
|
|
126
126
|
@pass_ocrd_tool
|
|
127
127
|
def ocrd_tool_tool_resolve_resource(ctx, res_name):
|
|
128
|
-
ctx.processor(None).resolve_resource(res_name)
|
|
128
|
+
print(ctx.processor(None).resolve_resource(res_name))
|
|
129
129
|
|
|
130
130
|
@ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource")
|
|
131
131
|
@click.argument('res_name')
|
ocrd/cli/validate.py
CHANGED
|
@@ -102,16 +102,19 @@ def validate_page(page, **kwargs):
|
|
|
102
102
|
@validate_cli.command('tasks')
|
|
103
103
|
@click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax')
|
|
104
104
|
@click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace')
|
|
105
|
+
@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server')
|
|
105
106
|
@click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.')
|
|
106
107
|
@click.option('-g', '--page-id', help="ID(s) of the pages to process")
|
|
107
108
|
@click.argument('tasks', nargs=-1, required=True)
|
|
108
|
-
def validate_process(tasks, workspace, mets_basename, overwrite, page_id):
|
|
109
|
+
def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id):
|
|
109
110
|
'''
|
|
110
111
|
Validate a sequence of tasks passable to `ocrd process`
|
|
111
112
|
'''
|
|
112
113
|
if workspace:
|
|
113
|
-
_inform_of_result(validate_tasks(
|
|
114
|
-
|
|
114
|
+
_inform_of_result(validate_tasks(
|
|
115
|
+
[ProcessorTask.parse(t) for t in tasks],
|
|
116
|
+
Workspace(Resolver(), directory=workspace, mets_basename=mets_basename, mets_server_url=mets_server_url),
|
|
117
|
+
page_id=page_id, overwrite=overwrite))
|
|
115
118
|
else:
|
|
116
119
|
for t in [ProcessorTask.parse(t) for t in tasks]:
|
|
117
120
|
_inform_of_result(t.validate())
|
ocrd/cli/workspace.py
CHANGED
|
@@ -36,6 +36,17 @@ class WorkspaceCtx():
|
|
|
36
36
|
= self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
|
|
37
37
|
self.automatic_backup = automatic_backup
|
|
38
38
|
|
|
39
|
+
def workspace(self):
|
|
40
|
+
return Workspace(
|
|
41
|
+
self.resolver,
|
|
42
|
+
directory=self.directory,
|
|
43
|
+
mets_basename=self.mets_basename,
|
|
44
|
+
automatic_backup=self.automatic_backup,
|
|
45
|
+
mets_server_url=self.mets_server_url,
|
|
46
|
+
)
|
|
47
|
+
def backup_manager(self):
|
|
48
|
+
return WorkspaceBackupManager(self.workspace())
|
|
49
|
+
|
|
39
50
|
|
|
40
51
|
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
|
|
41
52
|
|
|
@@ -138,6 +149,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
|
|
|
138
149
|
LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
|
|
139
150
|
ctx.directory = workspace_dir
|
|
140
151
|
|
|
152
|
+
assert not ctx.mets_server_url
|
|
141
153
|
workspace = ctx.resolver.workspace_from_url(
|
|
142
154
|
mets_url,
|
|
143
155
|
dst_dir=ctx.directory,
|
|
@@ -173,10 +185,11 @@ def workspace_init(ctx, clobber_mets, directory):
|
|
|
173
185
|
if directory:
|
|
174
186
|
LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
|
|
175
187
|
ctx.directory = directory
|
|
188
|
+
assert not ctx.mets_server_url
|
|
176
189
|
workspace = ctx.resolver.workspace_from_nothing(
|
|
177
190
|
directory=ctx.directory,
|
|
178
191
|
mets_basename=ctx.mets_basename,
|
|
179
|
-
clobber_mets=clobber_mets
|
|
192
|
+
clobber_mets=clobber_mets,
|
|
180
193
|
)
|
|
181
194
|
workspace.save_mets()
|
|
182
195
|
print(workspace.directory)
|
|
@@ -200,13 +213,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_
|
|
|
200
213
|
Add a file or http(s) URL FNAME to METS in a workspace.
|
|
201
214
|
If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
|
|
202
215
|
"""
|
|
203
|
-
workspace =
|
|
204
|
-
ctx.resolver,
|
|
205
|
-
directory=ctx.directory,
|
|
206
|
-
mets_basename=ctx.mets_basename,
|
|
207
|
-
automatic_backup=ctx.automatic_backup,
|
|
208
|
-
mets_server_url=ctx.mets_server_url,
|
|
209
|
-
)
|
|
216
|
+
workspace = ctx.workspace()
|
|
210
217
|
|
|
211
218
|
log = getLogger('ocrd.cli.workspace.add')
|
|
212
219
|
if not mimetype:
|
|
@@ -313,13 +320,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
|
|
|
313
320
|
|
|
314
321
|
"""
|
|
315
322
|
log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
|
|
316
|
-
workspace =
|
|
317
|
-
ctx.resolver,
|
|
318
|
-
directory=ctx.directory,
|
|
319
|
-
mets_basename=ctx.mets_basename,
|
|
320
|
-
automatic_backup=ctx.automatic_backup,
|
|
321
|
-
mets_server_url=ctx.mets_server_url,
|
|
322
|
-
)
|
|
323
|
+
workspace = ctx.workspace()
|
|
323
324
|
|
|
324
325
|
try:
|
|
325
326
|
pat = re.compile(regex)
|
|
@@ -455,12 +456,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl
|
|
|
455
456
|
output_field = [snake_to_camel.get(x, x) for x in output_field]
|
|
456
457
|
modified_mets = False
|
|
457
458
|
ret = []
|
|
458
|
-
workspace =
|
|
459
|
-
ctx.resolver,
|
|
460
|
-
directory=ctx.directory,
|
|
461
|
-
mets_basename=ctx.mets_basename,
|
|
462
|
-
mets_server_url=ctx.mets_server_url,
|
|
463
|
-
)
|
|
459
|
+
workspace = ctx.workspace()
|
|
464
460
|
with pushd_popd(workspace.directory):
|
|
465
461
|
for f in workspace.find_files(
|
|
466
462
|
file_id=file_id,
|
|
@@ -510,7 +506,7 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin
|
|
|
510
506
|
(If any ``ID`` starts with ``//``, then its remainder
|
|
511
507
|
will be interpreted as a regular expression.)
|
|
512
508
|
"""
|
|
513
|
-
workspace =
|
|
509
|
+
workspace = ctx.workspace()
|
|
514
510
|
for i in id:
|
|
515
511
|
workspace.remove_file(i, force=force, keep_file=keep_file)
|
|
516
512
|
workspace.save_mets()
|
|
@@ -528,7 +524,7 @@ def rename_group(ctx, old, new):
|
|
|
528
524
|
"""
|
|
529
525
|
Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
|
|
530
526
|
"""
|
|
531
|
-
workspace =
|
|
527
|
+
workspace = ctx.workspace()
|
|
532
528
|
workspace.rename_file_group(old, new)
|
|
533
529
|
workspace.save_mets()
|
|
534
530
|
|
|
@@ -549,7 +545,7 @@ def remove_group(ctx, group, recursive, force, keep_files):
|
|
|
549
545
|
(If any ``GROUP`` starts with ``//``, then its remainder
|
|
550
546
|
will be interpreted as a regular expression.)
|
|
551
547
|
"""
|
|
552
|
-
workspace =
|
|
548
|
+
workspace = ctx.workspace()
|
|
553
549
|
for g in group:
|
|
554
550
|
workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
|
|
555
551
|
workspace.save_mets()
|
|
@@ -571,7 +567,7 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
|
|
|
571
567
|
(If any ``FILTER`` starts with ``//``, then its remainder
|
|
572
568
|
will be interpreted as a regular expression.)
|
|
573
569
|
"""
|
|
574
|
-
workspace =
|
|
570
|
+
workspace = ctx.workspace()
|
|
575
571
|
with pushd_popd(workspace.directory):
|
|
576
572
|
for f in workspace.find_files(
|
|
577
573
|
file_id=file_id,
|
|
@@ -608,8 +604,7 @@ def clean(ctx, dry_run, directories, path_glob):
|
|
|
608
604
|
If no PATH_GLOB are specified, then all files and directories
|
|
609
605
|
may match.
|
|
610
606
|
"""
|
|
611
|
-
|
|
612
|
-
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
|
|
607
|
+
workspace = ctx.workspace()
|
|
613
608
|
allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)]
|
|
614
609
|
allowed_files.append(relpath(workspace.mets_target, start=workspace.directory))
|
|
615
610
|
allowed_dirs = set(dirname(path) for path in allowed_files)
|
|
@@ -627,7 +622,7 @@ def clean(ctx, dry_run, directories, path_glob):
|
|
|
627
622
|
if normpath(path) in allowed_files:
|
|
628
623
|
continue
|
|
629
624
|
if dry_run:
|
|
630
|
-
log.info('unlink(%s)' % path)
|
|
625
|
+
ctx.log.info('unlink(%s)' % path)
|
|
631
626
|
else:
|
|
632
627
|
unlink(path)
|
|
633
628
|
if not directories:
|
|
@@ -637,7 +632,7 @@ def clean(ctx, dry_run, directories, path_glob):
|
|
|
637
632
|
if normpath(path) in allowed_dirs:
|
|
638
633
|
continue
|
|
639
634
|
if dry_run:
|
|
640
|
-
log.info('rmdir(%s)' % path)
|
|
635
|
+
ctx.log.info('rmdir(%s)' % path)
|
|
641
636
|
else:
|
|
642
637
|
rmdir(path)
|
|
643
638
|
|
|
@@ -651,7 +646,7 @@ def list_groups(ctx):
|
|
|
651
646
|
"""
|
|
652
647
|
List fileGrp USE attributes
|
|
653
648
|
"""
|
|
654
|
-
workspace =
|
|
649
|
+
workspace = ctx.workspace()
|
|
655
650
|
print("\n".join(workspace.mets.file_groups))
|
|
656
651
|
|
|
657
652
|
# ----------------------------------------------------------------------
|
|
@@ -677,7 +672,7 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page
|
|
|
677
672
|
(If any ``FILTER`` starts with ``//``, then its remainder
|
|
678
673
|
will be interpreted as a regular expression.)
|
|
679
674
|
"""
|
|
680
|
-
workspace =
|
|
675
|
+
workspace = ctx.workspace()
|
|
681
676
|
find_kwargs = {}
|
|
682
677
|
if page_id_range and 'ID' in output_field:
|
|
683
678
|
find_kwargs['pageId'] = page_id_range
|
|
@@ -724,7 +719,7 @@ def get_id(ctx):
|
|
|
724
719
|
"""
|
|
725
720
|
Get METS id if any
|
|
726
721
|
"""
|
|
727
|
-
workspace =
|
|
722
|
+
workspace = ctx.workspace()
|
|
728
723
|
ID = workspace.mets.unique_identifier
|
|
729
724
|
if ID:
|
|
730
725
|
print(ID)
|
|
@@ -744,7 +739,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin
|
|
|
744
739
|
|
|
745
740
|
Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
|
|
746
741
|
"""
|
|
747
|
-
workspace =
|
|
742
|
+
workspace = ctx.workspace()
|
|
748
743
|
workspace.mets.unique_identifier = id
|
|
749
744
|
workspace.save_mets()
|
|
750
745
|
|
|
@@ -767,7 +762,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
|
|
|
767
762
|
if contentids:
|
|
768
763
|
update_kwargs['CONTENTIDS'] = contentids
|
|
769
764
|
try:
|
|
770
|
-
workspace =
|
|
765
|
+
workspace = ctx.workspace()
|
|
771
766
|
workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
|
|
772
767
|
workspace.save_mets()
|
|
773
768
|
except Exception as err:
|
|
@@ -805,7 +800,7 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
|
|
|
805
800
|
mets_path = Path(mets_path)
|
|
806
801
|
if filegrp_mapping:
|
|
807
802
|
filegrp_mapping = loads(filegrp_mapping)
|
|
808
|
-
workspace =
|
|
803
|
+
workspace = ctx.workspace()
|
|
809
804
|
other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
|
|
810
805
|
workspace.merge(
|
|
811
806
|
other_workspace,
|
|
@@ -829,11 +824,12 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
|
|
|
829
824
|
# ----------------------------------------------------------------------
|
|
830
825
|
|
|
831
826
|
@workspace_cli.group('backup')
|
|
832
|
-
@
|
|
827
|
+
@pass_workspace
|
|
833
828
|
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
|
|
834
829
|
"""
|
|
835
830
|
Backing and restoring workspaces - dev edition
|
|
836
831
|
"""
|
|
832
|
+
assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server"
|
|
837
833
|
|
|
838
834
|
@workspace_backup_cli.command('add')
|
|
839
835
|
@pass_workspace
|
|
@@ -841,7 +837,7 @@ def workspace_backup_add(ctx):
|
|
|
841
837
|
"""
|
|
842
838
|
Create a new backup
|
|
843
839
|
"""
|
|
844
|
-
backup_manager =
|
|
840
|
+
backup_manager = ctx.backup_manager()
|
|
845
841
|
backup_manager.add()
|
|
846
842
|
|
|
847
843
|
@workspace_backup_cli.command('list')
|
|
@@ -850,7 +846,7 @@ def workspace_backup_list(ctx):
|
|
|
850
846
|
"""
|
|
851
847
|
List backups
|
|
852
848
|
"""
|
|
853
|
-
backup_manager =
|
|
849
|
+
backup_manager = ctx.backup_manager()
|
|
854
850
|
for b in backup_manager.list():
|
|
855
851
|
print(b)
|
|
856
852
|
|
|
@@ -862,7 +858,7 @@ def workspace_backup_restore(ctx, choose_first, bak):
|
|
|
862
858
|
"""
|
|
863
859
|
Restore backup BAK
|
|
864
860
|
"""
|
|
865
|
-
backup_manager =
|
|
861
|
+
backup_manager = ctx.backup_manager()
|
|
866
862
|
backup_manager.restore(bak, choose_first)
|
|
867
863
|
|
|
868
864
|
@workspace_backup_cli.command('undo')
|
|
@@ -871,7 +867,7 @@ def workspace_backup_undo(ctx):
|
|
|
871
867
|
"""
|
|
872
868
|
Restore the last backup
|
|
873
869
|
"""
|
|
874
|
-
backup_manager =
|
|
870
|
+
backup_manager = ctx.backup_manager()
|
|
875
871
|
backup_manager.undo()
|
|
876
872
|
|
|
877
873
|
|
|
@@ -888,15 +884,24 @@ def workspace_serve_cli(ctx): # pylint: disable=unused-argument
|
|
|
888
884
|
@workspace_serve_cli.command('stop')
|
|
889
885
|
@pass_workspace
|
|
890
886
|
def workspace_serve_stop(ctx): # pylint: disable=unused-argument
|
|
891
|
-
"""Stop the METS server"""
|
|
892
|
-
workspace =
|
|
893
|
-
ctx.resolver,
|
|
894
|
-
directory=ctx.directory,
|
|
895
|
-
mets_basename=ctx.mets_basename,
|
|
896
|
-
mets_server_url=ctx.mets_server_url,
|
|
897
|
-
)
|
|
887
|
+
"""Stop the METS server (saving changes to disk)"""
|
|
888
|
+
workspace = ctx.workspace()
|
|
898
889
|
workspace.mets.stop()
|
|
899
890
|
|
|
891
|
+
@workspace_serve_cli.command('reload')
|
|
892
|
+
@pass_workspace
|
|
893
|
+
def workspace_serve_reload(ctx): # pylint: disable=unused-argument
|
|
894
|
+
"""Reload the METS server from disk"""
|
|
895
|
+
workspace = ctx.workspace()
|
|
896
|
+
workspace.mets.reload()
|
|
897
|
+
|
|
898
|
+
@workspace_serve_cli.command('save')
|
|
899
|
+
@pass_workspace
|
|
900
|
+
def workspace_serve_save(ctx): # pylint: disable=unused-argument
|
|
901
|
+
"""Save the METS changes to disk"""
|
|
902
|
+
workspace = ctx.workspace()
|
|
903
|
+
workspace.mets.save()
|
|
904
|
+
|
|
900
905
|
@workspace_serve_cli.command('start')
|
|
901
906
|
@pass_workspace
|
|
902
907
|
def workspace_serve_start(ctx): # pylint: disable=unused-argument
|
|
@@ -43,6 +43,7 @@ def ocrd_cli_options(f):
|
|
|
43
43
|
option('--address', type=ServerAddressParamType()),
|
|
44
44
|
option('--queue', type=QueueServerParamType()),
|
|
45
45
|
option('--database', type=DatabaseParamType()),
|
|
46
|
+
option('-R', '--resolve-resource'),
|
|
46
47
|
option('-C', '--show-resource'),
|
|
47
48
|
option('-L', '--list-resources', is_flag=True, default=False),
|
|
48
49
|
option('-J', '--dump-json', is_flag=True, default=False),
|
ocrd/lib.bash
CHANGED
|
@@ -27,8 +27,8 @@ ocrd__log () {
|
|
|
27
27
|
## Ensure minimum version
|
|
28
28
|
# ht https://stackoverflow.com/posts/4025065
|
|
29
29
|
ocrd__minversion () {
|
|
30
|
-
local minversion_raw="$1"
|
|
31
30
|
set -e
|
|
31
|
+
local minversion_raw="$1"
|
|
32
32
|
local version_raw=$(ocrd --version|sed 's/ocrd, version //')
|
|
33
33
|
local version_mmp=$(echo "$version_raw" | grep -Eo '([0-9]+\.?){3}')
|
|
34
34
|
local version_prerelease_suffix="${version_raw#$version_mmp}"
|
|
@@ -123,6 +123,7 @@ ocrd__usage () {
|
|
|
123
123
|
## declare -A ocrd__argv=()
|
|
124
124
|
## ```
|
|
125
125
|
ocrd__parse_argv () {
|
|
126
|
+
set -e
|
|
126
127
|
|
|
127
128
|
# if [[ -n "$ZSH_VERSION" ]];then
|
|
128
129
|
# print -r -- ${+ocrd__argv} ${(t)ocrd__argv}
|
|
@@ -135,11 +136,16 @@ ocrd__parse_argv () {
|
|
|
135
136
|
ocrd__raise "Must set \$params (declare -A params)"
|
|
136
137
|
fi
|
|
137
138
|
|
|
139
|
+
if ! declare -p "params_json" >/dev/null 2>/dev/null ;then
|
|
140
|
+
ocrd__raise "Must set \$params_json (declare params_json)"
|
|
141
|
+
fi
|
|
142
|
+
|
|
138
143
|
if [[ $# = 0 ]];then
|
|
139
144
|
ocrd__usage
|
|
140
145
|
exit 1
|
|
141
146
|
fi
|
|
142
147
|
|
|
148
|
+
ocrd__argv[debug]=false
|
|
143
149
|
ocrd__argv[overwrite]=false
|
|
144
150
|
ocrd__argv[profile]=false
|
|
145
151
|
ocrd__argv[profile_file]=
|
|
@@ -170,6 +176,7 @@ ocrd__parse_argv () {
|
|
|
170
176
|
-w|--working-dir) ocrd__argv[working_dir]=$(realpath "$2") ; shift ;;
|
|
171
177
|
-m|--mets) ocrd__argv[mets_file]=$(realpath "$2") ; shift ;;
|
|
172
178
|
-U|--mets-server-url) ocrd__argv[mets_server_url]="$2" ; shift ;;
|
|
179
|
+
--debug) ocrd__argv[debug]=true ;;
|
|
173
180
|
--overwrite) ocrd__argv[overwrite]=true ;;
|
|
174
181
|
--profile) ocrd__argv[profile]=true ;;
|
|
175
182
|
--profile-file) ocrd__argv[profile_file]=$(realpath "$2") ; shift ;;
|
|
@@ -242,17 +249,6 @@ ocrd__parse_argv () {
|
|
|
242
249
|
trap showtime DEBUG
|
|
243
250
|
fi
|
|
244
251
|
|
|
245
|
-
# check fileGrps
|
|
246
|
-
local _valopts=( --workspace "${ocrd__argv[working_dir]}" --mets-basename "$(basename ${ocrd__argv[mets_file]})" )
|
|
247
|
-
if [[ ${ocrd__argv[overwrite]} = true ]]; then
|
|
248
|
-
_valopts+=( --overwrite )
|
|
249
|
-
fi
|
|
250
|
-
if [[ -n "${ocrd__argv[page_id]:-}" ]]; then
|
|
251
|
-
_valopts+=( --page-id "${ocrd__argv[page_id]}" )
|
|
252
|
-
fi
|
|
253
|
-
_valopts+=( "${OCRD_TOOL_NAME#ocrd-} -I ${ocrd__argv[input_file_grp]} -O ${ocrd__argv[output_file_grp]} ${__parameters[*]@Q} ${__parameter_overrides[*]@Q}" )
|
|
254
|
-
ocrd validate tasks "${_valopts[@]}" || exit $?
|
|
255
|
-
|
|
256
252
|
# check parameters
|
|
257
253
|
local params_parsed retval
|
|
258
254
|
params_parsed="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params "${__parameters[@]}" "${__parameter_overrides[@]}")" || {
|
|
@@ -261,10 +257,12 @@ ocrd__parse_argv () {
|
|
|
261
257
|
$params_parsed"
|
|
262
258
|
}
|
|
263
259
|
eval "$params_parsed"
|
|
260
|
+
params_json="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params --json "${__parameters[@]}" "${__parameter_overrides[@]}")"
|
|
264
261
|
|
|
265
262
|
}
|
|
266
263
|
|
|
267
264
|
ocrd__wrap () {
|
|
265
|
+
set -e
|
|
268
266
|
|
|
269
267
|
declare -gx OCRD_TOOL_JSON="$1"
|
|
270
268
|
declare -gx OCRD_TOOL_NAME="$2"
|
|
@@ -272,6 +270,7 @@ ocrd__wrap () {
|
|
|
272
270
|
shift
|
|
273
271
|
declare -Agx params
|
|
274
272
|
params=()
|
|
273
|
+
declare -g params_json
|
|
275
274
|
declare -Agx ocrd__argv
|
|
276
275
|
ocrd__argv=()
|
|
277
276
|
|
|
@@ -293,22 +292,26 @@ ocrd__wrap () {
|
|
|
293
292
|
|
|
294
293
|
ocrd__parse_argv "$@"
|
|
295
294
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
eval declare -Ag "ocrd__file$i=( $line )"
|
|
300
|
-
eval "ocrd__files[$i]=ocrd__file$i"
|
|
301
|
-
let ++i
|
|
302
|
-
done < <(ocrd bashlib input-files \
|
|
295
|
+
declare -ag ocrd__files
|
|
296
|
+
IFS=$'\n'
|
|
297
|
+
ocrd__files=( $(ocrd bashlib input-files \
|
|
303
298
|
--ocrd-tool $OCRD_TOOL_JSON \
|
|
304
299
|
--executable $OCRD_TOOL_NAME \
|
|
300
|
+
$(if [[ ${ocrd__argv[debug]} = true ]]; then echo --debug; fi) \
|
|
301
|
+
$(if [[ ${ocrd__argv[overwrite]} = true ]]; then echo --overwrite; fi) \
|
|
305
302
|
-m "${ocrd__argv[mets_file]}" \
|
|
303
|
+
-d "${ocrd__argv[working_dir]}" \
|
|
304
|
+
${ocrd__argv[mets_server_url]:+-U} ${ocrd__argv[mets_server_url]:-} \
|
|
305
|
+
-p "$params_json" \
|
|
306
306
|
-I "${ocrd__argv[input_file_grp]}" \
|
|
307
307
|
-O "${ocrd__argv[output_file_grp]}" \
|
|
308
|
-
${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-})
|
|
308
|
+
${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) )
|
|
309
|
+
IFS=$' \t\n'
|
|
309
310
|
}
|
|
310
311
|
|
|
311
312
|
## usage: pageId=$(ocrd__input_file 3 pageId)
|
|
312
313
|
ocrd__input_file() {
|
|
313
|
-
|
|
314
|
+
declare -A input_file
|
|
315
|
+
eval input_file=( "${ocrd__files[$1]}" )
|
|
316
|
+
eval echo "${input_file[$2]}"
|
|
314
317
|
}
|
ocrd/mets_server.py
CHANGED
|
@@ -88,6 +88,14 @@ class OcrdFileGroupListModel(BaseModel):
|
|
|
88
88
|
return OcrdFileGroupListModel(file_groups=file_groups)
|
|
89
89
|
|
|
90
90
|
|
|
91
|
+
class OcrdPageListModel(BaseModel):
|
|
92
|
+
physical_pages: List[str] = Field()
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def create(physical_pages: List[str]):
|
|
96
|
+
return OcrdPageListModel(physical_pages=physical_pages)
|
|
97
|
+
|
|
98
|
+
|
|
91
99
|
class OcrdAgentListModel(BaseModel):
|
|
92
100
|
agents: List[OcrdAgentModel] = Field()
|
|
93
101
|
|
|
@@ -210,6 +218,17 @@ class ClientSideOcrdMets:
|
|
|
210
218
|
).json()["text"]
|
|
211
219
|
return self.ws_dir_path
|
|
212
220
|
|
|
221
|
+
@property
|
|
222
|
+
def physical_pages(self) -> List[str]:
|
|
223
|
+
if not self.multiplexing_mode:
|
|
224
|
+
return self.session.request("GET", f"{self.url}/physical_pages").json()["physical_pages"]
|
|
225
|
+
else:
|
|
226
|
+
return self.session.request(
|
|
227
|
+
"POST",
|
|
228
|
+
self.url,
|
|
229
|
+
json=MpxReq.physical_pages(self.ws_dir_path)
|
|
230
|
+
).json()["physical_pages"]
|
|
231
|
+
|
|
213
232
|
@property
|
|
214
233
|
def file_groups(self):
|
|
215
234
|
if not self.multiplexing_mode:
|
|
@@ -284,15 +303,17 @@ class ClientSideOcrdMets:
|
|
|
284
303
|
file_id=ID, page_id=pageId,
|
|
285
304
|
mimetype=mimetype, url=url, local_filename=local_filename
|
|
286
305
|
)
|
|
306
|
+
# add force+ignore
|
|
307
|
+
kwargs = {**kwargs, **data.dict()}
|
|
287
308
|
|
|
288
309
|
if not self.multiplexing_mode:
|
|
289
|
-
r = self.session.request("POST", f"{self.url}/file", data=
|
|
290
|
-
if not r:
|
|
291
|
-
raise RuntimeError("
|
|
310
|
+
r = self.session.request("POST", f"{self.url}/file", data=kwargs)
|
|
311
|
+
if not r.ok:
|
|
312
|
+
raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()}")
|
|
292
313
|
else:
|
|
293
|
-
r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path,
|
|
294
|
-
if
|
|
295
|
-
raise RuntimeError(f"
|
|
314
|
+
r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, kwargs))
|
|
315
|
+
if not r.ok:
|
|
316
|
+
raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()[errors]}")
|
|
296
317
|
|
|
297
318
|
return ClientSideOcrdFile(
|
|
298
319
|
None, fileGrp=file_grp,
|
|
@@ -347,6 +368,11 @@ class MpxReq:
|
|
|
347
368
|
return MpxReq.__args_wrapper(
|
|
348
369
|
ws_dir_path, method_type="GET", response_type="text", request_url="workspace_path", request_data={})
|
|
349
370
|
|
|
371
|
+
@staticmethod
|
|
372
|
+
def physical_pages(ws_dir_path: str) -> Dict:
|
|
373
|
+
return MpxReq.__args_wrapper(
|
|
374
|
+
ws_dir_path, method_type="GET", response_type="dict", request_url="physical_pages", request_data={})
|
|
375
|
+
|
|
350
376
|
@staticmethod
|
|
351
377
|
def file_groups(ws_dir_path: str) -> Dict:
|
|
352
378
|
return MpxReq.__args_wrapper(
|
|
@@ -466,6 +492,10 @@ class OcrdMetsServer:
|
|
|
466
492
|
async def workspace_path():
|
|
467
493
|
return Response(content=workspace.directory, media_type="text/plain")
|
|
468
494
|
|
|
495
|
+
@app.get(path='/physical_pages', response_model=OcrdPageListModel)
|
|
496
|
+
async def physical_pages():
|
|
497
|
+
return {'physical_pages': workspace.mets.physical_pages}
|
|
498
|
+
|
|
469
499
|
@app.get(path='/file_groups', response_model=OcrdFileGroupListModel)
|
|
470
500
|
async def file_groups():
|
|
471
501
|
return {'file_groups': workspace.mets.file_groups}
|
|
@@ -505,7 +535,8 @@ class OcrdMetsServer:
|
|
|
505
535
|
page_id: Optional[str] = Form(),
|
|
506
536
|
mimetype: str = Form(),
|
|
507
537
|
url: Optional[str] = Form(None),
|
|
508
|
-
local_filename: Optional[str] = Form(None)
|
|
538
|
+
local_filename: Optional[str] = Form(None),
|
|
539
|
+
force: bool = Form(False),
|
|
509
540
|
):
|
|
510
541
|
"""
|
|
511
542
|
Add a file
|
|
@@ -517,7 +548,7 @@ class OcrdMetsServer:
|
|
|
517
548
|
)
|
|
518
549
|
# Add to workspace
|
|
519
550
|
kwargs = file_resource.dict()
|
|
520
|
-
workspace.add_file(**kwargs)
|
|
551
|
+
workspace.add_file(**kwargs, force=force)
|
|
521
552
|
return file_resource
|
|
522
553
|
|
|
523
554
|
# ------------- #
|
ocrd/processor/base.py
CHANGED
|
@@ -16,7 +16,7 @@ import json
|
|
|
16
16
|
import os
|
|
17
17
|
from os import getcwd
|
|
18
18
|
from pathlib import Path
|
|
19
|
-
from typing import List, Optional, Union, get_args
|
|
19
|
+
from typing import Any, List, Optional, Union, get_args
|
|
20
20
|
import sys
|
|
21
21
|
import inspect
|
|
22
22
|
import tarfile
|
|
@@ -166,11 +166,14 @@ class Processor():
|
|
|
166
166
|
|
|
167
167
|
(Override if ``ocrd-tool.json`` is not distributed with the Python package.)
|
|
168
168
|
"""
|
|
169
|
-
|
|
170
|
-
module_tokens =
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
169
|
+
module = inspect.getmodule(self)
|
|
170
|
+
module_tokens = module.__package__.split('.')
|
|
171
|
+
# for namespace packages, we cannot just use the first token
|
|
172
|
+
for i in range(len(module_tokens)):
|
|
173
|
+
prefix = '.'.join(module_tokens[:i + 1])
|
|
174
|
+
if sys.modules[prefix].__spec__.has_location:
|
|
175
|
+
return resource_filename(prefix, self.metadata_filename)
|
|
176
|
+
raise Exception("cannot find top-level module prefix for %s", module.__package__)
|
|
174
177
|
|
|
175
178
|
@cached_property
|
|
176
179
|
def metadata_rawdict(self) -> dict:
|
|
@@ -336,7 +339,7 @@ class Processor():
|
|
|
336
339
|
self._finalizer = weakref.finalize(self, self.shutdown)
|
|
337
340
|
# workaround for deprecated#72 (@deprecated decorator does not work for subclasses):
|
|
338
341
|
setattr(self, 'process',
|
|
339
|
-
deprecated(version='3.0', reason='process() should be replaced with
|
|
342
|
+
deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process')))
|
|
340
343
|
|
|
341
344
|
def show_help(self, subcommand=None):
|
|
342
345
|
"""
|
|
@@ -355,6 +358,7 @@ class Processor():
|
|
|
355
358
|
"""
|
|
356
359
|
Verify that :py:attr:`input_file_grp` and :py:attr:`output_file_grp` fulfill the processor's requirements.
|
|
357
360
|
"""
|
|
361
|
+
# verify input and output file groups in parameters
|
|
358
362
|
assert self.input_file_grp is not None
|
|
359
363
|
assert self.output_file_grp is not None
|
|
360
364
|
input_file_grps = self.input_file_grp.split(',')
|
|
@@ -371,12 +375,23 @@ class Processor():
|
|
|
371
375
|
assert len(grps) >= minimum, msg % (len(grps), str(spec))
|
|
372
376
|
if maximum > 0:
|
|
373
377
|
assert len(grps) <= maximum, msg % (len(grps), str(spec))
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
+
# FIXME: enforce unconditionally as soon as grace period for deprecation is over
|
|
379
|
+
if 'input_file_grp_cardinality' in self.ocrd_tool:
|
|
380
|
+
assert_file_grp_cardinality(input_file_grps, self.ocrd_tool['input_file_grp_cardinality'],
|
|
381
|
+
"Unexpected number of input file groups %d vs %s")
|
|
382
|
+
if 'output_file_grp_cardinality' in self.ocrd_tool:
|
|
383
|
+
assert_file_grp_cardinality(output_file_grps, self.ocrd_tool['output_file_grp_cardinality'],
|
|
384
|
+
"Unexpected number of output file groups %d vs %s")
|
|
385
|
+
# verify input and output file groups in METS
|
|
378
386
|
for input_file_grp in input_file_grps:
|
|
379
|
-
assert input_file_grp in self.workspace.mets.file_groups
|
|
387
|
+
assert input_file_grp in self.workspace.mets.file_groups, \
|
|
388
|
+
f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}"
|
|
389
|
+
for output_file_grp in output_file_grps:
|
|
390
|
+
assert output_file_grp not in self.workspace.mets.file_groups \
|
|
391
|
+
or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \
|
|
392
|
+
or not any(self.workspace.mets.find_files(
|
|
393
|
+
pageId=self.page_id, fileGrp=output_file_grp)), \
|
|
394
|
+
f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}"
|
|
380
395
|
# keep this for backwards compatibility:
|
|
381
396
|
return True
|
|
382
397
|
|
|
@@ -455,17 +470,17 @@ class Processor():
|
|
|
455
470
|
nr_copied = 0
|
|
456
471
|
|
|
457
472
|
# set up multithreading
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
max_workers =
|
|
473
|
+
max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES)
|
|
474
|
+
if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES:
|
|
475
|
+
self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers)
|
|
476
|
+
max_workers = self.max_workers
|
|
462
477
|
if max_workers > 1:
|
|
463
478
|
assert isinstance(workspace.mets, ClientSideOcrdMets), \
|
|
464
479
|
"OCRD_MAX_PARALLEL_PAGES>1 requires also using --mets-server-url"
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
max_seconds =
|
|
480
|
+
max_seconds = max(0, config.OCRD_PROCESSING_PAGE_TIMEOUT)
|
|
481
|
+
if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT:
|
|
482
|
+
self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds)
|
|
483
|
+
max_seconds = self.max_page_seconds
|
|
469
484
|
executor = ThreadPoolExecutor(
|
|
470
485
|
max_workers=max_workers or 1,
|
|
471
486
|
thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}"
|
|
@@ -543,7 +558,11 @@ class Processor():
|
|
|
543
558
|
|
|
544
559
|
except NotImplementedError:
|
|
545
560
|
# fall back to deprecated method
|
|
546
|
-
|
|
561
|
+
try:
|
|
562
|
+
self.process()
|
|
563
|
+
except Exception as err:
|
|
564
|
+
# suppress the NotImplementedError context
|
|
565
|
+
raise err from None
|
|
547
566
|
|
|
548
567
|
def _copy_page_file(self, input_file : OcrdFileType) -> None:
|
|
549
568
|
"""
|
|
@@ -571,7 +590,6 @@ class Processor():
|
|
|
571
590
|
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
|
|
572
591
|
mimetype=MIMETYPE_PAGE,
|
|
573
592
|
content=to_xml(input_pcgts),
|
|
574
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
575
593
|
)
|
|
576
594
|
|
|
577
595
|
def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
|
|
@@ -613,6 +631,8 @@ class Processor():
|
|
|
613
631
|
image_result.alternative_image.set_imageHeight(image_result.pil.height)
|
|
614
632
|
elif isinstance(image_result.alternative_image, AlternativeImageType):
|
|
615
633
|
image_result.alternative_image.set_filename(image_file_path)
|
|
634
|
+
elif image_result.alternative_image is None:
|
|
635
|
+
pass # do not reference in PAGE result
|
|
616
636
|
else:
|
|
617
637
|
raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
|
|
618
638
|
f"{type(image_result.alternative_image)}")
|
|
@@ -622,7 +642,6 @@ class Processor():
|
|
|
622
642
|
self.output_file_grp,
|
|
623
643
|
page_id=page_id,
|
|
624
644
|
file_path=image_file_path,
|
|
625
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
626
645
|
)
|
|
627
646
|
result.pcgts.set_pcGtsId(output_file_id)
|
|
628
647
|
self.add_metadata(result.pcgts)
|
|
@@ -633,7 +652,6 @@ class Processor():
|
|
|
633
652
|
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
|
|
634
653
|
mimetype=MIMETYPE_PAGE,
|
|
635
654
|
content=to_xml(result.pcgts),
|
|
636
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
637
655
|
)
|
|
638
656
|
|
|
639
657
|
def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
|
|
@@ -47,7 +47,6 @@ class DummyProcessor(Processor):
|
|
|
47
47
|
mimetype=input_file.mimetype,
|
|
48
48
|
local_filename=local_filename,
|
|
49
49
|
content=f.read(),
|
|
50
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
51
50
|
)
|
|
52
51
|
file_id = file_id + '_PAGE'
|
|
53
52
|
pcgts = page_from_file(output_file)
|
|
@@ -62,7 +61,6 @@ class DummyProcessor(Processor):
|
|
|
62
61
|
local_filename=join(self.output_file_grp, file_id + '.xml'),
|
|
63
62
|
mimetype=MIMETYPE_PAGE,
|
|
64
63
|
content=to_xml(pcgts),
|
|
65
|
-
force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
|
|
66
64
|
)
|
|
67
65
|
else:
|
|
68
66
|
if self.parameter['copy_files']:
|
ocrd/processor/helpers.py
CHANGED
|
@@ -89,7 +89,7 @@ def run_processor(
|
|
|
89
89
|
|
|
90
90
|
ocrd_tool = processor.ocrd_tool
|
|
91
91
|
name = '%s v%s' % (ocrd_tool['executable'], processor.version)
|
|
92
|
-
otherrole = ocrd_tool
|
|
92
|
+
otherrole = ocrd_tool.get('steps', [''])[0]
|
|
93
93
|
logProfile = getLogger('ocrd.process.profile')
|
|
94
94
|
log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
|
|
95
95
|
t0_wall = perf_counter()
|
|
@@ -234,10 +234,10 @@ def get_cached_processor(parameter: dict, processor_class):
|
|
|
234
234
|
def get_processor(
|
|
235
235
|
processor_class,
|
|
236
236
|
parameter: Optional[dict] = None,
|
|
237
|
-
workspace: Workspace = None,
|
|
238
|
-
page_id: str = None,
|
|
239
|
-
input_file_grp: List[str] = None,
|
|
240
|
-
output_file_grp: List[str] = None,
|
|
237
|
+
workspace: Optional[Workspace] = None,
|
|
238
|
+
page_id: Optional[str] = None,
|
|
239
|
+
input_file_grp: Optional[List[str]] = None,
|
|
240
|
+
output_file_grp: Optional[List[str]] = None,
|
|
241
241
|
instance_caching: bool = False,
|
|
242
242
|
):
|
|
243
243
|
if processor_class:
|
|
@@ -258,6 +258,7 @@ def get_processor(
|
|
|
258
258
|
else:
|
|
259
259
|
# avoid passing workspace already (deprecated chdir behaviour)
|
|
260
260
|
processor = processor_class(None, parameter=parameter)
|
|
261
|
+
assert processor
|
|
261
262
|
# set current processing parameters
|
|
262
263
|
processor.workspace = workspace
|
|
263
264
|
processor.page_id = page_id
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import List, Union
|
|
2
|
+
from typing import List, Union, Optional
|
|
3
3
|
from ocrd_models.ocrd_page import OcrdPage
|
|
4
4
|
from PIL.Image import Image
|
|
5
5
|
|
|
@@ -9,7 +9,7 @@ from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
|
|
|
9
9
|
class OcrdPageResultImage():
|
|
10
10
|
pil : Image
|
|
11
11
|
file_id_suffix : str
|
|
12
|
-
alternative_image : Union[AlternativeImageType, PageType]
|
|
12
|
+
alternative_image : Optional[Union[AlternativeImageType, PageType]]
|
|
13
13
|
|
|
14
14
|
@dataclass
|
|
15
15
|
class OcrdPageResult():
|
ocrd/workspace.py
CHANGED
|
@@ -19,6 +19,7 @@ from ocrd_models.ocrd_page import parse, BorderType, to_xml
|
|
|
19
19
|
from ocrd_modelfactory import exif_from_filename, page_from_file
|
|
20
20
|
from ocrd_utils import (
|
|
21
21
|
atomic_write,
|
|
22
|
+
config,
|
|
22
23
|
getLogger,
|
|
23
24
|
image_from_polygon,
|
|
24
25
|
coordinates_of_segment,
|
|
@@ -121,7 +122,10 @@ class Workspace():
|
|
|
121
122
|
"""
|
|
122
123
|
Reload METS from the filesystem.
|
|
123
124
|
"""
|
|
124
|
-
|
|
125
|
+
if self.is_remote:
|
|
126
|
+
self.mets.reload()
|
|
127
|
+
else:
|
|
128
|
+
self.mets = OcrdMets(filename=self.mets_target)
|
|
125
129
|
|
|
126
130
|
@deprecated_alias(pageId="page_id")
|
|
127
131
|
@deprecated_alias(ID="file_id")
|
|
@@ -424,6 +428,8 @@ class Workspace():
|
|
|
424
428
|
kwargs["pageId"] = kwargs.pop("page_id")
|
|
425
429
|
if "file_id" in kwargs:
|
|
426
430
|
kwargs["ID"] = kwargs.pop("file_id")
|
|
431
|
+
if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
|
|
432
|
+
kwargs["force"] = True
|
|
427
433
|
|
|
428
434
|
ret = self.mets.add_file(file_grp, **kwargs)
|
|
429
435
|
|
|
@@ -1,36 +1,36 @@
|
|
|
1
1
|
ocrd/__init__.py,sha256=ZswMVmlqFhAEIzMR3my6IKPq9XLH21aDPC_m_8Jh4dA,1076
|
|
2
2
|
ocrd/constants.py,sha256=6dn3mG54WqHsKInmLZp4kJjNqqPtBoFoSuLUuRbOps0,740
|
|
3
|
-
ocrd/lib.bash,sha256=
|
|
4
|
-
ocrd/mets_server.py,sha256=
|
|
3
|
+
ocrd/lib.bash,sha256=le6XqAOEacdjP3JNSlPkxwRH1y0oVjNQM2tX5d6QFO4,10901
|
|
4
|
+
ocrd/mets_server.py,sha256=U62eih1_O_N0StunVFkEustFs2PlrcMzccraj6_QRk4,21295
|
|
5
5
|
ocrd/ocrd-all-tool.json,sha256=9bX2VYnUwhTAzAvKaoT77BFzbgBGgyIt7qBqARpwWNc,586
|
|
6
6
|
ocrd/resolver.py,sha256=Ba9ALQbTXz6_mla4VqN9tAfHoj6aKuNJAU4tIDnjcHE,14952
|
|
7
7
|
ocrd/resource_list.yml,sha256=82-PiqkZnka1kTj3MQqNn4wXWKHHtoFchsQuetWuqFs,2633
|
|
8
8
|
ocrd/resource_manager.py,sha256=8BMVKJq8J56hugi8vtGn9Ffuk7oRkbs197aG74aKbCY,16733
|
|
9
9
|
ocrd/task_sequence.py,sha256=spiaUQaMM7M8WdBDoQGmLuTPm7tOugYXD6rcJ2UXzxw,6991
|
|
10
|
-
ocrd/workspace.py,sha256=
|
|
10
|
+
ocrd/workspace.py,sha256=cedqK7es2i2nwQCiUiVyWk3j4-nH7bsi6TF7v8siTio,65794
|
|
11
11
|
ocrd/workspace_backup.py,sha256=iab_JjZ_mMP-G8NIUk4PZmfpNlQuGRoqc3NbTSSew1w,3621
|
|
12
12
|
ocrd/workspace_bagger.py,sha256=yU8H3xR5WmQKvgQewac71ie-DUWcfLnMS01D55zsEHQ,11971
|
|
13
13
|
ocrd/cli/__init__.py,sha256=lNR6wMf7JhQ8Jf33tUkowJr0mB3423OMY0_6dkMRLvU,2672
|
|
14
|
-
ocrd/cli/bashlib.py,sha256=
|
|
14
|
+
ocrd/cli/bashlib.py,sha256=ypFBM3-IULz_IEBx0Y04eGt9VbQWwEWm4ujm9g_hPWY,6009
|
|
15
15
|
ocrd/cli/log.py,sha256=6_FrVmTKIIVNUaNLkuOJx8pvPhensHMuayJ0PA7T-XA,1562
|
|
16
16
|
ocrd/cli/network.py,sha256=oWBHFEURxfUdb_t-F4svP_ri7o5mqBoNQnLZLbsZLTA,602
|
|
17
|
-
ocrd/cli/ocrd_tool.py,sha256=
|
|
17
|
+
ocrd/cli/ocrd_tool.py,sha256=qaJgt-LNH0tXkaupMNrEKXasxcgsabHdfLdYESEsomk,7035
|
|
18
18
|
ocrd/cli/process.py,sha256=8KD0i7LT01H9u5CC1vktYMEVpS67da_rp_09_EOECmw,1233
|
|
19
19
|
ocrd/cli/resmgr.py,sha256=bTE-MpF7RRCHhgAbknqZUFHgHScIK6FR3S4h4DEAets,10080
|
|
20
|
-
ocrd/cli/validate.py,sha256=
|
|
21
|
-
ocrd/cli/workspace.py,sha256=
|
|
20
|
+
ocrd/cli/validate.py,sha256=nvageDaHCETcE71X5lu7i_4JKpgo9MrvJKinVPLYUTI,5727
|
|
21
|
+
ocrd/cli/workspace.py,sha256=InIQ5rfQWPn4Qsd1s_xA6AC6ndZLCsuyhoAEiqP8bK4,39479
|
|
22
22
|
ocrd/cli/zip.py,sha256=MMJLw3OXWiJVfVtrdJcBkbB8vA1IzSautluazZRuCQ0,5910
|
|
23
23
|
ocrd/decorators/__init__.py,sha256=IJlA1XcdVBO6Hxm9rNDya7QYcqeWcaXXuLtGjfjcen8,7596
|
|
24
24
|
ocrd/decorators/loglevel_option.py,sha256=tgipROEu3t4hkwWvFssd80k2SbTBwBIC4WNE6Gc-XAg,798
|
|
25
25
|
ocrd/decorators/mets_find_options.py,sha256=d4oATKMP6bFQHNqOK6nLqgUiWF2FYdkPvzkTVRMYpKo,635
|
|
26
|
-
ocrd/decorators/ocrd_cli_options.py,sha256=
|
|
26
|
+
ocrd/decorators/ocrd_cli_options.py,sha256=hr2EugwAY_-GJ7F7g77Od9o9eAqhfLBHSpfmCql2OCU,2665
|
|
27
27
|
ocrd/decorators/parameter_option.py,sha256=n8hYw7XVTd3i3tvpK8F1Jx_CqRp6EGF9qJVH95yj92Q,1076
|
|
28
28
|
ocrd/processor/__init__.py,sha256=39ymNwYRdc-b_OJzzKmWCvo2ga3KdsGSYDHE1Hzkn_w,274
|
|
29
|
-
ocrd/processor/base.py,sha256=
|
|
30
|
-
ocrd/processor/helpers.py,sha256=
|
|
31
|
-
ocrd/processor/ocrd_page_result.py,sha256=
|
|
29
|
+
ocrd/processor/base.py,sha256=_TvaxKf_oMaIfuUcaFWas8YimhZ-l1d3RWGELlBJfy8,50307
|
|
30
|
+
ocrd/processor/helpers.py,sha256=vPYUri6ucuhdTNrideywriJ0fCa8UE2QyBXOmS-7RcQ,10232
|
|
31
|
+
ocrd/processor/ocrd_page_result.py,sha256=eDkpyVHcpaBzTHXiGrcNk9PP9Xr-XZru2w_uoX_ZeNA,510
|
|
32
32
|
ocrd/processor/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
-
ocrd/processor/builtin/dummy_processor.py,sha256=
|
|
33
|
+
ocrd/processor/builtin/dummy_processor.py,sha256=iWiw_jJXOqwr7-hFjdkmTCCo1xGr6MLGOshx81PTu-8,3548
|
|
34
34
|
ocrd/processor/builtin/dummy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
35
|
ocrd/processor/builtin/dummy/ocrd-tool.json,sha256=aTA2FZRsRsrkbTctkazFeRu4xsTF6yCdeY07cMzOyt4,677
|
|
36
36
|
ocrd_modelfactory/__init__.py,sha256=0baYSJXrOCTCguHkE6hBeqpGNVUe3aZUocv64A-DMDk,4094
|
|
@@ -42,7 +42,7 @@ ocrd_models/ocrd_exif.py,sha256=5BRLjvB6jg36V68i8jvVnT2SSNnpqLbhLsaMuP51Scw,4583
|
|
|
42
42
|
ocrd_models/ocrd_file.py,sha256=7lyHezuNnl2FEYV1lV35-QTCrgYAL-3wO2ulFUNq2Ak,9717
|
|
43
43
|
ocrd_models/ocrd_mets.py,sha256=h3y_WI5fVLsbBoUIRNH2ebjuO1-_P6T3BMIULX-ZOIs,42514
|
|
44
44
|
ocrd_models/ocrd_page.py,sha256=sVIvvMeBT8eZnOfW0DTjQUNyu62-llz0v_Ga5Xo-tUM,5393
|
|
45
|
-
ocrd_models/ocrd_page_generateds.py,sha256=
|
|
45
|
+
ocrd_models/ocrd_page_generateds.py,sha256=wfx3vESMAi08rl6-16zNVJe4E3B6APIvL6RCr1roAzg,774092
|
|
46
46
|
ocrd_models/ocrd_xml_base.py,sha256=OW57mXLlwm1nH8CNefvXmwLRws9KL9zSrb-3vH--mX8,1641
|
|
47
47
|
ocrd_models/report.py,sha256=luZxvzAAQyGYOlRNSJQUIUIANG81iGmBW5ag-uXxKCA,2026
|
|
48
48
|
ocrd_models/utils.py,sha256=0_WHf5NEn1WC8MKJc6X_RK8gW-70Z09_mslkKOj7uF8,2369
|
|
@@ -95,7 +95,7 @@ ocrd_utils/introspect.py,sha256=gfBlmeEFuRmRUSgdSK0jOxRpYqDRXl2IAE6gv2MZ6as,1977
|
|
|
95
95
|
ocrd_utils/logging.py,sha256=5_-5T5OWSYicNk8SQyjVqdRj2bVl-gDK1Th-C7oW_HE,8248
|
|
96
96
|
ocrd_utils/ocrd_logging.conf,sha256=kl9x9JS1d8h8F0QZabvrjZtW1iApIaChvkImYafKO5g,3623
|
|
97
97
|
ocrd_utils/os.py,sha256=acRRdDBI8L6BK0Mf773yKEzwdpZSFRBJEKB2crL4EjU,9865
|
|
98
|
-
ocrd_utils/str.py,sha256=
|
|
98
|
+
ocrd_utils/str.py,sha256=cRgqYILDGOAqWr0qrCrV52I3y4wvpwDVtnBGEUjXNS4,10116
|
|
99
99
|
ocrd_validators/__init__.py,sha256=ZFc-UqRVBk9o1YesZFmr9lOepttNJ_NKx1Zdb7g_YsU,972
|
|
100
100
|
ocrd_validators/bagit-profile.yml,sha256=sdQJlSi7TOn1E9WYMOZ1shewJ-i_nPaKmsAFkh28TGY,1011
|
|
101
101
|
ocrd_validators/constants.py,sha256=FLP57T3F39weka_XovG40RgVMW1GunnbK04QRQ9tmlE,1802
|
|
@@ -118,9 +118,9 @@ ocrd_validators/xlink.xsd,sha256=8fW7YAMWXN2PbB_MMvj9H5ZeFoEBDzuYBtlGC8_6ijw,318
|
|
|
118
118
|
ocrd_validators/xsd_mets_validator.py,sha256=4GWfLyqkmca0x7osDuXuExYuM0HWVrKoqn0S35sFhHU,467
|
|
119
119
|
ocrd_validators/xsd_page_validator.py,sha256=BNz_9u-Ek4UCeyZu3KxSQoolfW9lvuaSR9nIu1XXxeE,467
|
|
120
120
|
ocrd_validators/xsd_validator.py,sha256=6HrVAf6SzCvfUIuQdIzz9bOq4V-zhyii9yrUPoK2Uvo,2094
|
|
121
|
-
ocrd-3.0.
|
|
122
|
-
ocrd-3.0.
|
|
123
|
-
ocrd-3.0.
|
|
124
|
-
ocrd-3.0.
|
|
125
|
-
ocrd-3.0.
|
|
126
|
-
ocrd-3.0.
|
|
121
|
+
ocrd-3.0.0b5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
122
|
+
ocrd-3.0.0b5.dist-info/METADATA,sha256=eIpkAoobj7QocP9VXYASLqE82wN35JlvYuYjSBLk30o,10397
|
|
123
|
+
ocrd-3.0.0b5.dist-info/WHEEL,sha256=5Mi1sN9lKoFv_gxcPtisEVrJZihrm_beibeg5R6xb4I,91
|
|
124
|
+
ocrd-3.0.0b5.dist-info/entry_points.txt,sha256=tV_gAdO8cbnOjS0GmKfJKbN60xBAV2DQRX6hEjleSjE,94
|
|
125
|
+
ocrd-3.0.0b5.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
|
|
126
|
+
ocrd-3.0.0b5.dist-info/RECORD,,
|
|
@@ -2,30 +2,28 @@
|
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
4
|
#
|
|
5
|
-
# Generated
|
|
6
|
-
# Python 3.
|
|
5
|
+
# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20.
|
|
6
|
+
# Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0]
|
|
7
7
|
#
|
|
8
8
|
# Command line options:
|
|
9
9
|
# ('-f', '')
|
|
10
10
|
# ('--root-element', 'PcGts')
|
|
11
|
-
# ('-o', '
|
|
11
|
+
# ('-o', 'src/ocrd_models/ocrd_page_generateds.py')
|
|
12
12
|
# ('--silence', '')
|
|
13
13
|
# ('--export', 'write etree')
|
|
14
14
|
# ('--disable-generatedssuper-lookup', '')
|
|
15
|
-
# ('--user-methods', '
|
|
15
|
+
# ('--user-methods', 'src/ocrd_page_user_methods.py')
|
|
16
16
|
#
|
|
17
17
|
# Command line arguments:
|
|
18
|
-
#
|
|
18
|
+
# src/ocrd_validators/page.xsd
|
|
19
19
|
#
|
|
20
20
|
# Command line:
|
|
21
|
-
# /
|
|
21
|
+
# /data/ocr-d/ocrd_all/venv38/bin/generateDS -f --root-element="PcGts" -o "src/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="src/ocrd_page_user_methods.py" src/ocrd_validators/page.xsd
|
|
22
22
|
#
|
|
23
23
|
# Current working directory (os.getcwd()):
|
|
24
24
|
# core
|
|
25
25
|
#
|
|
26
26
|
|
|
27
|
-
# type: ignore
|
|
28
|
-
|
|
29
27
|
from itertools import zip_longest
|
|
30
28
|
import os
|
|
31
29
|
import sys
|
|
@@ -223,7 +221,7 @@ class GeneratedsSuper(object):
|
|
|
223
221
|
try:
|
|
224
222
|
int(value)
|
|
225
223
|
except (TypeError, ValueError):
|
|
226
|
-
raise_parse_error(node, 'Requires sequence of integer
|
|
224
|
+
raise_parse_error(node, 'Requires sequence of integer valuess')
|
|
227
225
|
return values
|
|
228
226
|
def gds_format_float(self, input_data, input_name=''):
|
|
229
227
|
return ('%.15f' % input_data).rstrip('0')
|
|
@@ -1230,9 +1228,10 @@ class PcGtsType(GeneratedsSuper):
|
|
|
1230
1228
|
return hash(self.id)
|
|
1231
1229
|
@property
|
|
1232
1230
|
def id(self):
|
|
1231
|
+
from ocrd_utils import make_xml_id
|
|
1233
1232
|
if hasattr(self, 'pcGtsId'):
|
|
1234
1233
|
return self.pcGtsId or ''
|
|
1235
|
-
return self.imageFilename
|
|
1234
|
+
return make_xml_id(self.imageFilename)
|
|
1236
1235
|
def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True):
|
|
1237
1236
|
"""
|
|
1238
1237
|
Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document.
|
|
@@ -3116,9 +3115,10 @@ class PageType(GeneratedsSuper):
|
|
|
3116
3115
|
return hash(self.id)
|
|
3117
3116
|
@property
|
|
3118
3117
|
def id(self):
|
|
3118
|
+
from ocrd_utils import make_xml_id
|
|
3119
3119
|
if hasattr(self, 'pcGtsId'):
|
|
3120
3120
|
return self.pcGtsId or ''
|
|
3121
|
-
return self.imageFilename
|
|
3121
|
+
return make_xml_id(self.imageFilename)
|
|
3122
3122
|
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
3123
3123
|
def _region_class(self, x): # pylint: disable=unused-argument
|
|
3124
3124
|
return x.__class__.__name__.replace('RegionType', '')
|
|
@@ -3314,6 +3314,39 @@ class PageType(GeneratedsSuper):
|
|
|
3314
3314
|
ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines))
|
|
3315
3315
|
return ret
|
|
3316
3316
|
|
|
3317
|
+
def get_ReadingOrderGroups(self) -> dict:
|
|
3318
|
+
"""
|
|
3319
|
+
Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef
|
|
3320
|
+
(i.e. segment `@id`) to its referring group object (i.e one of
|
|
3321
|
+
|
|
3322
|
+
\b
|
|
3323
|
+
- :py:class:`.RegionRefType`
|
|
3324
|
+
- :py:class:`.RegionRefIndexedType`
|
|
3325
|
+
- :py:class:`.OrderedGroupType`
|
|
3326
|
+
- :py:class:`.OrderedGroupIndexedType`
|
|
3327
|
+
- :py:class:`.UnoderedGroupType`
|
|
3328
|
+
- :py:class:`.UnoderedGroupIndexedType`
|
|
3329
|
+
"""
|
|
3330
|
+
def get_groupdict(group):
|
|
3331
|
+
regionrefs = list()
|
|
3332
|
+
if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)):
|
|
3333
|
+
regionrefs = (group.get_RegionRefIndexed() +
|
|
3334
|
+
group.get_OrderedGroupIndexed() +
|
|
3335
|
+
group.get_UnorderedGroupIndexed())
|
|
3336
|
+
if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)):
|
|
3337
|
+
regionrefs = (group.get_RegionRef() +
|
|
3338
|
+
group.get_OrderedGroup() +
|
|
3339
|
+
group.get_UnorderedGroup())
|
|
3340
|
+
refdict = {}
|
|
3341
|
+
for elem in regionrefs:
|
|
3342
|
+
refdict[elem.get_regionRef()] = elem
|
|
3343
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
|
|
3344
|
+
refdict = {**refdict, **get_groupdict(elem)}
|
|
3345
|
+
return refdict
|
|
3346
|
+
ro = self.get_ReadingOrder()
|
|
3347
|
+
if ro is None:
|
|
3348
|
+
return {}
|
|
3349
|
+
return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup())
|
|
3317
3350
|
def set_orientation(self, orientation):
|
|
3318
3351
|
"""
|
|
3319
3352
|
Set deskewing angle to given `orientation` number.
|
ocrd_utils/str.py
CHANGED
|
@@ -108,10 +108,11 @@ def make_xml_id(idstr: str) -> str:
|
|
|
108
108
|
ret = idstr
|
|
109
109
|
if not REGEX_FILE_ID.fullmatch(ret):
|
|
110
110
|
ret = ret.replace(':', '_')
|
|
111
|
+
ret = ret.replace('/', '_')
|
|
111
112
|
ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
|
|
112
113
|
ret = re.sub(r'[^\w.-]', r'', ret)
|
|
113
114
|
return ret
|
|
114
|
-
|
|
115
|
+
|
|
115
116
|
def nth_url_segment(url, n=-1):
|
|
116
117
|
"""
|
|
117
118
|
Return the last /-delimited segment of a URL-like string
|
|
File without changes
|
|
File without changes
|
|
File without changes
|