ocrd 3.1.2__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocrd/cli/workspace.py CHANGED
@@ -88,8 +88,8 @@ def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup):
88
88
  @pass_workspace
89
89
  @click.option('-a', '--download', is_flag=True, help="Download all files")
90
90
  @click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(
91
- ['imagefilename', 'dimension', 'pixel_density', 'page', 'url', 'page_xsd', 'mets_fileid_page_pcgtsid',
92
- 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'mets_xsd']))
91
+ ['imagefilename', 'alternativeimage_filename', 'alternativeimage_comments', 'dimension', 'pixel_density', 'page', 'page_xsd',
92
+ 'url', 'mets_fileid_page_pcgtsid', 'mets_unique_identifier', 'mets_files', 'mets_xsd']))
93
93
  @click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict')
94
94
  @click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
95
95
  @click.argument('mets_url', default=None, required=False)
ocrd/processor/base.py CHANGED
@@ -779,10 +779,14 @@ class Processor():
779
779
  to handle cases like multiple output fileGrps, non-PAGE input etc.)
780
780
  """
781
781
  input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
782
- assert isinstance(input_files[0], get_args(OcrdFileType))
783
- page_id = input_files[0].pageId
782
+ input_pos = next(i for i, input_file in enumerate(input_files) if input_file is not None)
783
+ page_id = input_files[input_pos].pageId
784
784
  self._base_logger.info("processing page %s", page_id)
785
785
  for i, input_file in enumerate(input_files):
786
+ if input_file is None:
787
+ grp = self.input_file_grp.split(',')[i]
788
+ self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}")
789
+ continue
786
790
  assert isinstance(input_file, get_args(OcrdFileType))
787
791
  self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
788
792
  try:
@@ -792,7 +796,10 @@ class Processor():
792
796
  except ValueError as err:
793
797
  # not PAGE and not an image to generate PAGE for
794
798
  self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
795
- output_file_id = make_file_id(input_files[0], self.output_file_grp)
799
+ output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
800
+ if input_files[input_pos].fileGrp == self.output_file_grp:
801
+ # input=output fileGrp: re-use ID exactly
802
+ output_file_id = input_files[input_pos].ID
796
803
  output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
797
804
  if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
798
805
  # short-cut avoiding useless computation:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.1.2
3
+ Version: 3.3.0
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -18,7 +18,7 @@ ocrd/cli/ocrd_tool.py,sha256=EyD5VdLm2WTzQnR-hZKpn-D4-dsWr2PIE5IoY1O3mfE,7357
18
18
  ocrd/cli/process.py,sha256=8KD0i7LT01H9u5CC1vktYMEVpS67da_rp_09_EOECmw,1233
19
19
  ocrd/cli/resmgr.py,sha256=mk8KZweC_7ENAFnC6FvFf7Zv_W1wqJTmk0EMd9XSvf4,10132
20
20
  ocrd/cli/validate.py,sha256=nvageDaHCETcE71X5lu7i_4JKpgo9MrvJKinVPLYUTI,5727
21
- ocrd/cli/workspace.py,sha256=KTbSzIUrba5WoYETvM9ElRZVsDUHCGVvjoFgBGZS2nU,40468
21
+ ocrd/cli/workspace.py,sha256=bsp6YXEgwABIUFbSENmxV1c4oxRwc2L-BpeDPlYfhHE,40501
22
22
  ocrd/cli/zip.py,sha256=MMJLw3OXWiJVfVtrdJcBkbB8vA1IzSautluazZRuCQ0,5910
23
23
  ocrd/decorators/__init__.py,sha256=PyXX7vxdWkRHixas9dWUtyO3YLczcly8ZEpfZDSMVp8,7639
24
24
  ocrd/decorators/loglevel_option.py,sha256=tgipROEu3t4hkwWvFssd80k2SbTBwBIC4WNE6Gc-XAg,798
@@ -26,7 +26,7 @@ ocrd/decorators/mets_find_options.py,sha256=d4oATKMP6bFQHNqOK6nLqgUiWF2FYdkPvzkT
26
26
  ocrd/decorators/ocrd_cli_options.py,sha256=lIvtE8re1VmpHm45u71ltE0QJS8nyd28HhLC7zGSvlo,2691
27
27
  ocrd/decorators/parameter_option.py,sha256=n8hYw7XVTd3i3tvpK8F1Jx_CqRp6EGF9qJVH95yj92Q,1076
28
28
  ocrd/processor/__init__.py,sha256=39ymNwYRdc-b_OJzzKmWCvo2ga3KdsGSYDHE1Hzkn_w,274
29
- ocrd/processor/base.py,sha256=GcfVrgCvfHbrxngwl1VzcaZ5z7QV2e1Cn7CIjBYdcHc,59480
29
+ ocrd/processor/base.py,sha256=yN_sMfwm2B89wtr2ShNkEtcTjXNqnvAtjM4TbWTUNCk,59929
30
30
  ocrd/processor/helpers.py,sha256=gIc6PdvOS1sR0UkYlrdZopImAXxXglDBNpgNZGWHO7Y,10987
31
31
  ocrd/processor/ocrd_page_result.py,sha256=eDkpyVHcpaBzTHXiGrcNk9PP9Xr-XZru2w_uoX_ZeNA,510
32
32
  ocrd/processor/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -36,7 +36,7 @@ ocrd/processor/builtin/dummy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
36
36
  ocrd/processor/builtin/dummy/ocrd-tool.json,sha256=VoI37paWiUyMkTN5Qqau8R1Clmw24-HcZu4wjy1Br9Y,2311
37
37
  ocrd_modelfactory/__init__.py,sha256=NyJT1uSvmeEwibRFOkh0AEoVnYfP0mzxU--pP23B-TQ,4404
38
38
  ocrd_models/__init__.py,sha256=A0aj0mOraNb-xfiUueACdoaqISnp0qH-F49nTJg2vCs,380
39
- ocrd_models/constants.py,sha256=fI6Qz4OPOm6UBLQ_P2dlpjcwB0XFJZ7AgxxKqgc75X0,2724
39
+ ocrd_models/constants.py,sha256=kvvAAro_1YOTRWwFgbrGEFeDZ8_u0S624Y3icNNk4Oo,2987
40
40
  ocrd_models/mets-empty.xml,sha256=dFixfbxSXrgjZx9BfdIKWHX-khNmp7dNYaFe2qQSwCY,1203
41
41
  ocrd_models/ocrd_agent.py,sha256=E9OtDhz9UfKb6ou2qvsuCL9NlO1V6zMb0s8nVq8dVos,5609
42
42
  ocrd_models/ocrd_exif.py,sha256=wRSprHxCy9LCXw41Fi9kp-CbFc5NFX9ZFIFNszB41qk,4585
@@ -115,14 +115,14 @@ ocrd_validators/parameter_validator.py,sha256=_5Y3IS24Sf_xHBkB3TE3jB9VTCbbjWO8bS
115
115
  ocrd_validators/processing_server_config.schema.yml,sha256=8NQbhSshm1exTvbdYiu694rZZ-Xe70_vQtsJ0nd7ZCM,5432
116
116
  ocrd_validators/processing_server_config_validator.py,sha256=lQ2-ZxsvbFki_SvE_N4_1ptBnBHcwOTJ5grtL2G9F8A,810
117
117
  ocrd_validators/resource_list_validator.py,sha256=cFMj0n_x-tjhuNUpjgEvPP8iPVm7lme9TWAaqATasV0,776
118
- ocrd_validators/workspace_validator.py,sha256=rEXIwjtNpt8HcTv94fKed3vVlA3U4z7Xmm1ZL1VHC84,17892
118
+ ocrd_validators/workspace_validator.py,sha256=JNPsRVPgQI0vsaxcs_c3qj22GagdZcgO3v9u3sbBbBI,20340
119
119
  ocrd_validators/xlink.xsd,sha256=8fW7YAMWXN2PbB_MMvj9H5ZeFoEBDzuYBtlGC8_6ijw,3180
120
120
  ocrd_validators/xsd_mets_validator.py,sha256=4GWfLyqkmca0x7osDuXuExYuM0HWVrKoqn0S35sFhHU,467
121
121
  ocrd_validators/xsd_page_validator.py,sha256=BNz_9u-Ek4UCeyZu3KxSQoolfW9lvuaSR9nIu1XXxeE,467
122
122
  ocrd_validators/xsd_validator.py,sha256=6HrVAf6SzCvfUIuQdIzz9bOq4V-zhyii9yrUPoK2Uvo,2094
123
- ocrd-3.1.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
124
- ocrd-3.1.2.dist-info/METADATA,sha256=gtLDkn539WBI_GpwiwsMe5iIv_bmePJCFqgRfkf9Gnc,10442
125
- ocrd-3.1.2.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
126
- ocrd-3.1.2.dist-info/entry_points.txt,sha256=4hcJ2LkK_OlIabHnKgFit35Ap7b5Lz1Gb4hzkxV0Kiw,152
127
- ocrd-3.1.2.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
128
- ocrd-3.1.2.dist-info/RECORD,,
123
+ ocrd-3.3.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
124
+ ocrd-3.3.0.dist-info/METADATA,sha256=K8u-P2RKlBlGPPq8h8sv5hLhWi0XgRmhF-Bf-F3Qgpc,10442
125
+ ocrd-3.3.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
126
+ ocrd-3.3.0.dist-info/entry_points.txt,sha256=4hcJ2LkK_OlIabHnKgFit35Ap7b5Lz1Gb4hzkxV0Kiw,152
127
+ ocrd-3.3.0.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
128
+ ocrd-3.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (75.3.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
ocrd_models/constants.py CHANGED
@@ -28,6 +28,8 @@ __all__ = [
28
28
  'TAG_PAGE_TEXTEQUIV',
29
29
  'TAG_PAGE_TEXTREGION',
30
30
  'METS_PAGE_DIV_ATTRIBUTE',
31
+ 'PAGE_REGION_TYPES',
32
+ 'PAGE_ALTIMG_FEATURES',
31
33
  ]
32
34
 
33
35
 
@@ -72,6 +74,20 @@ PAGE_REGION_TYPES = [
72
74
  'Separator', 'Table', 'Text', 'Unknown'
73
75
  ]
74
76
 
77
+ PAGE_ALTIMG_FEATURES = [
78
+ 'binarized',
79
+ 'grayscale_normalized',
80
+ 'despeckled',
81
+ 'cropped',
82
+ 'deskewed',
83
+ 'rotated-90',
84
+ 'rotated-180',
85
+ 'rotated-270',
86
+ 'dewarped',
87
+ 'clipped',
88
+ ]
89
+
90
+
75
91
  class METS_PAGE_DIV_ATTRIBUTE(Enum):
76
92
  ID = auto()
77
93
  ORDER = auto()
@@ -7,6 +7,7 @@ from pathlib import Path
7
7
 
8
8
  from ocrd_utils import getLogger, MIMETYPE_PAGE, pushd_popd, is_local_filename, DEFAULT_METS_BASENAME
9
9
  from ocrd_models import ValidationReport
10
+ from ocrd_models.constants import PAGE_ALTIMG_FEATURES
10
11
  from ocrd_modelfactory import page_from_file
11
12
 
12
13
  from .constants import FILE_GROUP_CATEGORIES, FILE_GROUP_PREFIX
@@ -98,6 +99,9 @@ class WorkspaceValidator():
98
99
  self.page_coordinate_consistency = page_coordinate_consistency
99
100
  # there will be more options to come
100
101
  self.page_checks = [check for check in ['mets_fileid_page_pcgtsid',
102
+ 'imagefilename',
103
+ 'alternativeimage_filename',
104
+ 'alternativeimage_comments',
101
105
  'dimension',
102
106
  'page',
103
107
  'page_xsd']
@@ -118,7 +122,7 @@ class WorkspaceValidator():
118
122
  mets_url (string): URL of the METS file
119
123
  src_dir (string, None): Directory containing mets file
120
124
  skip (list): Validation checks to omit. One or more of
121
- 'mets_unique_identifier', 'mets_file_group_names',
125
+ 'mets_unique_identifier',
122
126
  'mets_files', 'pixel_density', 'dimension', 'url',
123
127
  'multipage', 'page', 'page_xsd', 'mets_xsd',
124
128
  'mets_fileid_page_pcgtsid'
@@ -145,8 +149,6 @@ class WorkspaceValidator():
145
149
  try:
146
150
  if 'mets_unique_identifier' not in self.skip:
147
151
  self._validate_mets_unique_identifier()
148
- if 'mets_file_group_names' not in self.skip:
149
- self._validate_mets_file_group_names()
150
152
  if 'mets_files' not in self.skip:
151
153
  self._validate_mets_files()
152
154
  if 'pixel_density' not in self.skip:
@@ -192,7 +194,11 @@ class WorkspaceValidator():
192
194
  self.workspace.download_file(f)
193
195
  page = page_from_file(f).get_Page()
194
196
  imageFilename = page.imageFilename
195
- if not self.mets.find_files(url=imageFilename, **self.find_kwargs):
197
+ if is_local_filename(imageFilename):
198
+ kwargs = dict(local_filename=imageFilename, **self.find_kwargs)
199
+ else:
200
+ kwargs = dict(url=imageFilename, **self.find_kwargs)
201
+ if not self.mets.find_files(**kwargs):
196
202
  self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
197
203
  if is_local_filename(imageFilename) and not Path(imageFilename).exists():
198
204
  self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file")
@@ -295,6 +301,9 @@ class WorkspaceValidator():
295
301
  if f.url and 'url' not in self.skip:
296
302
  if re.match(r'^file:/[^/]', f.url):
297
303
  self.report.add_error(f"File '{f.ID}' has an invalid (Java-specific) file URL '{f.url}'")
304
+ elif ':' not in f.url:
305
+ self.report.add_error(f"File '{f.ID}' has an invalid (non-URI) file URL '{f.url}'")
306
+ continue
298
307
  scheme = f.url[0:f.url.index(':')]
299
308
  if scheme not in ('http', 'https', 'file'):
300
309
  self.report.add_warning(f"File '{f.ID}' has non-HTTP, non-file URL '{f.url}'")
@@ -321,17 +330,43 @@ class WorkspaceValidator():
321
330
  pcgts = page_from_file(f)
322
331
  page = pcgts.get_Page()
323
332
  if 'dimension' in self.page_checks:
324
- _, _, exif = self.workspace.image_from_page(page, f.pageId)
325
- if page.imageHeight != exif.height:
326
- self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})")
327
- if page.imageWidth != exif.width:
328
- self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})")
333
+ img = self.workspace._resolve_image_as_pil(page.imageFilename)
334
+ if page.imageHeight != img.height:
335
+ self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {img.height})")
336
+ if page.imageWidth != img.width:
337
+ self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {img.width})")
329
338
  if 'imagefilename' in self.page_checks:
330
339
  imageFilename = page.imageFilename
331
- if not self.mets.find_files(url=imageFilename):
340
+ if is_local_filename(imageFilename):
341
+ kwargs = dict(local_filename=imageFilename, **self.find_kwargs)
342
+ else:
343
+ kwargs = dict(url=imageFilename, **self.find_kwargs)
344
+ if not self.mets.find_files(**kwargs):
332
345
  self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS")
333
346
  if is_local_filename(imageFilename) and not Path(imageFilename).exists():
334
347
  self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file")
348
+ if 'alternativeimage_filename' in self.page_checks:
349
+ for altimg in page.get_AllAlternativeImages():
350
+ if is_local_filename(altimg.filename):
351
+ kwargs = dict(local_filename=altimg.filename, **self.find_kwargs)
352
+ else:
353
+ kwargs = dict(url=altimg.filename, **self.find_kwargs)
354
+ if not self.mets.find_files(**kwargs):
355
+ self.report.add_error(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
356
+ f"'{altimg.filename}' not found in METS")
357
+ if is_local_filename(altimg.filename) and not Path(altimg.filename).exists():
358
+ self.report.add_warning(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
359
+ f"'{altimg.filename}' points to non-existent local file")
360
+ if 'alternativeimage_comments' in self.page_checks:
361
+ for altimg in page.get_AllAlternativeImages():
362
+ if altimg.comments is None:
363
+ self.report.add_error(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
364
+ f"'{altimg.filename}' features not specified in PAGE")
365
+ else:
366
+ for feature in altimg.comments.split(','):
367
+ if feature not in PAGE_ALTIMG_FEATURES:
368
+ self.report.add_error(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage "
369
+ f"'{altimg.filename}' feature '{feature}' not standardized for PAGE")
335
370
  if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID:
336
371
  self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pcgts.pcGtsId or '', f.ID or ''))
337
372
 
File without changes