ocrd 3.8.1__py3-none-any.whl → 3.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ocrd/processor/base.py CHANGED
@@ -824,51 +824,59 @@ class Processor():
824
824
  if not any(input_pcgts):
825
825
  self._base_logger.warning(f'skipping page {page_id}')
826
826
  return
827
- output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
828
- if input_files[input_pos].fileGrp == self.output_file_grp:
829
- # input=output fileGrp: re-use ID exactly
830
- output_file_id = input_files[input_pos].ID
831
- output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
832
- if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
833
- # short-cut avoiding useless computation:
834
- raise FileExistsError(
835
- f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
836
- )
837
- result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
838
- for image_result in result.images:
839
- image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
840
- image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
841
- if isinstance(image_result.alternative_image, PageType):
842
- # special case: not an alternative image, but replacing the original image
843
- # (this is needed by certain processors when the original's coordinate system
844
- # cannot or must not be kept)
845
- image_result.alternative_image.set_imageFilename(image_file_path)
846
- image_result.alternative_image.set_imageWidth(image_result.pil.width)
847
- image_result.alternative_image.set_imageHeight(image_result.pil.height)
848
- elif isinstance(image_result.alternative_image, AlternativeImageType):
849
- image_result.alternative_image.set_filename(image_file_path)
850
- elif image_result.alternative_image is None:
851
- pass # do not reference in PAGE result
852
- else:
853
- raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
854
- f"{type(image_result.alternative_image)}")
855
- self.workspace.save_image_file(
856
- image_result.pil,
857
- image_file_id,
858
- self.output_file_grp,
827
+ output_file_grps = self.output_file_grp.split(',')
828
+ output_file_ids = [make_file_id(input_files[input_pos], output_file_grp)
829
+ if input_files[input_pos].fileGrp != output_file_grp else
830
+ # input=output fileGrp: re-use ID exactly
831
+ input_files[input_pos].ID
832
+ for output_file_grp in output_file_grps]
833
+ if config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
834
+ for output_file_id in output_file_ids:
835
+ if output_file := next(self.workspace.mets.find_files(ID=output_file_id), None):
836
+ # short-cut avoiding useless computation:
837
+ raise FileExistsError(
838
+ f"A file with ID=={output_file_id} already exists {output_file}"
839
+ " and OCRD_EXISTING_OUTPUT != OVERWRITE"
840
+ )
841
+ results = self.process_page_pcgts(*input_pcgts, page_id=page_id)
842
+ if len(results) > len(output_file_grps):
843
+ self._base_logger.error(f"processor returned {len(results) - len(output_file_grps)} "
844
+ f"more results than specified output fileGrps for page {page_id}")
845
+ for result, output_file_id, output_file_grp in zip(results, output_file_ids, output_file_grps):
846
+ for image_result in result.images:
847
+ image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
848
+ image_file_path = join(output_file_grp, f'{image_file_id}.png')
849
+ if isinstance(image_result.alternative_image, PageType):
850
+ # special case: not an alternative image, but replacing the original image
851
+ # (this is needed by certain processors when the original's coordinate system
852
+ # cannot or must not be kept, e.g. dewarping)
853
+ image_result.alternative_image.set_imageFilename(image_file_path)
854
+ image_result.alternative_image.set_imageWidth(image_result.pil.width)
855
+ image_result.alternative_image.set_imageHeight(image_result.pil.height)
856
+ elif isinstance(image_result.alternative_image, AlternativeImageType):
857
+ image_result.alternative_image.set_filename(image_file_path)
858
+ elif image_result.alternative_image is None:
859
+ pass # do not reference in PAGE result
860
+ else:
861
+ raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
862
+ f"{type(image_result.alternative_image)}")
863
+ self.workspace.save_image_file(
864
+ image_result.pil,
865
+ image_file_id,
866
+ output_file_grp,
867
+ page_id=page_id,
868
+ file_path=image_file_path,
869
+ )
870
+ result.pcgts.set_pcGtsId(output_file_id)
871
+ self.add_metadata(result.pcgts)
872
+ self.workspace.add_file(
873
+ file_id=output_file_id,
874
+ file_grp=output_file_grp,
859
875
  page_id=page_id,
860
- file_path=image_file_path,
876
+ local_filename=os.path.join(output_file_grp, output_file_id + '.xml'),
877
+ mimetype=MIMETYPE_PAGE,
878
+ content=to_xml(result.pcgts),
861
879
  )
862
- result.pcgts.set_pcGtsId(output_file_id)
863
- self.add_metadata(result.pcgts)
864
- self.workspace.add_file(
865
- file_id=output_file_id,
866
- file_grp=self.output_file_grp,
867
- page_id=page_id,
868
- local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
869
- mimetype=MIMETYPE_PAGE,
870
- content=to_xml(result.pcgts),
871
- )
872
880
 
873
881
  def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
874
882
  """
@@ -1,4 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
+ import copy
2
3
  from typing import List, Union, Optional
3
4
  from ocrd_models.ocrd_page import OcrdPage
4
5
  from PIL.Image import Image
@@ -8,12 +9,85 @@ from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType
8
9
 
9
10
  @dataclass
10
11
  class OcrdPageResultImage():
12
+ """
13
+ Encapsulates a single ``AlternativeImage`` reference to be persisted
14
+ as image file to the :py:class:`ocrd.Workspace`.
15
+ """
11
16
  pil: Image
17
+ """
18
+ image data to be saved
19
+ """
12
20
  file_id_suffix: str
21
+ """
22
+ a suffix to append to the file name when saving
23
+ (something like ``.IMG`` according to OCR-D
24
+ conventions for PAGE-XML)
25
+ """
13
26
  alternative_image: Optional[Union[AlternativeImageType, PageType]]
27
+ """
28
+ the ``AlternativeImage`` instance that references this image;
29
+ to be amended with the actual (final) ``@filename`` when saving
30
+
31
+ alternatively, can be a ``Page`` instance: in that case,
32
+ amend its ``@imageFilename`` (i.e. replace the original image
33
+ of the PAGE-XML)
34
+ """
14
35
 
15
36
 
16
37
  @dataclass
17
38
  class OcrdPageResult():
39
+ """
40
+ Encapsulates the return type of :py:func:`ocrd.Processor.process_page_pcgts`,
41
+ i.e. an instance of :py:class:`ocrd_models.ocrd_page.OcrdPage` and an
42
+ accompanying list of :py:class:`OcrdPageResultImage` that contain all
43
+ image files referenced via ``AlternativeImage`` to be persisted into the
44
+ :py:class:`ocrd.Workspace` along with the PAGE-XML itself.
45
+ """
18
46
  pcgts: OcrdPage
19
47
  images: List[OcrdPageResultImage] = field(default_factory=list)
48
+
49
+ class OcrdPageResultVariadicListWrapper():
50
+ """
51
+ Proxy object for :py:class:`ocrd.SingleOcrdPageResult` allowing
52
+ list semantics (i.e. multi-valued return from
53
+ :py:func:`ocrd.Processor.process_page_pcgts`) without changing
54
+ the API introduced in version 3.0.
55
+
56
+ Everything but list access will yield the old (singular valued)
57
+ semantics.
58
+ """
59
+ def __init__(
60
+ self,
61
+ pcgts: OcrdPage,
62
+ *args):
63
+ self._results = [SingleOcrdPageResult(pcgts)] + [
64
+ SingleOcrdPageResult(arg) for arg in args]
65
+
66
+ def __getitem__(self, key):
67
+ return self._results[key]
68
+
69
+ def __contains__(self, key):
70
+ return key in self._results
71
+
72
+ def __len__(self):
73
+ return len(self._results)
74
+
75
+ def __iter__(self):
76
+ return iter(self._results)
77
+
78
+ def __repr__(self):
79
+ return repr(self._results)
80
+
81
+ # allow copy() without infinite recursion
82
+ def __copy__(self):
83
+ return OcrdPageResultVariadicListWrapper(*copy.copy(self._results))
84
+
85
+ # allow deepcopy() without infinite recursion
86
+ def __deepcopy__(self, memo):
87
+ return OcrdPageResultVariadicListWrapper(*copy.deepcopy(self._results))
88
+
89
+ # delegate to all members of first result
90
+ def __getattr__(self, name):
91
+ return getattr(self._results[0], name)
92
+
93
+ SingleOcrdPageResult, OcrdPageResult = OcrdPageResult, OcrdPageResultVariadicListWrapper
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ocrd
3
- Version: 3.8.1
3
+ Version: 3.9.1
4
4
  Summary: OCR-D framework
5
5
  Author-email: Konstantin Baierer <unixprog@gmail.com>
6
6
  License: Apache License 2.0
@@ -23,9 +23,9 @@ ocrd/decorators/mets_find_options.py,sha256=8fiSdk-415o6-iBPB2T9He_v52qE8cTj3cCn
23
23
  ocrd/decorators/ocrd_cli_options.py,sha256=Bemkq3V3QkOI3nNqGzphaNW7gjU9vNN-M5F2DvxvioM,2479
24
24
  ocrd/decorators/parameter_option.py,sha256=TnCIcV9L5oAnI1Ew2TyFzo5FAwiIzWl2pn8oaD9jfEU,1056
25
25
  ocrd/processor/__init__.py,sha256=39ymNwYRdc-b_OJzzKmWCvo2ga3KdsGSYDHE1Hzkn_w,274
26
- ocrd/processor/base.py,sha256=DxBsRn8VLsfNvc9_2BU0KxUv4t9XtHSSu9uiabxn8Nk,59850
26
+ ocrd/processor/base.py,sha256=yHwxd4ZkHLPuFgqQmOeDhMWAdCnHY_ptOjiSWj-FZqI,60600
27
27
  ocrd/processor/helpers.py,sha256=4lR_QvZsxvh7f8_uK9YzdHP5-hvFU4qqYM_Cu_k41KI,10937
28
- ocrd/processor/ocrd_page_result.py,sha256=qo9pGV4r9S5--NAq5clIJOfs4b1vavoDOTbDqAEAAKA,507
28
+ ocrd/processor/ocrd_page_result.py,sha256=hHV1TlKhKFN848cUCqR31v2R3HH4HEoeyGXqUc2DLkY,2945
29
29
  ocrd/processor/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  ocrd/processor/builtin/dummy_processor.py,sha256=SmMRtN0w88kBU24654ThT-yf84SFsFW4BOcmwsDDWdc,3533
31
31
  ocrd/processor/builtin/filter_processor.py,sha256=9mbMq_XTJa8wrlbNdf46GUMNdjedz-enxafsCrnNhEo,4295
@@ -46,16 +46,16 @@ ocrd_models/ocrd_agent.py,sha256=Nm0XDNCmWZ8O3xsXaY-WmEghttXmh90UKmAObCL99IY,561
46
46
  ocrd_models/ocrd_exif.py,sha256=HSLPn_WBDRIlMtKNYilLHm8WjX-b14HgnqT_KfzjS_0,4680
47
47
  ocrd_models/ocrd_file.py,sha256=9-mfDb91RVy3p9rKryl-C39P4Of6Rb8OZBuxAee4VrI,9723
48
48
  ocrd_models/ocrd_mets.py,sha256=lz9mlDq9A9UmZDoN8lh5XRnBzdAtLLZywDZSbyZPS84,50905
49
- ocrd_models/ocrd_page.py,sha256=SRInM4HgcfDzbJH4ZO5B53ST2lAO7pgTXluwi0yzkf0,6211
50
- ocrd_models/ocrd_page_generateds.py,sha256=IWoN3V-v3C4JgyPaFh9OQC87ob__wUP1Q6ELBxhLA1w,841794
49
+ ocrd_models/ocrd_page.py,sha256=Hed1PJ4JWSkTVj7mVOWDaJqtZ9Fc9czzOfPr6flFohw,6818
50
+ ocrd_models/ocrd_page_generateds.py,sha256=hBIhOs_slXdQza_zokBfjjsrimX76h9I_6prRWbgVAk,911920
51
51
  ocrd_models/ocrd_xml_base.py,sha256=iOnDl2zBNhN-Q4moLWiFkSqXvfRzxE5wbp5Tjsu1W6A,1642
52
52
  ocrd_models/report.py,sha256=CX-t9ZDi2VmAy8M1Azsh83UsvE_f5pMeEC7tPaA-ztU,2021
53
53
  ocrd_models/utils.py,sha256=A-H11ZJ65ZjH4DPK9s_Yz6JtA9fbTQ2jY-__9s7Hrg8,2320
54
54
  ocrd_models/xpath_functions.py,sha256=VM2f9hl8ja4NrDOEQRSYdx7GewwAxfoyGMDjqjgA_7g,1439
55
55
  ocrd_network/__init__.py,sha256=NWlSgXi7z45ow37AmITxfCB1d-L39rO8ttyxNJ-z8G0,376
56
- ocrd_network/client.py,sha256=pL-g79cQgulXyGYgLOh--oxl1hZEMu48PTbuvMW1jIE,3007
57
- ocrd_network/client_utils.py,sha256=Ne1a0fteb-TBuc0EAD6X_fh2RAl4hmPt2oluhpB28iU,5371
58
- ocrd_network/constants.py,sha256=XyRYjFO38yIBD6s1wsA-z6V16tBmbUw4LXlFkj-tQC8,1943
56
+ ocrd_network/client.py,sha256=hi13uDUYC5t7xHtZEUYwNBAZOvovWaScfCtFSORVg7Q,3224
57
+ ocrd_network/client_utils.py,sha256=d5UE0MdDJxsYxIQemKcoUuALOiPJ8Cew8bjgsg9d71w,5709
58
+ ocrd_network/constants.py,sha256=mUjpkZDYPdRZmOeC0jyzQkuLuWrODLFzlrAHkguKWGg,1942
59
59
  ocrd_network/database.py,sha256=-SddvaMLKn0pjdONyvWmjxfPJd6viedAIp6Lj1sU1Zs,10705
60
60
  ocrd_network/logging_utils.py,sha256=hXwS46FzY_HTh92DgnxTuARxj8C18bOBmFKVrvBlUgc,2409
61
61
  ocrd_network/param_validators.py,sha256=Jl1VwiPPKJ50k-xEHLdvW-1QDOkJHCiMz4k9Ipqm-Uc,1489
@@ -68,7 +68,7 @@ ocrd_network/server_utils.py,sha256=Lxby62gHvrSbHgpWXvyZGdsWajp2TFzyxjHdMZWBESk,
68
68
  ocrd_network/tcp_to_uds_mets_proxy.py,sha256=yRW-O6ihd31gf7xqQBIBb_ZQQgqisMyOdRI216ehq_A,3160
69
69
  ocrd_network/utils.py,sha256=yE-nV_sv171tPp7weIFOxYw6HJlxvGBmrS8b1rIHS7c,6760
70
70
  ocrd_network/cli/__init__.py,sha256=VBjjXcn-2O5gerqE6UdNfS-EkVFEVPQFHylsn8F9kfY,317
71
- ocrd_network/cli/client.py,sha256=H5fiJhBqbFn4_B2p3V20GejGTIYO-mNglh3y5nzUGhs,10350
71
+ ocrd_network/cli/client.py,sha256=WoLt1NZAOtHeECegUBcop8K2_D0S8khrLjFZhV_38ww,10551
72
72
  ocrd_network/cli/processing_server.py,sha256=NsuI0f9h4KDwe39YugmHo5cJ_29chcLLQ7DThKfPO7s,770
73
73
  ocrd_network/cli/processing_worker.py,sha256=ZuaCkbKV_WKJV7cGOjZ6RLrjjppymnwNCiznFMlclAg,1897
74
74
  ocrd_network/cli/resmgr_server.py,sha256=sc0VX_RehTbg8Qp7ht_DvVqsrdL5b9Zw3bBgWcAD13A,826
@@ -81,7 +81,7 @@ ocrd_network/rabbitmq_utils/__init__.py,sha256=XLIqZhfin4I4m80G9B__UcP45Lz10_mEp
81
81
  ocrd_network/rabbitmq_utils/connector.py,sha256=N6mzjIf5FkVIno3FI1AksZY4F5jMUAm8baay0nXZx8w,11343
82
82
  ocrd_network/rabbitmq_utils/constants.py,sha256=Zu_dKJASfrgnIvEZZlFX9uDR9y6w7zy0KhW7gP7wHDE,1063
83
83
  ocrd_network/rabbitmq_utils/consumer.py,sha256=3WeryDmo0dSD9U0eLODbDElscvhEYjNeCBIewQHYfws,2488
84
- ocrd_network/rabbitmq_utils/helpers.py,sha256=5G0wrBlDtmCItzp-fMZjYr1oeaqDUcii5qeyjXI-ilM,5372
84
+ ocrd_network/rabbitmq_utils/helpers.py,sha256=gbP9Ks4c_ksMln-VQ7GCND6ok_lttm6wW-R7Wszo5qA,5374
85
85
  ocrd_network/rabbitmq_utils/ocrd_messages.py,sha256=wwzfMWbXmOFo_nd32_XySCso91_Ul-aGm_GhGncNxD4,4419
86
86
  ocrd_network/rabbitmq_utils/publisher.py,sha256=mw4XQQhRE1xUQVgEUseyG845iIgVO-9GdGwNH6nUFms,2433
87
87
  ocrd_network/runtime_data/__init__.py,sha256=PnWuuagElbkTzGtPWQEk5wlFtDxqT7B48S0Zrgt8H68,320
@@ -123,9 +123,9 @@ ocrd_validators/xlink.xsd,sha256=8fW7YAMWXN2PbB_MMvj9H5ZeFoEBDzuYBtlGC8_6ijw,318
123
123
  ocrd_validators/xsd_mets_validator.py,sha256=YgiuNtwNDtn3LuvdFFscnmsGREF_wQ4wtA76yE2Iljw,469
124
124
  ocrd_validators/xsd_page_validator.py,sha256=ggt-nmaz-DDyAPwm3ZMVvtChuV2BJ2ZEEbWpePL9vTk,469
125
125
  ocrd_validators/xsd_validator.py,sha256=ahJo_oVvTK_JB0Cu4CkMC8l_gbzsyW91AxGtelMjqrg,2115
126
- ocrd-3.8.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
127
- ocrd-3.8.1.dist-info/METADATA,sha256=cn6Wts1L_Gu7xs9Eachncz78IqE5YZS4N5FoJ8xZU7w,11396
128
- ocrd-3.8.1.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
129
- ocrd-3.8.1.dist-info/entry_points.txt,sha256=CI-NoDR1BYmsuAsJmPAn4NrN9guzdedHGUbC8QSmdGs,266
130
- ocrd-3.8.1.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
131
- ocrd-3.8.1.dist-info/RECORD,,
126
+ ocrd-3.9.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
127
+ ocrd-3.9.1.dist-info/METADATA,sha256=89_vM8lCOcs1eMAHyEO6YunESQbjiWG6jZNkWsrBYV0,11396
128
+ ocrd-3.9.1.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
129
+ ocrd-3.9.1.dist-info/entry_points.txt,sha256=CI-NoDR1BYmsuAsJmPAn4NrN9guzdedHGUbC8QSmdGs,266
130
+ ocrd-3.9.1.dist-info/top_level.txt,sha256=pUgiN42t4KXC5rvpi6V8atza31XP4SCznXpXlVlvomM,75
131
+ ocrd-3.9.1.dist-info/RECORD,,
ocrd_models/ocrd_page.py CHANGED
@@ -2,6 +2,7 @@
2
2
  API to PAGE-XML, generated with generateDS from XML schema.
3
3
  """
4
4
  from io import StringIO
5
+ import copy
5
6
  from typing import Dict, Union, Any
6
7
  from lxml import etree as ET
7
8
  from elementpath import XPath2Parser, XPathContext
@@ -212,6 +213,25 @@ class OcrdPage():
212
213
  self.xpath_context = XPathContext(self.etree)
213
214
  self.xpath = lambda expression: self.xpath_parser.parse(expression).get_results(self.xpath_context)
214
215
 
216
+ # allow copy() without infinite recursion
217
+ def __copy__(self):
218
+ return OcrdPage(
219
+ copy.copy(self._pcgts),
220
+ copy.copy(self.etree),
221
+ copy.copy(self.mapping),
222
+ copy.copy(self.revmap),
223
+ )
224
+
225
+ # allow deepcopy() without infinite recursion
226
+ def __deepcopy__(self, memo):
227
+ return OcrdPage(
228
+ copy.deepcopy(self._pcgts, memo),
229
+ copy.deepcopy(self.etree, memo),
230
+ copy.deepcopy(self.mapping, memo),
231
+ copy.deepcopy(self.revmap, memo),
232
+ )
233
+
234
+ # delegate to all members of ._pcgts
215
235
  def __getattr__(self, name):
216
236
  return getattr(self._pcgts, name)
217
237