ocrd 3.0.0b7__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,5 +101,11 @@ def page_from_file(input_file, **kwargs) -> OcrdPage:
101
101
  if input_file.mimetype.startswith('image'):
102
102
  return page_from_image(input_file)
103
103
  if input_file.mimetype == MIMETYPE_PAGE:
104
- return OcrdPage(*parseEtree(input_file.local_filename, silence=True))
104
+ revmap = {}
105
+ # the old/default gds.reverse_node_mapping is useless
106
+ # since 2.39.4, we can actually get the exact reverse mapping for perfect round-trip
107
+ # but awkwardly, we have to pass the dict in for that
108
+ page = OcrdPage(*parseEtree(input_file.local_filename, reverse_mapping=revmap, silence=True))
109
+ page.revmap = revmap
110
+ return page
105
111
  raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype)
ocrd_models/ocrd_exif.py CHANGED
@@ -49,11 +49,11 @@ class OcrdExif():
49
49
  for prop in ['compression', 'photometric_interpretation']:
50
50
  setattr(self, prop, img.info[prop] if prop in img.info else None)
51
51
  if img.filename:
52
- ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', img.filename], check=False, stderr=PIPE, stdout=PIPE)
52
+ ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', img.filename], check=False, stderr=PIPE, stdout=PIPE)
53
53
  else:
54
54
  with BytesIO() as bio:
55
55
  img.save(bio, format=img.format)
56
- ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue())
56
+ ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue())
57
57
  if ret.returncode:
58
58
  stderr = ret.stderr.decode('utf-8')
59
59
  if 'no decode delegate for this image format' in stderr:
ocrd_models/ocrd_page.py CHANGED
@@ -2,8 +2,9 @@
2
2
  API to PAGE-XML, generated with generateDS from XML schema.
3
3
  """
4
4
  from io import StringIO
5
- from typing import Dict, Union
5
+ from typing import Dict, Union, Any
6
6
  from lxml import etree as ET
7
+ from elementpath import XPath2Parser, XPathContext
7
8
 
8
9
  __all__ = [
9
10
  'parse',
@@ -132,6 +133,7 @@ from .ocrd_page_generateds import (
132
133
  )
133
134
 
134
135
  from .constants import NAMESPACES
136
+ from .xpath_functions import pc_functions
135
137
 
136
138
  # add docstrings
137
139
  parse.__doc__ = (
@@ -189,12 +191,25 @@ class OcrdPage():
189
191
  pcgts : PcGtsType,
190
192
  etree : ET._Element,
191
193
  mapping : Dict[str, ET._Element],
192
- revmap : Dict[ET._Element, str],
194
+ revmap : Dict[ET._Element, Any],
193
195
  ):
194
196
  self._pcgts = pcgts
195
197
  self.etree = etree
196
198
  self.mapping = mapping
197
199
  self.revmap = revmap
200
+ self.xpath_parser = XPath2Parser(namespaces={
201
+ 'page': NAMESPACES['page'],
202
+ 'pc': NAMESPACES['page']})
203
+ for func in pc_functions:
204
+ name = func.__name__.replace('_', '-')
205
+ if name.startswith('pc-'):
206
+ name = name[3:]
207
+ elif name.startswith('pc'):
208
+ name = name[2:]
209
+ # register
210
+ self.xpath_parser.external_function(func, name=name, prefix='pc')
211
+ self.xpath_context = XPathContext(self.etree)
212
+ self.xpath = lambda expression: self.xpath_parser.parse(expression).get_results(self.xpath_context)
198
213
 
199
214
  def __getattr__(self, name):
200
215
  return getattr(self._pcgts, name)
@@ -208,11 +223,15 @@ def to_xml(el, skip_declaration=False) -> str:
208
223
  # XXX remove potential empty ReadingOrder
209
224
  if hasattr(el, 'prune_ReadingOrder'):
210
225
  el.prune_ReadingOrder()
226
+ if hasattr(el, 'original_tagname_'):
227
+ name = el.original_tagname_ or 'PcGts'
228
+ else:
229
+ name = 'PcGts'
211
230
  sio = StringIO()
212
231
  el.export(
213
232
  outfile=sio,
214
233
  level=0,
215
- name_='PcGts',
234
+ name_=name,
216
235
  namespaceprefix_='pc:',
217
236
  namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % (
218
237
  NAMESPACES['page'],