ocrd 3.8.1__py3-none-any.whl → 3.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,8 +2,8 @@
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
4
  #
5
- # Generated Mon Feb 17 10:32:54 2025 by generateDS.py version 2.44.1.
6
- # Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0]
5
+ # Generated Thu Dec 11 12:03:57 2025 by generateDS.py version 2.44.1.
6
+ # Python 3.8.19 (default, Mar 26 2024, 20:08:11) [GCC 8.5.0]
7
7
  #
8
8
  # Command line options:
9
9
  # ('-f', '')
@@ -3766,7 +3766,8 @@ class PageType(GeneratedsSuper):
3766
3766
  def get_AllRegions(self, classes=None, order='document', depth=0):
3767
3767
  """
3768
3768
  Get all the ``*Region`` elements, or only those provided by `classes`.
3769
- Return in document order, unless `order` is ``reading-order``.
3769
+ Return in document order, unless the top element is ``Page`` and
3770
+ `order` is ``reading-order``.
3770
3771
 
3771
3772
  Arguments:
3772
3773
  classes (list): Classes of regions that shall be returned, \
@@ -3775,7 +3776,8 @@ class PageType(GeneratedsSuper):
3775
3776
  return regions sorted by document order (``document``, default) or by
3776
3777
  reading order with regions not in the reading order at the end of the
3777
3778
  returned list (``reading-order``) or regions not in the reading order
3778
- omitted (``reading-order-only``)
3779
+ omitted (``reading-order-only``). The latter two are only available
3780
+ on page level.
3779
3781
  depth (int): Recursive depth to look for regions at, set to `0` for \
3780
3782
  all regions at any depth. Default: 0
3781
3783
 
@@ -3800,7 +3802,7 @@ class PageType(GeneratedsSuper):
3800
3802
  if depth < 0:
3801
3803
  raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
3802
3804
  ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
3803
- if order.startswith('reading-order'):
3805
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
3804
3806
  reading_order = self.get_ReadingOrder()
3805
3807
  if reading_order:
3806
3808
  reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
@@ -3929,21 +3931,23 @@ class PageType(GeneratedsSuper):
3929
3931
  - :py:class:`.UnoderedGroupType`
3930
3932
  - :py:class:`.UnoderedGroupIndexedType`
3931
3933
  """
3934
+ from collections import OrderedDict as odict
3932
3935
  def get_groupdict(group):
3933
3936
  regionrefs = list()
3934
3937
  if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)):
3935
3938
  regionrefs = (group.get_RegionRefIndexed() +
3936
3939
  group.get_OrderedGroupIndexed() +
3937
3940
  group.get_UnorderedGroupIndexed())
3941
+ regionrefs = sorted(regionrefs, key=lambda x: x.index)
3938
3942
  if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)):
3939
3943
  regionrefs = (group.get_RegionRef() +
3940
3944
  group.get_OrderedGroup() +
3941
3945
  group.get_UnorderedGroup())
3942
- refdict = {}
3946
+ refdict = odict()
3943
3947
  for elem in regionrefs:
3944
3948
  refdict[elem.get_regionRef()] = elem
3945
3949
  if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
3946
- refdict = {**refdict, **get_groupdict(elem)}
3950
+ refdict = odict(**refdict, **get_groupdict(elem))
3947
3951
  return refdict
3948
3952
  ro = self.get_ReadingOrder()
3949
3953
  if ro is None:
@@ -12673,6 +12677,106 @@ class AdvertRegionType(RegionType):
12673
12677
  pass
12674
12678
  def __hash__(self):
12675
12679
  return hash(self.id)
12680
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
12681
+ def _region_class(self, x): # pylint: disable=unused-argument
12682
+ return x.__class__.__name__.replace('RegionType', '')
12683
+
12684
+ def _get_recursive_regions(self, regions, level, classes=None):
12685
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
12686
+ if level == 1:
12687
+ # stop recursion, filter classes
12688
+ if classes:
12689
+ return [r for r in regions if self._region_class(r) in classes]
12690
+ if regions and regions[0].__class__.__name__ == 'PageType':
12691
+ regions = regions[1:]
12692
+ return regions
12693
+ # find more regions recursively
12694
+ more_regions = []
12695
+ for region in regions:
12696
+ more_regions.append([])
12697
+ for class_ in PAGE_REGION_TYPES:
12698
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
12699
+ # 'Map' is not recursive in 2019 schema
12700
+ continue
12701
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
12702
+ if not any(more_regions):
12703
+ return self._get_recursive_regions(regions, 1, classes)
12704
+ ret = []
12705
+ for r, more in zip(regions, more_regions):
12706
+ ret.append(r)
12707
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
12708
+ return self._get_recursive_regions(ret, 1, classes)
12709
+
12710
+ def _get_recursive_reading_order(self, rogroup):
12711
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
12712
+ elements = rogroup.get_AllIndexed()
12713
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
12714
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
12715
+ regionrefs = list()
12716
+ for elem in elements:
12717
+ regionrefs.append(elem.get_regionRef())
12718
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
12719
+ regionrefs.extend(self._get_recursive_reading_order(elem))
12720
+ return regionrefs
12721
+
12722
+ def get_AllRegions(self, classes=None, order='document', depth=0):
12723
+ """
12724
+ Get all the ``*Region`` elements, or only those provided by `classes`.
12725
+ Return in document order, unless the top element is ``Page`` and
12726
+ `order` is ``reading-order``.
12727
+
12728
+ Arguments:
12729
+ classes (list): Classes of regions that shall be returned, \
12730
+ e.g. ``['Text', 'Image']``
12731
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
12732
+ return regions sorted by document order (``document``, default) or by
12733
+ reading order with regions not in the reading order at the end of the
12734
+ returned list (``reading-order``) or regions not in the reading order
12735
+ omitted (``reading-order-only``). The latter two are only available
12736
+ on page level.
12737
+ depth (int): Recursive depth to look for regions at, set to `0` for \
12738
+ all regions at any depth. Default: 0
12739
+
12740
+ Returns:
12741
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
12742
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
12743
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
12744
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
12745
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
12746
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
12747
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
12748
+ and/or :py:class:`CustomRegionType`
12749
+
12750
+ For example, to get all text anywhere on the page in reading order, use:
12751
+ ::
12752
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
12753
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
12754
+ for line in region.get_TextLine())
12755
+ """
12756
+ if order not in ['document', 'reading-order', 'reading-order-only']:
12757
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
12758
+ if depth < 0:
12759
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
12760
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
12761
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
12762
+ reading_order = self.get_ReadingOrder()
12763
+ if reading_order:
12764
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
12765
+ if reading_order:
12766
+ reading_order = self._get_recursive_reading_order(reading_order)
12767
+ if reading_order:
12768
+ id2region = {region.id: region for region in ret}
12769
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
12770
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
12771
+ # len(ret),
12772
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
12773
+ # len([r for r in ret if r not in in_reading_order])
12774
+ # ))
12775
+ if order == 'reading-order-only':
12776
+ ret = in_reading_order
12777
+ else:
12778
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
12779
+ return ret
12676
12780
  def set_orientation(self, orientation):
12677
12781
  """
12678
12782
  Set deskewing angle to given `orientation` number.
@@ -12835,6 +12939,106 @@ class MusicRegionType(RegionType):
12835
12939
  pass
12836
12940
  def __hash__(self):
12837
12941
  return hash(self.id)
12942
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
12943
+ def _region_class(self, x): # pylint: disable=unused-argument
12944
+ return x.__class__.__name__.replace('RegionType', '')
12945
+
12946
+ def _get_recursive_regions(self, regions, level, classes=None):
12947
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
12948
+ if level == 1:
12949
+ # stop recursion, filter classes
12950
+ if classes:
12951
+ return [r for r in regions if self._region_class(r) in classes]
12952
+ if regions and regions[0].__class__.__name__ == 'PageType':
12953
+ regions = regions[1:]
12954
+ return regions
12955
+ # find more regions recursively
12956
+ more_regions = []
12957
+ for region in regions:
12958
+ more_regions.append([])
12959
+ for class_ in PAGE_REGION_TYPES:
12960
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
12961
+ # 'Map' is not recursive in 2019 schema
12962
+ continue
12963
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
12964
+ if not any(more_regions):
12965
+ return self._get_recursive_regions(regions, 1, classes)
12966
+ ret = []
12967
+ for r, more in zip(regions, more_regions):
12968
+ ret.append(r)
12969
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
12970
+ return self._get_recursive_regions(ret, 1, classes)
12971
+
12972
+ def _get_recursive_reading_order(self, rogroup):
12973
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
12974
+ elements = rogroup.get_AllIndexed()
12975
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
12976
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
12977
+ regionrefs = list()
12978
+ for elem in elements:
12979
+ regionrefs.append(elem.get_regionRef())
12980
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
12981
+ regionrefs.extend(self._get_recursive_reading_order(elem))
12982
+ return regionrefs
12983
+
12984
+ def get_AllRegions(self, classes=None, order='document', depth=0):
12985
+ """
12986
+ Get all the ``*Region`` elements, or only those provided by `classes`.
12987
+ Return in document order, unless the top element is ``Page`` and
12988
+ `order` is ``reading-order``.
12989
+
12990
+ Arguments:
12991
+ classes (list): Classes of regions that shall be returned, \
12992
+ e.g. ``['Text', 'Image']``
12993
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
12994
+ return regions sorted by document order (``document``, default) or by
12995
+ reading order with regions not in the reading order at the end of the
12996
+ returned list (``reading-order``) or regions not in the reading order
12997
+ omitted (``reading-order-only``). The latter two are only available
12998
+ on page level.
12999
+ depth (int): Recursive depth to look for regions at, set to `0` for \
13000
+ all regions at any depth. Default: 0
13001
+
13002
+ Returns:
13003
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
13004
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
13005
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
13006
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
13007
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
13008
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
13009
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
13010
+ and/or :py:class:`CustomRegionType`
13011
+
13012
+ For example, to get all text anywhere on the page in reading order, use:
13013
+ ::
13014
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
13015
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
13016
+ for line in region.get_TextLine())
13017
+ """
13018
+ if order not in ['document', 'reading-order', 'reading-order-only']:
13019
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
13020
+ if depth < 0:
13021
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
13022
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
13023
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
13024
+ reading_order = self.get_ReadingOrder()
13025
+ if reading_order:
13026
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
13027
+ if reading_order:
13028
+ reading_order = self._get_recursive_reading_order(reading_order)
13029
+ if reading_order:
13030
+ id2region = {region.id: region for region in ret}
13031
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
13032
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
13033
+ # len(ret),
13034
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
13035
+ # len([r for r in ret if r not in in_reading_order])
13036
+ # ))
13037
+ if order == 'reading-order-only':
13038
+ ret = in_reading_order
13039
+ else:
13040
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
13041
+ return ret
12838
13042
  def set_orientation(self, orientation):
12839
13043
  """
12840
13044
  Set deskewing angle to given `orientation` number.
@@ -12965,69 +13169,169 @@ class MapRegionType(RegionType):
12965
13169
  pass
12966
13170
  def __hash__(self):
12967
13171
  return hash(self.id)
12968
- def set_orientation(self, orientation):
12969
- """
12970
- Set deskewing angle to given `orientation` number.
12971
- Moreover, invalidate self's ``pc:AlternativeImage``s
12972
- (because they will have been rotated and enlarged
12973
- with the angle of the previous value).
12974
- """
12975
- if hasattr(self, 'invalidate_AlternativeImage'):
12976
- # PageType, RegionType:
12977
- self.invalidate_AlternativeImage(feature_selector='deskewed')
12978
- self.orientation = orientation
12979
- # end class MapRegionType
12980
-
12981
-
12982
- class ChemRegionType(RegionType):
12983
- """ChemRegionType --
12984
- Regions containing chemical formulas.
12985
-
12986
- * orientation --
12987
- The angle the rectangle encapsulating a
12988
- region has to be rotated in clockwise
12989
- direction in order to correct the present
12990
- skew (negative values indicate
12991
- anti-clockwise rotation). Range:
12992
- -179.999,180
12993
-
12994
- * bgColour --
12995
- The background colour of the region
13172
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
13173
+ def _region_class(self, x): # pylint: disable=unused-argument
13174
+ return x.__class__.__name__.replace('RegionType', '')
12996
13175
 
12997
- """
12998
- __hash__ = GeneratedsSuper.__hash__
12999
- member_data_items_ = [
13000
- MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}),
13001
- MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}),
13002
- ]
13003
- subclass = None
13004
- superclass = RegionType
13005
- def __init__(self, id=None, custom=None, comments=None, continuation=None, AlternativeImage=None, Coords=None, UserDefined=None, Labels=None, Roles=None, TextRegion=None, ImageRegion=None, LineDrawingRegion=None, GraphicRegion=None, TableRegion=None, ChartRegion=None, SeparatorRegion=None, MathsRegion=None, ChemRegion=None, MusicRegion=None, AdvertRegion=None, NoiseRegion=None, UnknownRegion=None, CustomRegion=None, orientation=None, bgColour=None, gds_collector_=None, **kwargs_):
13006
- self.gds_collector_ = gds_collector_
13007
- self.gds_elementtree_node_ = None
13008
- self.original_tagname_ = None
13009
- self.parent_object_ = kwargs_.get('parent_object_')
13010
- self.ns_prefix_ = "pc"
13011
- super(globals().get("ChemRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_)
13012
- self.orientation = _cast(float, orientation)
13013
- self.orientation_nsprefix_ = "pc"
13014
- self.bgColour = _cast(None, bgColour)
13015
- self.bgColour_nsprefix_ = "pc"
13016
- def factory(*args_, **kwargs_):
13017
- if CurrentSubclassModule_ is not None:
13018
- subclass = getSubclassFromModule_(
13019
- CurrentSubclassModule_, ChemRegionType)
13020
- if subclass is not None:
13021
- return subclass(*args_, **kwargs_)
13022
- if ChemRegionType.subclass:
13023
- return ChemRegionType.subclass(*args_, **kwargs_)
13024
- else:
13025
- return ChemRegionType(*args_, **kwargs_)
13026
- factory = staticmethod(factory)
13027
- def get_ns_prefix_(self):
13028
- return self.ns_prefix_
13029
- def set_ns_prefix_(self, ns_prefix):
13030
- self.ns_prefix_ = ns_prefix
13176
+ def _get_recursive_regions(self, regions, level, classes=None):
13177
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
13178
+ if level == 1:
13179
+ # stop recursion, filter classes
13180
+ if classes:
13181
+ return [r for r in regions if self._region_class(r) in classes]
13182
+ if regions and regions[0].__class__.__name__ == 'PageType':
13183
+ regions = regions[1:]
13184
+ return regions
13185
+ # find more regions recursively
13186
+ more_regions = []
13187
+ for region in regions:
13188
+ more_regions.append([])
13189
+ for class_ in PAGE_REGION_TYPES:
13190
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
13191
+ # 'Map' is not recursive in 2019 schema
13192
+ continue
13193
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
13194
+ if not any(more_regions):
13195
+ return self._get_recursive_regions(regions, 1, classes)
13196
+ ret = []
13197
+ for r, more in zip(regions, more_regions):
13198
+ ret.append(r)
13199
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
13200
+ return self._get_recursive_regions(ret, 1, classes)
13201
+
13202
+ def _get_recursive_reading_order(self, rogroup):
13203
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
13204
+ elements = rogroup.get_AllIndexed()
13205
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
13206
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
13207
+ regionrefs = list()
13208
+ for elem in elements:
13209
+ regionrefs.append(elem.get_regionRef())
13210
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
13211
+ regionrefs.extend(self._get_recursive_reading_order(elem))
13212
+ return regionrefs
13213
+
13214
+ def get_AllRegions(self, classes=None, order='document', depth=0):
13215
+ """
13216
+ Get all the ``*Region`` elements, or only those provided by `classes`.
13217
+ Return in document order, unless the top element is ``Page`` and
13218
+ `order` is ``reading-order``.
13219
+
13220
+ Arguments:
13221
+ classes (list): Classes of regions that shall be returned, \
13222
+ e.g. ``['Text', 'Image']``
13223
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
13224
+ return regions sorted by document order (``document``, default) or by
13225
+ reading order with regions not in the reading order at the end of the
13226
+ returned list (``reading-order``) or regions not in the reading order
13227
+ omitted (``reading-order-only``). The latter two are only available
13228
+ on page level.
13229
+ depth (int): Recursive depth to look for regions at, set to `0` for \
13230
+ all regions at any depth. Default: 0
13231
+
13232
+ Returns:
13233
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
13234
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
13235
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
13236
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
13237
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
13238
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
13239
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
13240
+ and/or :py:class:`CustomRegionType`
13241
+
13242
+ For example, to get all text anywhere on the page in reading order, use:
13243
+ ::
13244
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
13245
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
13246
+ for line in region.get_TextLine())
13247
+ """
13248
+ if order not in ['document', 'reading-order', 'reading-order-only']:
13249
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
13250
+ if depth < 0:
13251
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
13252
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
13253
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
13254
+ reading_order = self.get_ReadingOrder()
13255
+ if reading_order:
13256
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
13257
+ if reading_order:
13258
+ reading_order = self._get_recursive_reading_order(reading_order)
13259
+ if reading_order:
13260
+ id2region = {region.id: region for region in ret}
13261
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
13262
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
13263
+ # len(ret),
13264
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
13265
+ # len([r for r in ret if r not in in_reading_order])
13266
+ # ))
13267
+ if order == 'reading-order-only':
13268
+ ret = in_reading_order
13269
+ else:
13270
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
13271
+ return ret
13272
+ def set_orientation(self, orientation):
13273
+ """
13274
+ Set deskewing angle to given `orientation` number.
13275
+ Moreover, invalidate self's ``pc:AlternativeImage``s
13276
+ (because they will have been rotated and enlarged
13277
+ with the angle of the previous value).
13278
+ """
13279
+ if hasattr(self, 'invalidate_AlternativeImage'):
13280
+ # PageType, RegionType:
13281
+ self.invalidate_AlternativeImage(feature_selector='deskewed')
13282
+ self.orientation = orientation
13283
+ # end class MapRegionType
13284
+
13285
+
13286
+ class ChemRegionType(RegionType):
13287
+ """ChemRegionType --
13288
+ Regions containing chemical formulas.
13289
+
13290
+ * orientation --
13291
+ The angle the rectangle encapsulating a
13292
+ region has to be rotated in clockwise
13293
+ direction in order to correct the present
13294
+ skew (negative values indicate
13295
+ anti-clockwise rotation). Range:
13296
+ -179.999,180
13297
+
13298
+ * bgColour --
13299
+ The background colour of the region
13300
+
13301
+ """
13302
+ __hash__ = GeneratedsSuper.__hash__
13303
+ member_data_items_ = [
13304
+ MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}),
13305
+ MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}),
13306
+ ]
13307
+ subclass = None
13308
+ superclass = RegionType
13309
+ def __init__(self, id=None, custom=None, comments=None, continuation=None, AlternativeImage=None, Coords=None, UserDefined=None, Labels=None, Roles=None, TextRegion=None, ImageRegion=None, LineDrawingRegion=None, GraphicRegion=None, TableRegion=None, ChartRegion=None, SeparatorRegion=None, MathsRegion=None, ChemRegion=None, MusicRegion=None, AdvertRegion=None, NoiseRegion=None, UnknownRegion=None, CustomRegion=None, orientation=None, bgColour=None, gds_collector_=None, **kwargs_):
13310
+ self.gds_collector_ = gds_collector_
13311
+ self.gds_elementtree_node_ = None
13312
+ self.original_tagname_ = None
13313
+ self.parent_object_ = kwargs_.get('parent_object_')
13314
+ self.ns_prefix_ = "pc"
13315
+ super(globals().get("ChemRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_)
13316
+ self.orientation = _cast(float, orientation)
13317
+ self.orientation_nsprefix_ = "pc"
13318
+ self.bgColour = _cast(None, bgColour)
13319
+ self.bgColour_nsprefix_ = "pc"
13320
+ def factory(*args_, **kwargs_):
13321
+ if CurrentSubclassModule_ is not None:
13322
+ subclass = getSubclassFromModule_(
13323
+ CurrentSubclassModule_, ChemRegionType)
13324
+ if subclass is not None:
13325
+ return subclass(*args_, **kwargs_)
13326
+ if ChemRegionType.subclass:
13327
+ return ChemRegionType.subclass(*args_, **kwargs_)
13328
+ else:
13329
+ return ChemRegionType(*args_, **kwargs_)
13330
+ factory = staticmethod(factory)
13331
+ def get_ns_prefix_(self):
13332
+ return self.ns_prefix_
13333
+ def set_ns_prefix_(self, ns_prefix):
13334
+ self.ns_prefix_ = ns_prefix
13031
13335
  def get_orientation(self):
13032
13336
  return self.orientation
13033
13337
  def set_orientation(self, orientation):
@@ -13128,6 +13432,106 @@ class ChemRegionType(RegionType):
13128
13432
  pass
13129
13433
  def __hash__(self):
13130
13434
  return hash(self.id)
13435
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
13436
+ def _region_class(self, x): # pylint: disable=unused-argument
13437
+ return x.__class__.__name__.replace('RegionType', '')
13438
+
13439
+ def _get_recursive_regions(self, regions, level, classes=None):
13440
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
13441
+ if level == 1:
13442
+ # stop recursion, filter classes
13443
+ if classes:
13444
+ return [r for r in regions if self._region_class(r) in classes]
13445
+ if regions and regions[0].__class__.__name__ == 'PageType':
13446
+ regions = regions[1:]
13447
+ return regions
13448
+ # find more regions recursively
13449
+ more_regions = []
13450
+ for region in regions:
13451
+ more_regions.append([])
13452
+ for class_ in PAGE_REGION_TYPES:
13453
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
13454
+ # 'Map' is not recursive in 2019 schema
13455
+ continue
13456
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
13457
+ if not any(more_regions):
13458
+ return self._get_recursive_regions(regions, 1, classes)
13459
+ ret = []
13460
+ for r, more in zip(regions, more_regions):
13461
+ ret.append(r)
13462
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
13463
+ return self._get_recursive_regions(ret, 1, classes)
13464
+
13465
+ def _get_recursive_reading_order(self, rogroup):
13466
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
13467
+ elements = rogroup.get_AllIndexed()
13468
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
13469
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
13470
+ regionrefs = list()
13471
+ for elem in elements:
13472
+ regionrefs.append(elem.get_regionRef())
13473
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
13474
+ regionrefs.extend(self._get_recursive_reading_order(elem))
13475
+ return regionrefs
13476
+
13477
+ def get_AllRegions(self, classes=None, order='document', depth=0):
13478
+ """
13479
+ Get all the ``*Region`` elements, or only those provided by `classes`.
13480
+ Return in document order, unless the top element is ``Page`` and
13481
+ `order` is ``reading-order``.
13482
+
13483
+ Arguments:
13484
+ classes (list): Classes of regions that shall be returned, \
13485
+ e.g. ``['Text', 'Image']``
13486
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
13487
+ return regions sorted by document order (``document``, default) or by
13488
+ reading order with regions not in the reading order at the end of the
13489
+ returned list (``reading-order``) or regions not in the reading order
13490
+ omitted (``reading-order-only``). The latter two are only available
13491
+ on page level.
13492
+ depth (int): Recursive depth to look for regions at, set to `0` for \
13493
+ all regions at any depth. Default: 0
13494
+
13495
+ Returns:
13496
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
13497
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
13498
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
13499
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
13500
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
13501
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
13502
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
13503
+ and/or :py:class:`CustomRegionType`
13504
+
13505
+ For example, to get all text anywhere on the page in reading order, use:
13506
+ ::
13507
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
13508
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
13509
+ for line in region.get_TextLine())
13510
+ """
13511
+ if order not in ['document', 'reading-order', 'reading-order-only']:
13512
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
13513
+ if depth < 0:
13514
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
13515
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
13516
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
13517
+ reading_order = self.get_ReadingOrder()
13518
+ if reading_order:
13519
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
13520
+ if reading_order:
13521
+ reading_order = self._get_recursive_reading_order(reading_order)
13522
+ if reading_order:
13523
+ id2region = {region.id: region for region in ret}
13524
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
13525
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
13526
+ # len(ret),
13527
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
13528
+ # len([r for r in ret if r not in in_reading_order])
13529
+ # ))
13530
+ if order == 'reading-order-only':
13531
+ ret = in_reading_order
13532
+ else:
13533
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
13534
+ return ret
13131
13535
  def set_orientation(self, orientation):
13132
13536
  """
13133
13537
  Set deskewing angle to given `orientation` number.
@@ -13291,6 +13695,106 @@ class MathsRegionType(RegionType):
13291
13695
  pass
13292
13696
  def __hash__(self):
13293
13697
  return hash(self.id)
13698
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
13699
+ def _region_class(self, x): # pylint: disable=unused-argument
13700
+ return x.__class__.__name__.replace('RegionType', '')
13701
+
13702
+ def _get_recursive_regions(self, regions, level, classes=None):
13703
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
13704
+ if level == 1:
13705
+ # stop recursion, filter classes
13706
+ if classes:
13707
+ return [r for r in regions if self._region_class(r) in classes]
13708
+ if regions and regions[0].__class__.__name__ == 'PageType':
13709
+ regions = regions[1:]
13710
+ return regions
13711
+ # find more regions recursively
13712
+ more_regions = []
13713
+ for region in regions:
13714
+ more_regions.append([])
13715
+ for class_ in PAGE_REGION_TYPES:
13716
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
13717
+ # 'Map' is not recursive in 2019 schema
13718
+ continue
13719
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
13720
+ if not any(more_regions):
13721
+ return self._get_recursive_regions(regions, 1, classes)
13722
+ ret = []
13723
+ for r, more in zip(regions, more_regions):
13724
+ ret.append(r)
13725
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
13726
+ return self._get_recursive_regions(ret, 1, classes)
13727
+
13728
+ def _get_recursive_reading_order(self, rogroup):
13729
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
13730
+ elements = rogroup.get_AllIndexed()
13731
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
13732
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
13733
+ regionrefs = list()
13734
+ for elem in elements:
13735
+ regionrefs.append(elem.get_regionRef())
13736
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
13737
+ regionrefs.extend(self._get_recursive_reading_order(elem))
13738
+ return regionrefs
13739
+
13740
+ def get_AllRegions(self, classes=None, order='document', depth=0):
13741
+ """
13742
+ Get all the ``*Region`` elements, or only those provided by `classes`.
13743
+ Return in document order, unless the top element is ``Page`` and
13744
+ `order` is ``reading-order``.
13745
+
13746
+ Arguments:
13747
+ classes (list): Classes of regions that shall be returned, \
13748
+ e.g. ``['Text', 'Image']``
13749
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
13750
+ return regions sorted by document order (``document``, default) or by
13751
+ reading order with regions not in the reading order at the end of the
13752
+ returned list (``reading-order``) or regions not in the reading order
13753
+ omitted (``reading-order-only``). The latter two are only available
13754
+ on page level.
13755
+ depth (int): Recursive depth to look for regions at, set to `0` for \
13756
+ all regions at any depth. Default: 0
13757
+
13758
+ Returns:
13759
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
13760
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
13761
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
13762
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
13763
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
13764
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
13765
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
13766
+ and/or :py:class:`CustomRegionType`
13767
+
13768
+ For example, to get all text anywhere on the page in reading order, use:
13769
+ ::
13770
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
13771
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
13772
+ for line in region.get_TextLine())
13773
+ """
13774
+ if order not in ['document', 'reading-order', 'reading-order-only']:
13775
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
13776
+ if depth < 0:
13777
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
13778
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
13779
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
13780
+ reading_order = self.get_ReadingOrder()
13781
+ if reading_order:
13782
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
13783
+ if reading_order:
13784
+ reading_order = self._get_recursive_reading_order(reading_order)
13785
+ if reading_order:
13786
+ id2region = {region.id: region for region in ret}
13787
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
13788
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
13789
+ # len(ret),
13790
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
13791
+ # len([r for r in ret if r not in in_reading_order])
13792
+ # ))
13793
+ if order == 'reading-order-only':
13794
+ ret = in_reading_order
13795
+ else:
13796
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
13797
+ return ret
13294
13798
  def set_orientation(self, orientation):
13295
13799
  """
13296
13800
  Set deskewing angle to given `orientation` number.
@@ -13455,6 +13959,106 @@ class SeparatorRegionType(RegionType):
13455
13959
  pass
13456
13960
  def __hash__(self):
13457
13961
  return hash(self.id)
13962
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
13963
+ def _region_class(self, x): # pylint: disable=unused-argument
13964
+ return x.__class__.__name__.replace('RegionType', '')
13965
+
13966
+ def _get_recursive_regions(self, regions, level, classes=None):
13967
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
13968
+ if level == 1:
13969
+ # stop recursion, filter classes
13970
+ if classes:
13971
+ return [r for r in regions if self._region_class(r) in classes]
13972
+ if regions and regions[0].__class__.__name__ == 'PageType':
13973
+ regions = regions[1:]
13974
+ return regions
13975
+ # find more regions recursively
13976
+ more_regions = []
13977
+ for region in regions:
13978
+ more_regions.append([])
13979
+ for class_ in PAGE_REGION_TYPES:
13980
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
13981
+ # 'Map' is not recursive in 2019 schema
13982
+ continue
13983
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
13984
+ if not any(more_regions):
13985
+ return self._get_recursive_regions(regions, 1, classes)
13986
+ ret = []
13987
+ for r, more in zip(regions, more_regions):
13988
+ ret.append(r)
13989
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
13990
+ return self._get_recursive_regions(ret, 1, classes)
13991
+
13992
+ def _get_recursive_reading_order(self, rogroup):
13993
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
13994
+ elements = rogroup.get_AllIndexed()
13995
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
13996
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
13997
+ regionrefs = list()
13998
+ for elem in elements:
13999
+ regionrefs.append(elem.get_regionRef())
14000
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
14001
+ regionrefs.extend(self._get_recursive_reading_order(elem))
14002
+ return regionrefs
14003
+
14004
+ def get_AllRegions(self, classes=None, order='document', depth=0):
14005
+ """
14006
+ Get all the ``*Region`` elements, or only those provided by `classes`.
14007
+ Return in document order, unless the top element is ``Page`` and
14008
+ `order` is ``reading-order``.
14009
+
14010
+ Arguments:
14011
+ classes (list): Classes of regions that shall be returned, \
14012
+ e.g. ``['Text', 'Image']``
14013
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
14014
+ return regions sorted by document order (``document``, default) or by
14015
+ reading order with regions not in the reading order at the end of the
14016
+ returned list (``reading-order``) or regions not in the reading order
14017
+ omitted (``reading-order-only``). The latter two are only available
14018
+ on page level.
14019
+ depth (int): Recursive depth to look for regions at, set to `0` for \
14020
+ all regions at any depth. Default: 0
14021
+
14022
+ Returns:
14023
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
14024
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
14025
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
14026
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
14027
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
14028
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
14029
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
14030
+ and/or :py:class:`CustomRegionType`
14031
+
14032
+ For example, to get all text anywhere on the page in reading order, use:
14033
+ ::
14034
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
14035
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
14036
+ for line in region.get_TextLine())
14037
+ """
14038
+ if order not in ['document', 'reading-order', 'reading-order-only']:
14039
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
14040
+ if depth < 0:
14041
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
14042
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
14043
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
14044
+ reading_order = self.get_ReadingOrder()
14045
+ if reading_order:
14046
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
14047
+ if reading_order:
14048
+ reading_order = self._get_recursive_reading_order(reading_order)
14049
+ if reading_order:
14050
+ id2region = {region.id: region for region in ret}
14051
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
14052
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
14053
+ # len(ret),
14054
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
14055
+ # len([r for r in ret if r not in in_reading_order])
14056
+ # ))
14057
+ if order == 'reading-order-only':
14058
+ ret = in_reading_order
14059
+ else:
14060
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
14061
+ return ret
13458
14062
  def set_orientation(self, orientation):
13459
14063
  """
13460
14064
  Set deskewing angle to given `orientation` number.
@@ -13696,6 +14300,106 @@ class ChartRegionType(RegionType):
13696
14300
  pass
13697
14301
  def __hash__(self):
13698
14302
  return hash(self.id)
14303
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
14304
+ def _region_class(self, x): # pylint: disable=unused-argument
14305
+ return x.__class__.__name__.replace('RegionType', '')
14306
+
14307
+ def _get_recursive_regions(self, regions, level, classes=None):
14308
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
14309
+ if level == 1:
14310
+ # stop recursion, filter classes
14311
+ if classes:
14312
+ return [r for r in regions if self._region_class(r) in classes]
14313
+ if regions and regions[0].__class__.__name__ == 'PageType':
14314
+ regions = regions[1:]
14315
+ return regions
14316
+ # find more regions recursively
14317
+ more_regions = []
14318
+ for region in regions:
14319
+ more_regions.append([])
14320
+ for class_ in PAGE_REGION_TYPES:
14321
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
14322
+ # 'Map' is not recursive in 2019 schema
14323
+ continue
14324
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
14325
+ if not any(more_regions):
14326
+ return self._get_recursive_regions(regions, 1, classes)
14327
+ ret = []
14328
+ for r, more in zip(regions, more_regions):
14329
+ ret.append(r)
14330
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
14331
+ return self._get_recursive_regions(ret, 1, classes)
14332
+
14333
+ def _get_recursive_reading_order(self, rogroup):
14334
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
14335
+ elements = rogroup.get_AllIndexed()
14336
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
14337
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
14338
+ regionrefs = list()
14339
+ for elem in elements:
14340
+ regionrefs.append(elem.get_regionRef())
14341
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
14342
+ regionrefs.extend(self._get_recursive_reading_order(elem))
14343
+ return regionrefs
14344
+
14345
+ def get_AllRegions(self, classes=None, order='document', depth=0):
14346
+ """
14347
+ Get all the ``*Region`` elements, or only those provided by `classes`.
14348
+ Return in document order, unless the top element is ``Page`` and
14349
+ `order` is ``reading-order``.
14350
+
14351
+ Arguments:
14352
+ classes (list): Classes of regions that shall be returned, \
14353
+ e.g. ``['Text', 'Image']``
14354
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
14355
+ return regions sorted by document order (``document``, default) or by
14356
+ reading order with regions not in the reading order at the end of the
14357
+ returned list (``reading-order``) or regions not in the reading order
14358
+ omitted (``reading-order-only``). The latter two are only available
14359
+ on page level.
14360
+ depth (int): Recursive depth to look for regions at, set to `0` for \
14361
+ all regions at any depth. Default: 0
14362
+
14363
+ Returns:
14364
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
14365
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
14366
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
14367
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
14368
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
14369
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
14370
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
14371
+ and/or :py:class:`CustomRegionType`
14372
+
14373
+ For example, to get all text anywhere on the page in reading order, use:
14374
+ ::
14375
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
14376
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
14377
+ for line in region.get_TextLine())
14378
+ """
14379
+ if order not in ['document', 'reading-order', 'reading-order-only']:
14380
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
14381
+ if depth < 0:
14382
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
14383
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
14384
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
14385
+ reading_order = self.get_ReadingOrder()
14386
+ if reading_order:
14387
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
14388
+ if reading_order:
14389
+ reading_order = self._get_recursive_reading_order(reading_order)
14390
+ if reading_order:
14391
+ id2region = {region.id: region for region in ret}
14392
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
14393
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
14394
+ # len(ret),
14395
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
14396
+ # len([r for r in ret if r not in in_reading_order])
14397
+ # ))
14398
+ if order == 'reading-order-only':
14399
+ ret = in_reading_order
14400
+ else:
14401
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
14402
+ return ret
13699
14403
  def set_orientation(self, orientation):
13700
14404
  """
13701
14405
  Set deskewing angle to given `orientation` number.
@@ -13991,6 +14695,106 @@ class TableRegionType(RegionType):
13991
14695
  super(TableRegionType, self)._buildChildren(child_, node, nodeName_, True)
13992
14696
  def __hash__(self):
13993
14697
  return hash(self.id)
14698
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
14699
+ def _region_class(self, x): # pylint: disable=unused-argument
14700
+ return x.__class__.__name__.replace('RegionType', '')
14701
+
14702
+ def _get_recursive_regions(self, regions, level, classes=None):
14703
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
14704
+ if level == 1:
14705
+ # stop recursion, filter classes
14706
+ if classes:
14707
+ return [r for r in regions if self._region_class(r) in classes]
14708
+ if regions and regions[0].__class__.__name__ == 'PageType':
14709
+ regions = regions[1:]
14710
+ return regions
14711
+ # find more regions recursively
14712
+ more_regions = []
14713
+ for region in regions:
14714
+ more_regions.append([])
14715
+ for class_ in PAGE_REGION_TYPES:
14716
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
14717
+ # 'Map' is not recursive in 2019 schema
14718
+ continue
14719
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
14720
+ if not any(more_regions):
14721
+ return self._get_recursive_regions(regions, 1, classes)
14722
+ ret = []
14723
+ for r, more in zip(regions, more_regions):
14724
+ ret.append(r)
14725
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
14726
+ return self._get_recursive_regions(ret, 1, classes)
14727
+
14728
+ def _get_recursive_reading_order(self, rogroup):
14729
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
14730
+ elements = rogroup.get_AllIndexed()
14731
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
14732
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
14733
+ regionrefs = list()
14734
+ for elem in elements:
14735
+ regionrefs.append(elem.get_regionRef())
14736
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
14737
+ regionrefs.extend(self._get_recursive_reading_order(elem))
14738
+ return regionrefs
14739
+
14740
+ def get_AllRegions(self, classes=None, order='document', depth=0):
14741
+ """
14742
+ Get all the ``*Region`` elements, or only those provided by `classes`.
14743
+ Return in document order, unless the top element is ``Page`` and
14744
+ `order` is ``reading-order``.
14745
+
14746
+ Arguments:
14747
+ classes (list): Classes of regions that shall be returned, \
14748
+ e.g. ``['Text', 'Image']``
14749
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
14750
+ return regions sorted by document order (``document``, default) or by
14751
+ reading order with regions not in the reading order at the end of the
14752
+ returned list (``reading-order``) or regions not in the reading order
14753
+ omitted (``reading-order-only``). The latter two are only available
14754
+ on page level.
14755
+ depth (int): Recursive depth to look for regions at, set to `0` for \
14756
+ all regions at any depth. Default: 0
14757
+
14758
+ Returns:
14759
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
14760
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
14761
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
14762
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
14763
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
14764
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
14765
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
14766
+ and/or :py:class:`CustomRegionType`
14767
+
14768
+ For example, to get all text anywhere on the page in reading order, use:
14769
+ ::
14770
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
14771
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
14772
+ for line in region.get_TextLine())
14773
+ """
14774
+ if order not in ['document', 'reading-order', 'reading-order-only']:
14775
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
14776
+ if depth < 0:
14777
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
14778
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
14779
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
14780
+ reading_order = self.get_ReadingOrder()
14781
+ if reading_order:
14782
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
14783
+ if reading_order:
14784
+ reading_order = self._get_recursive_reading_order(reading_order)
14785
+ if reading_order:
14786
+ id2region = {region.id: region for region in ret}
14787
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
14788
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
14789
+ # len(ret),
14790
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
14791
+ # len([r for r in ret if r not in in_reading_order])
14792
+ # ))
14793
+ if order == 'reading-order-only':
14794
+ ret = in_reading_order
14795
+ else:
14796
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
14797
+ return ret
13994
14798
  def set_orientation(self, orientation):
13995
14799
  """
13996
14800
  Set deskewing angle to given `orientation` number.
@@ -14199,6 +15003,106 @@ class GraphicRegionType(RegionType):
14199
15003
  pass
14200
15004
  def __hash__(self):
14201
15005
  return hash(self.id)
15006
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
15007
+ def _region_class(self, x): # pylint: disable=unused-argument
15008
+ return x.__class__.__name__.replace('RegionType', '')
15009
+
15010
+ def _get_recursive_regions(self, regions, level, classes=None):
15011
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
15012
+ if level == 1:
15013
+ # stop recursion, filter classes
15014
+ if classes:
15015
+ return [r for r in regions if self._region_class(r) in classes]
15016
+ if regions and regions[0].__class__.__name__ == 'PageType':
15017
+ regions = regions[1:]
15018
+ return regions
15019
+ # find more regions recursively
15020
+ more_regions = []
15021
+ for region in regions:
15022
+ more_regions.append([])
15023
+ for class_ in PAGE_REGION_TYPES:
15024
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
15025
+ # 'Map' is not recursive in 2019 schema
15026
+ continue
15027
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
15028
+ if not any(more_regions):
15029
+ return self._get_recursive_regions(regions, 1, classes)
15030
+ ret = []
15031
+ for r, more in zip(regions, more_regions):
15032
+ ret.append(r)
15033
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
15034
+ return self._get_recursive_regions(ret, 1, classes)
15035
+
15036
+ def _get_recursive_reading_order(self, rogroup):
15037
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
15038
+ elements = rogroup.get_AllIndexed()
15039
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
15040
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
15041
+ regionrefs = list()
15042
+ for elem in elements:
15043
+ regionrefs.append(elem.get_regionRef())
15044
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
15045
+ regionrefs.extend(self._get_recursive_reading_order(elem))
15046
+ return regionrefs
15047
+
15048
+ def get_AllRegions(self, classes=None, order='document', depth=0):
15049
+ """
15050
+ Get all the ``*Region`` elements, or only those provided by `classes`.
15051
+ Return in document order, unless the top element is ``Page`` and
15052
+ `order` is ``reading-order``.
15053
+
15054
+ Arguments:
15055
+ classes (list): Classes of regions that shall be returned, \
15056
+ e.g. ``['Text', 'Image']``
15057
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
15058
+ return regions sorted by document order (``document``, default) or by
15059
+ reading order with regions not in the reading order at the end of the
15060
+ returned list (``reading-order``) or regions not in the reading order
15061
+ omitted (``reading-order-only``). The latter two are only available
15062
+ on page level.
15063
+ depth (int): Recursive depth to look for regions at, set to `0` for \
15064
+ all regions at any depth. Default: 0
15065
+
15066
+ Returns:
15067
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
15068
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
15069
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
15070
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
15071
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
15072
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
15073
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
15074
+ and/or :py:class:`CustomRegionType`
15075
+
15076
+ For example, to get all text anywhere on the page in reading order, use:
15077
+ ::
15078
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
15079
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
15080
+ for line in region.get_TextLine())
15081
+ """
15082
+ if order not in ['document', 'reading-order', 'reading-order-only']:
15083
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
15084
+ if depth < 0:
15085
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
15086
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
15087
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
15088
+ reading_order = self.get_ReadingOrder()
15089
+ if reading_order:
15090
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
15091
+ if reading_order:
15092
+ reading_order = self._get_recursive_reading_order(reading_order)
15093
+ if reading_order:
15094
+ id2region = {region.id: region for region in ret}
15095
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
15096
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
15097
+ # len(ret),
15098
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
15099
+ # len([r for r in ret if r not in in_reading_order])
15100
+ # ))
15101
+ if order == 'reading-order-only':
15102
+ ret = in_reading_order
15103
+ else:
15104
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
15105
+ return ret
14202
15106
  def set_orientation(self, orientation):
14203
15107
  """
14204
15108
  Set deskewing angle to given `orientation` number.
@@ -14407,6 +15311,106 @@ class LineDrawingRegionType(RegionType):
14407
15311
  pass
14408
15312
  def __hash__(self):
14409
15313
  return hash(self.id)
15314
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
15315
+ def _region_class(self, x): # pylint: disable=unused-argument
15316
+ return x.__class__.__name__.replace('RegionType', '')
15317
+
15318
+ def _get_recursive_regions(self, regions, level, classes=None):
15319
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
15320
+ if level == 1:
15321
+ # stop recursion, filter classes
15322
+ if classes:
15323
+ return [r for r in regions if self._region_class(r) in classes]
15324
+ if regions and regions[0].__class__.__name__ == 'PageType':
15325
+ regions = regions[1:]
15326
+ return regions
15327
+ # find more regions recursively
15328
+ more_regions = []
15329
+ for region in regions:
15330
+ more_regions.append([])
15331
+ for class_ in PAGE_REGION_TYPES:
15332
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
15333
+ # 'Map' is not recursive in 2019 schema
15334
+ continue
15335
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
15336
+ if not any(more_regions):
15337
+ return self._get_recursive_regions(regions, 1, classes)
15338
+ ret = []
15339
+ for r, more in zip(regions, more_regions):
15340
+ ret.append(r)
15341
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
15342
+ return self._get_recursive_regions(ret, 1, classes)
15343
+
15344
+ def _get_recursive_reading_order(self, rogroup):
15345
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
15346
+ elements = rogroup.get_AllIndexed()
15347
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
15348
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
15349
+ regionrefs = list()
15350
+ for elem in elements:
15351
+ regionrefs.append(elem.get_regionRef())
15352
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
15353
+ regionrefs.extend(self._get_recursive_reading_order(elem))
15354
+ return regionrefs
15355
+
15356
+ def get_AllRegions(self, classes=None, order='document', depth=0):
15357
+ """
15358
+ Get all the ``*Region`` elements, or only those provided by `classes`.
15359
+ Return in document order, unless the top element is ``Page`` and
15360
+ `order` is ``reading-order``.
15361
+
15362
+ Arguments:
15363
+ classes (list): Classes of regions that shall be returned, \
15364
+ e.g. ``['Text', 'Image']``
15365
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
15366
+ return regions sorted by document order (``document``, default) or by
15367
+ reading order with regions not in the reading order at the end of the
15368
+ returned list (``reading-order``) or regions not in the reading order
15369
+ omitted (``reading-order-only``). The latter two are only available
15370
+ on page level.
15371
+ depth (int): Recursive depth to look for regions at, set to `0` for \
15372
+ all regions at any depth. Default: 0
15373
+
15374
+ Returns:
15375
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
15376
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
15377
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
15378
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
15379
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
15380
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
15381
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
15382
+ and/or :py:class:`CustomRegionType`
15383
+
15384
+ For example, to get all text anywhere on the page in reading order, use:
15385
+ ::
15386
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
15387
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
15388
+ for line in region.get_TextLine())
15389
+ """
15390
+ if order not in ['document', 'reading-order', 'reading-order-only']:
15391
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
15392
+ if depth < 0:
15393
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
15394
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
15395
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
15396
+ reading_order = self.get_ReadingOrder()
15397
+ if reading_order:
15398
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
15399
+ if reading_order:
15400
+ reading_order = self._get_recursive_reading_order(reading_order)
15401
+ if reading_order:
15402
+ id2region = {region.id: region for region in ret}
15403
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
15404
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
15405
+ # len(ret),
15406
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
15407
+ # len([r for r in ret if r not in in_reading_order])
15408
+ # ))
15409
+ if order == 'reading-order-only':
15410
+ ret = in_reading_order
15411
+ else:
15412
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
15413
+ return ret
14410
15414
  def set_orientation(self, orientation):
14411
15415
  """
14412
15416
  Set deskewing angle to given `orientation` number.
@@ -14628,6 +15632,106 @@ class ImageRegionType(RegionType):
14628
15632
  pass
14629
15633
  def __hash__(self):
14630
15634
  return hash(self.id)
15635
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
15636
+ def _region_class(self, x): # pylint: disable=unused-argument
15637
+ return x.__class__.__name__.replace('RegionType', '')
15638
+
15639
+ def _get_recursive_regions(self, regions, level, classes=None):
15640
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
15641
+ if level == 1:
15642
+ # stop recursion, filter classes
15643
+ if classes:
15644
+ return [r for r in regions if self._region_class(r) in classes]
15645
+ if regions and regions[0].__class__.__name__ == 'PageType':
15646
+ regions = regions[1:]
15647
+ return regions
15648
+ # find more regions recursively
15649
+ more_regions = []
15650
+ for region in regions:
15651
+ more_regions.append([])
15652
+ for class_ in PAGE_REGION_TYPES:
15653
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
15654
+ # 'Map' is not recursive in 2019 schema
15655
+ continue
15656
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
15657
+ if not any(more_regions):
15658
+ return self._get_recursive_regions(regions, 1, classes)
15659
+ ret = []
15660
+ for r, more in zip(regions, more_regions):
15661
+ ret.append(r)
15662
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
15663
+ return self._get_recursive_regions(ret, 1, classes)
15664
+
15665
+ def _get_recursive_reading_order(self, rogroup):
15666
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
15667
+ elements = rogroup.get_AllIndexed()
15668
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
15669
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
15670
+ regionrefs = list()
15671
+ for elem in elements:
15672
+ regionrefs.append(elem.get_regionRef())
15673
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
15674
+ regionrefs.extend(self._get_recursive_reading_order(elem))
15675
+ return regionrefs
15676
+
15677
+ def get_AllRegions(self, classes=None, order='document', depth=0):
15678
+ """
15679
+ Get all the ``*Region`` elements, or only those provided by `classes`.
15680
+ Return in document order, unless the top element is ``Page`` and
15681
+ `order` is ``reading-order``.
15682
+
15683
+ Arguments:
15684
+ classes (list): Classes of regions that shall be returned, \
15685
+ e.g. ``['Text', 'Image']``
15686
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
15687
+ return regions sorted by document order (``document``, default) or by
15688
+ reading order with regions not in the reading order at the end of the
15689
+ returned list (``reading-order``) or regions not in the reading order
15690
+ omitted (``reading-order-only``). The latter two are only available
15691
+ on page level.
15692
+ depth (int): Recursive depth to look for regions at, set to `0` for \
15693
+ all regions at any depth. Default: 0
15694
+
15695
+ Returns:
15696
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
15697
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
15698
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
15699
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
15700
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
15701
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
15702
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
15703
+ and/or :py:class:`CustomRegionType`
15704
+
15705
+ For example, to get all text anywhere on the page in reading order, use:
15706
+ ::
15707
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
15708
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
15709
+ for line in region.get_TextLine())
15710
+ """
15711
+ if order not in ['document', 'reading-order', 'reading-order-only']:
15712
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
15713
+ if depth < 0:
15714
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
15715
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
15716
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
15717
+ reading_order = self.get_ReadingOrder()
15718
+ if reading_order:
15719
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
15720
+ if reading_order:
15721
+ reading_order = self._get_recursive_reading_order(reading_order)
15722
+ if reading_order:
15723
+ id2region = {region.id: region for region in ret}
15724
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
15725
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
15726
+ # len(ret),
15727
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
15728
+ # len([r for r in ret if r not in in_reading_order])
15729
+ # ))
15730
+ if order == 'reading-order-only':
15731
+ ret = in_reading_order
15732
+ else:
15733
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
15734
+ return ret
14631
15735
  def set_orientation(self, orientation):
14632
15736
  """
14633
15737
  Set deskewing angle to given `orientation` number.
@@ -15191,6 +16295,106 @@ class TextRegionType(RegionType):
15191
16295
  super(TextRegionType, self)._buildChildren(child_, node, nodeName_, True)
15192
16296
  def __hash__(self):
15193
16297
  return hash(self.id)
16298
+ # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
16299
+ def _region_class(self, x): # pylint: disable=unused-argument
16300
+ return x.__class__.__name__.replace('RegionType', '')
16301
+
16302
+ def _get_recursive_regions(self, regions, level, classes=None):
16303
+ from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
16304
+ if level == 1:
16305
+ # stop recursion, filter classes
16306
+ if classes:
16307
+ return [r for r in regions if self._region_class(r) in classes]
16308
+ if regions and regions[0].__class__.__name__ == 'PageType':
16309
+ regions = regions[1:]
16310
+ return regions
16311
+ # find more regions recursively
16312
+ more_regions = []
16313
+ for region in regions:
16314
+ more_regions.append([])
16315
+ for class_ in PAGE_REGION_TYPES:
16316
+ if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
16317
+ # 'Map' is not recursive in 2019 schema
16318
+ continue
16319
+ more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
16320
+ if not any(more_regions):
16321
+ return self._get_recursive_regions(regions, 1, classes)
16322
+ ret = []
16323
+ for r, more in zip(regions, more_regions):
16324
+ ret.append(r)
16325
+ ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
16326
+ return self._get_recursive_regions(ret, 1, classes)
16327
+
16328
+ def _get_recursive_reading_order(self, rogroup):
16329
+ if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
16330
+ elements = rogroup.get_AllIndexed()
16331
+ if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
16332
+ elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
16333
+ regionrefs = list()
16334
+ for elem in elements:
16335
+ regionrefs.append(elem.get_regionRef())
16336
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
16337
+ regionrefs.extend(self._get_recursive_reading_order(elem))
16338
+ return regionrefs
16339
+
16340
+ def get_AllRegions(self, classes=None, order='document', depth=0):
16341
+ """
16342
+ Get all the ``*Region`` elements, or only those provided by `classes`.
16343
+ Return in document order, unless the top element is ``Page`` and
16344
+ `order` is ``reading-order``.
16345
+
16346
+ Arguments:
16347
+ classes (list): Classes of regions that shall be returned, \
16348
+ e.g. ``['Text', 'Image']``
16349
+ order ("document"|"reading-order"|"reading-order-only"): Whether to \
16350
+ return regions sorted by document order (``document``, default) or by
16351
+ reading order with regions not in the reading order at the end of the
16352
+ returned list (``reading-order``) or regions not in the reading order
16353
+ omitted (``reading-order-only``). The latter two are only available
16354
+ on page level.
16355
+ depth (int): Recursive depth to look for regions at, set to `0` for \
16356
+ all regions at any depth. Default: 0
16357
+
16358
+ Returns:
16359
+ a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
16360
+ :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
16361
+ :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
16362
+ :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
16363
+ :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
16364
+ :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
16365
+ :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
16366
+ and/or :py:class:`CustomRegionType`
16367
+
16368
+ For example, to get all text anywhere on the page in reading order, use:
16369
+ ::
16370
+ '\\n'.join(line.get_TextEquiv()[0].Unicode
16371
+ for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
16372
+ for line in region.get_TextLine())
16373
+ """
16374
+ if order not in ['document', 'reading-order', 'reading-order-only']:
16375
+ raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
16376
+ if depth < 0:
16377
+ raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
16378
+ ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
16379
+ if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
16380
+ reading_order = self.get_ReadingOrder()
16381
+ if reading_order:
16382
+ reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
16383
+ if reading_order:
16384
+ reading_order = self._get_recursive_reading_order(reading_order)
16385
+ if reading_order:
16386
+ id2region = {region.id: region for region in ret}
16387
+ in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
16388
+ # print("ret: {} / in_ro: {} / not-in-ro: {}".format(
16389
+ # len(ret),
16390
+ # len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
16391
+ # len([r for r in ret if r not in in_reading_order])
16392
+ # ))
16393
+ if order == 'reading-order-only':
16394
+ ret = in_reading_order
16395
+ else:
16396
+ ret = in_reading_order + [r for r in ret if r not in in_reading_order]
16397
+ return ret
15194
16398
  def set_orientation(self, orientation):
15195
16399
  """
15196
16400
  Set deskewing angle to given `orientation` number.