ocrd 3.8.1__py3-none-any.whl → 3.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocrd/processor/base.py +51 -43
- ocrd/processor/ocrd_page_result.py +74 -0
- {ocrd-3.8.1.dist-info → ocrd-3.9.1.dist-info}/METADATA +1 -1
- {ocrd-3.8.1.dist-info → ocrd-3.9.1.dist-info}/RECORD +15 -15
- ocrd_models/ocrd_page.py +20 -0
- ocrd_models/ocrd_page_generateds.py +1273 -69
- ocrd_network/cli/client.py +6 -2
- ocrd_network/client.py +4 -0
- ocrd_network/client_utils.py +9 -2
- ocrd_network/constants.py +1 -1
- ocrd_network/rabbitmq_utils/helpers.py +1 -1
- {ocrd-3.8.1.dist-info → ocrd-3.9.1.dist-info}/LICENSE +0 -0
- {ocrd-3.8.1.dist-info → ocrd-3.9.1.dist-info}/WHEEL +0 -0
- {ocrd-3.8.1.dist-info → ocrd-3.9.1.dist-info}/entry_points.txt +0 -0
- {ocrd-3.8.1.dist-info → ocrd-3.9.1.dist-info}/top_level.txt +0 -0
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
3
|
|
|
4
4
|
#
|
|
5
|
-
# Generated
|
|
6
|
-
# Python 3.8.
|
|
5
|
+
# Generated Thu Dec 11 12:03:57 2025 by generateDS.py version 2.44.1.
|
|
6
|
+
# Python 3.8.19 (default, Mar 26 2024, 20:08:11) [GCC 8.5.0]
|
|
7
7
|
#
|
|
8
8
|
# Command line options:
|
|
9
9
|
# ('-f', '')
|
|
@@ -3766,7 +3766,8 @@ class PageType(GeneratedsSuper):
|
|
|
3766
3766
|
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
3767
3767
|
"""
|
|
3768
3768
|
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
3769
|
-
Return in document order, unless
|
|
3769
|
+
Return in document order, unless the top element is ``Page`` and
|
|
3770
|
+
`order` is ``reading-order``.
|
|
3770
3771
|
|
|
3771
3772
|
Arguments:
|
|
3772
3773
|
classes (list): Classes of regions that shall be returned, \
|
|
@@ -3775,7 +3776,8 @@ class PageType(GeneratedsSuper):
|
|
|
3775
3776
|
return regions sorted by document order (``document``, default) or by
|
|
3776
3777
|
reading order with regions not in the reading order at the end of the
|
|
3777
3778
|
returned list (``reading-order``) or regions not in the reading order
|
|
3778
|
-
omitted (``reading-order-only``)
|
|
3779
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
3780
|
+
on page level.
|
|
3779
3781
|
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
3780
3782
|
all regions at any depth. Default: 0
|
|
3781
3783
|
|
|
@@ -3800,7 +3802,7 @@ class PageType(GeneratedsSuper):
|
|
|
3800
3802
|
if depth < 0:
|
|
3801
3803
|
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
3802
3804
|
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
3803
|
-
if order.startswith('reading-order'):
|
|
3805
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
3804
3806
|
reading_order = self.get_ReadingOrder()
|
|
3805
3807
|
if reading_order:
|
|
3806
3808
|
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
@@ -3929,21 +3931,23 @@ class PageType(GeneratedsSuper):
|
|
|
3929
3931
|
- :py:class:`.UnoderedGroupType`
|
|
3930
3932
|
- :py:class:`.UnoderedGroupIndexedType`
|
|
3931
3933
|
"""
|
|
3934
|
+
from collections import OrderedDict as odict
|
|
3932
3935
|
def get_groupdict(group):
|
|
3933
3936
|
regionrefs = list()
|
|
3934
3937
|
if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)):
|
|
3935
3938
|
regionrefs = (group.get_RegionRefIndexed() +
|
|
3936
3939
|
group.get_OrderedGroupIndexed() +
|
|
3937
3940
|
group.get_UnorderedGroupIndexed())
|
|
3941
|
+
regionrefs = sorted(regionrefs, key=lambda x: x.index)
|
|
3938
3942
|
if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)):
|
|
3939
3943
|
regionrefs = (group.get_RegionRef() +
|
|
3940
3944
|
group.get_OrderedGroup() +
|
|
3941
3945
|
group.get_UnorderedGroup())
|
|
3942
|
-
refdict =
|
|
3946
|
+
refdict = odict()
|
|
3943
3947
|
for elem in regionrefs:
|
|
3944
3948
|
refdict[elem.get_regionRef()] = elem
|
|
3945
3949
|
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
|
|
3946
|
-
refdict =
|
|
3950
|
+
refdict = odict(**refdict, **get_groupdict(elem))
|
|
3947
3951
|
return refdict
|
|
3948
3952
|
ro = self.get_ReadingOrder()
|
|
3949
3953
|
if ro is None:
|
|
@@ -12673,6 +12677,106 @@ class AdvertRegionType(RegionType):
|
|
|
12673
12677
|
pass
|
|
12674
12678
|
def __hash__(self):
|
|
12675
12679
|
return hash(self.id)
|
|
12680
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
12681
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
12682
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
12683
|
+
|
|
12684
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
12685
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
12686
|
+
if level == 1:
|
|
12687
|
+
# stop recursion, filter classes
|
|
12688
|
+
if classes:
|
|
12689
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
12690
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
12691
|
+
regions = regions[1:]
|
|
12692
|
+
return regions
|
|
12693
|
+
# find more regions recursively
|
|
12694
|
+
more_regions = []
|
|
12695
|
+
for region in regions:
|
|
12696
|
+
more_regions.append([])
|
|
12697
|
+
for class_ in PAGE_REGION_TYPES:
|
|
12698
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
12699
|
+
# 'Map' is not recursive in 2019 schema
|
|
12700
|
+
continue
|
|
12701
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
12702
|
+
if not any(more_regions):
|
|
12703
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
12704
|
+
ret = []
|
|
12705
|
+
for r, more in zip(regions, more_regions):
|
|
12706
|
+
ret.append(r)
|
|
12707
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
12708
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
12709
|
+
|
|
12710
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
12711
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
12712
|
+
elements = rogroup.get_AllIndexed()
|
|
12713
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
12714
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
12715
|
+
regionrefs = list()
|
|
12716
|
+
for elem in elements:
|
|
12717
|
+
regionrefs.append(elem.get_regionRef())
|
|
12718
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
12719
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
12720
|
+
return regionrefs
|
|
12721
|
+
|
|
12722
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
12723
|
+
"""
|
|
12724
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
12725
|
+
Return in document order, unless the top element is ``Page`` and
|
|
12726
|
+
`order` is ``reading-order``.
|
|
12727
|
+
|
|
12728
|
+
Arguments:
|
|
12729
|
+
classes (list): Classes of regions that shall be returned, \
|
|
12730
|
+
e.g. ``['Text', 'Image']``
|
|
12731
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
12732
|
+
return regions sorted by document order (``document``, default) or by
|
|
12733
|
+
reading order with regions not in the reading order at the end of the
|
|
12734
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
12735
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
12736
|
+
on page level.
|
|
12737
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
12738
|
+
all regions at any depth. Default: 0
|
|
12739
|
+
|
|
12740
|
+
Returns:
|
|
12741
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
12742
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
12743
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
12744
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
12745
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
12746
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
12747
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
12748
|
+
and/or :py:class:`CustomRegionType`
|
|
12749
|
+
|
|
12750
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
12751
|
+
::
|
|
12752
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
12753
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
12754
|
+
for line in region.get_TextLine())
|
|
12755
|
+
"""
|
|
12756
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
12757
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
12758
|
+
if depth < 0:
|
|
12759
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
12760
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
12761
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
12762
|
+
reading_order = self.get_ReadingOrder()
|
|
12763
|
+
if reading_order:
|
|
12764
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
12765
|
+
if reading_order:
|
|
12766
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
12767
|
+
if reading_order:
|
|
12768
|
+
id2region = {region.id: region for region in ret}
|
|
12769
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
12770
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
12771
|
+
# len(ret),
|
|
12772
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
12773
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
12774
|
+
# ))
|
|
12775
|
+
if order == 'reading-order-only':
|
|
12776
|
+
ret = in_reading_order
|
|
12777
|
+
else:
|
|
12778
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
12779
|
+
return ret
|
|
12676
12780
|
def set_orientation(self, orientation):
|
|
12677
12781
|
"""
|
|
12678
12782
|
Set deskewing angle to given `orientation` number.
|
|
@@ -12835,6 +12939,106 @@ class MusicRegionType(RegionType):
|
|
|
12835
12939
|
pass
|
|
12836
12940
|
def __hash__(self):
|
|
12837
12941
|
return hash(self.id)
|
|
12942
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
12943
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
12944
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
12945
|
+
|
|
12946
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
12947
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
12948
|
+
if level == 1:
|
|
12949
|
+
# stop recursion, filter classes
|
|
12950
|
+
if classes:
|
|
12951
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
12952
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
12953
|
+
regions = regions[1:]
|
|
12954
|
+
return regions
|
|
12955
|
+
# find more regions recursively
|
|
12956
|
+
more_regions = []
|
|
12957
|
+
for region in regions:
|
|
12958
|
+
more_regions.append([])
|
|
12959
|
+
for class_ in PAGE_REGION_TYPES:
|
|
12960
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
12961
|
+
# 'Map' is not recursive in 2019 schema
|
|
12962
|
+
continue
|
|
12963
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
12964
|
+
if not any(more_regions):
|
|
12965
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
12966
|
+
ret = []
|
|
12967
|
+
for r, more in zip(regions, more_regions):
|
|
12968
|
+
ret.append(r)
|
|
12969
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
12970
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
12971
|
+
|
|
12972
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
12973
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
12974
|
+
elements = rogroup.get_AllIndexed()
|
|
12975
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
12976
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
12977
|
+
regionrefs = list()
|
|
12978
|
+
for elem in elements:
|
|
12979
|
+
regionrefs.append(elem.get_regionRef())
|
|
12980
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
12981
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
12982
|
+
return regionrefs
|
|
12983
|
+
|
|
12984
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
12985
|
+
"""
|
|
12986
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
12987
|
+
Return in document order, unless the top element is ``Page`` and
|
|
12988
|
+
`order` is ``reading-order``.
|
|
12989
|
+
|
|
12990
|
+
Arguments:
|
|
12991
|
+
classes (list): Classes of regions that shall be returned, \
|
|
12992
|
+
e.g. ``['Text', 'Image']``
|
|
12993
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
12994
|
+
return regions sorted by document order (``document``, default) or by
|
|
12995
|
+
reading order with regions not in the reading order at the end of the
|
|
12996
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
12997
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
12998
|
+
on page level.
|
|
12999
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
13000
|
+
all regions at any depth. Default: 0
|
|
13001
|
+
|
|
13002
|
+
Returns:
|
|
13003
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
13004
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
13005
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
13006
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
13007
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
13008
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
13009
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
13010
|
+
and/or :py:class:`CustomRegionType`
|
|
13011
|
+
|
|
13012
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
13013
|
+
::
|
|
13014
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
13015
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
13016
|
+
for line in region.get_TextLine())
|
|
13017
|
+
"""
|
|
13018
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
13019
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
13020
|
+
if depth < 0:
|
|
13021
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
13022
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
13023
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
13024
|
+
reading_order = self.get_ReadingOrder()
|
|
13025
|
+
if reading_order:
|
|
13026
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
13027
|
+
if reading_order:
|
|
13028
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
13029
|
+
if reading_order:
|
|
13030
|
+
id2region = {region.id: region for region in ret}
|
|
13031
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
13032
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
13033
|
+
# len(ret),
|
|
13034
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
13035
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
13036
|
+
# ))
|
|
13037
|
+
if order == 'reading-order-only':
|
|
13038
|
+
ret = in_reading_order
|
|
13039
|
+
else:
|
|
13040
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
13041
|
+
return ret
|
|
12838
13042
|
def set_orientation(self, orientation):
|
|
12839
13043
|
"""
|
|
12840
13044
|
Set deskewing angle to given `orientation` number.
|
|
@@ -12965,69 +13169,169 @@ class MapRegionType(RegionType):
|
|
|
12965
13169
|
pass
|
|
12966
13170
|
def __hash__(self):
|
|
12967
13171
|
return hash(self.id)
|
|
12968
|
-
|
|
12969
|
-
|
|
12970
|
-
|
|
12971
|
-
Moreover, invalidate self's ``pc:AlternativeImage``s
|
|
12972
|
-
(because they will have been rotated and enlarged
|
|
12973
|
-
with the angle of the previous value).
|
|
12974
|
-
"""
|
|
12975
|
-
if hasattr(self, 'invalidate_AlternativeImage'):
|
|
12976
|
-
# PageType, RegionType:
|
|
12977
|
-
self.invalidate_AlternativeImage(feature_selector='deskewed')
|
|
12978
|
-
self.orientation = orientation
|
|
12979
|
-
# end class MapRegionType
|
|
12980
|
-
|
|
12981
|
-
|
|
12982
|
-
class ChemRegionType(RegionType):
|
|
12983
|
-
"""ChemRegionType --
|
|
12984
|
-
Regions containing chemical formulas.
|
|
12985
|
-
|
|
12986
|
-
* orientation --
|
|
12987
|
-
The angle the rectangle encapsulating a
|
|
12988
|
-
region has to be rotated in clockwise
|
|
12989
|
-
direction in order to correct the present
|
|
12990
|
-
skew (negative values indicate
|
|
12991
|
-
anti-clockwise rotation). Range:
|
|
12992
|
-
-179.999,180
|
|
12993
|
-
|
|
12994
|
-
* bgColour --
|
|
12995
|
-
The background colour of the region
|
|
13172
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
13173
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
13174
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
12996
13175
|
|
|
12997
|
-
|
|
12998
|
-
|
|
12999
|
-
|
|
13000
|
-
|
|
13001
|
-
|
|
13002
|
-
|
|
13003
|
-
|
|
13004
|
-
|
|
13005
|
-
|
|
13006
|
-
|
|
13007
|
-
|
|
13008
|
-
|
|
13009
|
-
|
|
13010
|
-
|
|
13011
|
-
|
|
13012
|
-
|
|
13013
|
-
|
|
13014
|
-
|
|
13015
|
-
|
|
13016
|
-
|
|
13017
|
-
|
|
13018
|
-
|
|
13019
|
-
|
|
13020
|
-
if
|
|
13021
|
-
|
|
13022
|
-
|
|
13023
|
-
|
|
13024
|
-
|
|
13025
|
-
|
|
13026
|
-
|
|
13027
|
-
|
|
13028
|
-
|
|
13029
|
-
|
|
13030
|
-
|
|
13176
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
13177
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
13178
|
+
if level == 1:
|
|
13179
|
+
# stop recursion, filter classes
|
|
13180
|
+
if classes:
|
|
13181
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
13182
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
13183
|
+
regions = regions[1:]
|
|
13184
|
+
return regions
|
|
13185
|
+
# find more regions recursively
|
|
13186
|
+
more_regions = []
|
|
13187
|
+
for region in regions:
|
|
13188
|
+
more_regions.append([])
|
|
13189
|
+
for class_ in PAGE_REGION_TYPES:
|
|
13190
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
13191
|
+
# 'Map' is not recursive in 2019 schema
|
|
13192
|
+
continue
|
|
13193
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
13194
|
+
if not any(more_regions):
|
|
13195
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
13196
|
+
ret = []
|
|
13197
|
+
for r, more in zip(regions, more_regions):
|
|
13198
|
+
ret.append(r)
|
|
13199
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
13200
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
13201
|
+
|
|
13202
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
13203
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
13204
|
+
elements = rogroup.get_AllIndexed()
|
|
13205
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
13206
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
13207
|
+
regionrefs = list()
|
|
13208
|
+
for elem in elements:
|
|
13209
|
+
regionrefs.append(elem.get_regionRef())
|
|
13210
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
13211
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
13212
|
+
return regionrefs
|
|
13213
|
+
|
|
13214
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
13215
|
+
"""
|
|
13216
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
13217
|
+
Return in document order, unless the top element is ``Page`` and
|
|
13218
|
+
`order` is ``reading-order``.
|
|
13219
|
+
|
|
13220
|
+
Arguments:
|
|
13221
|
+
classes (list): Classes of regions that shall be returned, \
|
|
13222
|
+
e.g. ``['Text', 'Image']``
|
|
13223
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
13224
|
+
return regions sorted by document order (``document``, default) or by
|
|
13225
|
+
reading order with regions not in the reading order at the end of the
|
|
13226
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
13227
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
13228
|
+
on page level.
|
|
13229
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
13230
|
+
all regions at any depth. Default: 0
|
|
13231
|
+
|
|
13232
|
+
Returns:
|
|
13233
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
13234
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
13235
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
13236
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
13237
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
13238
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
13239
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
13240
|
+
and/or :py:class:`CustomRegionType`
|
|
13241
|
+
|
|
13242
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
13243
|
+
::
|
|
13244
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
13245
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
13246
|
+
for line in region.get_TextLine())
|
|
13247
|
+
"""
|
|
13248
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
13249
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
13250
|
+
if depth < 0:
|
|
13251
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
13252
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
13253
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
13254
|
+
reading_order = self.get_ReadingOrder()
|
|
13255
|
+
if reading_order:
|
|
13256
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
13257
|
+
if reading_order:
|
|
13258
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
13259
|
+
if reading_order:
|
|
13260
|
+
id2region = {region.id: region for region in ret}
|
|
13261
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
13262
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
13263
|
+
# len(ret),
|
|
13264
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
13265
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
13266
|
+
# ))
|
|
13267
|
+
if order == 'reading-order-only':
|
|
13268
|
+
ret = in_reading_order
|
|
13269
|
+
else:
|
|
13270
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
13271
|
+
return ret
|
|
13272
|
+
def set_orientation(self, orientation):
|
|
13273
|
+
"""
|
|
13274
|
+
Set deskewing angle to given `orientation` number.
|
|
13275
|
+
Moreover, invalidate self's ``pc:AlternativeImage``s
|
|
13276
|
+
(because they will have been rotated and enlarged
|
|
13277
|
+
with the angle of the previous value).
|
|
13278
|
+
"""
|
|
13279
|
+
if hasattr(self, 'invalidate_AlternativeImage'):
|
|
13280
|
+
# PageType, RegionType:
|
|
13281
|
+
self.invalidate_AlternativeImage(feature_selector='deskewed')
|
|
13282
|
+
self.orientation = orientation
|
|
13283
|
+
# end class MapRegionType
|
|
13284
|
+
|
|
13285
|
+
|
|
13286
|
+
class ChemRegionType(RegionType):
|
|
13287
|
+
"""ChemRegionType --
|
|
13288
|
+
Regions containing chemical formulas.
|
|
13289
|
+
|
|
13290
|
+
* orientation --
|
|
13291
|
+
The angle the rectangle encapsulating a
|
|
13292
|
+
region has to be rotated in clockwise
|
|
13293
|
+
direction in order to correct the present
|
|
13294
|
+
skew (negative values indicate
|
|
13295
|
+
anti-clockwise rotation). Range:
|
|
13296
|
+
-179.999,180
|
|
13297
|
+
|
|
13298
|
+
* bgColour --
|
|
13299
|
+
The background colour of the region
|
|
13300
|
+
|
|
13301
|
+
"""
|
|
13302
|
+
__hash__ = GeneratedsSuper.__hash__
|
|
13303
|
+
member_data_items_ = [
|
|
13304
|
+
MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}),
|
|
13305
|
+
MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}),
|
|
13306
|
+
]
|
|
13307
|
+
subclass = None
|
|
13308
|
+
superclass = RegionType
|
|
13309
|
+
def __init__(self, id=None, custom=None, comments=None, continuation=None, AlternativeImage=None, Coords=None, UserDefined=None, Labels=None, Roles=None, TextRegion=None, ImageRegion=None, LineDrawingRegion=None, GraphicRegion=None, TableRegion=None, ChartRegion=None, SeparatorRegion=None, MathsRegion=None, ChemRegion=None, MusicRegion=None, AdvertRegion=None, NoiseRegion=None, UnknownRegion=None, CustomRegion=None, orientation=None, bgColour=None, gds_collector_=None, **kwargs_):
|
|
13310
|
+
self.gds_collector_ = gds_collector_
|
|
13311
|
+
self.gds_elementtree_node_ = None
|
|
13312
|
+
self.original_tagname_ = None
|
|
13313
|
+
self.parent_object_ = kwargs_.get('parent_object_')
|
|
13314
|
+
self.ns_prefix_ = "pc"
|
|
13315
|
+
super(globals().get("ChemRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_)
|
|
13316
|
+
self.orientation = _cast(float, orientation)
|
|
13317
|
+
self.orientation_nsprefix_ = "pc"
|
|
13318
|
+
self.bgColour = _cast(None, bgColour)
|
|
13319
|
+
self.bgColour_nsprefix_ = "pc"
|
|
13320
|
+
def factory(*args_, **kwargs_):
|
|
13321
|
+
if CurrentSubclassModule_ is not None:
|
|
13322
|
+
subclass = getSubclassFromModule_(
|
|
13323
|
+
CurrentSubclassModule_, ChemRegionType)
|
|
13324
|
+
if subclass is not None:
|
|
13325
|
+
return subclass(*args_, **kwargs_)
|
|
13326
|
+
if ChemRegionType.subclass:
|
|
13327
|
+
return ChemRegionType.subclass(*args_, **kwargs_)
|
|
13328
|
+
else:
|
|
13329
|
+
return ChemRegionType(*args_, **kwargs_)
|
|
13330
|
+
factory = staticmethod(factory)
|
|
13331
|
+
def get_ns_prefix_(self):
|
|
13332
|
+
return self.ns_prefix_
|
|
13333
|
+
def set_ns_prefix_(self, ns_prefix):
|
|
13334
|
+
self.ns_prefix_ = ns_prefix
|
|
13031
13335
|
def get_orientation(self):
|
|
13032
13336
|
return self.orientation
|
|
13033
13337
|
def set_orientation(self, orientation):
|
|
@@ -13128,6 +13432,106 @@ class ChemRegionType(RegionType):
|
|
|
13128
13432
|
pass
|
|
13129
13433
|
def __hash__(self):
|
|
13130
13434
|
return hash(self.id)
|
|
13435
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
13436
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
13437
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
13438
|
+
|
|
13439
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
13440
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
13441
|
+
if level == 1:
|
|
13442
|
+
# stop recursion, filter classes
|
|
13443
|
+
if classes:
|
|
13444
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
13445
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
13446
|
+
regions = regions[1:]
|
|
13447
|
+
return regions
|
|
13448
|
+
# find more regions recursively
|
|
13449
|
+
more_regions = []
|
|
13450
|
+
for region in regions:
|
|
13451
|
+
more_regions.append([])
|
|
13452
|
+
for class_ in PAGE_REGION_TYPES:
|
|
13453
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
13454
|
+
# 'Map' is not recursive in 2019 schema
|
|
13455
|
+
continue
|
|
13456
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
13457
|
+
if not any(more_regions):
|
|
13458
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
13459
|
+
ret = []
|
|
13460
|
+
for r, more in zip(regions, more_regions):
|
|
13461
|
+
ret.append(r)
|
|
13462
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
13463
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
13464
|
+
|
|
13465
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
13466
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
13467
|
+
elements = rogroup.get_AllIndexed()
|
|
13468
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
13469
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
13470
|
+
regionrefs = list()
|
|
13471
|
+
for elem in elements:
|
|
13472
|
+
regionrefs.append(elem.get_regionRef())
|
|
13473
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
13474
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
13475
|
+
return regionrefs
|
|
13476
|
+
|
|
13477
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
13478
|
+
"""
|
|
13479
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
13480
|
+
Return in document order, unless the top element is ``Page`` and
|
|
13481
|
+
`order` is ``reading-order``.
|
|
13482
|
+
|
|
13483
|
+
Arguments:
|
|
13484
|
+
classes (list): Classes of regions that shall be returned, \
|
|
13485
|
+
e.g. ``['Text', 'Image']``
|
|
13486
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
13487
|
+
return regions sorted by document order (``document``, default) or by
|
|
13488
|
+
reading order with regions not in the reading order at the end of the
|
|
13489
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
13490
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
13491
|
+
on page level.
|
|
13492
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
13493
|
+
all regions at any depth. Default: 0
|
|
13494
|
+
|
|
13495
|
+
Returns:
|
|
13496
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
13497
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
13498
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
13499
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
13500
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
13501
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
13502
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
13503
|
+
and/or :py:class:`CustomRegionType`
|
|
13504
|
+
|
|
13505
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
13506
|
+
::
|
|
13507
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
13508
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
13509
|
+
for line in region.get_TextLine())
|
|
13510
|
+
"""
|
|
13511
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
13512
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
13513
|
+
if depth < 0:
|
|
13514
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
13515
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
13516
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
13517
|
+
reading_order = self.get_ReadingOrder()
|
|
13518
|
+
if reading_order:
|
|
13519
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
13520
|
+
if reading_order:
|
|
13521
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
13522
|
+
if reading_order:
|
|
13523
|
+
id2region = {region.id: region for region in ret}
|
|
13524
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
13525
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
13526
|
+
# len(ret),
|
|
13527
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
13528
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
13529
|
+
# ))
|
|
13530
|
+
if order == 'reading-order-only':
|
|
13531
|
+
ret = in_reading_order
|
|
13532
|
+
else:
|
|
13533
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
13534
|
+
return ret
|
|
13131
13535
|
def set_orientation(self, orientation):
|
|
13132
13536
|
"""
|
|
13133
13537
|
Set deskewing angle to given `orientation` number.
|
|
@@ -13291,6 +13695,106 @@ class MathsRegionType(RegionType):
|
|
|
13291
13695
|
pass
|
|
13292
13696
|
def __hash__(self):
|
|
13293
13697
|
return hash(self.id)
|
|
13698
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
13699
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
13700
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
13701
|
+
|
|
13702
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
13703
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
13704
|
+
if level == 1:
|
|
13705
|
+
# stop recursion, filter classes
|
|
13706
|
+
if classes:
|
|
13707
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
13708
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
13709
|
+
regions = regions[1:]
|
|
13710
|
+
return regions
|
|
13711
|
+
# find more regions recursively
|
|
13712
|
+
more_regions = []
|
|
13713
|
+
for region in regions:
|
|
13714
|
+
more_regions.append([])
|
|
13715
|
+
for class_ in PAGE_REGION_TYPES:
|
|
13716
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
13717
|
+
# 'Map' is not recursive in 2019 schema
|
|
13718
|
+
continue
|
|
13719
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
13720
|
+
if not any(more_regions):
|
|
13721
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
13722
|
+
ret = []
|
|
13723
|
+
for r, more in zip(regions, more_regions):
|
|
13724
|
+
ret.append(r)
|
|
13725
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
13726
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
13727
|
+
|
|
13728
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
13729
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
13730
|
+
elements = rogroup.get_AllIndexed()
|
|
13731
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
13732
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
13733
|
+
regionrefs = list()
|
|
13734
|
+
for elem in elements:
|
|
13735
|
+
regionrefs.append(elem.get_regionRef())
|
|
13736
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
13737
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
13738
|
+
return regionrefs
|
|
13739
|
+
|
|
13740
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
13741
|
+
"""
|
|
13742
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
13743
|
+
Return in document order, unless the top element is ``Page`` and
|
|
13744
|
+
`order` is ``reading-order``.
|
|
13745
|
+
|
|
13746
|
+
Arguments:
|
|
13747
|
+
classes (list): Classes of regions that shall be returned, \
|
|
13748
|
+
e.g. ``['Text', 'Image']``
|
|
13749
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
13750
|
+
return regions sorted by document order (``document``, default) or by
|
|
13751
|
+
reading order with regions not in the reading order at the end of the
|
|
13752
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
13753
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
13754
|
+
on page level.
|
|
13755
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
13756
|
+
all regions at any depth. Default: 0
|
|
13757
|
+
|
|
13758
|
+
Returns:
|
|
13759
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
13760
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
13761
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
13762
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
13763
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
13764
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
13765
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
13766
|
+
and/or :py:class:`CustomRegionType`
|
|
13767
|
+
|
|
13768
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
13769
|
+
::
|
|
13770
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
13771
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
13772
|
+
for line in region.get_TextLine())
|
|
13773
|
+
"""
|
|
13774
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
13775
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
13776
|
+
if depth < 0:
|
|
13777
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
13778
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
13779
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
13780
|
+
reading_order = self.get_ReadingOrder()
|
|
13781
|
+
if reading_order:
|
|
13782
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
13783
|
+
if reading_order:
|
|
13784
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
13785
|
+
if reading_order:
|
|
13786
|
+
id2region = {region.id: region for region in ret}
|
|
13787
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
13788
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
13789
|
+
# len(ret),
|
|
13790
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
13791
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
13792
|
+
# ))
|
|
13793
|
+
if order == 'reading-order-only':
|
|
13794
|
+
ret = in_reading_order
|
|
13795
|
+
else:
|
|
13796
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
13797
|
+
return ret
|
|
13294
13798
|
def set_orientation(self, orientation):
|
|
13295
13799
|
"""
|
|
13296
13800
|
Set deskewing angle to given `orientation` number.
|
|
@@ -13455,6 +13959,106 @@ class SeparatorRegionType(RegionType):
|
|
|
13455
13959
|
pass
|
|
13456
13960
|
def __hash__(self):
|
|
13457
13961
|
return hash(self.id)
|
|
13962
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
13963
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
13964
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
13965
|
+
|
|
13966
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
13967
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
13968
|
+
if level == 1:
|
|
13969
|
+
# stop recursion, filter classes
|
|
13970
|
+
if classes:
|
|
13971
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
13972
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
13973
|
+
regions = regions[1:]
|
|
13974
|
+
return regions
|
|
13975
|
+
# find more regions recursively
|
|
13976
|
+
more_regions = []
|
|
13977
|
+
for region in regions:
|
|
13978
|
+
more_regions.append([])
|
|
13979
|
+
for class_ in PAGE_REGION_TYPES:
|
|
13980
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
13981
|
+
# 'Map' is not recursive in 2019 schema
|
|
13982
|
+
continue
|
|
13983
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
13984
|
+
if not any(more_regions):
|
|
13985
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
13986
|
+
ret = []
|
|
13987
|
+
for r, more in zip(regions, more_regions):
|
|
13988
|
+
ret.append(r)
|
|
13989
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
13990
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
13991
|
+
|
|
13992
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
13993
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
13994
|
+
elements = rogroup.get_AllIndexed()
|
|
13995
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
13996
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
13997
|
+
regionrefs = list()
|
|
13998
|
+
for elem in elements:
|
|
13999
|
+
regionrefs.append(elem.get_regionRef())
|
|
14000
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
14001
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
14002
|
+
return regionrefs
|
|
14003
|
+
|
|
14004
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
14005
|
+
"""
|
|
14006
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
14007
|
+
Return in document order, unless the top element is ``Page`` and
|
|
14008
|
+
`order` is ``reading-order``.
|
|
14009
|
+
|
|
14010
|
+
Arguments:
|
|
14011
|
+
classes (list): Classes of regions that shall be returned, \
|
|
14012
|
+
e.g. ``['Text', 'Image']``
|
|
14013
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
14014
|
+
return regions sorted by document order (``document``, default) or by
|
|
14015
|
+
reading order with regions not in the reading order at the end of the
|
|
14016
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
14017
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
14018
|
+
on page level.
|
|
14019
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
14020
|
+
all regions at any depth. Default: 0
|
|
14021
|
+
|
|
14022
|
+
Returns:
|
|
14023
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
14024
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
14025
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
14026
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
14027
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
14028
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
14029
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
14030
|
+
and/or :py:class:`CustomRegionType`
|
|
14031
|
+
|
|
14032
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
14033
|
+
::
|
|
14034
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
14035
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
14036
|
+
for line in region.get_TextLine())
|
|
14037
|
+
"""
|
|
14038
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
14039
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
14040
|
+
if depth < 0:
|
|
14041
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
14042
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
14043
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
14044
|
+
reading_order = self.get_ReadingOrder()
|
|
14045
|
+
if reading_order:
|
|
14046
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
14047
|
+
if reading_order:
|
|
14048
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
14049
|
+
if reading_order:
|
|
14050
|
+
id2region = {region.id: region for region in ret}
|
|
14051
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
14052
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
14053
|
+
# len(ret),
|
|
14054
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
14055
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
14056
|
+
# ))
|
|
14057
|
+
if order == 'reading-order-only':
|
|
14058
|
+
ret = in_reading_order
|
|
14059
|
+
else:
|
|
14060
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
14061
|
+
return ret
|
|
13458
14062
|
def set_orientation(self, orientation):
|
|
13459
14063
|
"""
|
|
13460
14064
|
Set deskewing angle to given `orientation` number.
|
|
@@ -13696,6 +14300,106 @@ class ChartRegionType(RegionType):
|
|
|
13696
14300
|
pass
|
|
13697
14301
|
def __hash__(self):
|
|
13698
14302
|
return hash(self.id)
|
|
14303
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
14304
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
14305
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
14306
|
+
|
|
14307
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
14308
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
14309
|
+
if level == 1:
|
|
14310
|
+
# stop recursion, filter classes
|
|
14311
|
+
if classes:
|
|
14312
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
14313
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
14314
|
+
regions = regions[1:]
|
|
14315
|
+
return regions
|
|
14316
|
+
# find more regions recursively
|
|
14317
|
+
more_regions = []
|
|
14318
|
+
for region in regions:
|
|
14319
|
+
more_regions.append([])
|
|
14320
|
+
for class_ in PAGE_REGION_TYPES:
|
|
14321
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
14322
|
+
# 'Map' is not recursive in 2019 schema
|
|
14323
|
+
continue
|
|
14324
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
14325
|
+
if not any(more_regions):
|
|
14326
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
14327
|
+
ret = []
|
|
14328
|
+
for r, more in zip(regions, more_regions):
|
|
14329
|
+
ret.append(r)
|
|
14330
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
14331
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
14332
|
+
|
|
14333
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
14334
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
14335
|
+
elements = rogroup.get_AllIndexed()
|
|
14336
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
14337
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
14338
|
+
regionrefs = list()
|
|
14339
|
+
for elem in elements:
|
|
14340
|
+
regionrefs.append(elem.get_regionRef())
|
|
14341
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
14342
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
14343
|
+
return regionrefs
|
|
14344
|
+
|
|
14345
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
14346
|
+
"""
|
|
14347
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
14348
|
+
Return in document order, unless the top element is ``Page`` and
|
|
14349
|
+
`order` is ``reading-order``.
|
|
14350
|
+
|
|
14351
|
+
Arguments:
|
|
14352
|
+
classes (list): Classes of regions that shall be returned, \
|
|
14353
|
+
e.g. ``['Text', 'Image']``
|
|
14354
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
14355
|
+
return regions sorted by document order (``document``, default) or by
|
|
14356
|
+
reading order with regions not in the reading order at the end of the
|
|
14357
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
14358
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
14359
|
+
on page level.
|
|
14360
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
14361
|
+
all regions at any depth. Default: 0
|
|
14362
|
+
|
|
14363
|
+
Returns:
|
|
14364
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
14365
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
14366
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
14367
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
14368
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
14369
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
14370
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
14371
|
+
and/or :py:class:`CustomRegionType`
|
|
14372
|
+
|
|
14373
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
14374
|
+
::
|
|
14375
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
14376
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
14377
|
+
for line in region.get_TextLine())
|
|
14378
|
+
"""
|
|
14379
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
14380
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
14381
|
+
if depth < 0:
|
|
14382
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
14383
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
14384
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
14385
|
+
reading_order = self.get_ReadingOrder()
|
|
14386
|
+
if reading_order:
|
|
14387
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
14388
|
+
if reading_order:
|
|
14389
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
14390
|
+
if reading_order:
|
|
14391
|
+
id2region = {region.id: region for region in ret}
|
|
14392
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
14393
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
14394
|
+
# len(ret),
|
|
14395
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
14396
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
14397
|
+
# ))
|
|
14398
|
+
if order == 'reading-order-only':
|
|
14399
|
+
ret = in_reading_order
|
|
14400
|
+
else:
|
|
14401
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
14402
|
+
return ret
|
|
13699
14403
|
def set_orientation(self, orientation):
|
|
13700
14404
|
"""
|
|
13701
14405
|
Set deskewing angle to given `orientation` number.
|
|
@@ -13991,6 +14695,106 @@ class TableRegionType(RegionType):
|
|
|
13991
14695
|
super(TableRegionType, self)._buildChildren(child_, node, nodeName_, True)
|
|
13992
14696
|
def __hash__(self):
|
|
13993
14697
|
return hash(self.id)
|
|
14698
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
14699
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
14700
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
14701
|
+
|
|
14702
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
14703
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
14704
|
+
if level == 1:
|
|
14705
|
+
# stop recursion, filter classes
|
|
14706
|
+
if classes:
|
|
14707
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
14708
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
14709
|
+
regions = regions[1:]
|
|
14710
|
+
return regions
|
|
14711
|
+
# find more regions recursively
|
|
14712
|
+
more_regions = []
|
|
14713
|
+
for region in regions:
|
|
14714
|
+
more_regions.append([])
|
|
14715
|
+
for class_ in PAGE_REGION_TYPES:
|
|
14716
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
14717
|
+
# 'Map' is not recursive in 2019 schema
|
|
14718
|
+
continue
|
|
14719
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
14720
|
+
if not any(more_regions):
|
|
14721
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
14722
|
+
ret = []
|
|
14723
|
+
for r, more in zip(regions, more_regions):
|
|
14724
|
+
ret.append(r)
|
|
14725
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
14726
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
14727
|
+
|
|
14728
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
14729
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
14730
|
+
elements = rogroup.get_AllIndexed()
|
|
14731
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
14732
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
14733
|
+
regionrefs = list()
|
|
14734
|
+
for elem in elements:
|
|
14735
|
+
regionrefs.append(elem.get_regionRef())
|
|
14736
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
14737
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
14738
|
+
return regionrefs
|
|
14739
|
+
|
|
14740
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
14741
|
+
"""
|
|
14742
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
14743
|
+
Return in document order, unless the top element is ``Page`` and
|
|
14744
|
+
`order` is ``reading-order``.
|
|
14745
|
+
|
|
14746
|
+
Arguments:
|
|
14747
|
+
classes (list): Classes of regions that shall be returned, \
|
|
14748
|
+
e.g. ``['Text', 'Image']``
|
|
14749
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
14750
|
+
return regions sorted by document order (``document``, default) or by
|
|
14751
|
+
reading order with regions not in the reading order at the end of the
|
|
14752
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
14753
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
14754
|
+
on page level.
|
|
14755
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
14756
|
+
all regions at any depth. Default: 0
|
|
14757
|
+
|
|
14758
|
+
Returns:
|
|
14759
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
14760
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
14761
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
14762
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
14763
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
14764
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
14765
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
14766
|
+
and/or :py:class:`CustomRegionType`
|
|
14767
|
+
|
|
14768
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
14769
|
+
::
|
|
14770
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
14771
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
14772
|
+
for line in region.get_TextLine())
|
|
14773
|
+
"""
|
|
14774
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
14775
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
14776
|
+
if depth < 0:
|
|
14777
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
14778
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
14779
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
14780
|
+
reading_order = self.get_ReadingOrder()
|
|
14781
|
+
if reading_order:
|
|
14782
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
14783
|
+
if reading_order:
|
|
14784
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
14785
|
+
if reading_order:
|
|
14786
|
+
id2region = {region.id: region for region in ret}
|
|
14787
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
14788
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
14789
|
+
# len(ret),
|
|
14790
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
14791
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
14792
|
+
# ))
|
|
14793
|
+
if order == 'reading-order-only':
|
|
14794
|
+
ret = in_reading_order
|
|
14795
|
+
else:
|
|
14796
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
14797
|
+
return ret
|
|
13994
14798
|
def set_orientation(self, orientation):
|
|
13995
14799
|
"""
|
|
13996
14800
|
Set deskewing angle to given `orientation` number.
|
|
@@ -14199,6 +15003,106 @@ class GraphicRegionType(RegionType):
|
|
|
14199
15003
|
pass
|
|
14200
15004
|
def __hash__(self):
|
|
14201
15005
|
return hash(self.id)
|
|
15006
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
15007
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
15008
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
15009
|
+
|
|
15010
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
15011
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
15012
|
+
if level == 1:
|
|
15013
|
+
# stop recursion, filter classes
|
|
15014
|
+
if classes:
|
|
15015
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
15016
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
15017
|
+
regions = regions[1:]
|
|
15018
|
+
return regions
|
|
15019
|
+
# find more regions recursively
|
|
15020
|
+
more_regions = []
|
|
15021
|
+
for region in regions:
|
|
15022
|
+
more_regions.append([])
|
|
15023
|
+
for class_ in PAGE_REGION_TYPES:
|
|
15024
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
15025
|
+
# 'Map' is not recursive in 2019 schema
|
|
15026
|
+
continue
|
|
15027
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
15028
|
+
if not any(more_regions):
|
|
15029
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
15030
|
+
ret = []
|
|
15031
|
+
for r, more in zip(regions, more_regions):
|
|
15032
|
+
ret.append(r)
|
|
15033
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
15034
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
15035
|
+
|
|
15036
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
15037
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
15038
|
+
elements = rogroup.get_AllIndexed()
|
|
15039
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
15040
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
15041
|
+
regionrefs = list()
|
|
15042
|
+
for elem in elements:
|
|
15043
|
+
regionrefs.append(elem.get_regionRef())
|
|
15044
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
15045
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
15046
|
+
return regionrefs
|
|
15047
|
+
|
|
15048
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
15049
|
+
"""
|
|
15050
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
15051
|
+
Return in document order, unless the top element is ``Page`` and
|
|
15052
|
+
`order` is ``reading-order``.
|
|
15053
|
+
|
|
15054
|
+
Arguments:
|
|
15055
|
+
classes (list): Classes of regions that shall be returned, \
|
|
15056
|
+
e.g. ``['Text', 'Image']``
|
|
15057
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
15058
|
+
return regions sorted by document order (``document``, default) or by
|
|
15059
|
+
reading order with regions not in the reading order at the end of the
|
|
15060
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
15061
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
15062
|
+
on page level.
|
|
15063
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
15064
|
+
all regions at any depth. Default: 0
|
|
15065
|
+
|
|
15066
|
+
Returns:
|
|
15067
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
15068
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
15069
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
15070
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
15071
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
15072
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
15073
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
15074
|
+
and/or :py:class:`CustomRegionType`
|
|
15075
|
+
|
|
15076
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
15077
|
+
::
|
|
15078
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
15079
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
15080
|
+
for line in region.get_TextLine())
|
|
15081
|
+
"""
|
|
15082
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
15083
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
15084
|
+
if depth < 0:
|
|
15085
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
15086
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
15087
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
15088
|
+
reading_order = self.get_ReadingOrder()
|
|
15089
|
+
if reading_order:
|
|
15090
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
15091
|
+
if reading_order:
|
|
15092
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
15093
|
+
if reading_order:
|
|
15094
|
+
id2region = {region.id: region for region in ret}
|
|
15095
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
15096
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
15097
|
+
# len(ret),
|
|
15098
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
15099
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
15100
|
+
# ))
|
|
15101
|
+
if order == 'reading-order-only':
|
|
15102
|
+
ret = in_reading_order
|
|
15103
|
+
else:
|
|
15104
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
15105
|
+
return ret
|
|
14202
15106
|
def set_orientation(self, orientation):
|
|
14203
15107
|
"""
|
|
14204
15108
|
Set deskewing angle to given `orientation` number.
|
|
@@ -14407,6 +15311,106 @@ class LineDrawingRegionType(RegionType):
|
|
|
14407
15311
|
pass
|
|
14408
15312
|
def __hash__(self):
|
|
14409
15313
|
return hash(self.id)
|
|
15314
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
15315
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
15316
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
15317
|
+
|
|
15318
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
15319
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
15320
|
+
if level == 1:
|
|
15321
|
+
# stop recursion, filter classes
|
|
15322
|
+
if classes:
|
|
15323
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
15324
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
15325
|
+
regions = regions[1:]
|
|
15326
|
+
return regions
|
|
15327
|
+
# find more regions recursively
|
|
15328
|
+
more_regions = []
|
|
15329
|
+
for region in regions:
|
|
15330
|
+
more_regions.append([])
|
|
15331
|
+
for class_ in PAGE_REGION_TYPES:
|
|
15332
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
15333
|
+
# 'Map' is not recursive in 2019 schema
|
|
15334
|
+
continue
|
|
15335
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
15336
|
+
if not any(more_regions):
|
|
15337
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
15338
|
+
ret = []
|
|
15339
|
+
for r, more in zip(regions, more_regions):
|
|
15340
|
+
ret.append(r)
|
|
15341
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
15342
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
15343
|
+
|
|
15344
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
15345
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
15346
|
+
elements = rogroup.get_AllIndexed()
|
|
15347
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
15348
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
15349
|
+
regionrefs = list()
|
|
15350
|
+
for elem in elements:
|
|
15351
|
+
regionrefs.append(elem.get_regionRef())
|
|
15352
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
15353
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
15354
|
+
return regionrefs
|
|
15355
|
+
|
|
15356
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
15357
|
+
"""
|
|
15358
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
15359
|
+
Return in document order, unless the top element is ``Page`` and
|
|
15360
|
+
`order` is ``reading-order``.
|
|
15361
|
+
|
|
15362
|
+
Arguments:
|
|
15363
|
+
classes (list): Classes of regions that shall be returned, \
|
|
15364
|
+
e.g. ``['Text', 'Image']``
|
|
15365
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
15366
|
+
return regions sorted by document order (``document``, default) or by
|
|
15367
|
+
reading order with regions not in the reading order at the end of the
|
|
15368
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
15369
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
15370
|
+
on page level.
|
|
15371
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
15372
|
+
all regions at any depth. Default: 0
|
|
15373
|
+
|
|
15374
|
+
Returns:
|
|
15375
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
15376
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
15377
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
15378
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
15379
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
15380
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
15381
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
15382
|
+
and/or :py:class:`CustomRegionType`
|
|
15383
|
+
|
|
15384
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
15385
|
+
::
|
|
15386
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
15387
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
15388
|
+
for line in region.get_TextLine())
|
|
15389
|
+
"""
|
|
15390
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
15391
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
15392
|
+
if depth < 0:
|
|
15393
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
15394
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
15395
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
15396
|
+
reading_order = self.get_ReadingOrder()
|
|
15397
|
+
if reading_order:
|
|
15398
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
15399
|
+
if reading_order:
|
|
15400
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
15401
|
+
if reading_order:
|
|
15402
|
+
id2region = {region.id: region for region in ret}
|
|
15403
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
15404
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
15405
|
+
# len(ret),
|
|
15406
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
15407
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
15408
|
+
# ))
|
|
15409
|
+
if order == 'reading-order-only':
|
|
15410
|
+
ret = in_reading_order
|
|
15411
|
+
else:
|
|
15412
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
15413
|
+
return ret
|
|
14410
15414
|
def set_orientation(self, orientation):
|
|
14411
15415
|
"""
|
|
14412
15416
|
Set deskewing angle to given `orientation` number.
|
|
@@ -14628,6 +15632,106 @@ class ImageRegionType(RegionType):
|
|
|
14628
15632
|
pass
|
|
14629
15633
|
def __hash__(self):
|
|
14630
15634
|
return hash(self.id)
|
|
15635
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
15636
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
15637
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
15638
|
+
|
|
15639
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
15640
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
15641
|
+
if level == 1:
|
|
15642
|
+
# stop recursion, filter classes
|
|
15643
|
+
if classes:
|
|
15644
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
15645
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
15646
|
+
regions = regions[1:]
|
|
15647
|
+
return regions
|
|
15648
|
+
# find more regions recursively
|
|
15649
|
+
more_regions = []
|
|
15650
|
+
for region in regions:
|
|
15651
|
+
more_regions.append([])
|
|
15652
|
+
for class_ in PAGE_REGION_TYPES:
|
|
15653
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
15654
|
+
# 'Map' is not recursive in 2019 schema
|
|
15655
|
+
continue
|
|
15656
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
15657
|
+
if not any(more_regions):
|
|
15658
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
15659
|
+
ret = []
|
|
15660
|
+
for r, more in zip(regions, more_regions):
|
|
15661
|
+
ret.append(r)
|
|
15662
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
15663
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
15664
|
+
|
|
15665
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
15666
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
15667
|
+
elements = rogroup.get_AllIndexed()
|
|
15668
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
15669
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
15670
|
+
regionrefs = list()
|
|
15671
|
+
for elem in elements:
|
|
15672
|
+
regionrefs.append(elem.get_regionRef())
|
|
15673
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
15674
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
15675
|
+
return regionrefs
|
|
15676
|
+
|
|
15677
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
15678
|
+
"""
|
|
15679
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
15680
|
+
Return in document order, unless the top element is ``Page`` and
|
|
15681
|
+
`order` is ``reading-order``.
|
|
15682
|
+
|
|
15683
|
+
Arguments:
|
|
15684
|
+
classes (list): Classes of regions that shall be returned, \
|
|
15685
|
+
e.g. ``['Text', 'Image']``
|
|
15686
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
15687
|
+
return regions sorted by document order (``document``, default) or by
|
|
15688
|
+
reading order with regions not in the reading order at the end of the
|
|
15689
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
15690
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
15691
|
+
on page level.
|
|
15692
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
15693
|
+
all regions at any depth. Default: 0
|
|
15694
|
+
|
|
15695
|
+
Returns:
|
|
15696
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
15697
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
15698
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
15699
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
15700
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
15701
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
15702
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
15703
|
+
and/or :py:class:`CustomRegionType`
|
|
15704
|
+
|
|
15705
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
15706
|
+
::
|
|
15707
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
15708
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
15709
|
+
for line in region.get_TextLine())
|
|
15710
|
+
"""
|
|
15711
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
15712
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
15713
|
+
if depth < 0:
|
|
15714
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
15715
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
15716
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
15717
|
+
reading_order = self.get_ReadingOrder()
|
|
15718
|
+
if reading_order:
|
|
15719
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
15720
|
+
if reading_order:
|
|
15721
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
15722
|
+
if reading_order:
|
|
15723
|
+
id2region = {region.id: region for region in ret}
|
|
15724
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
15725
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
15726
|
+
# len(ret),
|
|
15727
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
15728
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
15729
|
+
# ))
|
|
15730
|
+
if order == 'reading-order-only':
|
|
15731
|
+
ret = in_reading_order
|
|
15732
|
+
else:
|
|
15733
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
15734
|
+
return ret
|
|
14631
15735
|
def set_orientation(self, orientation):
|
|
14632
15736
|
"""
|
|
14633
15737
|
Set deskewing angle to given `orientation` number.
|
|
@@ -15191,6 +16295,106 @@ class TextRegionType(RegionType):
|
|
|
15191
16295
|
super(TextRegionType, self)._buildChildren(child_, node, nodeName_, True)
|
|
15192
16296
|
def __hash__(self):
|
|
15193
16297
|
return hash(self.id)
|
|
16298
|
+
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
|
|
16299
|
+
def _region_class(self, x): # pylint: disable=unused-argument
|
|
16300
|
+
return x.__class__.__name__.replace('RegionType', '')
|
|
16301
|
+
|
|
16302
|
+
def _get_recursive_regions(self, regions, level, classes=None):
|
|
16303
|
+
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
|
|
16304
|
+
if level == 1:
|
|
16305
|
+
# stop recursion, filter classes
|
|
16306
|
+
if classes:
|
|
16307
|
+
return [r for r in regions if self._region_class(r) in classes]
|
|
16308
|
+
if regions and regions[0].__class__.__name__ == 'PageType':
|
|
16309
|
+
regions = regions[1:]
|
|
16310
|
+
return regions
|
|
16311
|
+
# find more regions recursively
|
|
16312
|
+
more_regions = []
|
|
16313
|
+
for region in regions:
|
|
16314
|
+
more_regions.append([])
|
|
16315
|
+
for class_ in PAGE_REGION_TYPES:
|
|
16316
|
+
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
|
|
16317
|
+
# 'Map' is not recursive in 2019 schema
|
|
16318
|
+
continue
|
|
16319
|
+
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
|
|
16320
|
+
if not any(more_regions):
|
|
16321
|
+
return self._get_recursive_regions(regions, 1, classes)
|
|
16322
|
+
ret = []
|
|
16323
|
+
for r, more in zip(regions, more_regions):
|
|
16324
|
+
ret.append(r)
|
|
16325
|
+
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
|
|
16326
|
+
return self._get_recursive_regions(ret, 1, classes)
|
|
16327
|
+
|
|
16328
|
+
def _get_recursive_reading_order(self, rogroup):
|
|
16329
|
+
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
16330
|
+
elements = rogroup.get_AllIndexed()
|
|
16331
|
+
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
|
|
16332
|
+
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
|
|
16333
|
+
regionrefs = list()
|
|
16334
|
+
for elem in elements:
|
|
16335
|
+
regionrefs.append(elem.get_regionRef())
|
|
16336
|
+
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
|
|
16337
|
+
regionrefs.extend(self._get_recursive_reading_order(elem))
|
|
16338
|
+
return regionrefs
|
|
16339
|
+
|
|
16340
|
+
def get_AllRegions(self, classes=None, order='document', depth=0):
|
|
16341
|
+
"""
|
|
16342
|
+
Get all the ``*Region`` elements, or only those provided by `classes`.
|
|
16343
|
+
Return in document order, unless the top element is ``Page`` and
|
|
16344
|
+
`order` is ``reading-order``.
|
|
16345
|
+
|
|
16346
|
+
Arguments:
|
|
16347
|
+
classes (list): Classes of regions that shall be returned, \
|
|
16348
|
+
e.g. ``['Text', 'Image']``
|
|
16349
|
+
order ("document"|"reading-order"|"reading-order-only"): Whether to \
|
|
16350
|
+
return regions sorted by document order (``document``, default) or by
|
|
16351
|
+
reading order with regions not in the reading order at the end of the
|
|
16352
|
+
returned list (``reading-order``) or regions not in the reading order
|
|
16353
|
+
omitted (``reading-order-only``). The latter two are only available
|
|
16354
|
+
on page level.
|
|
16355
|
+
depth (int): Recursive depth to look for regions at, set to `0` for \
|
|
16356
|
+
all regions at any depth. Default: 0
|
|
16357
|
+
|
|
16358
|
+
Returns:
|
|
16359
|
+
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \
|
|
16360
|
+
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \
|
|
16361
|
+
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \
|
|
16362
|
+
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \
|
|
16363
|
+
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \
|
|
16364
|
+
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \
|
|
16365
|
+
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \
|
|
16366
|
+
and/or :py:class:`CustomRegionType`
|
|
16367
|
+
|
|
16368
|
+
For example, to get all text anywhere on the page in reading order, use:
|
|
16369
|
+
::
|
|
16370
|
+
'\\n'.join(line.get_TextEquiv()[0].Unicode
|
|
16371
|
+
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
|
|
16372
|
+
for line in region.get_TextLine())
|
|
16373
|
+
"""
|
|
16374
|
+
if order not in ['document', 'reading-order', 'reading-order-only']:
|
|
16375
|
+
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
|
|
16376
|
+
if depth < 0:
|
|
16377
|
+
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
|
|
16378
|
+
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
|
|
16379
|
+
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'):
|
|
16380
|
+
reading_order = self.get_ReadingOrder()
|
|
16381
|
+
if reading_order:
|
|
16382
|
+
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
|
|
16383
|
+
if reading_order:
|
|
16384
|
+
reading_order = self._get_recursive_reading_order(reading_order)
|
|
16385
|
+
if reading_order:
|
|
16386
|
+
id2region = {region.id: region for region in ret}
|
|
16387
|
+
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
|
|
16388
|
+
# print("ret: {} / in_ro: {} / not-in-ro: {}".format(
|
|
16389
|
+
# len(ret),
|
|
16390
|
+
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
|
|
16391
|
+
# len([r for r in ret if r not in in_reading_order])
|
|
16392
|
+
# ))
|
|
16393
|
+
if order == 'reading-order-only':
|
|
16394
|
+
ret = in_reading_order
|
|
16395
|
+
else:
|
|
16396
|
+
ret = in_reading_order + [r for r in ret if r not in in_reading_order]
|
|
16397
|
+
return ret
|
|
15194
16398
|
def set_orientation(self, orientation):
|
|
15195
16399
|
"""
|
|
15196
16400
|
Set deskewing angle to given `orientation` number.
|