pdf2docx-plus 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf2docx_plus/__init__.py +41 -0
- pdf2docx_plus/_vendored/__init__.py +6 -0
- pdf2docx_plus/_vendored/pdf2docx/__init__.py +3 -0
- pdf2docx_plus/_vendored/pdf2docx/common/Block.py +144 -0
- pdf2docx_plus/_vendored/pdf2docx/common/Collection.py +359 -0
- pdf2docx_plus/_vendored/pdf2docx/common/Element.py +312 -0
- pdf2docx_plus/_vendored/pdf2docx/common/__init__.py +0 -0
- pdf2docx_plus/_vendored/pdf2docx/common/algorithm.py +403 -0
- pdf2docx_plus/_vendored/pdf2docx/common/constants.py +90 -0
- pdf2docx_plus/_vendored/pdf2docx/common/docx.py +591 -0
- pdf2docx_plus/_vendored/pdf2docx/common/share.py +310 -0
- pdf2docx_plus/_vendored/pdf2docx/converter.py +481 -0
- pdf2docx_plus/_vendored/pdf2docx/font/Fonts.py +240 -0
- pdf2docx_plus/_vendored/pdf2docx/font/__init__.py +0 -0
- pdf2docx_plus/_vendored/pdf2docx/gui/App.py +37 -0
- pdf2docx_plus/_vendored/pdf2docx/gui/MainFrame.py +147 -0
- pdf2docx_plus/_vendored/pdf2docx/gui/__init__.py +0 -0
- pdf2docx_plus/_vendored/pdf2docx/image/Image.py +94 -0
- pdf2docx_plus/_vendored/pdf2docx/image/ImageBlock.py +81 -0
- pdf2docx_plus/_vendored/pdf2docx/image/ImageSpan.py +27 -0
- pdf2docx_plus/_vendored/pdf2docx/image/ImagesExtractor.py +496 -0
- pdf2docx_plus/_vendored/pdf2docx/image/__init__.py +0 -0
- pdf2docx_plus/_vendored/pdf2docx/layout/Blocks.py +650 -0
- pdf2docx_plus/_vendored/pdf2docx/layout/Column.py +49 -0
- pdf2docx_plus/_vendored/pdf2docx/layout/Layout.py +177 -0
- pdf2docx_plus/_vendored/pdf2docx/layout/Section.py +97 -0
- pdf2docx_plus/_vendored/pdf2docx/layout/Sections.py +91 -0
- pdf2docx_plus/_vendored/pdf2docx/layout/__init__.py +0 -0
- pdf2docx_plus/_vendored/pdf2docx/main.py +135 -0
- pdf2docx_plus/_vendored/pdf2docx/page/BasePage.py +27 -0
- pdf2docx_plus/_vendored/pdf2docx/page/Page.py +211 -0
- pdf2docx_plus/_vendored/pdf2docx/page/Pages.py +90 -0
- pdf2docx_plus/_vendored/pdf2docx/page/RawPage.py +279 -0
- pdf2docx_plus/_vendored/pdf2docx/page/RawPageFactory.py +23 -0
- pdf2docx_plus/_vendored/pdf2docx/page/RawPageFitz.py +164 -0
- pdf2docx_plus/_vendored/pdf2docx/page/__init__.py +0 -0
- pdf2docx_plus/_vendored/pdf2docx/shape/Path.py +405 -0
- pdf2docx_plus/_vendored/pdf2docx/shape/Paths.py +142 -0
- pdf2docx_plus/_vendored/pdf2docx/shape/Shape.py +365 -0
- pdf2docx_plus/_vendored/pdf2docx/shape/Shapes.py +241 -0
- pdf2docx_plus/_vendored/pdf2docx/shape/__init__.py +0 -0
- pdf2docx_plus/_vendored/pdf2docx/table/Border.py +419 -0
- pdf2docx_plus/_vendored/pdf2docx/table/Cell.py +165 -0
- pdf2docx_plus/_vendored/pdf2docx/table/Cells.py +27 -0
- pdf2docx_plus/_vendored/pdf2docx/table/Row.py +78 -0
- pdf2docx_plus/_vendored/pdf2docx/table/Rows.py +25 -0
- pdf2docx_plus/_vendored/pdf2docx/table/TableBlock.py +174 -0
- pdf2docx_plus/_vendored/pdf2docx/table/TableStructure.py +634 -0
- pdf2docx_plus/_vendored/pdf2docx/table/TablesConstructor.py +382 -0
- pdf2docx_plus/_vendored/pdf2docx/table/__init__.py +0 -0
- pdf2docx_plus/_vendored/pdf2docx/text/Char.py +65 -0
- pdf2docx_plus/_vendored/pdf2docx/text/Line.py +179 -0
- pdf2docx_plus/_vendored/pdf2docx/text/Lines.py +281 -0
- pdf2docx_plus/_vendored/pdf2docx/text/Spans.py +59 -0
- pdf2docx_plus/_vendored/pdf2docx/text/TextBlock.py +471 -0
- pdf2docx_plus/_vendored/pdf2docx/text/TextSpan.py +439 -0
- pdf2docx_plus/_vendored/pdf2docx/text/__init__.py +0 -0
- pdf2docx_plus/api.py +870 -0
- pdf2docx_plus/backends/__init__.py +124 -0
- pdf2docx_plus/cli.py +145 -0
- pdf2docx_plus/consolidate.py +73 -0
- pdf2docx_plus/emit/__init__.py +60 -0
- pdf2docx_plus/emit/headers_footers.py +111 -0
- pdf2docx_plus/emit/lists.py +229 -0
- pdf2docx_plus/emit/page_breaks.py +57 -0
- pdf2docx_plus/emit/page_footer.py +259 -0
- pdf2docx_plus/emit/sections.py +252 -0
- pdf2docx_plus/emit/table_fit.py +254 -0
- pdf2docx_plus/emit/tables_cleanup.py +302 -0
- pdf2docx_plus/emit/whitespace.py +55 -0
- pdf2docx_plus/emit/word_spacing.py +119 -0
- pdf2docx_plus/errors.py +53 -0
- pdf2docx_plus/fidelity/__init__.py +25 -0
- pdf2docx_plus/fidelity/crashguards.py +217 -0
- pdf2docx_plus/fidelity/hyperlink.py +56 -0
- pdf2docx_plus/fidelity/styles.py +31 -0
- pdf2docx_plus/fidelity/text.py +38 -0
- pdf2docx_plus/fidelity/tty.py +22 -0
- pdf2docx_plus/hooks/__init__.py +29 -0
- pdf2docx_plus/hooks/formula_ocr.py +82 -0
- pdf2docx_plus/hooks/layout_detection.py +43 -0
- pdf2docx_plus/hooks/ocr.py +38 -0
- pdf2docx_plus/hooks/table_transformer.py +107 -0
- pdf2docx_plus/images/__init__.py +40 -0
- pdf2docx_plus/images/recovery.py +285 -0
- pdf2docx_plus/layout/__init__.py +20 -0
- pdf2docx_plus/layout/hf_detect.py +158 -0
- pdf2docx_plus/layout/lists.py +103 -0
- pdf2docx_plus/layout/scanned.py +76 -0
- pdf2docx_plus/logging.py +43 -0
- pdf2docx_plus/plugins/__init__.py +36 -0
- pdf2docx_plus/plugins/base.py +62 -0
- pdf2docx_plus/plugins/registry.py +45 -0
- pdf2docx_plus/py.typed +0 -0
- pdf2docx_plus/server.py +90 -0
- pdf2docx_plus/styles/__init__.py +144 -0
- pdf2docx_plus/tables/__init__.py +19 -0
- pdf2docx_plus/tables/float_images.py +97 -0
- pdf2docx_plus/tables/stitch.py +219 -0
- pdf2docx_plus/version.py +1 -0
- pdf2docx_plus-0.6.1.dist-info/METADATA +236 -0
- pdf2docx_plus-0.6.1.dist-info/RECORD +105 -0
- pdf2docx_plus-0.6.1.dist-info/WHEEL +4 -0
- pdf2docx_plus-0.6.1.dist-info/entry_points.txt +2 -0
- pdf2docx_plus-0.6.1.dist-info/licenses/LICENSE +7 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""pdf2docx-plus: hardened PDF -> DOCX converter.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
|
|
5
|
+
from pdf2docx_plus import Converter, convert, ConversionResult
|
|
6
|
+
|
|
7
|
+
result = convert("in.pdf", "out.docx", timeout_s=60)
|
|
8
|
+
print(result.pages_ok, result.pages_failed, result.elapsed_s)
|
|
9
|
+
|
|
10
|
+
Lower-level facade:
|
|
11
|
+
|
|
12
|
+
with Converter("in.pdf") as cv:
|
|
13
|
+
cv.convert("out.docx", pages=[0, 1, 2])
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from .api import ConversionResult, Converter, convert, extract_tables
|
|
19
|
+
from .errors import (
|
|
20
|
+
ConversionError,
|
|
21
|
+
InputError,
|
|
22
|
+
MakeDocxError,
|
|
23
|
+
ParseError,
|
|
24
|
+
PasswordRequired,
|
|
25
|
+
TimeoutExceeded,
|
|
26
|
+
)
|
|
27
|
+
from .version import __version__
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"ConversionError",
|
|
31
|
+
"ConversionResult",
|
|
32
|
+
"Converter",
|
|
33
|
+
"InputError",
|
|
34
|
+
"MakeDocxError",
|
|
35
|
+
"ParseError",
|
|
36
|
+
"PasswordRequired",
|
|
37
|
+
"TimeoutExceeded",
|
|
38
|
+
"__version__",
|
|
39
|
+
"convert",
|
|
40
|
+
"extract_tables",
|
|
41
|
+
]
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
'''Base class for text/image/table blocks.
|
|
4
|
+
'''
|
|
5
|
+
|
|
6
|
+
from .share import BlockType, TextAlignment
|
|
7
|
+
from .Element import Element
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Block(Element):
|
|
11
|
+
'''Base class for text/image/table blocks.
|
|
12
|
+
|
|
13
|
+
Attributes:
|
|
14
|
+
raw (dict): initialize object from raw properties.
|
|
15
|
+
parent (optional): parent object that this block belongs to.
|
|
16
|
+
'''
|
|
17
|
+
def __init__(self, raw:dict=None, parent=None):
|
|
18
|
+
self._type = BlockType.UNDEFINED
|
|
19
|
+
|
|
20
|
+
# horizontal spacing
|
|
21
|
+
if raw is None: raw = {}
|
|
22
|
+
self.alignment = self._get_alignment(raw.get('alignment', 0))
|
|
23
|
+
self.left_space = raw.get('left_space', 0.0)
|
|
24
|
+
self.right_space = raw.get('right_space', 0.0)
|
|
25
|
+
self.first_line_space = raw.get('first_line_space', 0.0)
|
|
26
|
+
|
|
27
|
+
# RELATIVE position of tab stops
|
|
28
|
+
self.tab_stops = raw.get('tab_stops', [])
|
|
29
|
+
|
|
30
|
+
# vertical spacing
|
|
31
|
+
self.before_space = raw.get('before_space', 0.0)
|
|
32
|
+
self.after_space = raw.get('after_space', 0.0)
|
|
33
|
+
self.line_space = raw.get('line_space', 0.0)
|
|
34
|
+
self.line_space_type = raw.get('line_space_type', 1) # 0-exactly, 1-relatively
|
|
35
|
+
|
|
36
|
+
super().__init__(raw, parent)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def is_text_block(self):
|
|
41
|
+
'''Whether test block.'''
|
|
42
|
+
return self._type==BlockType.TEXT
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def is_inline_image_block(self):
|
|
46
|
+
'''Whether inline image block.'''
|
|
47
|
+
return self._type==BlockType.IMAGE
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def is_float_image_block(self):
|
|
51
|
+
'''Whether float image block.'''
|
|
52
|
+
return self._type==BlockType.FLOAT_IMAGE
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def is_image_block(self):
|
|
56
|
+
'''Whether inline or float image block.'''
|
|
57
|
+
return self.is_inline_image_block or self.is_float_image_block
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def is_text_image_block(self):
|
|
61
|
+
'''Whether text block or inline image block.'''
|
|
62
|
+
return self.is_text_block or self.is_inline_image_block
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def is_lattice_table_block(self):
|
|
66
|
+
'''Whether lattice table (explicit table borders) block.'''
|
|
67
|
+
return self._type==BlockType.LATTICE_TABLE
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def is_stream_table_block(self):
|
|
71
|
+
'''Whether stream table (implied by table content) block.'''
|
|
72
|
+
return self._type==BlockType.STREAM_TABLE
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def is_table_block(self):
|
|
76
|
+
'''Whether table (lattice or stream) block.'''
|
|
77
|
+
return self.is_lattice_table_block or self.is_stream_table_block
|
|
78
|
+
|
|
79
|
+
def set_text_block(self):
|
|
80
|
+
'''Set block type.'''
|
|
81
|
+
self._type = BlockType.TEXT
|
|
82
|
+
|
|
83
|
+
def set_inline_image_block(self):
|
|
84
|
+
'''Set block type.'''
|
|
85
|
+
self._type = BlockType.IMAGE
|
|
86
|
+
|
|
87
|
+
def set_float_image_block(self):
|
|
88
|
+
'''Set block type.'''
|
|
89
|
+
self._type = BlockType.FLOAT_IMAGE
|
|
90
|
+
|
|
91
|
+
def set_lattice_table_block(self):
|
|
92
|
+
'''Set block type.'''
|
|
93
|
+
self._type = BlockType.LATTICE_TABLE
|
|
94
|
+
|
|
95
|
+
def set_stream_table_block(self):
|
|
96
|
+
'''Set block type.'''
|
|
97
|
+
self._type = BlockType.STREAM_TABLE
|
|
98
|
+
|
|
99
|
+
def _get_alignment(self, mode:int):
|
|
100
|
+
for t in TextAlignment:
|
|
101
|
+
if t.value==mode:
|
|
102
|
+
return t
|
|
103
|
+
return TextAlignment.LEFT
|
|
104
|
+
|
|
105
|
+
def parse_horizontal_spacing(self, bbox, *args):
|
|
106
|
+
"""Set left alignment, and calculate left space.
|
|
107
|
+
|
|
108
|
+
Override by :obj:`pdf2docx.text.TextBlock`.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
bbox (fitz.rect): boundary box of this block.
|
|
112
|
+
"""
|
|
113
|
+
# NOTE: in PyMuPDF CS, horizontal text direction is same with positive x-axis,
|
|
114
|
+
# while vertical text is on the contrary, so use f = -1 here
|
|
115
|
+
idx, f = (0, 1.0) if self.is_horizontal_text else (3, -1.0)
|
|
116
|
+
self.alignment = TextAlignment.LEFT
|
|
117
|
+
self.left_space = (self.bbox[idx] - bbox[idx]) * f
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def store(self):
|
|
121
|
+
'''Store attributes in json format.'''
|
|
122
|
+
res = super().store()
|
|
123
|
+
res.update({
|
|
124
|
+
'type' : self._type.value,
|
|
125
|
+
'alignment' : self.alignment.value,
|
|
126
|
+
'left_space' : self.left_space,
|
|
127
|
+
'right_space' : self.right_space,
|
|
128
|
+
'first_line_space' : self.first_line_space,
|
|
129
|
+
'before_space' : self.before_space,
|
|
130
|
+
'after_space' : self.after_space,
|
|
131
|
+
'line_space' : self.line_space,
|
|
132
|
+
'line_space_type' : self.line_space_type,
|
|
133
|
+
'tab_stops' : self.tab_stops
|
|
134
|
+
})
|
|
135
|
+
return res
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def make_docx(self, *args, **kwargs):
|
|
139
|
+
"""Create associated docx element.
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
NotImplementedError
|
|
143
|
+
"""
|
|
144
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
'''A group of instances, e.g. Blocks, Lines, Spans, Shapes.
|
|
4
|
+
'''
|
|
5
|
+
|
|
6
|
+
import fitz
|
|
7
|
+
from .Element import Element
|
|
8
|
+
from .share import (IText, TextDirection)
|
|
9
|
+
from .algorithm import (solve_rects_intersection, graph_bfs)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseCollection:
|
|
13
|
+
'''Base collection representing a list of instances.'''
|
|
14
|
+
def __init__(self, instances:list=None, parent=None):
|
|
15
|
+
'''Init collection from a list of instances.'''
|
|
16
|
+
self._parent = parent
|
|
17
|
+
self._instances = []
|
|
18
|
+
self.extend(instances or []) # Note to exclude empty instance by default
|
|
19
|
+
|
|
20
|
+
def __getitem__(self, idx):
|
|
21
|
+
try:
|
|
22
|
+
instances = self._instances[idx]
|
|
23
|
+
except IndexError:
|
|
24
|
+
msg = f'Collection index {idx} out of range.'
|
|
25
|
+
raise IndexError(msg)
|
|
26
|
+
else:
|
|
27
|
+
return instances
|
|
28
|
+
|
|
29
|
+
def __iter__(self): return (instance for instance in self._instances)
|
|
30
|
+
|
|
31
|
+
def __len__(self): return len(self._instances)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def parent(self): return self._parent
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def bbox(self):
|
|
39
|
+
'''bbox of combined collection.'''
|
|
40
|
+
rect = fitz.Rect()
|
|
41
|
+
for instance in self._instances:
|
|
42
|
+
rect |= instance.bbox
|
|
43
|
+
return fitz.Rect([round(x,1) for x in rect]) # NOTE: round to avoid digital error
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def append(self, instance):
|
|
47
|
+
if not instance: return
|
|
48
|
+
self._instances.append(instance)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def extend(self, instances:list):
|
|
52
|
+
if not instances: return
|
|
53
|
+
for instance in instances: self.append(instance)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def reset(self, instances:list=None):
|
|
57
|
+
"""Reset instances list.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
instances (list, optional): reset to target instances. Defaults to None.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
BaseCollection: self
|
|
64
|
+
"""
|
|
65
|
+
self._instances = []
|
|
66
|
+
self.extend(instances or [])
|
|
67
|
+
return self
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def store(self):
|
|
71
|
+
'''Store attributes in json format.'''
|
|
72
|
+
return [ instance.store() for instance in self._instances ]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def restore(self, *args, **kwargs):
|
|
76
|
+
'''Construct Collection from a list of dict.'''
|
|
77
|
+
raise NotImplementedError
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class Collection(BaseCollection, IText):
|
|
81
|
+
'''Collection of instance focusing on grouping and sorting elements.'''
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def text_direction(self):
|
|
85
|
+
'''Get text direction. All instances must have same text direction.'''
|
|
86
|
+
res = set(instance.text_direction for instance in self._instances)
|
|
87
|
+
return list(res)[0] if len(res)==1 else TextDirection.MIX
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def group(self, fun):
|
|
91
|
+
"""Group instances according to user defined criterion.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
fun (function): with 2 arguments representing 2 instances (Element) and return bool.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
list: a list of grouped ``Collection`` instances.
|
|
98
|
+
|
|
99
|
+
Examples 1::
|
|
100
|
+
|
|
101
|
+
# group instances intersected with each other
|
|
102
|
+
fun = lambda a,b: a.bbox & b.bbox
|
|
103
|
+
|
|
104
|
+
Examples 2::
|
|
105
|
+
|
|
106
|
+
# group instances aligned horizontally
|
|
107
|
+
fun = lambda a,b: a.horizontally_aligned_with(b)
|
|
108
|
+
|
|
109
|
+
.. note::
|
|
110
|
+
It's equal to a GRAPH searching problem, build adjacent list, and then search graph
|
|
111
|
+
to find all connected components.
|
|
112
|
+
"""
|
|
113
|
+
# build adjacent list:
|
|
114
|
+
# the i-th item is a set of indexes, which connected to the i-th instance.
|
|
115
|
+
# NOTE: O(n^2) method, but it's acceptable (~0.2s) when n<1000 which is satisfied by page blocks
|
|
116
|
+
num = len(self._instances)
|
|
117
|
+
index_groups = [set() for i in range(num)] # type: list[set]
|
|
118
|
+
for i, instance in enumerate(self._instances):
|
|
119
|
+
# connections of current instance to all instances after it
|
|
120
|
+
for j in range(i+1, num):
|
|
121
|
+
if fun(instance, self._instances[j]):
|
|
122
|
+
index_groups[i].add(j)
|
|
123
|
+
index_groups[j].add(i)
|
|
124
|
+
|
|
125
|
+
# search graph -> grouped index of instance
|
|
126
|
+
groups = graph_bfs(index_groups)
|
|
127
|
+
groups = [self.__class__([self._instances[i] for i in group]) for group in groups]
|
|
128
|
+
return groups
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def group_by_connectivity(self, dx:float, dy:float):
|
|
132
|
+
"""Collect connected instances into same group.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
dx (float): x-tolerances to define connectivity
|
|
136
|
+
dy (float): y-tolerances to define connectivity
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
list: a list of grouped ``Collection`` instances.
|
|
140
|
+
|
|
141
|
+
.. note::
|
|
142
|
+
* It's equal to a GRAPH traversing problem, which the critical point in
|
|
143
|
+
building the adjacent list, especially a large number of vertex (paths).
|
|
144
|
+
|
|
145
|
+
* Checking intersections between paths is actually a Rectangle-Intersection
|
|
146
|
+
problem, studied already in many literatures.
|
|
147
|
+
"""
|
|
148
|
+
# build the graph -> adjacent list:
|
|
149
|
+
# the i-th item is a set of indexes, which connected to the i-th instance
|
|
150
|
+
num = len(self._instances)
|
|
151
|
+
index_groups = [set() for _ in range(num)] # type: list[set]
|
|
152
|
+
|
|
153
|
+
# solve rectangle intersection problem
|
|
154
|
+
i_rect_x, i = [], 0
|
|
155
|
+
d_rect = (-dx, -dy, dx, dy)
|
|
156
|
+
for rect in self._instances:
|
|
157
|
+
points = [a+b for a,b in zip(rect.bbox, d_rect)] # consider tolerance
|
|
158
|
+
i_rect_x.append((i, points, points[0]))
|
|
159
|
+
i_rect_x.append((i+1, points, points[2]))
|
|
160
|
+
i += 2
|
|
161
|
+
i_rect_x.sort(key=lambda item: item[-1])
|
|
162
|
+
solve_rects_intersection(i_rect_x, 2*num, index_groups)
|
|
163
|
+
|
|
164
|
+
# search graph -> grouped index of instance
|
|
165
|
+
groups = graph_bfs(index_groups)
|
|
166
|
+
groups = [self.__class__([self._instances[i] for i in group]) for group in groups]
|
|
167
|
+
return groups
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def group_by_columns(self, factor:float=0.0, sorted:bool=True, text_direction:bool=False):
|
|
171
|
+
'''Group elements into columns based on the bbox.'''
|
|
172
|
+
# split in columns
|
|
173
|
+
fun = lambda a,b: a.vertically_align_with(b, factor=factor, text_direction=text_direction)
|
|
174
|
+
groups = self.group(fun)
|
|
175
|
+
|
|
176
|
+
# increase in x-direction if sort
|
|
177
|
+
if sorted:
|
|
178
|
+
idx = 3 if text_direction and self.is_vertical_text else 0
|
|
179
|
+
groups.sort(key=lambda group: group.bbox[idx])
|
|
180
|
+
|
|
181
|
+
return groups
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def group_by_rows(self, factor:float=0.0, sorted:bool=True, text_direction:bool=False):
|
|
185
|
+
'''Group elements into rows based on the bbox.'''
|
|
186
|
+
# split in rows
|
|
187
|
+
fun = lambda a,b: a.horizontally_align_with(b, factor=factor, text_direction=text_direction)
|
|
188
|
+
groups = self.group(fun)
|
|
189
|
+
|
|
190
|
+
# increase in y-direction if sort
|
|
191
|
+
if sorted:
|
|
192
|
+
idx = 0 if text_direction and self.is_vertical_text else 1
|
|
193
|
+
groups.sort(key=lambda group: group.bbox[idx])
|
|
194
|
+
|
|
195
|
+
return groups
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def group_by_physical_rows(self, sorted:bool=False, text_direction:bool=False):
|
|
199
|
+
'''Group lines into physical rows.'''
|
|
200
|
+
fun = lambda a,b: a.in_same_row(b)
|
|
201
|
+
groups = self.group(fun)
|
|
202
|
+
|
|
203
|
+
# increase in y-direction if sort
|
|
204
|
+
if sorted:
|
|
205
|
+
idx = 0 if text_direction and self.is_vertical_text else 1
|
|
206
|
+
groups.sort(key=lambda group: group.bbox[idx])
|
|
207
|
+
|
|
208
|
+
return groups
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def sort_in_reading_order(self):
|
|
212
|
+
'''Sort collection instances in reading order (considering text direction), e.g.
|
|
213
|
+
for normal reading direction: from top to bottom, from left to right.
|
|
214
|
+
'''
|
|
215
|
+
if self.is_horizontal_text:
|
|
216
|
+
self._instances.sort(key=lambda e: (e.bbox.y0, e.bbox.x0, e.bbox.x1))
|
|
217
|
+
else:
|
|
218
|
+
self._instances.sort(key=lambda e: (e.bbox.x0, e.bbox.y1, e.bbox.y0))
|
|
219
|
+
return self
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def sort_in_line_order(self):
|
|
223
|
+
'''Sort collection instances in a physical with text direction considered, e.g.
|
|
224
|
+
for normal reading direction: from left to right.
|
|
225
|
+
'''
|
|
226
|
+
if not self.is_vertical_text:
|
|
227
|
+
self._instances.sort(key=lambda e: (e.bbox.x0, e.bbox.y0, e.bbox.x1))
|
|
228
|
+
else:
|
|
229
|
+
self._instances.sort(key=lambda e: (e.bbox.y1, e.bbox.x0, e.bbox.y0))
|
|
230
|
+
return self
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def sort_in_reading_order_plus(self):
|
|
234
|
+
'''Sort instances in reading order, especially for instances in same row. Taking
|
|
235
|
+
natural reading direction for example: reading order for rows, from left to right
|
|
236
|
+
for instances in row. In the following example, A comes before B::
|
|
237
|
+
|
|
238
|
+
+-----------+
|
|
239
|
+
+---------+ | |
|
|
240
|
+
| A | | B |
|
|
241
|
+
+---------+ +-----------+
|
|
242
|
+
|
|
243
|
+
Steps:
|
|
244
|
+
|
|
245
|
+
* Sort elements in reading order, i.e. from top to bottom, from left to right.
|
|
246
|
+
* Group elements in row.
|
|
247
|
+
* Sort elements in row: from left to right.
|
|
248
|
+
'''
|
|
249
|
+
instances = []
|
|
250
|
+
for row in self.group_by_physical_rows(sorted=True, text_direction=True):
|
|
251
|
+
row.sort_in_line_order()
|
|
252
|
+
instances.extend(row)
|
|
253
|
+
self.reset(instances)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class ElementCollection(Collection):
|
|
258
|
+
'''Collection of ``Element`` instances.'''
|
|
259
|
+
|
|
260
|
+
def _update_bbox(self, e:Element):
|
|
261
|
+
'''Update parent bbox.'''
|
|
262
|
+
if not self._parent is None: # Note: `if self._parent` does not work here
|
|
263
|
+
self._parent.union_bbox(e)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def append(self, e:Element):
|
|
267
|
+
"""Append an instance, update parent's bbox accordingly and set the parent of the added instance.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
e (Element): instance to append.
|
|
271
|
+
"""
|
|
272
|
+
if not e: return
|
|
273
|
+
self._instances.append(e)
|
|
274
|
+
self._update_bbox(e)
|
|
275
|
+
|
|
276
|
+
# set parent
|
|
277
|
+
if not self._parent is None: e.parent = self._parent
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def insert(self, nth:int, e:Element):
|
|
281
|
+
"""Insert a Element and update parent's bbox accordingly.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
nth (int): the position to insert.
|
|
285
|
+
e (Element): the instance to insert.
|
|
286
|
+
"""
|
|
287
|
+
if not e: return
|
|
288
|
+
self._instances.insert(nth, e)
|
|
289
|
+
self._update_bbox(e)
|
|
290
|
+
e.parent = self._parent # set parent
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def pop(self, nth:int):
|
|
294
|
+
"""Delete the ``nth`` instance.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
nth (int): the position to remove.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Collection: the removed instance.
|
|
301
|
+
"""
|
|
302
|
+
return self._instances.pop(nth)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def is_flow_layout(self, line_separate_threshold:float, cell_layout=False):
|
|
306
|
+
'''Whether contained elements are in flow layout or not.'''
|
|
307
|
+
# float layout if vertical text but not cell layout, since vertical text
|
|
308
|
+
# will be simulated with stream table
|
|
309
|
+
if not cell_layout and self.is_vertical_text:
|
|
310
|
+
return False
|
|
311
|
+
|
|
312
|
+
# flow layout if single column only
|
|
313
|
+
if len(self)<=1: return True
|
|
314
|
+
if len(self.group_by_columns())>1: return False
|
|
315
|
+
|
|
316
|
+
# group in physical row and check distance between lines
|
|
317
|
+
idx0, idx1 = (0, 2) if self.is_horizontal_text else (3, 1)
|
|
318
|
+
for row in self.group_by_physical_rows(text_direction=True):
|
|
319
|
+
for i in range(1, len(row)):
|
|
320
|
+
dis = abs(row[i].bbox[idx0]-row[i-1].bbox[idx1])
|
|
321
|
+
if dis >= line_separate_threshold: return False
|
|
322
|
+
|
|
323
|
+
return True
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def contained_in_bbox(self, bbox):
|
|
327
|
+
'''Filter instances contained in target bbox.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
bbox (fitz.Rect): target boundary box.
|
|
331
|
+
'''
|
|
332
|
+
instances = list(filter(
|
|
333
|
+
lambda e: bbox.contains(e.bbox), self._instances))
|
|
334
|
+
return self.__class__(instances)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def split_with_intersection(self, bbox:fitz.Rect, threshold:float=1e-3):
|
|
338
|
+
"""Split instances into two groups: one intersects with ``bbox``, the other not.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
bbox (fitz.Rect): target rect box.
|
|
342
|
+
threshold (float): It's intersected when the overlap rate exceeds this threshold. Defaults to 0.
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
tuple: two group in original class type.
|
|
346
|
+
"""
|
|
347
|
+
intersections, no_intersections = [], []
|
|
348
|
+
for instance in self._instances:
|
|
349
|
+
# A contains B => A & B = B
|
|
350
|
+
intersection = instance.bbox & bbox
|
|
351
|
+
if intersection.is_empty:
|
|
352
|
+
no_intersections.append(instance)
|
|
353
|
+
else:
|
|
354
|
+
factor = round(intersection.get_area()/instance.bbox.get_area(), 2)
|
|
355
|
+
if factor >= threshold:
|
|
356
|
+
intersections.append(instance)
|
|
357
|
+
else:
|
|
358
|
+
no_intersections.append(instance)
|
|
359
|
+
return self.__class__(intersections), self.__class__(no_intersections)
|