pdf2docx-plus 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. pdf2docx_plus/__init__.py +41 -0
  2. pdf2docx_plus/_vendored/__init__.py +6 -0
  3. pdf2docx_plus/_vendored/pdf2docx/__init__.py +3 -0
  4. pdf2docx_plus/_vendored/pdf2docx/common/Block.py +144 -0
  5. pdf2docx_plus/_vendored/pdf2docx/common/Collection.py +359 -0
  6. pdf2docx_plus/_vendored/pdf2docx/common/Element.py +312 -0
  7. pdf2docx_plus/_vendored/pdf2docx/common/__init__.py +0 -0
  8. pdf2docx_plus/_vendored/pdf2docx/common/algorithm.py +403 -0
  9. pdf2docx_plus/_vendored/pdf2docx/common/constants.py +90 -0
  10. pdf2docx_plus/_vendored/pdf2docx/common/docx.py +591 -0
  11. pdf2docx_plus/_vendored/pdf2docx/common/share.py +310 -0
  12. pdf2docx_plus/_vendored/pdf2docx/converter.py +481 -0
  13. pdf2docx_plus/_vendored/pdf2docx/font/Fonts.py +240 -0
  14. pdf2docx_plus/_vendored/pdf2docx/font/__init__.py +0 -0
  15. pdf2docx_plus/_vendored/pdf2docx/gui/App.py +37 -0
  16. pdf2docx_plus/_vendored/pdf2docx/gui/MainFrame.py +147 -0
  17. pdf2docx_plus/_vendored/pdf2docx/gui/__init__.py +0 -0
  18. pdf2docx_plus/_vendored/pdf2docx/image/Image.py +94 -0
  19. pdf2docx_plus/_vendored/pdf2docx/image/ImageBlock.py +81 -0
  20. pdf2docx_plus/_vendored/pdf2docx/image/ImageSpan.py +27 -0
  21. pdf2docx_plus/_vendored/pdf2docx/image/ImagesExtractor.py +496 -0
  22. pdf2docx_plus/_vendored/pdf2docx/image/__init__.py +0 -0
  23. pdf2docx_plus/_vendored/pdf2docx/layout/Blocks.py +650 -0
  24. pdf2docx_plus/_vendored/pdf2docx/layout/Column.py +49 -0
  25. pdf2docx_plus/_vendored/pdf2docx/layout/Layout.py +177 -0
  26. pdf2docx_plus/_vendored/pdf2docx/layout/Section.py +97 -0
  27. pdf2docx_plus/_vendored/pdf2docx/layout/Sections.py +91 -0
  28. pdf2docx_plus/_vendored/pdf2docx/layout/__init__.py +0 -0
  29. pdf2docx_plus/_vendored/pdf2docx/main.py +135 -0
  30. pdf2docx_plus/_vendored/pdf2docx/page/BasePage.py +27 -0
  31. pdf2docx_plus/_vendored/pdf2docx/page/Page.py +211 -0
  32. pdf2docx_plus/_vendored/pdf2docx/page/Pages.py +90 -0
  33. pdf2docx_plus/_vendored/pdf2docx/page/RawPage.py +279 -0
  34. pdf2docx_plus/_vendored/pdf2docx/page/RawPageFactory.py +23 -0
  35. pdf2docx_plus/_vendored/pdf2docx/page/RawPageFitz.py +164 -0
  36. pdf2docx_plus/_vendored/pdf2docx/page/__init__.py +0 -0
  37. pdf2docx_plus/_vendored/pdf2docx/shape/Path.py +405 -0
  38. pdf2docx_plus/_vendored/pdf2docx/shape/Paths.py +142 -0
  39. pdf2docx_plus/_vendored/pdf2docx/shape/Shape.py +365 -0
  40. pdf2docx_plus/_vendored/pdf2docx/shape/Shapes.py +241 -0
  41. pdf2docx_plus/_vendored/pdf2docx/shape/__init__.py +0 -0
  42. pdf2docx_plus/_vendored/pdf2docx/table/Border.py +419 -0
  43. pdf2docx_plus/_vendored/pdf2docx/table/Cell.py +165 -0
  44. pdf2docx_plus/_vendored/pdf2docx/table/Cells.py +27 -0
  45. pdf2docx_plus/_vendored/pdf2docx/table/Row.py +78 -0
  46. pdf2docx_plus/_vendored/pdf2docx/table/Rows.py +25 -0
  47. pdf2docx_plus/_vendored/pdf2docx/table/TableBlock.py +174 -0
  48. pdf2docx_plus/_vendored/pdf2docx/table/TableStructure.py +634 -0
  49. pdf2docx_plus/_vendored/pdf2docx/table/TablesConstructor.py +382 -0
  50. pdf2docx_plus/_vendored/pdf2docx/table/__init__.py +0 -0
  51. pdf2docx_plus/_vendored/pdf2docx/text/Char.py +65 -0
  52. pdf2docx_plus/_vendored/pdf2docx/text/Line.py +179 -0
  53. pdf2docx_plus/_vendored/pdf2docx/text/Lines.py +281 -0
  54. pdf2docx_plus/_vendored/pdf2docx/text/Spans.py +59 -0
  55. pdf2docx_plus/_vendored/pdf2docx/text/TextBlock.py +471 -0
  56. pdf2docx_plus/_vendored/pdf2docx/text/TextSpan.py +439 -0
  57. pdf2docx_plus/_vendored/pdf2docx/text/__init__.py +0 -0
  58. pdf2docx_plus/api.py +870 -0
  59. pdf2docx_plus/backends/__init__.py +124 -0
  60. pdf2docx_plus/cli.py +145 -0
  61. pdf2docx_plus/consolidate.py +73 -0
  62. pdf2docx_plus/emit/__init__.py +60 -0
  63. pdf2docx_plus/emit/headers_footers.py +111 -0
  64. pdf2docx_plus/emit/lists.py +229 -0
  65. pdf2docx_plus/emit/page_breaks.py +57 -0
  66. pdf2docx_plus/emit/page_footer.py +259 -0
  67. pdf2docx_plus/emit/sections.py +252 -0
  68. pdf2docx_plus/emit/table_fit.py +254 -0
  69. pdf2docx_plus/emit/tables_cleanup.py +302 -0
  70. pdf2docx_plus/emit/whitespace.py +55 -0
  71. pdf2docx_plus/emit/word_spacing.py +119 -0
  72. pdf2docx_plus/errors.py +53 -0
  73. pdf2docx_plus/fidelity/__init__.py +25 -0
  74. pdf2docx_plus/fidelity/crashguards.py +217 -0
  75. pdf2docx_plus/fidelity/hyperlink.py +56 -0
  76. pdf2docx_plus/fidelity/styles.py +31 -0
  77. pdf2docx_plus/fidelity/text.py +38 -0
  78. pdf2docx_plus/fidelity/tty.py +22 -0
  79. pdf2docx_plus/hooks/__init__.py +29 -0
  80. pdf2docx_plus/hooks/formula_ocr.py +82 -0
  81. pdf2docx_plus/hooks/layout_detection.py +43 -0
  82. pdf2docx_plus/hooks/ocr.py +38 -0
  83. pdf2docx_plus/hooks/table_transformer.py +107 -0
  84. pdf2docx_plus/images/__init__.py +40 -0
  85. pdf2docx_plus/images/recovery.py +285 -0
  86. pdf2docx_plus/layout/__init__.py +20 -0
  87. pdf2docx_plus/layout/hf_detect.py +158 -0
  88. pdf2docx_plus/layout/lists.py +103 -0
  89. pdf2docx_plus/layout/scanned.py +76 -0
  90. pdf2docx_plus/logging.py +43 -0
  91. pdf2docx_plus/plugins/__init__.py +36 -0
  92. pdf2docx_plus/plugins/base.py +62 -0
  93. pdf2docx_plus/plugins/registry.py +45 -0
  94. pdf2docx_plus/py.typed +0 -0
  95. pdf2docx_plus/server.py +90 -0
  96. pdf2docx_plus/styles/__init__.py +144 -0
  97. pdf2docx_plus/tables/__init__.py +19 -0
  98. pdf2docx_plus/tables/float_images.py +97 -0
  99. pdf2docx_plus/tables/stitch.py +219 -0
  100. pdf2docx_plus/version.py +1 -0
  101. pdf2docx_plus-0.6.1.dist-info/METADATA +236 -0
  102. pdf2docx_plus-0.6.1.dist-info/RECORD +105 -0
  103. pdf2docx_plus-0.6.1.dist-info/WHEEL +4 -0
  104. pdf2docx_plus-0.6.1.dist-info/entry_points.txt +2 -0
  105. pdf2docx_plus-0.6.1.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,41 @@
1
+ """pdf2docx-plus: hardened PDF -> DOCX converter.
2
+
3
+ Public API:
4
+
5
+ from pdf2docx_plus import Converter, convert, ConversionResult
6
+
7
+ result = convert("in.pdf", "out.docx", timeout_s=60)
8
+ print(result.pages_ok, result.pages_failed, result.elapsed_s)
9
+
10
+ Lower-level facade:
11
+
12
+ with Converter("in.pdf") as cv:
13
+ cv.convert("out.docx", pages=[0, 1, 2])
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from .api import ConversionResult, Converter, convert, extract_tables
19
+ from .errors import (
20
+ ConversionError,
21
+ InputError,
22
+ MakeDocxError,
23
+ ParseError,
24
+ PasswordRequired,
25
+ TimeoutExceeded,
26
+ )
27
+ from .version import __version__
28
+
29
+ __all__ = [
30
+ "ConversionError",
31
+ "ConversionResult",
32
+ "Converter",
33
+ "InputError",
34
+ "MakeDocxError",
35
+ "ParseError",
36
+ "PasswordRequired",
37
+ "TimeoutExceeded",
38
+ "__version__",
39
+ "convert",
40
+ "extract_tables",
41
+ ]
@@ -0,0 +1,6 @@
1
+ """Vendored third-party packages.
2
+
3
+ These packages are shipped inside pdf2docx_plus to isolate them from
4
+ whatever else the user has installed. Do not import from here directly
5
+ from application code; use the public pdf2docx_plus API instead.
6
+ """
@@ -0,0 +1,3 @@
1
+ from .converter import Converter
2
+ from .page.Page import Page
3
+ from .main import parse
@@ -0,0 +1,144 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ '''Base class for text/image/table blocks.
4
+ '''
5
+
6
+ from .share import BlockType, TextAlignment
7
+ from .Element import Element
8
+
9
+
10
+ class Block(Element):
11
+ '''Base class for text/image/table blocks.
12
+
13
+ Attributes:
14
+ raw (dict): initialize object from raw properties.
15
+ parent (optional): parent object that this block belongs to.
16
+ '''
17
+ def __init__(self, raw:dict=None, parent=None):
18
+ self._type = BlockType.UNDEFINED
19
+
20
+ # horizontal spacing
21
+ if raw is None: raw = {}
22
+ self.alignment = self._get_alignment(raw.get('alignment', 0))
23
+ self.left_space = raw.get('left_space', 0.0)
24
+ self.right_space = raw.get('right_space', 0.0)
25
+ self.first_line_space = raw.get('first_line_space', 0.0)
26
+
27
+ # RELATIVE position of tab stops
28
+ self.tab_stops = raw.get('tab_stops', [])
29
+
30
+ # vertical spacing
31
+ self.before_space = raw.get('before_space', 0.0)
32
+ self.after_space = raw.get('after_space', 0.0)
33
+ self.line_space = raw.get('line_space', 0.0)
34
+ self.line_space_type = raw.get('line_space_type', 1) # 0-exactly, 1-relatively
35
+
36
+ super().__init__(raw, parent)
37
+
38
+
39
+ @property
40
+ def is_text_block(self):
41
+ '''Whether test block.'''
42
+ return self._type==BlockType.TEXT
43
+
44
+ @property
45
+ def is_inline_image_block(self):
46
+ '''Whether inline image block.'''
47
+ return self._type==BlockType.IMAGE
48
+
49
+ @property
50
+ def is_float_image_block(self):
51
+ '''Whether float image block.'''
52
+ return self._type==BlockType.FLOAT_IMAGE
53
+
54
+ @property
55
+ def is_image_block(self):
56
+ '''Whether inline or float image block.'''
57
+ return self.is_inline_image_block or self.is_float_image_block
58
+
59
+ @property
60
+ def is_text_image_block(self):
61
+ '''Whether text block or inline image block.'''
62
+ return self.is_text_block or self.is_inline_image_block
63
+
64
+ @property
65
+ def is_lattice_table_block(self):
66
+ '''Whether lattice table (explicit table borders) block.'''
67
+ return self._type==BlockType.LATTICE_TABLE
68
+
69
+ @property
70
+ def is_stream_table_block(self):
71
+ '''Whether stream table (implied by table content) block.'''
72
+ return self._type==BlockType.STREAM_TABLE
73
+
74
+ @property
75
+ def is_table_block(self):
76
+ '''Whether table (lattice or stream) block.'''
77
+ return self.is_lattice_table_block or self.is_stream_table_block
78
+
79
+ def set_text_block(self):
80
+ '''Set block type.'''
81
+ self._type = BlockType.TEXT
82
+
83
+ def set_inline_image_block(self):
84
+ '''Set block type.'''
85
+ self._type = BlockType.IMAGE
86
+
87
+ def set_float_image_block(self):
88
+ '''Set block type.'''
89
+ self._type = BlockType.FLOAT_IMAGE
90
+
91
+ def set_lattice_table_block(self):
92
+ '''Set block type.'''
93
+ self._type = BlockType.LATTICE_TABLE
94
+
95
+ def set_stream_table_block(self):
96
+ '''Set block type.'''
97
+ self._type = BlockType.STREAM_TABLE
98
+
99
+ def _get_alignment(self, mode:int):
100
+ for t in TextAlignment:
101
+ if t.value==mode:
102
+ return t
103
+ return TextAlignment.LEFT
104
+
105
+ def parse_horizontal_spacing(self, bbox, *args):
106
+ """Set left alignment, and calculate left space.
107
+
108
+ Override by :obj:`pdf2docx.text.TextBlock`.
109
+
110
+ Args:
111
+ bbox (fitz.rect): boundary box of this block.
112
+ """
113
+ # NOTE: in PyMuPDF CS, horizontal text direction is same with positive x-axis,
114
+ # while vertical text is on the contrary, so use f = -1 here
115
+ idx, f = (0, 1.0) if self.is_horizontal_text else (3, -1.0)
116
+ self.alignment = TextAlignment.LEFT
117
+ self.left_space = (self.bbox[idx] - bbox[idx]) * f
118
+
119
+
120
+ def store(self):
121
+ '''Store attributes in json format.'''
122
+ res = super().store()
123
+ res.update({
124
+ 'type' : self._type.value,
125
+ 'alignment' : self.alignment.value,
126
+ 'left_space' : self.left_space,
127
+ 'right_space' : self.right_space,
128
+ 'first_line_space' : self.first_line_space,
129
+ 'before_space' : self.before_space,
130
+ 'after_space' : self.after_space,
131
+ 'line_space' : self.line_space,
132
+ 'line_space_type' : self.line_space_type,
133
+ 'tab_stops' : self.tab_stops
134
+ })
135
+ return res
136
+
137
+
138
+ def make_docx(self, *args, **kwargs):
139
+ """Create associated docx element.
140
+
141
+ Raises:
142
+ NotImplementedError
143
+ """
144
+ raise NotImplementedError
@@ -0,0 +1,359 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ '''A group of instances, e.g. Blocks, Lines, Spans, Shapes.
4
+ '''
5
+
6
+ import fitz
7
+ from .Element import Element
8
+ from .share import (IText, TextDirection)
9
+ from .algorithm import (solve_rects_intersection, graph_bfs)
10
+
11
+
12
+ class BaseCollection:
13
+ '''Base collection representing a list of instances.'''
14
+ def __init__(self, instances:list=None, parent=None):
15
+ '''Init collection from a list of instances.'''
16
+ self._parent = parent
17
+ self._instances = []
18
+ self.extend(instances or []) # Note to exclude empty instance by default
19
+
20
+ def __getitem__(self, idx):
21
+ try:
22
+ instances = self._instances[idx]
23
+ except IndexError:
24
+ msg = f'Collection index {idx} out of range.'
25
+ raise IndexError(msg)
26
+ else:
27
+ return instances
28
+
29
+ def __iter__(self): return (instance for instance in self._instances)
30
+
31
+ def __len__(self): return len(self._instances)
32
+
33
+ @property
34
+ def parent(self): return self._parent
35
+
36
+
37
+ @property
38
+ def bbox(self):
39
+ '''bbox of combined collection.'''
40
+ rect = fitz.Rect()
41
+ for instance in self._instances:
42
+ rect |= instance.bbox
43
+ return fitz.Rect([round(x,1) for x in rect]) # NOTE: round to avoid digital error
44
+
45
+
46
+ def append(self, instance):
47
+ if not instance: return
48
+ self._instances.append(instance)
49
+
50
+
51
+ def extend(self, instances:list):
52
+ if not instances: return
53
+ for instance in instances: self.append(instance)
54
+
55
+
56
+ def reset(self, instances:list=None):
57
+ """Reset instances list.
58
+
59
+ Args:
60
+ instances (list, optional): reset to target instances. Defaults to None.
61
+
62
+ Returns:
63
+ BaseCollection: self
64
+ """
65
+ self._instances = []
66
+ self.extend(instances or [])
67
+ return self
68
+
69
+
70
+ def store(self):
71
+ '''Store attributes in json format.'''
72
+ return [ instance.store() for instance in self._instances ]
73
+
74
+
75
+ def restore(self, *args, **kwargs):
76
+ '''Construct Collection from a list of dict.'''
77
+ raise NotImplementedError
78
+
79
+
80
+ class Collection(BaseCollection, IText):
81
+ '''Collection of instance focusing on grouping and sorting elements.'''
82
+
83
+ @property
84
+ def text_direction(self):
85
+ '''Get text direction. All instances must have same text direction.'''
86
+ res = set(instance.text_direction for instance in self._instances)
87
+ return list(res)[0] if len(res)==1 else TextDirection.MIX
88
+
89
+
90
+ def group(self, fun):
91
+ """Group instances according to user defined criterion.
92
+
93
+ Args:
94
+ fun (function): with 2 arguments representing 2 instances (Element) and return bool.
95
+
96
+ Returns:
97
+ list: a list of grouped ``Collection`` instances.
98
+
99
+ Examples 1::
100
+
101
+ # group instances intersected with each other
102
+ fun = lambda a,b: a.bbox & b.bbox
103
+
104
+ Examples 2::
105
+
106
+ # group instances aligned horizontally
107
+ fun = lambda a,b: a.horizontally_aligned_with(b)
108
+
109
+ .. note::
110
+ It's equal to a GRAPH searching problem, build adjacent list, and then search graph
111
+ to find all connected components.
112
+ """
113
+ # build adjacent list:
114
+ # the i-th item is a set of indexes, which connected to the i-th instance.
115
+ # NOTE: O(n^2) method, but it's acceptable (~0.2s) when n<1000 which is satisfied by page blocks
116
+ num = len(self._instances)
117
+ index_groups = [set() for i in range(num)] # type: list[set]
118
+ for i, instance in enumerate(self._instances):
119
+ # connections of current instance to all instances after it
120
+ for j in range(i+1, num):
121
+ if fun(instance, self._instances[j]):
122
+ index_groups[i].add(j)
123
+ index_groups[j].add(i)
124
+
125
+ # search graph -> grouped index of instance
126
+ groups = graph_bfs(index_groups)
127
+ groups = [self.__class__([self._instances[i] for i in group]) for group in groups]
128
+ return groups
129
+
130
+
131
+ def group_by_connectivity(self, dx:float, dy:float):
132
+ """Collect connected instances into same group.
133
+
134
+ Args:
135
+ dx (float): x-tolerances to define connectivity
136
+ dy (float): y-tolerances to define connectivity
137
+
138
+ Returns:
139
+ list: a list of grouped ``Collection`` instances.
140
+
141
+ .. note::
142
+ * It's equal to a GRAPH traversing problem, which the critical point in
143
+ building the adjacent list, especially a large number of vertex (paths).
144
+
145
+ * Checking intersections between paths is actually a Rectangle-Intersection
146
+ problem, studied already in many literatures.
147
+ """
148
+ # build the graph -> adjacent list:
149
+ # the i-th item is a set of indexes, which connected to the i-th instance
150
+ num = len(self._instances)
151
+ index_groups = [set() for _ in range(num)] # type: list[set]
152
+
153
+ # solve rectangle intersection problem
154
+ i_rect_x, i = [], 0
155
+ d_rect = (-dx, -dy, dx, dy)
156
+ for rect in self._instances:
157
+ points = [a+b for a,b in zip(rect.bbox, d_rect)] # consider tolerance
158
+ i_rect_x.append((i, points, points[0]))
159
+ i_rect_x.append((i+1, points, points[2]))
160
+ i += 2
161
+ i_rect_x.sort(key=lambda item: item[-1])
162
+ solve_rects_intersection(i_rect_x, 2*num, index_groups)
163
+
164
+ # search graph -> grouped index of instance
165
+ groups = graph_bfs(index_groups)
166
+ groups = [self.__class__([self._instances[i] for i in group]) for group in groups]
167
+ return groups
168
+
169
+
170
+ def group_by_columns(self, factor:float=0.0, sorted:bool=True, text_direction:bool=False):
171
+ '''Group elements into columns based on the bbox.'''
172
+ # split in columns
173
+ fun = lambda a,b: a.vertically_align_with(b, factor=factor, text_direction=text_direction)
174
+ groups = self.group(fun)
175
+
176
+ # increase in x-direction if sort
177
+ if sorted:
178
+ idx = 3 if text_direction and self.is_vertical_text else 0
179
+ groups.sort(key=lambda group: group.bbox[idx])
180
+
181
+ return groups
182
+
183
+
184
+ def group_by_rows(self, factor:float=0.0, sorted:bool=True, text_direction:bool=False):
185
+ '''Group elements into rows based on the bbox.'''
186
+ # split in rows
187
+ fun = lambda a,b: a.horizontally_align_with(b, factor=factor, text_direction=text_direction)
188
+ groups = self.group(fun)
189
+
190
+ # increase in y-direction if sort
191
+ if sorted:
192
+ idx = 0 if text_direction and self.is_vertical_text else 1
193
+ groups.sort(key=lambda group: group.bbox[idx])
194
+
195
+ return groups
196
+
197
+
198
+ def group_by_physical_rows(self, sorted:bool=False, text_direction:bool=False):
199
+ '''Group lines into physical rows.'''
200
+ fun = lambda a,b: a.in_same_row(b)
201
+ groups = self.group(fun)
202
+
203
+ # increase in y-direction if sort
204
+ if sorted:
205
+ idx = 0 if text_direction and self.is_vertical_text else 1
206
+ groups.sort(key=lambda group: group.bbox[idx])
207
+
208
+ return groups
209
+
210
+
211
+ def sort_in_reading_order(self):
212
+ '''Sort collection instances in reading order (considering text direction), e.g.
213
+ for normal reading direction: from top to bottom, from left to right.
214
+ '''
215
+ if self.is_horizontal_text:
216
+ self._instances.sort(key=lambda e: (e.bbox.y0, e.bbox.x0, e.bbox.x1))
217
+ else:
218
+ self._instances.sort(key=lambda e: (e.bbox.x0, e.bbox.y1, e.bbox.y0))
219
+ return self
220
+
221
+
222
+ def sort_in_line_order(self):
223
+ '''Sort collection instances in a physical with text direction considered, e.g.
224
+ for normal reading direction: from left to right.
225
+ '''
226
+ if not self.is_vertical_text:
227
+ self._instances.sort(key=lambda e: (e.bbox.x0, e.bbox.y0, e.bbox.x1))
228
+ else:
229
+ self._instances.sort(key=lambda e: (e.bbox.y1, e.bbox.x0, e.bbox.y0))
230
+ return self
231
+
232
+
233
+ def sort_in_reading_order_plus(self):
234
+ '''Sort instances in reading order, especially for instances in same row. Taking
235
+ natural reading direction for example: reading order for rows, from left to right
236
+ for instances in row. In the following example, A comes before B::
237
+
238
+ +-----------+
239
+ +---------+ | |
240
+ | A | | B |
241
+ +---------+ +-----------+
242
+
243
+ Steps:
244
+
245
+ * Sort elements in reading order, i.e. from top to bottom, from left to right.
246
+ * Group elements in row.
247
+ * Sort elements in row: from left to right.
248
+ '''
249
+ instances = []
250
+ for row in self.group_by_physical_rows(sorted=True, text_direction=True):
251
+ row.sort_in_line_order()
252
+ instances.extend(row)
253
+ self.reset(instances)
254
+
255
+
256
+
257
+ class ElementCollection(Collection):
258
+ '''Collection of ``Element`` instances.'''
259
+
260
+ def _update_bbox(self, e:Element):
261
+ '''Update parent bbox.'''
262
+ if not self._parent is None: # Note: `if self._parent` does not work here
263
+ self._parent.union_bbox(e)
264
+
265
+
266
+ def append(self, e:Element):
267
+ """Append an instance, update parent's bbox accordingly and set the parent of the added instance.
268
+
269
+ Args:
270
+ e (Element): instance to append.
271
+ """
272
+ if not e: return
273
+ self._instances.append(e)
274
+ self._update_bbox(e)
275
+
276
+ # set parent
277
+ if not self._parent is None: e.parent = self._parent
278
+
279
+
280
+ def insert(self, nth:int, e:Element):
281
+ """Insert a Element and update parent's bbox accordingly.
282
+
283
+ Args:
284
+ nth (int): the position to insert.
285
+ e (Element): the instance to insert.
286
+ """
287
+ if not e: return
288
+ self._instances.insert(nth, e)
289
+ self._update_bbox(e)
290
+ e.parent = self._parent # set parent
291
+
292
+
293
+ def pop(self, nth:int):
294
+ """Delete the ``nth`` instance.
295
+
296
+ Args:
297
+ nth (int): the position to remove.
298
+
299
+ Returns:
300
+ Collection: the removed instance.
301
+ """
302
+ return self._instances.pop(nth)
303
+
304
+
305
+ def is_flow_layout(self, line_separate_threshold:float, cell_layout=False):
306
+ '''Whether contained elements are in flow layout or not.'''
307
+ # float layout if vertical text but not cell layout, since vertical text
308
+ # will be simulated with stream table
309
+ if not cell_layout and self.is_vertical_text:
310
+ return False
311
+
312
+ # flow layout if single column only
313
+ if len(self)<=1: return True
314
+ if len(self.group_by_columns())>1: return False
315
+
316
+ # group in physical row and check distance between lines
317
+ idx0, idx1 = (0, 2) if self.is_horizontal_text else (3, 1)
318
+ for row in self.group_by_physical_rows(text_direction=True):
319
+ for i in range(1, len(row)):
320
+ dis = abs(row[i].bbox[idx0]-row[i-1].bbox[idx1])
321
+ if dis >= line_separate_threshold: return False
322
+
323
+ return True
324
+
325
+
326
+ def contained_in_bbox(self, bbox):
327
+ '''Filter instances contained in target bbox.
328
+
329
+ Args:
330
+ bbox (fitz.Rect): target boundary box.
331
+ '''
332
+ instances = list(filter(
333
+ lambda e: bbox.contains(e.bbox), self._instances))
334
+ return self.__class__(instances)
335
+
336
+
337
+ def split_with_intersection(self, bbox:fitz.Rect, threshold:float=1e-3):
338
+ """Split instances into two groups: one intersects with ``bbox``, the other not.
339
+
340
+ Args:
341
+ bbox (fitz.Rect): target rect box.
342
+ threshold (float): It's intersected when the overlap rate exceeds this threshold. Defaults to 0.
343
+
344
+ Returns:
345
+ tuple: two group in original class type.
346
+ """
347
+ intersections, no_intersections = [], []
348
+ for instance in self._instances:
349
+ # A contains B => A & B = B
350
+ intersection = instance.bbox & bbox
351
+ if intersection.is_empty:
352
+ no_intersections.append(instance)
353
+ else:
354
+ factor = round(intersection.get_area()/instance.bbox.get_area(), 2)
355
+ if factor >= threshold:
356
+ intersections.append(instance)
357
+ else:
358
+ no_intersections.append(instance)
359
+ return self.__class__(intersections), self.__class__(no_intersections)