deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +8 -25
- deepdoctection/analyzer/dd.py +84 -71
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +78 -56
- deepdoctection/datapoint/box.py +7 -7
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +157 -75
- deepdoctection/datapoint/view.py +175 -151
- deepdoctection/datasets/adapter.py +30 -24
- deepdoctection/datasets/base.py +10 -10
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +23 -25
- deepdoctection/datasets/instances/doclaynet.py +48 -49
- deepdoctection/datasets/instances/fintabnet.py +44 -45
- deepdoctection/datasets/instances/funsd.py +23 -23
- deepdoctection/datasets/instances/iiitar13k.py +8 -8
- deepdoctection/datasets/instances/layouttest.py +2 -2
- deepdoctection/datasets/instances/publaynet.py +3 -3
- deepdoctection/datasets/instances/pubtables1m.py +18 -18
- deepdoctection/datasets/instances/pubtabnet.py +30 -29
- deepdoctection/datasets/instances/rvlcdip.py +28 -29
- deepdoctection/datasets/instances/xfund.py +51 -30
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +13 -12
- deepdoctection/eval/eval.py +32 -26
- deepdoctection/eval/tedsmetric.py +16 -12
- deepdoctection/eval/tp_eval_callback.py +7 -16
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +69 -89
- deepdoctection/extern/deskew.py +11 -10
- deepdoctection/extern/doctrocr.py +81 -64
- deepdoctection/extern/fastlang.py +23 -16
- deepdoctection/extern/hfdetr.py +53 -38
- deepdoctection/extern/hflayoutlm.py +216 -155
- deepdoctection/extern/hflm.py +35 -30
- deepdoctection/extern/model.py +433 -255
- deepdoctection/extern/pdftext.py +15 -15
- deepdoctection/extern/pt/ptutils.py +4 -2
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +14 -16
- deepdoctection/extern/tp/tfutils.py +16 -2
- deepdoctection/extern/tp/tpcompat.py +11 -7
- deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
- deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
- deepdoctection/extern/tpdetect.py +40 -45
- deepdoctection/mapper/cats.py +36 -40
- deepdoctection/mapper/cocostruct.py +16 -12
- deepdoctection/mapper/d2struct.py +22 -22
- deepdoctection/mapper/hfstruct.py +7 -7
- deepdoctection/mapper/laylmstruct.py +22 -24
- deepdoctection/mapper/maputils.py +9 -10
- deepdoctection/mapper/match.py +33 -2
- deepdoctection/mapper/misc.py +6 -7
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +6 -6
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/anngen.py +39 -14
- deepdoctection/pipe/base.py +68 -99
- deepdoctection/pipe/common.py +181 -85
- deepdoctection/pipe/concurrency.py +14 -10
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +18 -16
- deepdoctection/pipe/lm.py +49 -47
- deepdoctection/pipe/order.py +63 -65
- deepdoctection/pipe/refine.py +102 -109
- deepdoctection/pipe/segment.py +157 -162
- deepdoctection/pipe/sub_layout.py +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/d2_frcnn_train.py +27 -25
- deepdoctection/train/hf_detr_train.py +22 -18
- deepdoctection/train/hf_layoutlm_train.py +49 -48
- deepdoctection/train/tp_frcnn_train.py +10 -11
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +52 -14
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +41 -14
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +15 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/pdf_utils.py +39 -14
- deepdoctection/utils/settings.py +188 -182
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +70 -69
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
- deepdoctection-0.34.dist-info/RECORD +146 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.32.dist-info/RECORD +0 -146
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
deepdoctection/datapoint/view.py
CHANGED
|
@@ -22,11 +22,10 @@ simplify consumption
|
|
|
22
22
|
from __future__ import annotations
|
|
23
23
|
|
|
24
24
|
from copy import copy
|
|
25
|
-
from typing import Any,
|
|
25
|
+
from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union, no_type_check
|
|
26
26
|
|
|
27
27
|
import numpy as np
|
|
28
28
|
|
|
29
|
-
from ..utils.detection_types import ImageType, JsonDict, Pathlike
|
|
30
29
|
from ..utils.error import AnnotationError, ImageError
|
|
31
30
|
from ..utils.logger import LoggingRecord, logger
|
|
32
31
|
from ..utils.settings import (
|
|
@@ -35,13 +34,15 @@ from ..utils.settings import (
|
|
|
35
34
|
ObjectTypes,
|
|
36
35
|
PageType,
|
|
37
36
|
Relationships,
|
|
37
|
+
SummaryType,
|
|
38
38
|
TableType,
|
|
39
39
|
TokenClasses,
|
|
40
40
|
WordType,
|
|
41
41
|
get_type,
|
|
42
42
|
)
|
|
43
|
+
from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, Text_, csv
|
|
43
44
|
from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
|
|
44
|
-
from .annotation import ContainerAnnotation, ImageAnnotation,
|
|
45
|
+
from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
|
|
45
46
|
from .box import BoundingBox, crop_box_from_image
|
|
46
47
|
from .image import Image
|
|
47
48
|
|
|
@@ -68,7 +69,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
68
69
|
base_page: Page
|
|
69
70
|
|
|
70
71
|
@property
|
|
71
|
-
def bbox(self) ->
|
|
72
|
+
def bbox(self) -> list[float]:
|
|
72
73
|
"""
|
|
73
74
|
Get the bounding box as list and in absolute coordinates of the base page.
|
|
74
75
|
"""
|
|
@@ -79,7 +80,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
79
80
|
bounding_box = bounding_box.transform(self.base_page.width, self.base_page.height, absolute_coords=True)
|
|
80
81
|
return bounding_box.to_list(mode="xyxy")
|
|
81
82
|
|
|
82
|
-
def viz(self, interactive: bool = False) -> Optional[
|
|
83
|
+
def viz(self, interactive: bool = False) -> Optional[PixelValues]:
|
|
83
84
|
"""
|
|
84
85
|
Display the annotation (without any sub-layout elements).
|
|
85
86
|
|
|
@@ -100,7 +101,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
100
101
|
return np_image
|
|
101
102
|
raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
|
|
102
103
|
|
|
103
|
-
def __getattr__(self, item: str) -> Optional[Union[str, int,
|
|
104
|
+
def __getattr__(self, item: str) -> Optional[Union[str, int, list[str]]]:
|
|
104
105
|
"""
|
|
105
106
|
Get attributes defined by registered `self.get_attribute_names()` in a multi step process:
|
|
106
107
|
|
|
@@ -124,19 +125,18 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
124
125
|
return sub_cat.category_name
|
|
125
126
|
if isinstance(sub_cat, ContainerAnnotation):
|
|
126
127
|
return sub_cat.value
|
|
127
|
-
return
|
|
128
|
+
return sub_cat.category_id
|
|
128
129
|
if self.image is not None:
|
|
129
|
-
if self.image.summary
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
return int(sub_cat.category_id)
|
|
130
|
+
if item in self.image.summary.sub_categories:
|
|
131
|
+
sub_cat = self.get_summary(get_type(item))
|
|
132
|
+
if item != sub_cat.category_name:
|
|
133
|
+
return sub_cat.category_name
|
|
134
|
+
if isinstance(sub_cat, ContainerAnnotation):
|
|
135
|
+
return sub_cat.value
|
|
136
|
+
return sub_cat.category_id
|
|
137
137
|
return None
|
|
138
138
|
|
|
139
|
-
def get_attribute_names(self) ->
|
|
139
|
+
def get_attribute_names(self) -> set[str]:
|
|
140
140
|
"""
|
|
141
141
|
:return: A set of registered attributes. When sub classing modify this method accordingly.
|
|
142
142
|
"""
|
|
@@ -144,12 +144,11 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
144
144
|
# sub categories and summary sub categories are valid attribute names
|
|
145
145
|
attribute_names = {"bbox", "np_image"}.union({cat.value for cat in self.sub_categories})
|
|
146
146
|
if self.image:
|
|
147
|
-
|
|
148
|
-
attribute_names = attribute_names.union({cat.value for cat in self.image.summary.sub_categories.keys()})
|
|
147
|
+
attribute_names = attribute_names.union({cat.value for cat in self.image.summary.sub_categories.keys()})
|
|
149
148
|
return attribute_names
|
|
150
149
|
|
|
151
150
|
@classmethod
|
|
152
|
-
def from_dict(cls, **kwargs:
|
|
151
|
+
def from_dict(cls, **kwargs: AnnotationDict) -> ImageAnnotationBaseView:
|
|
153
152
|
"""
|
|
154
153
|
Identical to its base class method for having correct return types. If the base class changes, please
|
|
155
154
|
change this method as well.
|
|
@@ -165,8 +164,8 @@ class Word(ImageAnnotationBaseView):
|
|
|
165
164
|
Word specific subclass of `ImageAnnotationBaseView` modelled by `WordType`.
|
|
166
165
|
"""
|
|
167
166
|
|
|
168
|
-
def get_attribute_names(self) ->
|
|
169
|
-
return set(WordType).union(super().get_attribute_names()).union({Relationships.
|
|
167
|
+
def get_attribute_names(self) -> set[str]:
|
|
168
|
+
return set(WordType).union(super().get_attribute_names()).union({Relationships.READING_ORDER})
|
|
170
169
|
|
|
171
170
|
|
|
172
171
|
class Layout(ImageAnnotationBaseView):
|
|
@@ -181,13 +180,13 @@ class Layout(ImageAnnotationBaseView):
|
|
|
181
180
|
text_container: Optional[ObjectTypes] = None
|
|
182
181
|
|
|
183
182
|
@property
|
|
184
|
-
def words(self) ->
|
|
183
|
+
def words(self) -> list[ImageAnnotationBaseView]:
|
|
185
184
|
"""
|
|
186
185
|
Get a list of `ImageAnnotationBaseView` objects with `LayoutType` defined by `text_container`.
|
|
187
186
|
It will only select those among all annotations that have an entry in `Relationships.child` .
|
|
188
187
|
"""
|
|
189
188
|
if self.category_name != self.text_container:
|
|
190
|
-
text_ids = self.get_relationship(Relationships.
|
|
189
|
+
text_ids = self.get_relationship(Relationships.CHILD)
|
|
191
190
|
return self.base_page.get_annotation(annotation_ids=text_ids, category_names=self.text_container)
|
|
192
191
|
return [self]
|
|
193
192
|
|
|
@@ -199,17 +198,25 @@ class Layout(ImageAnnotationBaseView):
|
|
|
199
198
|
words = self.get_ordered_words()
|
|
200
199
|
return " ".join([word.characters for word in words]) # type: ignore
|
|
201
200
|
|
|
202
|
-
def get_ordered_words(self) ->
|
|
201
|
+
def get_ordered_words(self) -> list[ImageAnnotationBaseView]:
|
|
203
202
|
"""Returns a list of words order by reading order. Words with no reading order will not be returned"""
|
|
204
203
|
words_with_reading_order = [word for word in self.words if word.reading_order is not None]
|
|
205
204
|
words_with_reading_order.sort(key=lambda x: x.reading_order) # type: ignore
|
|
206
205
|
return words_with_reading_order
|
|
207
206
|
|
|
208
207
|
@property
|
|
209
|
-
def text_(self) ->
|
|
210
|
-
"""Returns a dict
|
|
211
|
-
|
|
212
|
-
"
|
|
208
|
+
def text_(self) -> Text_:
|
|
209
|
+
"""Returns a dict
|
|
210
|
+
|
|
211
|
+
`{"text": text string,
|
|
212
|
+
"text_list": list of single words,
|
|
213
|
+
"ann_ids": word annotation ids`,
|
|
214
|
+
"token_classes": token classes,
|
|
215
|
+
"token_tags": token tags,
|
|
216
|
+
"token_class_ids": token class ids,
|
|
217
|
+
"token_tag_ids": token tag ids}`
|
|
218
|
+
|
|
219
|
+
"""
|
|
213
220
|
words = self.get_ordered_words()
|
|
214
221
|
characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
|
|
215
222
|
*[
|
|
@@ -218,13 +225,11 @@ class Layout(ImageAnnotationBaseView):
|
|
|
218
225
|
word.annotation_id,
|
|
219
226
|
word.token_class,
|
|
220
227
|
word.token_tag,
|
|
221
|
-
(
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
(word.get_sub_category(WordType.token_tag).category_id)
|
|
227
|
-
if WordType.token_tag in word.sub_categories
|
|
228
|
+
word.get_sub_category(WordType.TOKEN_CLASS).category_id
|
|
229
|
+
if WordType.TOKEN_CLASS in word.sub_categories
|
|
230
|
+
else None,
|
|
231
|
+
word.get_sub_category(WordType.TOKEN_TAG).category_id
|
|
232
|
+
if WordType.TOKEN_TAG in word.sub_categories
|
|
228
233
|
else None,
|
|
229
234
|
)
|
|
230
235
|
for word in words
|
|
@@ -240,8 +245,8 @@ class Layout(ImageAnnotationBaseView):
|
|
|
240
245
|
"token_tag_ids": token_tag_ids,
|
|
241
246
|
}
|
|
242
247
|
|
|
243
|
-
def get_attribute_names(self) ->
|
|
244
|
-
return {"words", "text"}.union(super().get_attribute_names()).union({Relationships.
|
|
248
|
+
def get_attribute_names(self) -> set[str]:
|
|
249
|
+
return {"words", "text"}.union(super().get_attribute_names()).union({Relationships.READING_ORDER})
|
|
245
250
|
|
|
246
251
|
def __len__(self) -> int:
|
|
247
252
|
"""len of text counted by number of characters"""
|
|
@@ -253,7 +258,7 @@ class Cell(Layout):
|
|
|
253
258
|
Cell specific subclass of `ImageAnnotationBaseView` modelled by `CellType`.
|
|
254
259
|
"""
|
|
255
260
|
|
|
256
|
-
def get_attribute_names(self) ->
|
|
261
|
+
def get_attribute_names(self) -> set[str]:
|
|
257
262
|
return set(CellType).union(super().get_attribute_names())
|
|
258
263
|
|
|
259
264
|
|
|
@@ -263,52 +268,52 @@ class Table(Layout):
|
|
|
263
268
|
"""
|
|
264
269
|
|
|
265
270
|
@property
|
|
266
|
-
def cells(self) ->
|
|
271
|
+
def cells(self) -> list[ImageAnnotationBaseView]:
|
|
267
272
|
"""
|
|
268
273
|
A list of a table cells.
|
|
269
274
|
"""
|
|
270
|
-
all_relation_ids = self.get_relationship(Relationships.
|
|
275
|
+
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
271
276
|
cell_anns = self.base_page.get_annotation(
|
|
272
277
|
annotation_ids=all_relation_ids,
|
|
273
278
|
category_names=[
|
|
274
|
-
LayoutType.
|
|
275
|
-
CellType.
|
|
276
|
-
CellType.
|
|
277
|
-
CellType.
|
|
278
|
-
CellType.
|
|
279
|
-
CellType.
|
|
280
|
-
CellType.
|
|
279
|
+
LayoutType.CELL,
|
|
280
|
+
CellType.HEADER,
|
|
281
|
+
CellType.BODY,
|
|
282
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
283
|
+
CellType.SPANNING,
|
|
284
|
+
CellType.ROW_HEADER,
|
|
285
|
+
CellType.COLUMN_HEADER,
|
|
281
286
|
],
|
|
282
287
|
)
|
|
283
288
|
return cell_anns
|
|
284
289
|
|
|
285
290
|
@property
|
|
286
|
-
def rows(self) ->
|
|
291
|
+
def rows(self) -> list[ImageAnnotationBaseView]:
|
|
287
292
|
"""
|
|
288
293
|
A list of a table rows.
|
|
289
294
|
"""
|
|
290
|
-
all_relation_ids = self.get_relationship(Relationships.
|
|
291
|
-
row_anns = self.base_page.get_annotation(annotation_ids=all_relation_ids, category_names=[LayoutType.
|
|
295
|
+
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
296
|
+
row_anns = self.base_page.get_annotation(annotation_ids=all_relation_ids, category_names=[LayoutType.ROW])
|
|
292
297
|
return row_anns
|
|
293
298
|
|
|
294
299
|
@property
|
|
295
|
-
def columns(self) ->
|
|
300
|
+
def columns(self) -> list[ImageAnnotationBaseView]:
|
|
296
301
|
"""
|
|
297
302
|
A list of a table columns.
|
|
298
303
|
"""
|
|
299
|
-
all_relation_ids = self.get_relationship(Relationships.
|
|
300
|
-
col_anns = self.base_page.get_annotation(annotation_ids=all_relation_ids, category_names=[LayoutType.
|
|
304
|
+
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
305
|
+
col_anns = self.base_page.get_annotation(annotation_ids=all_relation_ids, category_names=[LayoutType.COLUMN])
|
|
301
306
|
return col_anns
|
|
302
307
|
|
|
303
308
|
@property
|
|
304
|
-
def html(self) ->
|
|
309
|
+
def html(self) -> HTML:
|
|
305
310
|
"""
|
|
306
311
|
The html representation of the table
|
|
307
312
|
"""
|
|
308
313
|
|
|
309
314
|
html_list = []
|
|
310
|
-
if TableType.
|
|
311
|
-
ann = self.get_sub_category(TableType.
|
|
315
|
+
if TableType.HTML in self.sub_categories:
|
|
316
|
+
ann = self.get_sub_category(TableType.HTML)
|
|
312
317
|
if isinstance(ann, ContainerAnnotation):
|
|
313
318
|
if isinstance(ann.value, list):
|
|
314
319
|
html_list = copy(ann.value)
|
|
@@ -322,7 +327,7 @@ class Table(Layout):
|
|
|
322
327
|
|
|
323
328
|
return "".join(html_list)
|
|
324
329
|
|
|
325
|
-
def get_attribute_names(self) ->
|
|
330
|
+
def get_attribute_names(self) -> set[str]:
|
|
326
331
|
return (
|
|
327
332
|
set(TableType)
|
|
328
333
|
.union(super().get_attribute_names())
|
|
@@ -330,7 +335,7 @@ class Table(Layout):
|
|
|
330
335
|
)
|
|
331
336
|
|
|
332
337
|
@property
|
|
333
|
-
def csv(self) ->
|
|
338
|
+
def csv(self) -> csv:
|
|
334
339
|
"""Returns a csv-style representation of a table as list of lists of string. Cell content of cell with higher
|
|
335
340
|
row or column spans will be shown at the upper left cell tile. All other tiles covered by the cell will be left
|
|
336
341
|
as blank
|
|
@@ -355,17 +360,17 @@ class Table(Layout):
|
|
|
355
360
|
return super().text
|
|
356
361
|
|
|
357
362
|
@property
|
|
358
|
-
def text_(self) ->
|
|
363
|
+
def text_(self) -> Text_:
|
|
359
364
|
cells = self.cells
|
|
360
365
|
if not cells:
|
|
361
366
|
return super().text_
|
|
362
|
-
text:
|
|
363
|
-
words:
|
|
364
|
-
ann_ids:
|
|
365
|
-
token_classes:
|
|
366
|
-
token_tags:
|
|
367
|
-
token_class_ids:
|
|
368
|
-
token_tag_ids:
|
|
367
|
+
text: list[str] = []
|
|
368
|
+
words: list[str] = []
|
|
369
|
+
ann_ids: list[str] = []
|
|
370
|
+
token_classes: list[str] = []
|
|
371
|
+
token_tags: list[str] = []
|
|
372
|
+
token_class_ids: list[str] = []
|
|
373
|
+
token_tag_ids: list[str] = []
|
|
369
374
|
for cell in cells:
|
|
370
375
|
text.extend(cell.text_["text"]) # type: ignore
|
|
371
376
|
words.extend(cell.text_["words"]) # type: ignore
|
|
@@ -385,12 +390,12 @@ class Table(Layout):
|
|
|
385
390
|
}
|
|
386
391
|
|
|
387
392
|
@property
|
|
388
|
-
def words(self) ->
|
|
393
|
+
def words(self) -> list[ImageAnnotationBaseView]:
|
|
389
394
|
"""
|
|
390
395
|
Get a list of `ImageAnnotationBaseView` objects with `LayoutType` defined by `text_container`.
|
|
391
396
|
It will only select those among all annotations that have an entry in `Relationships.child` .
|
|
392
397
|
"""
|
|
393
|
-
all_words:
|
|
398
|
+
all_words: list[ImageAnnotationBaseView] = []
|
|
394
399
|
cells = self.cells
|
|
395
400
|
if not cells:
|
|
396
401
|
return super().words
|
|
@@ -398,12 +403,12 @@ class Table(Layout):
|
|
|
398
403
|
all_words.extend(cell.words) # type: ignore
|
|
399
404
|
return all_words
|
|
400
405
|
|
|
401
|
-
def get_ordered_words(self) ->
|
|
406
|
+
def get_ordered_words(self) -> list[ImageAnnotationBaseView]:
|
|
402
407
|
"""Returns a list of words order by reading order. Words with no reading order will not be returned"""
|
|
403
408
|
try:
|
|
404
409
|
cells = self.cells
|
|
405
410
|
all_words = []
|
|
406
|
-
cells.sort(key=lambda x: (x.
|
|
411
|
+
cells.sort(key=lambda x: (x.ROW_NUMBER, x.COLUMN_NUMBER))
|
|
407
412
|
for cell in cells:
|
|
408
413
|
all_words.extend(cell.get_ordered_words()) # type: ignore
|
|
409
414
|
return all_words
|
|
@@ -411,27 +416,42 @@ class Table(Layout):
|
|
|
411
416
|
return super().get_ordered_words()
|
|
412
417
|
|
|
413
418
|
|
|
414
|
-
IMAGE_ANNOTATION_TO_LAYOUTS:
|
|
415
|
-
**{i: Layout for i in LayoutType if (i not in {LayoutType.
|
|
416
|
-
LayoutType.
|
|
417
|
-
LayoutType.
|
|
418
|
-
LayoutType.
|
|
419
|
-
LayoutType.
|
|
420
|
-
CellType.
|
|
421
|
-
CellType.
|
|
422
|
-
CellType.
|
|
423
|
-
CellType.
|
|
419
|
+
IMAGE_ANNOTATION_TO_LAYOUTS: dict[ObjectTypes, Type[Union[Layout, Table, Word]]] = {
|
|
420
|
+
**{i: Layout for i in LayoutType if (i not in {LayoutType.TABLE, LayoutType.WORD, LayoutType.CELL})},
|
|
421
|
+
LayoutType.TABLE: Table,
|
|
422
|
+
LayoutType.TABLE_ROTATED: Table,
|
|
423
|
+
LayoutType.WORD: Word,
|
|
424
|
+
LayoutType.CELL: Cell,
|
|
425
|
+
CellType.PROJECTED_ROW_HEADER: Cell,
|
|
426
|
+
CellType.SPANNING: Cell,
|
|
427
|
+
CellType.ROW_HEADER: Cell,
|
|
428
|
+
CellType.COLUMN_HEADER: Cell,
|
|
424
429
|
}
|
|
425
430
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
"
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
431
|
+
|
|
432
|
+
class ImageDefaults(TypedDict):
|
|
433
|
+
"""ImageDefaults"""
|
|
434
|
+
|
|
435
|
+
text_container: LayoutType
|
|
436
|
+
floating_text_block_categories: tuple[LayoutType, ...]
|
|
437
|
+
text_block_categories: tuple[LayoutType, ...]
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
IMAGE_DEFAULTS: ImageDefaults = {
|
|
441
|
+
"text_container": LayoutType.WORD,
|
|
442
|
+
"floating_text_block_categories": (
|
|
443
|
+
LayoutType.TEXT,
|
|
444
|
+
LayoutType.TITLE,
|
|
445
|
+
LayoutType.FIGURE,
|
|
446
|
+
LayoutType.LIST,
|
|
447
|
+
),
|
|
448
|
+
"text_block_categories": (
|
|
449
|
+
LayoutType.TEXT,
|
|
450
|
+
LayoutType.TITLE,
|
|
451
|
+
LayoutType.FIGURE,
|
|
452
|
+
LayoutType.LIST,
|
|
453
|
+
LayoutType.CELL,
|
|
454
|
+
),
|
|
435
455
|
}
|
|
436
456
|
|
|
437
457
|
|
|
@@ -448,7 +468,7 @@ def ann_obj_view_factory(annotation: ImageAnnotation, text_container: ObjectType
|
|
|
448
468
|
|
|
449
469
|
# We need to handle annotations that are text containers like words
|
|
450
470
|
if annotation.category_name == text_container:
|
|
451
|
-
layout_class = IMAGE_ANNOTATION_TO_LAYOUTS[LayoutType.
|
|
471
|
+
layout_class = IMAGE_ANNOTATION_TO_LAYOUTS[LayoutType.WORD]
|
|
452
472
|
else:
|
|
453
473
|
layout_class = IMAGE_ANNOTATION_TO_LAYOUTS[annotation.category_name]
|
|
454
474
|
ann_dict = annotation.as_dict()
|
|
@@ -477,9 +497,9 @@ class Page(Image):
|
|
|
477
497
|
"""
|
|
478
498
|
|
|
479
499
|
text_container: ObjectTypes
|
|
480
|
-
floating_text_block_categories:
|
|
500
|
+
floating_text_block_categories: list[ObjectTypes]
|
|
481
501
|
image_orig: Image
|
|
482
|
-
_attribute_names:
|
|
502
|
+
_attribute_names: set[str] = {
|
|
483
503
|
"text",
|
|
484
504
|
"chunks",
|
|
485
505
|
"tables",
|
|
@@ -500,7 +520,7 @@ class Page(Image):
|
|
|
500
520
|
model_id: Optional[Union[str, Sequence[str]]] = None,
|
|
501
521
|
session_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
502
522
|
ignore_inactive: bool = True,
|
|
503
|
-
) ->
|
|
523
|
+
) -> list[ImageAnnotationBaseView]:
|
|
504
524
|
"""
|
|
505
525
|
Selection of annotations from the annotation container. Filter conditions can be defined by specifying
|
|
506
526
|
the annotation_id or the category name. (Since only image annotations are currently allowed in the container,
|
|
@@ -523,9 +543,9 @@ class Page(Image):
|
|
|
523
543
|
|
|
524
544
|
if category_names is not None:
|
|
525
545
|
category_names = (
|
|
526
|
-
|
|
527
|
-
if isinstance(category_names,
|
|
528
|
-
else
|
|
546
|
+
(get_type(category_names),)
|
|
547
|
+
if isinstance(category_names, str)
|
|
548
|
+
else tuple(get_type(cat_name) for cat_name in category_names)
|
|
529
549
|
)
|
|
530
550
|
ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
|
|
531
551
|
service_id = [service_id] if isinstance(service_id, str) else service_id
|
|
@@ -533,60 +553,59 @@ class Page(Image):
|
|
|
533
553
|
session_id = [session_ids] if isinstance(session_ids, str) else session_ids
|
|
534
554
|
|
|
535
555
|
if ignore_inactive:
|
|
536
|
-
anns = filter(lambda x: x.active, self.annotations)
|
|
556
|
+
anns: Union[list[ImageAnnotation], filter[ImageAnnotation]] = filter(lambda x: x.active, self.annotations)
|
|
537
557
|
else:
|
|
538
|
-
anns = self.annotations
|
|
558
|
+
anns = self.annotations
|
|
539
559
|
|
|
540
560
|
if category_names is not None:
|
|
541
|
-
anns = filter(lambda x: x.category_name in category_names, anns)
|
|
561
|
+
anns = filter(lambda x: x.category_name in category_names, anns)
|
|
542
562
|
|
|
543
563
|
if ann_ids is not None:
|
|
544
|
-
anns = filter(lambda x: x.annotation_id in ann_ids, anns)
|
|
564
|
+
anns = filter(lambda x: x.annotation_id in ann_ids, anns)
|
|
545
565
|
|
|
546
566
|
if service_id is not None:
|
|
547
|
-
anns = filter(lambda x: x.generating_service in service_id, anns)
|
|
567
|
+
anns = filter(lambda x: x.generating_service in service_id, anns)
|
|
548
568
|
|
|
549
569
|
if model_id is not None:
|
|
550
|
-
anns = filter(lambda x: x.generating_model in model_id, anns)
|
|
570
|
+
anns = filter(lambda x: x.generating_model in model_id, anns)
|
|
551
571
|
|
|
552
572
|
if session_id is not None:
|
|
553
|
-
anns = filter(lambda x: x.session_id in session_id, anns)
|
|
573
|
+
anns = filter(lambda x: x.session_id in session_id, anns)
|
|
554
574
|
|
|
555
|
-
return list(anns) # type:ignore
|
|
575
|
+
return list(anns) # type: ignore
|
|
556
576
|
|
|
557
577
|
def __getattr__(self, item: str) -> Any:
|
|
558
578
|
if item not in self.get_attribute_names():
|
|
559
579
|
raise ImageError(f"Attribute {item} is not supported for {type(self)}")
|
|
560
|
-
if self.summary
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
return int(sub_cat.category_id)
|
|
580
|
+
if item in self.summary.sub_categories:
|
|
581
|
+
sub_cat = self.summary.get_sub_category(get_type(item))
|
|
582
|
+
if item != sub_cat.category_name:
|
|
583
|
+
return sub_cat.category_name
|
|
584
|
+
if isinstance(sub_cat, ContainerAnnotation):
|
|
585
|
+
return sub_cat.value
|
|
586
|
+
return sub_cat.category_id
|
|
568
587
|
return None
|
|
569
588
|
|
|
570
589
|
@property
|
|
571
|
-
def layouts(self) ->
|
|
590
|
+
def layouts(self) -> list[ImageAnnotationBaseView]:
|
|
572
591
|
"""
|
|
573
592
|
A list of a layouts. Layouts are all exactly all floating text block categories
|
|
574
593
|
"""
|
|
575
594
|
return self.get_annotation(category_names=self.floating_text_block_categories)
|
|
576
595
|
|
|
577
596
|
@property
|
|
578
|
-
def words(self) ->
|
|
597
|
+
def words(self) -> list[ImageAnnotationBaseView]:
|
|
579
598
|
"""
|
|
580
599
|
A list of a words. Word are all text containers
|
|
581
600
|
"""
|
|
582
601
|
return self.get_annotation(category_names=self.text_container)
|
|
583
602
|
|
|
584
603
|
@property
|
|
585
|
-
def tables(self) ->
|
|
604
|
+
def tables(self) -> list[ImageAnnotationBaseView]:
|
|
586
605
|
"""
|
|
587
606
|
A list of a tables.
|
|
588
607
|
"""
|
|
589
|
-
return self.get_annotation(category_names=LayoutType.
|
|
608
|
+
return self.get_annotation(category_names=LayoutType.TABLE)
|
|
590
609
|
|
|
591
610
|
@classmethod
|
|
592
611
|
def from_image(
|
|
@@ -612,13 +631,13 @@ class Page(Image):
|
|
|
612
631
|
"""
|
|
613
632
|
|
|
614
633
|
if text_container is None:
|
|
615
|
-
text_container = IMAGE_DEFAULTS["text_container"]
|
|
634
|
+
text_container = IMAGE_DEFAULTS["text_container"]
|
|
616
635
|
|
|
617
636
|
if not floating_text_block_categories:
|
|
618
|
-
floating_text_block_categories =
|
|
637
|
+
floating_text_block_categories = IMAGE_DEFAULTS["floating_text_block_categories"]
|
|
619
638
|
|
|
620
|
-
if include_residual_text_container and LayoutType.
|
|
621
|
-
floating_text_block_categories
|
|
639
|
+
if include_residual_text_container and LayoutType.LINE not in floating_text_block_categories:
|
|
640
|
+
floating_text_block_categories = tuple(floating_text_block_categories) + (LayoutType.LINE,)
|
|
622
641
|
|
|
623
642
|
img_kwargs = image_orig.as_dict()
|
|
624
643
|
page = cls(
|
|
@@ -646,18 +665,23 @@ class Page(Image):
|
|
|
646
665
|
if image_dict:
|
|
647
666
|
image = Image.from_dict(**image_dict)
|
|
648
667
|
layout_ann.image = cls.from_image(
|
|
649
|
-
image,
|
|
668
|
+
image_orig=image,
|
|
669
|
+
text_container=text_container,
|
|
670
|
+
floating_text_block_categories=floating_text_block_categories,
|
|
671
|
+
include_residual_text_container=include_residual_text_container,
|
|
672
|
+
base_page=page,
|
|
650
673
|
)
|
|
651
674
|
layout_ann.base_page = base_page if base_page is not None else page
|
|
652
675
|
page.dump(layout_ann)
|
|
653
676
|
if summary_dict := img_kwargs.get("_summary"):
|
|
654
|
-
page.summary =
|
|
677
|
+
page.summary = CategoryAnnotation.from_dict(**summary_dict)
|
|
678
|
+
page.summary.category_name = SummaryType.SUMMARY
|
|
655
679
|
page.floating_text_block_categories = floating_text_block_categories # type: ignore
|
|
656
|
-
page.text_container = text_container
|
|
680
|
+
page.text_container = text_container
|
|
657
681
|
page.include_residual_text_container = include_residual_text_container
|
|
658
682
|
return page
|
|
659
683
|
|
|
660
|
-
def _order(self, block: str) ->
|
|
684
|
+
def _order(self, block: str) -> list[ImageAnnotationBaseView]:
|
|
661
685
|
blocks_with_order = [layout for layout in getattr(self, block) if layout.reading_order is not None]
|
|
662
686
|
blocks_with_order.sort(key=lambda x: x.reading_order)
|
|
663
687
|
return blocks_with_order
|
|
@@ -678,18 +702,18 @@ class Page(Image):
|
|
|
678
702
|
return self._make_text()
|
|
679
703
|
|
|
680
704
|
@property
|
|
681
|
-
def text_(self) ->
|
|
705
|
+
def text_(self) -> Text_:
|
|
682
706
|
"""Returns a dict `{"text": text string,
|
|
683
707
|
"text_list": list of single words,
|
|
684
708
|
"annotation_ids": word annotation ids`"""
|
|
685
709
|
block_with_order = self._order("layouts")
|
|
686
|
-
text:
|
|
687
|
-
words:
|
|
688
|
-
ann_ids:
|
|
689
|
-
token_classes:
|
|
690
|
-
token_tags:
|
|
691
|
-
token_class_ids:
|
|
692
|
-
token_tag_ids:
|
|
710
|
+
text: list[str] = []
|
|
711
|
+
words: list[str] = []
|
|
712
|
+
ann_ids: list[str] = []
|
|
713
|
+
token_classes: list[str] = []
|
|
714
|
+
token_tags: list[str] = []
|
|
715
|
+
token_class_ids: list[str] = []
|
|
716
|
+
token_tag_ids: list[str] = []
|
|
693
717
|
for block in block_with_order:
|
|
694
718
|
text.append(block.text_["text"]) # type: ignore
|
|
695
719
|
words.extend(block.text_["words"]) # type: ignore
|
|
@@ -708,7 +732,7 @@ class Page(Image):
|
|
|
708
732
|
"token_tag_ids": token_tag_ids,
|
|
709
733
|
}
|
|
710
734
|
|
|
711
|
-
def get_layout_context(self, annotation_id: str, context_size: int = 3) ->
|
|
735
|
+
def get_layout_context(self, annotation_id: str, context_size: int = 3) -> list[ImageAnnotationBaseView]:
|
|
712
736
|
"""For a given `annotation_id` get a list of `ImageAnnotation` that are nearby in terms of reading order.
|
|
713
737
|
For a given context_size it will return all layouts with reading_order between
|
|
714
738
|
reading_order(annoation_id)-context_size and reading_order(annoation_id)-context_size.
|
|
@@ -731,7 +755,7 @@ class Page(Image):
|
|
|
731
755
|
]
|
|
732
756
|
|
|
733
757
|
@property
|
|
734
|
-
def chunks(self) ->
|
|
758
|
+
def chunks(self) -> Chunks:
|
|
735
759
|
"""
|
|
736
760
|
:return: Returns a "chunk" of a layout element or a table as 6-tuple containing
|
|
737
761
|
|
|
@@ -783,7 +807,7 @@ class Page(Image):
|
|
|
783
807
|
ignore_default_token_class: bool = False,
|
|
784
808
|
interactive: bool = False,
|
|
785
809
|
**debug_kwargs: str,
|
|
786
|
-
) -> Optional[
|
|
810
|
+
) -> Optional[PixelValues]:
|
|
787
811
|
"""
|
|
788
812
|
Display a page with detected bounding boxes of various types.
|
|
789
813
|
|
|
@@ -813,7 +837,7 @@ class Page(Image):
|
|
|
813
837
|
:return: If `interactive=False` will return a numpy array.
|
|
814
838
|
"""
|
|
815
839
|
|
|
816
|
-
category_names_list:
|
|
840
|
+
category_names_list: list[Union[str, None]] = []
|
|
817
841
|
box_stack = []
|
|
818
842
|
cells_found = False
|
|
819
843
|
|
|
@@ -836,15 +860,15 @@ class Page(Image):
|
|
|
836
860
|
if show_tables and not debug_kwargs:
|
|
837
861
|
for table in self.tables:
|
|
838
862
|
box_stack.append(table.bbox)
|
|
839
|
-
category_names_list.append(LayoutType.
|
|
863
|
+
category_names_list.append(LayoutType.TABLE.value)
|
|
840
864
|
if show_cells:
|
|
841
865
|
for cell in table.cells:
|
|
842
866
|
if cell.category_name in {
|
|
843
|
-
LayoutType.
|
|
844
|
-
CellType.
|
|
845
|
-
CellType.
|
|
846
|
-
CellType.
|
|
847
|
-
CellType.
|
|
867
|
+
LayoutType.CELL,
|
|
868
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
869
|
+
CellType.SPANNING,
|
|
870
|
+
CellType.ROW_HEADER,
|
|
871
|
+
CellType.COLUMN_HEADER,
|
|
848
872
|
}:
|
|
849
873
|
cells_found = True
|
|
850
874
|
box_stack.append(cell.bbox)
|
|
@@ -872,7 +896,7 @@ class Page(Image):
|
|
|
872
896
|
for table in self.tables:
|
|
873
897
|
all_words.extend(table.words)
|
|
874
898
|
if not all_words:
|
|
875
|
-
all_words = self.get_annotation(category_names=LayoutType.
|
|
899
|
+
all_words = self.get_annotation(category_names=LayoutType.WORD)
|
|
876
900
|
if not ignore_default_token_class:
|
|
877
901
|
for word in all_words:
|
|
878
902
|
box_stack.append(word.bbox)
|
|
@@ -882,7 +906,7 @@ class Page(Image):
|
|
|
882
906
|
category_names_list.append(word.token_tag.value if word.token_tag is not None else None)
|
|
883
907
|
else:
|
|
884
908
|
for word in all_words:
|
|
885
|
-
if word.token_class is not None and word.token_class != TokenClasses.
|
|
909
|
+
if word.token_class is not None and word.token_class != TokenClasses.OTHER:
|
|
886
910
|
box_stack.append(word.bbox)
|
|
887
911
|
if show_token_class:
|
|
888
912
|
category_names_list.append(word.token_class.value if word.token_class is not None else None)
|
|
@@ -915,7 +939,7 @@ class Page(Image):
|
|
|
915
939
|
return None
|
|
916
940
|
|
|
917
941
|
@classmethod
|
|
918
|
-
def get_attribute_names(cls) ->
|
|
942
|
+
def get_attribute_names(cls) -> set[str]:
|
|
919
943
|
"""
|
|
920
944
|
:return: A set of registered attributes.
|
|
921
945
|
"""
|
|
@@ -945,9 +969,9 @@ class Page(Image):
|
|
|
945
969
|
self,
|
|
946
970
|
image_to_json: bool = True,
|
|
947
971
|
highest_hierarchy_only: bool = False,
|
|
948
|
-
path: Optional[
|
|
972
|
+
path: Optional[PathLikeOrStr] = None,
|
|
949
973
|
dry: bool = False,
|
|
950
|
-
) -> Optional[
|
|
974
|
+
) -> Optional[Union[ImageDict, str]]:
|
|
951
975
|
"""
|
|
952
976
|
Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
|
|
953
977
|
base64 encodings.
|
|
@@ -967,7 +991,7 @@ class Page(Image):
|
|
|
967
991
|
cls,
|
|
968
992
|
file_path: str,
|
|
969
993
|
text_container: Optional[ObjectTypes] = None,
|
|
970
|
-
floating_text_block_categories: Optional[
|
|
994
|
+
floating_text_block_categories: Optional[list[ObjectTypes]] = None,
|
|
971
995
|
include_residual_text_container: bool = True,
|
|
972
996
|
) -> Page:
|
|
973
997
|
"""Reading JSON file and building a `Page` object with given config.
|
|
@@ -981,16 +1005,16 @@ class Page(Image):
|
|
|
981
1005
|
image = Image.from_file(file_path)
|
|
982
1006
|
return cls.from_image(image, text_container, floating_text_block_categories, include_residual_text_container)
|
|
983
1007
|
|
|
984
|
-
def get_token(self) ->
|
|
1008
|
+
def get_token(self) -> list[Mapping[str, str]]:
|
|
985
1009
|
"""Return a list of tuples with word and non default token tags"""
|
|
986
1010
|
block_with_order = self._order("layouts")
|
|
987
1011
|
all_words = []
|
|
988
1012
|
for block in block_with_order:
|
|
989
1013
|
all_words.extend(block.get_ordered_words()) # type: ignore
|
|
990
1014
|
return [
|
|
991
|
-
{"word": word.
|
|
1015
|
+
{"word": word.CHARACTERS, "entity": word.TOKEN_TAG}
|
|
992
1016
|
for word in all_words
|
|
993
|
-
if word.
|
|
1017
|
+
if word.TOKEN_TAG not in (TokenClasses.OTHER, None)
|
|
994
1018
|
]
|
|
995
1019
|
|
|
996
1020
|
def __copy__(self) -> Page:
|