deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +8 -25
- deepdoctection/analyzer/dd.py +84 -71
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +78 -56
- deepdoctection/datapoint/box.py +7 -7
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +157 -75
- deepdoctection/datapoint/view.py +175 -151
- deepdoctection/datasets/adapter.py +30 -24
- deepdoctection/datasets/base.py +10 -10
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +23 -25
- deepdoctection/datasets/instances/doclaynet.py +48 -49
- deepdoctection/datasets/instances/fintabnet.py +44 -45
- deepdoctection/datasets/instances/funsd.py +23 -23
- deepdoctection/datasets/instances/iiitar13k.py +8 -8
- deepdoctection/datasets/instances/layouttest.py +2 -2
- deepdoctection/datasets/instances/publaynet.py +3 -3
- deepdoctection/datasets/instances/pubtables1m.py +18 -18
- deepdoctection/datasets/instances/pubtabnet.py +30 -29
- deepdoctection/datasets/instances/rvlcdip.py +28 -29
- deepdoctection/datasets/instances/xfund.py +51 -30
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +13 -12
- deepdoctection/eval/eval.py +32 -26
- deepdoctection/eval/tedsmetric.py +16 -12
- deepdoctection/eval/tp_eval_callback.py +7 -16
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +69 -89
- deepdoctection/extern/deskew.py +11 -10
- deepdoctection/extern/doctrocr.py +81 -64
- deepdoctection/extern/fastlang.py +23 -16
- deepdoctection/extern/hfdetr.py +53 -38
- deepdoctection/extern/hflayoutlm.py +216 -155
- deepdoctection/extern/hflm.py +35 -30
- deepdoctection/extern/model.py +433 -255
- deepdoctection/extern/pdftext.py +15 -15
- deepdoctection/extern/pt/ptutils.py +4 -2
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +14 -16
- deepdoctection/extern/tp/tfutils.py +16 -2
- deepdoctection/extern/tp/tpcompat.py +11 -7
- deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
- deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
- deepdoctection/extern/tpdetect.py +40 -45
- deepdoctection/mapper/cats.py +36 -40
- deepdoctection/mapper/cocostruct.py +16 -12
- deepdoctection/mapper/d2struct.py +22 -22
- deepdoctection/mapper/hfstruct.py +7 -7
- deepdoctection/mapper/laylmstruct.py +22 -24
- deepdoctection/mapper/maputils.py +9 -10
- deepdoctection/mapper/match.py +33 -2
- deepdoctection/mapper/misc.py +6 -7
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +6 -6
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/anngen.py +39 -14
- deepdoctection/pipe/base.py +68 -99
- deepdoctection/pipe/common.py +181 -85
- deepdoctection/pipe/concurrency.py +14 -10
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +18 -16
- deepdoctection/pipe/lm.py +49 -47
- deepdoctection/pipe/order.py +63 -65
- deepdoctection/pipe/refine.py +102 -109
- deepdoctection/pipe/segment.py +157 -162
- deepdoctection/pipe/sub_layout.py +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/d2_frcnn_train.py +27 -25
- deepdoctection/train/hf_detr_train.py +22 -18
- deepdoctection/train/hf_layoutlm_train.py +49 -48
- deepdoctection/train/tp_frcnn_train.py +10 -11
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +52 -14
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +41 -14
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +15 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/pdf_utils.py +39 -14
- deepdoctection/utils/settings.py +188 -182
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +70 -69
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
- deepdoctection-0.34.dist-info/RECORD +146 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.32.dist-info/RECORD +0 -146
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
|
@@ -20,17 +20,16 @@ Module for mapping annotations in pubtabnet style structure
|
|
|
20
20
|
"""
|
|
21
21
|
import itertools
|
|
22
22
|
import os
|
|
23
|
-
from typing import
|
|
23
|
+
from typing import Iterable, Optional, Sequence
|
|
24
24
|
|
|
25
25
|
import numpy as np
|
|
26
26
|
|
|
27
27
|
from ..datapoint import BoundingBox, CategoryAnnotation, ContainerAnnotation, ImageAnnotation
|
|
28
|
-
from ..datapoint.annotation import SummaryAnnotation
|
|
29
28
|
from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
|
|
30
29
|
from ..datapoint.image import Image
|
|
31
|
-
from ..utils.detection_types import JsonDict
|
|
32
30
|
from ..utils.fs import load_bytes_from_pdf_file, load_image_from_file
|
|
33
|
-
from ..utils.settings import CellType, LayoutType, Relationships, TableType, WordType
|
|
31
|
+
from ..utils.settings import CellType, LayoutType, ObjectTypes, Relationships, SummaryType, TableType, WordType
|
|
32
|
+
from ..utils.types import JsonDict, PubtabnetDict
|
|
34
33
|
from ..utils.utils import is_file_extension
|
|
35
34
|
from .maputils import MappingContextManager, curry, maybe_get_fake_score
|
|
36
35
|
|
|
@@ -52,14 +51,14 @@ def _convert_boxes(dp: JsonDict, height: int) -> JsonDict:
|
|
|
52
51
|
return dp
|
|
53
52
|
|
|
54
53
|
|
|
55
|
-
def _get_table_annotation(dp: JsonDict, category_id:
|
|
54
|
+
def _get_table_annotation(dp: JsonDict, category_id: int) -> ImageAnnotation:
|
|
56
55
|
ulx, uly, lrx, lry = list(map(float, dp["bbox"]))
|
|
57
56
|
bbox = BoundingBox(absolute_coords=True, ulx=ulx, uly=uly, lrx=lrx, lry=lry)
|
|
58
|
-
annotation = ImageAnnotation(category_name=LayoutType.
|
|
57
|
+
annotation = ImageAnnotation(category_name=LayoutType.TABLE, bounding_box=bbox, category_id=category_id)
|
|
59
58
|
return annotation
|
|
60
59
|
|
|
61
60
|
|
|
62
|
-
def _cell_token(html: Sequence[str]) ->
|
|
61
|
+
def _cell_token(html: Sequence[str]) -> list[list[int]]:
|
|
63
62
|
index_rows = [i for i, tag in enumerate(html) if tag == "<tr>"]
|
|
64
63
|
index_cells = [i for i, tag in enumerate(html) if tag in ("<td>", ">")]
|
|
65
64
|
index_rows_tmp = [(index_rows[i], index_rows[i + 1]) for i in range(len(index_rows) - 1)]
|
|
@@ -72,7 +71,7 @@ def _cell_token(html: Sequence[str]) -> List[List[int]]:
|
|
|
72
71
|
return index_cells_tmp
|
|
73
72
|
|
|
74
73
|
|
|
75
|
-
def _item_spans(html: Sequence[str], index_cells: Sequence[Sequence[int]], item: str) ->
|
|
74
|
+
def _item_spans(html: Sequence[str], index_cells: Sequence[Sequence[int]], item: str) -> list[list[int]]:
|
|
76
75
|
item_spans = [
|
|
77
76
|
[
|
|
78
77
|
(
|
|
@@ -102,7 +101,7 @@ def _end_of_header(html: Sequence[str]) -> int:
|
|
|
102
101
|
return 0
|
|
103
102
|
|
|
104
103
|
|
|
105
|
-
def tile_table(row_spans: Sequence[Sequence[int]], col_spans: Sequence[Sequence[int]]) ->
|
|
104
|
+
def tile_table(row_spans: Sequence[Sequence[int]], col_spans: Sequence[Sequence[int]]) -> list[list[int]]:
|
|
106
105
|
"""
|
|
107
106
|
Tiles a table according the row and column span scheme. A table can be represented as a list of list, where each
|
|
108
107
|
inner list has the same length. Each cell with a cell id can be located according to their row and column spans in
|
|
@@ -153,28 +152,25 @@ def tile_table(row_spans: Sequence[Sequence[int]], col_spans: Sequence[Sequence[
|
|
|
153
152
|
return tiling
|
|
154
153
|
|
|
155
154
|
|
|
156
|
-
def _add_items(
|
|
157
|
-
|
|
158
|
-
|
|
155
|
+
def _add_items(
|
|
156
|
+
image: Image, item_type: str, categories_name_as_key: dict[ObjectTypes, int], pubtables_like: bool
|
|
157
|
+
) -> Image:
|
|
158
|
+
item_number = CellType.ROW_NUMBER if item_type == LayoutType.ROW else CellType.COLUMN_NUMBER
|
|
159
|
+
item_span = CellType.ROW_SPAN if item_type == LayoutType.ROW else CellType.COLUMN_SPAN
|
|
159
160
|
|
|
160
|
-
summary_key = TableType.
|
|
161
|
+
summary_key = TableType.NUMBER_OF_ROWS if item_type == LayoutType.ROW else TableType.NUMBER_OF_COLUMNS
|
|
161
162
|
|
|
162
|
-
|
|
163
|
+
category_item = image.summary.get_sub_category(summary_key)
|
|
164
|
+
number_of_items = category_item.category_id
|
|
163
165
|
|
|
164
|
-
|
|
165
|
-
category_item = image.summary.get_sub_category(summary_key)
|
|
166
|
-
number_of_items = int(category_item.category_id)
|
|
167
|
-
|
|
168
|
-
cells = image.get_annotation(category_names=LayoutType.cell)
|
|
166
|
+
cells = image.get_annotation(category_names=LayoutType.CELL)
|
|
169
167
|
table: ImageAnnotation
|
|
170
168
|
|
|
171
169
|
for item_num in range(1, number_of_items + 1):
|
|
172
170
|
cell_item = list(
|
|
173
|
-
filter(
|
|
174
|
-
lambda x: x.get_sub_category(item_number).category_id == str(item_num), cells # pylint: disable=W0640
|
|
175
|
-
)
|
|
171
|
+
filter(lambda x: x.get_sub_category(item_number).category_id == item_num, cells) # pylint: disable=W0640
|
|
176
172
|
)
|
|
177
|
-
cell_item = list(filter(lambda x: x.get_sub_category(item_span).category_id ==
|
|
173
|
+
cell_item = list(filter(lambda x: x.get_sub_category(item_span).category_id == 1, cell_item))
|
|
178
174
|
if cell_item:
|
|
179
175
|
ulx = min(cell.bounding_box.ulx for cell in cell_item if isinstance(cell.bounding_box, BoundingBox))
|
|
180
176
|
|
|
@@ -185,12 +181,12 @@ def _add_items(image: Image, item_type: str, categories_name_as_key: Dict[str, s
|
|
|
185
181
|
lry = max(cell.bounding_box.lry for cell in cell_item if isinstance(cell.bounding_box, BoundingBox))
|
|
186
182
|
|
|
187
183
|
if pubtables_like:
|
|
188
|
-
tables = image.get_annotation(category_names=LayoutType.
|
|
184
|
+
tables = image.get_annotation(category_names=LayoutType.TABLE)
|
|
189
185
|
if not tables:
|
|
190
186
|
raise ValueError("pubtables_like = True requires table")
|
|
191
187
|
table = tables[0]
|
|
192
188
|
|
|
193
|
-
if item_type == LayoutType.
|
|
189
|
+
if item_type == LayoutType.ROW:
|
|
194
190
|
if table.bounding_box:
|
|
195
191
|
ulx = table.bounding_box.ulx + 1.0
|
|
196
192
|
lrx = table.bounding_box.lrx - 1.0
|
|
@@ -200,22 +196,22 @@ def _add_items(image: Image, item_type: str, categories_name_as_key: Dict[str, s
|
|
|
200
196
|
lry = table.bounding_box.lry - 1.0
|
|
201
197
|
|
|
202
198
|
item_ann = ImageAnnotation(
|
|
203
|
-
category_id=categories_name_as_key[TableType.
|
|
204
|
-
category_name=TableType.
|
|
199
|
+
category_id=categories_name_as_key[TableType.ITEM],
|
|
200
|
+
category_name=TableType.ITEM,
|
|
205
201
|
bounding_box=BoundingBox(absolute_coords=True, ulx=ulx, uly=uly, lrx=lrx, lry=lry),
|
|
206
202
|
)
|
|
207
203
|
item_sub_ann = CategoryAnnotation(category_name=item_type)
|
|
208
|
-
item_ann.dump_sub_category(TableType.
|
|
204
|
+
item_ann.dump_sub_category(TableType.ITEM, item_sub_ann, image.image_id)
|
|
209
205
|
image.dump(item_ann)
|
|
210
206
|
|
|
211
207
|
if pubtables_like: # pubtables_like:
|
|
212
|
-
items = image.get_annotation(category_names=TableType.
|
|
213
|
-
item_type_anns = [ann for ann in items if ann.get_sub_category(TableType.
|
|
208
|
+
items = image.get_annotation(category_names=TableType.ITEM)
|
|
209
|
+
item_type_anns = [ann for ann in items if ann.get_sub_category(TableType.ITEM).category_name == item_type]
|
|
214
210
|
item_type_anns.sort(
|
|
215
|
-
key=lambda x: (x.bounding_box.cx if item_type == LayoutType.
|
|
211
|
+
key=lambda x: (x.bounding_box.cx if item_type == LayoutType.COLUMN else x.bounding_box.cy) # type: ignore
|
|
216
212
|
)
|
|
217
213
|
if table.bounding_box:
|
|
218
|
-
tmp_item_xy = table.bounding_box.uly + 1.0 if item_type == LayoutType.
|
|
214
|
+
tmp_item_xy = table.bounding_box.uly + 1.0 if item_type == LayoutType.ROW else table.bounding_box.ulx + 1.0
|
|
219
215
|
for idx, item in enumerate(item_type_anns):
|
|
220
216
|
with MappingContextManager(
|
|
221
217
|
dp_name=image.file_name,
|
|
@@ -230,22 +226,22 @@ def _add_items(image: Image, item_type: str, categories_name_as_key: Dict[str, s
|
|
|
230
226
|
if next_box:
|
|
231
227
|
tmp_next_item_xy = (
|
|
232
228
|
(box.lry + next_box.uly) / 2
|
|
233
|
-
if item_type == LayoutType.
|
|
229
|
+
if item_type == LayoutType.ROW
|
|
234
230
|
else (box.lrx + next_box.ulx) / 2
|
|
235
231
|
)
|
|
236
232
|
else:
|
|
237
233
|
if table.bounding_box:
|
|
238
234
|
tmp_next_item_xy = (
|
|
239
235
|
table.bounding_box.lry - 1.0
|
|
240
|
-
if item_type == LayoutType.
|
|
236
|
+
if item_type == LayoutType.ROW
|
|
241
237
|
else table.bounding_box.lrx - 1.0
|
|
242
238
|
)
|
|
243
239
|
|
|
244
240
|
new_embedding_box = BoundingBox(
|
|
245
|
-
ulx=box.ulx if item_type == LayoutType.
|
|
246
|
-
uly=tmp_item_xy if item_type == LayoutType.
|
|
247
|
-
lrx=box.lrx if item_type == LayoutType.
|
|
248
|
-
lry=tmp_next_item_xy if item_type == LayoutType.
|
|
241
|
+
ulx=box.ulx if item_type == LayoutType.ROW else tmp_item_xy,
|
|
242
|
+
uly=tmp_item_xy if item_type == LayoutType.ROW else box.uly,
|
|
243
|
+
lrx=box.lrx if item_type == LayoutType.ROW else tmp_next_item_xy,
|
|
244
|
+
lry=tmp_next_item_xy if item_type == LayoutType.ROW else box.lry,
|
|
249
245
|
absolute_coords=True,
|
|
250
246
|
)
|
|
251
247
|
item.bounding_box = new_embedding_box
|
|
@@ -255,7 +251,7 @@ def _add_items(image: Image, item_type: str, categories_name_as_key: Dict[str, s
|
|
|
255
251
|
return image
|
|
256
252
|
|
|
257
253
|
|
|
258
|
-
def row_col_cell_ids(tiling:
|
|
254
|
+
def row_col_cell_ids(tiling: list[list[int]]) -> list[tuple[int, int, int]]:
|
|
259
255
|
"""
|
|
260
256
|
Infers absolute rows and columns for every cell from the tiling of a table.
|
|
261
257
|
|
|
@@ -271,7 +267,7 @@ def row_col_cell_ids(tiling: List[List[int]]) -> List[Tuple[int, int, int]]:
|
|
|
271
267
|
return rows_col_cell_ids
|
|
272
268
|
|
|
273
269
|
|
|
274
|
-
def embedding_in_image(dp: Image, html:
|
|
270
|
+
def embedding_in_image(dp: Image, html: list[str], categories_name_as_key: dict[ObjectTypes, int]) -> Image:
|
|
275
271
|
"""
|
|
276
272
|
Generating an image, that resembles the output of an analyzer. The layout of the image is table spanning
|
|
277
273
|
the full page, i.e. there is one table image annotation. Moreover, the table annotation has an image, with cells
|
|
@@ -286,8 +282,8 @@ def embedding_in_image(dp: Image, html: List[str], categories_name_as_key: Dict[
|
|
|
286
282
|
image.image = dp.image
|
|
287
283
|
image.set_width_height(dp.width, dp.height)
|
|
288
284
|
table_ann = ImageAnnotation(
|
|
289
|
-
category_name=LayoutType.
|
|
290
|
-
category_id=categories_name_as_key[LayoutType.
|
|
285
|
+
category_name=LayoutType.TABLE,
|
|
286
|
+
category_id=categories_name_as_key[LayoutType.TABLE],
|
|
291
287
|
bounding_box=BoundingBox(absolute_coords=True, ulx=0.0, uly=0.0, lrx=dp.width, lry=dp.height),
|
|
292
288
|
)
|
|
293
289
|
image.dump(table_ann)
|
|
@@ -297,20 +293,20 @@ def embedding_in_image(dp: Image, html: List[str], categories_name_as_key: Dict[
|
|
|
297
293
|
# node.
|
|
298
294
|
html.insert(0, "<table>")
|
|
299
295
|
html.append("</table>")
|
|
300
|
-
if CellType.
|
|
296
|
+
if CellType.HEADER not in categories_name_as_key:
|
|
301
297
|
html.remove("<thead>")
|
|
302
298
|
html.remove("</thead>")
|
|
303
299
|
if "<tbody>" in html and "</tbody>" in html:
|
|
304
300
|
html.remove("<tbody>")
|
|
305
301
|
html.remove("</tbody>")
|
|
306
302
|
|
|
307
|
-
html_ann = ContainerAnnotation(category_name=TableType.
|
|
308
|
-
table_ann.dump_sub_category(TableType.
|
|
303
|
+
html_ann = ContainerAnnotation(category_name=TableType.HTML, value=html)
|
|
304
|
+
table_ann.dump_sub_category(TableType.HTML, html_ann)
|
|
309
305
|
for ann in dp.get_annotation():
|
|
310
306
|
image.dump(ann)
|
|
311
307
|
assert table_ann.image
|
|
312
308
|
table_ann.image.dump(ann)
|
|
313
|
-
table_ann.dump_relationship(Relationships.
|
|
309
|
+
table_ann.dump_relationship(Relationships.CHILD, ann.annotation_id)
|
|
314
310
|
|
|
315
311
|
return image
|
|
316
312
|
|
|
@@ -329,8 +325,8 @@ def nth_index(iterable: Iterable[str], value: str, n: int) -> Optional[int]:
|
|
|
329
325
|
|
|
330
326
|
|
|
331
327
|
def pub_to_image_uncur( # pylint: disable=R0914
|
|
332
|
-
dp:
|
|
333
|
-
categories_name_as_key:
|
|
328
|
+
dp: PubtabnetDict,
|
|
329
|
+
categories_name_as_key: dict[ObjectTypes, int],
|
|
334
330
|
load_image: bool,
|
|
335
331
|
fake_score: bool,
|
|
336
332
|
rows_and_cols: bool,
|
|
@@ -342,7 +338,7 @@ def pub_to_image_uncur( # pylint: disable=R0914
|
|
|
342
338
|
Map a datapoint of annotation structure as given in the Pubtabnet dataset to an Image structure.
|
|
343
339
|
<https://github.com/ibm-aur-nlp/PubTabNet>
|
|
344
340
|
|
|
345
|
-
:param dp: A datapoint in serialized
|
|
341
|
+
:param dp: A datapoint in serialized Pubtabnet format.
|
|
346
342
|
:param categories_name_as_key: A dict of categories, e.g. DatasetCategories.get_categories(name_as_key=True)
|
|
347
343
|
:param load_image: If `True` it will load image to `Image.image`
|
|
348
344
|
:param fake_score: If dp does not contain a score, a fake score with uniform random variables in (0,1)
|
|
@@ -407,60 +403,56 @@ def pub_to_image_uncur( # pylint: disable=R0914
|
|
|
407
403
|
|
|
408
404
|
table_ann: Optional[ImageAnnotation] = None
|
|
409
405
|
if is_fintabnet: # cannot use for synthetic table ann creation
|
|
410
|
-
table_ann = _get_table_annotation(dp, categories_name_as_key[LayoutType.
|
|
406
|
+
table_ann = _get_table_annotation(dp, categories_name_as_key[LayoutType.TABLE])
|
|
411
407
|
image.dump(table_ann)
|
|
412
408
|
|
|
413
|
-
for idx,
|
|
409
|
+
for idx, (row_col_cell_id, cell, row_span, col_span) in enumerate(
|
|
414
410
|
zip(rows_cols_cell_ids[::-1], dp["html"]["cells"][::-1], row_spans[::-1], col_spans[::-1])
|
|
415
411
|
):
|
|
416
|
-
row_col_cell_id = value[0]
|
|
417
412
|
row_number, col_number, cell_id = row_col_cell_id[0], row_col_cell_id[1], row_col_cell_id[2]
|
|
418
|
-
cell = value[1]
|
|
419
|
-
row_span = value[2]
|
|
420
|
-
col_span = value[3]
|
|
421
413
|
|
|
422
414
|
if "bbox" in cell: # empty cells have no box
|
|
423
415
|
ulx, uly, lrx, lry = list(map(float, cell["bbox"]))
|
|
424
416
|
cell_bounding_box = BoundingBox(absolute_coords=True, ulx=ulx, uly=uly, lrx=lrx, lry=lry)
|
|
425
417
|
cell_ann = ImageAnnotation(
|
|
426
|
-
category_name=LayoutType.
|
|
418
|
+
category_name=LayoutType.CELL,
|
|
427
419
|
bounding_box=cell_bounding_box,
|
|
428
|
-
category_id=categories_name_as_key[LayoutType.
|
|
420
|
+
category_id=categories_name_as_key[LayoutType.CELL],
|
|
429
421
|
score=maybe_get_fake_score(fake_score),
|
|
430
422
|
)
|
|
431
423
|
cell_ann.dump_sub_category(
|
|
432
|
-
CellType.
|
|
433
|
-
CategoryAnnotation(category_name=CellType.
|
|
424
|
+
CellType.ROW_NUMBER,
|
|
425
|
+
CategoryAnnotation(category_name=CellType.ROW_NUMBER, category_id=row_number),
|
|
434
426
|
image.image_id,
|
|
435
427
|
)
|
|
436
428
|
cell_ann.dump_sub_category(
|
|
437
|
-
CellType.
|
|
438
|
-
CategoryAnnotation(category_name=CellType.
|
|
429
|
+
CellType.COLUMN_NUMBER,
|
|
430
|
+
CategoryAnnotation(category_name=CellType.COLUMN_NUMBER, category_id=col_number),
|
|
439
431
|
image.image_id,
|
|
440
432
|
)
|
|
441
433
|
cell_ann.dump_sub_category(
|
|
442
|
-
CellType.
|
|
443
|
-
CategoryAnnotation(category_name=CellType.
|
|
434
|
+
CellType.ROW_SPAN,
|
|
435
|
+
CategoryAnnotation(category_name=CellType.ROW_SPAN, category_id=row_span), # type: ignore
|
|
444
436
|
image.image_id,
|
|
445
437
|
)
|
|
446
438
|
cell_ann.dump_sub_category(
|
|
447
|
-
CellType.
|
|
448
|
-
CategoryAnnotation(category_name=CellType.
|
|
439
|
+
CellType.COLUMN_SPAN,
|
|
440
|
+
CategoryAnnotation(category_name=CellType.COLUMN_SPAN, category_id=col_span), # type: ignore
|
|
449
441
|
image.image_id,
|
|
450
442
|
)
|
|
451
443
|
if (
|
|
452
|
-
|
|
453
|
-
or
|
|
444
|
+
cell_ann.get_sub_category(CellType.ROW_SPAN).category_id > 1
|
|
445
|
+
or cell_ann.get_sub_category(CellType.COLUMN_SPAN).category_id > 1
|
|
454
446
|
):
|
|
455
447
|
cell_ann.dump_sub_category(
|
|
456
|
-
CellType.
|
|
457
|
-
CategoryAnnotation(category_name=CellType.
|
|
448
|
+
CellType.SPANNING,
|
|
449
|
+
CategoryAnnotation(category_name=CellType.SPANNING),
|
|
458
450
|
image.image_id,
|
|
459
451
|
)
|
|
460
452
|
else:
|
|
461
453
|
cell_ann.dump_sub_category(
|
|
462
|
-
CellType.
|
|
463
|
-
CategoryAnnotation(category_name=LayoutType.
|
|
454
|
+
CellType.SPANNING,
|
|
455
|
+
CategoryAnnotation(category_name=LayoutType.CELL),
|
|
464
456
|
image.image_id,
|
|
465
457
|
)
|
|
466
458
|
|
|
@@ -468,13 +460,13 @@ def pub_to_image_uncur( # pylint: disable=R0914
|
|
|
468
460
|
max_cs = max(max_cs, col_span) # type: ignore
|
|
469
461
|
|
|
470
462
|
if _has_header:
|
|
471
|
-
category_name = CellType.
|
|
463
|
+
category_name = CellType.HEADER if cell_id <= end_of_header else CellType.BODY
|
|
472
464
|
cell_ann.dump_sub_category(
|
|
473
|
-
CellType.
|
|
465
|
+
CellType.HEADER, CategoryAnnotation(category_name=category_name), image.image_id
|
|
474
466
|
)
|
|
475
467
|
image.dump(cell_ann)
|
|
476
468
|
if table_ann is not None:
|
|
477
|
-
table_ann.dump_relationship(Relationships.
|
|
469
|
+
table_ann.dump_relationship(Relationships.CHILD, cell_ann.annotation_id)
|
|
478
470
|
|
|
479
471
|
if dd_pipe_like:
|
|
480
472
|
tokens = cell["tokens"]
|
|
@@ -484,47 +476,47 @@ def pub_to_image_uncur( # pylint: disable=R0914
|
|
|
484
476
|
text = "".join(tokens)
|
|
485
477
|
# we are not separating each word but view the full table content as one word
|
|
486
478
|
word = ImageAnnotation(
|
|
487
|
-
category_name=LayoutType.
|
|
488
|
-
category_id=categories_name_as_key[LayoutType.
|
|
479
|
+
category_name=LayoutType.WORD,
|
|
480
|
+
category_id=categories_name_as_key[LayoutType.WORD],
|
|
489
481
|
bounding_box=cell_bounding_box,
|
|
490
482
|
)
|
|
491
|
-
text_container = ContainerAnnotation(category_name=WordType.
|
|
492
|
-
word.dump_sub_category(WordType.
|
|
493
|
-
reading_order = CategoryAnnotation(category_name=Relationships.
|
|
494
|
-
word.dump_sub_category(Relationships.
|
|
483
|
+
text_container = ContainerAnnotation(category_name=WordType.CHARACTERS, value=text)
|
|
484
|
+
word.dump_sub_category(WordType.CHARACTERS, text_container)
|
|
485
|
+
reading_order = CategoryAnnotation(category_name=Relationships.READING_ORDER, category_id=1)
|
|
486
|
+
word.dump_sub_category(Relationships.READING_ORDER, reading_order)
|
|
495
487
|
image.dump(word)
|
|
496
|
-
cell_ann.dump_relationship(Relationships.
|
|
488
|
+
cell_ann.dump_relationship(Relationships.CHILD, word.annotation_id)
|
|
497
489
|
|
|
498
490
|
index = nth_index(html, "<td>", number_of_cells - idx)
|
|
499
491
|
if index:
|
|
500
492
|
html.insert(index + 1, cell_ann.annotation_id)
|
|
501
493
|
|
|
502
|
-
summary_ann =
|
|
494
|
+
summary_ann = CategoryAnnotation(category_name=SummaryType.SUMMARY)
|
|
503
495
|
summary_ann.dump_sub_category(
|
|
504
|
-
TableType.
|
|
505
|
-
CategoryAnnotation(category_name=TableType.
|
|
496
|
+
TableType.NUMBER_OF_ROWS,
|
|
497
|
+
CategoryAnnotation(category_name=TableType.NUMBER_OF_ROWS, category_id=number_of_rows),
|
|
506
498
|
image.image_id,
|
|
507
499
|
)
|
|
508
500
|
summary_ann.dump_sub_category(
|
|
509
|
-
TableType.
|
|
510
|
-
CategoryAnnotation(category_name=TableType.
|
|
501
|
+
TableType.NUMBER_OF_COLUMNS,
|
|
502
|
+
CategoryAnnotation(category_name=TableType.NUMBER_OF_COLUMNS, category_id=number_of_cols),
|
|
511
503
|
image.image_id,
|
|
512
504
|
)
|
|
513
505
|
summary_ann.dump_sub_category(
|
|
514
|
-
TableType.
|
|
515
|
-
CategoryAnnotation(category_name=TableType.
|
|
506
|
+
TableType.MAX_ROW_SPAN,
|
|
507
|
+
CategoryAnnotation(category_name=TableType.MAX_ROW_SPAN, category_id=max_rs),
|
|
516
508
|
image.image_id,
|
|
517
509
|
)
|
|
518
510
|
summary_ann.dump_sub_category(
|
|
519
|
-
TableType.
|
|
520
|
-
CategoryAnnotation(category_name=TableType.
|
|
511
|
+
TableType.MAX_COL_SPAN,
|
|
512
|
+
CategoryAnnotation(category_name=TableType.MAX_COL_SPAN, category_id=max_cs),
|
|
521
513
|
image.image_id,
|
|
522
514
|
)
|
|
523
515
|
image.summary = summary_ann
|
|
524
516
|
|
|
525
517
|
if rows_and_cols or dd_pipe_like:
|
|
526
|
-
image = _add_items(image, LayoutType.
|
|
527
|
-
image = _add_items(image, LayoutType.
|
|
518
|
+
image = _add_items(image, LayoutType.ROW, categories_name_as_key, pubtables_like)
|
|
519
|
+
image = _add_items(image, LayoutType.COLUMN, categories_name_as_key, pubtables_like)
|
|
528
520
|
|
|
529
521
|
if dd_pipe_like:
|
|
530
522
|
image = embedding_in_image(image, html, categories_name_as_key)
|
|
@@ -26,8 +26,8 @@ from lazy_imports import try_import
|
|
|
26
26
|
|
|
27
27
|
from ..datapoint.annotation import ImageAnnotation
|
|
28
28
|
from ..datapoint.image import Image
|
|
29
|
-
from ..utils.
|
|
30
|
-
from ..utils.
|
|
29
|
+
from ..utils.settings import TypeOrStr
|
|
30
|
+
from ..utils.types import JsonDict
|
|
31
31
|
from .maputils import curry
|
|
32
32
|
|
|
33
33
|
with try_import() as import_guard:
|
|
@@ -39,7 +39,7 @@ with try_import() as import_guard:
|
|
|
39
39
|
def image_to_tp_frcnn_training(
|
|
40
40
|
dp: Image,
|
|
41
41
|
add_mask: bool = False,
|
|
42
|
-
category_names: Optional[Union[
|
|
42
|
+
category_names: Optional[Union[TypeOrStr, Sequence[TypeOrStr]]] = None,
|
|
43
43
|
) -> Optional[JsonDict]:
|
|
44
44
|
"""
|
|
45
45
|
Maps an image to a dict to be consumed by Tensorpack Faster-RCNN bounding box detection. Note, that the returned
|
|
@@ -25,7 +25,6 @@ from itertools import chain
|
|
|
25
25
|
from typing import Mapping, Optional
|
|
26
26
|
|
|
27
27
|
from ..datapoint import BoundingBox, CategoryAnnotation, ContainerAnnotation, Image, ImageAnnotation
|
|
28
|
-
from ..utils.detection_types import JsonDict
|
|
29
28
|
from ..utils.fs import load_image_from_file
|
|
30
29
|
from ..utils.settings import (
|
|
31
30
|
BioTag,
|
|
@@ -37,17 +36,18 @@ from ..utils.settings import (
|
|
|
37
36
|
get_type,
|
|
38
37
|
token_class_tag_to_token_class_with_tag,
|
|
39
38
|
)
|
|
39
|
+
from ..utils.types import FunsdDict
|
|
40
40
|
from .maputils import MappingContextManager, curry, maybe_get_fake_score
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
@curry
|
|
44
44
|
def xfund_to_image(
|
|
45
|
-
dp:
|
|
45
|
+
dp: FunsdDict,
|
|
46
46
|
load_image: bool,
|
|
47
47
|
fake_score: bool,
|
|
48
|
-
categories_dict_name_as_key: Mapping[
|
|
48
|
+
categories_dict_name_as_key: Mapping[ObjectTypes, int],
|
|
49
49
|
token_class_names_mapping: Mapping[str, str],
|
|
50
|
-
ner_token_to_id_mapping: Mapping[ObjectTypes, Mapping[ObjectTypes, Mapping[ObjectTypes,
|
|
50
|
+
ner_token_to_id_mapping: Mapping[ObjectTypes, Mapping[ObjectTypes, Mapping[ObjectTypes, int]]],
|
|
51
51
|
) -> Optional[Image]:
|
|
52
52
|
"""
|
|
53
53
|
Map a datapoint of annotation structure as given as from xfund or funsd dataset in to an Image structure
|
|
@@ -75,9 +75,9 @@ def xfund_to_image(
|
|
|
75
75
|
|
|
76
76
|
_, file_name = os.path.split(full_path)
|
|
77
77
|
external_id = dp.get("uid")
|
|
78
|
-
tag_to_id_mapping = ner_token_to_id_mapping[LayoutType.
|
|
79
|
-
token_class_to_id_mapping = ner_token_to_id_mapping[LayoutType.
|
|
80
|
-
token_tag_to_id_mapping = ner_token_to_id_mapping[LayoutType.
|
|
78
|
+
tag_to_id_mapping = ner_token_to_id_mapping[LayoutType.WORD][WordType.TAG]
|
|
79
|
+
token_class_to_id_mapping = ner_token_to_id_mapping[LayoutType.WORD][WordType.TOKEN_CLASS]
|
|
80
|
+
token_tag_to_id_mapping = ner_token_to_id_mapping[LayoutType.WORD][WordType.TOKEN_TAG]
|
|
81
81
|
|
|
82
82
|
with MappingContextManager(file_name) as mapping_context:
|
|
83
83
|
image = Image(file_name=file_name, location=full_path, external_id=external_id)
|
|
@@ -101,16 +101,16 @@ def xfund_to_image(
|
|
|
101
101
|
bbox = BoundingBox(absolute_coords=True, ulx=box[0], uly=box[1], lrx=box[2], lry=box[3])
|
|
102
102
|
score = maybe_get_fake_score(fake_score)
|
|
103
103
|
entity_ann = ImageAnnotation(
|
|
104
|
-
category_name=LayoutType.
|
|
104
|
+
category_name=LayoutType.TEXT,
|
|
105
105
|
bounding_box=bbox,
|
|
106
|
-
category_id=categories_dict_name_as_key[LayoutType.
|
|
106
|
+
category_id=categories_dict_name_as_key[LayoutType.TEXT],
|
|
107
107
|
score=score,
|
|
108
108
|
)
|
|
109
109
|
category_name = token_class_names_mapping[entity["label"]]
|
|
110
110
|
sub_cat_semantic = CategoryAnnotation(
|
|
111
111
|
category_name=category_name, category_id=token_class_to_id_mapping[get_type(category_name)]
|
|
112
112
|
)
|
|
113
|
-
entity_ann.dump_sub_category(WordType.
|
|
113
|
+
entity_ann.dump_sub_category(WordType.TOKEN_CLASS, sub_cat_semantic)
|
|
114
114
|
image.dump(entity_ann)
|
|
115
115
|
|
|
116
116
|
words = entity.get("words")
|
|
@@ -122,61 +122,61 @@ def xfund_to_image(
|
|
|
122
122
|
score = maybe_get_fake_score(fake_score)
|
|
123
123
|
|
|
124
124
|
ann = ImageAnnotation(
|
|
125
|
-
category_name=LayoutType.
|
|
125
|
+
category_name=LayoutType.WORD,
|
|
126
126
|
bounding_box=bbox,
|
|
127
|
-
category_id=categories_dict_name_as_key[LayoutType.
|
|
127
|
+
category_id=categories_dict_name_as_key[LayoutType.WORD],
|
|
128
128
|
score=score,
|
|
129
129
|
)
|
|
130
130
|
image.dump(ann)
|
|
131
|
-
entity_ann.dump_relationship(Relationships.
|
|
131
|
+
entity_ann.dump_relationship(Relationships.CHILD, ann.annotation_id)
|
|
132
132
|
sub_cat_semantic = CategoryAnnotation(
|
|
133
133
|
category_name=category_name, category_id=token_class_to_id_mapping[get_type(category_name)]
|
|
134
134
|
)
|
|
135
|
-
ann.dump_sub_category(WordType.
|
|
136
|
-
sub_cat_chars = ContainerAnnotation(category_name=WordType.
|
|
137
|
-
ann.dump_sub_category(WordType.
|
|
138
|
-
if sub_cat_semantic.category_name == TokenClasses.
|
|
135
|
+
ann.dump_sub_category(WordType.TOKEN_CLASS, sub_cat_semantic)
|
|
136
|
+
sub_cat_chars = ContainerAnnotation(category_name=WordType.CHARACTERS, value=word["text"])
|
|
137
|
+
ann.dump_sub_category(WordType.CHARACTERS, sub_cat_chars)
|
|
138
|
+
if sub_cat_semantic.category_name == TokenClasses.OTHER:
|
|
139
139
|
sub_cat_tag = CategoryAnnotation(
|
|
140
|
-
category_name=BioTag.
|
|
140
|
+
category_name=BioTag.OUTSIDE, category_id=tag_to_id_mapping[BioTag.OUTSIDE]
|
|
141
141
|
)
|
|
142
|
-
ann.dump_sub_category(WordType.
|
|
142
|
+
ann.dump_sub_category(WordType.TAG, sub_cat_tag)
|
|
143
143
|
# populating ner token to be used for training and evaluation
|
|
144
144
|
sub_cat_ner_tok = CategoryAnnotation(
|
|
145
|
-
category_name=BioTag.
|
|
145
|
+
category_name=BioTag.OUTSIDE, category_id=token_tag_to_id_mapping[BioTag.OUTSIDE]
|
|
146
146
|
)
|
|
147
|
-
ann.dump_sub_category(WordType.
|
|
147
|
+
ann.dump_sub_category(WordType.TOKEN_TAG, sub_cat_ner_tok)
|
|
148
148
|
elif not idx:
|
|
149
149
|
sub_cat_tag = CategoryAnnotation(
|
|
150
|
-
category_name=BioTag.
|
|
150
|
+
category_name=BioTag.BEGIN, category_id=tag_to_id_mapping[BioTag.BEGIN]
|
|
151
151
|
)
|
|
152
|
-
ann.dump_sub_category(WordType.
|
|
152
|
+
ann.dump_sub_category(WordType.TAG, sub_cat_tag)
|
|
153
153
|
sub_cat_ner_tok = CategoryAnnotation(
|
|
154
154
|
category_name=token_class_tag_to_token_class_with_tag(
|
|
155
|
-
get_type(sub_cat_semantic.category_name), BioTag.
|
|
155
|
+
get_type(sub_cat_semantic.category_name), BioTag.BEGIN
|
|
156
156
|
),
|
|
157
157
|
category_id=token_tag_to_id_mapping[
|
|
158
158
|
token_class_tag_to_token_class_with_tag(
|
|
159
|
-
get_type(sub_cat_semantic.category_name), BioTag.
|
|
159
|
+
get_type(sub_cat_semantic.category_name), BioTag.BEGIN
|
|
160
160
|
)
|
|
161
161
|
],
|
|
162
162
|
)
|
|
163
|
-
ann.dump_sub_category(WordType.
|
|
163
|
+
ann.dump_sub_category(WordType.TOKEN_TAG, sub_cat_ner_tok)
|
|
164
164
|
else:
|
|
165
165
|
sub_cat_tag = CategoryAnnotation(
|
|
166
|
-
category_name=BioTag.
|
|
166
|
+
category_name=BioTag.INSIDE, category_id=tag_to_id_mapping[BioTag.INSIDE]
|
|
167
167
|
)
|
|
168
|
-
ann.dump_sub_category(WordType.
|
|
168
|
+
ann.dump_sub_category(WordType.TAG, sub_cat_tag)
|
|
169
169
|
sub_cat_ner_tok = CategoryAnnotation(
|
|
170
170
|
category_name=token_class_tag_to_token_class_with_tag(
|
|
171
|
-
get_type(sub_cat_semantic.category_name), BioTag.
|
|
171
|
+
get_type(sub_cat_semantic.category_name), BioTag.INSIDE
|
|
172
172
|
),
|
|
173
173
|
category_id=token_tag_to_id_mapping[
|
|
174
174
|
token_class_tag_to_token_class_with_tag(
|
|
175
|
-
get_type(sub_cat_semantic.category_name), BioTag.
|
|
175
|
+
get_type(sub_cat_semantic.category_name), BioTag.INSIDE
|
|
176
176
|
)
|
|
177
177
|
],
|
|
178
178
|
)
|
|
179
|
-
ann.dump_sub_category(WordType.
|
|
179
|
+
ann.dump_sub_category(WordType.TOKEN_TAG, sub_cat_ner_tok)
|
|
180
180
|
|
|
181
181
|
entity_id_to_ann_id[entity["id"]].append(ann.annotation_id)
|
|
182
182
|
ann_id_to_entity_id[ann.annotation_id] = entity["id"]
|
|
@@ -184,7 +184,7 @@ def xfund_to_image(
|
|
|
184
184
|
entity_id_to_entity_link_id[entity["id"]].extend(entity["linking"])
|
|
185
185
|
|
|
186
186
|
# now populating semantic links
|
|
187
|
-
word_anns = image.get_annotation(category_names=LayoutType.
|
|
187
|
+
word_anns = image.get_annotation(category_names=LayoutType.WORD)
|
|
188
188
|
for word in word_anns:
|
|
189
189
|
entity_id = ann_id_to_entity_id[word.annotation_id]
|
|
190
190
|
all_linked_entities = list(chain(*entity_id_to_entity_link_id[entity_id]))
|
|
@@ -193,7 +193,7 @@ def xfund_to_image(
|
|
|
193
193
|
ann_ids.extend(entity_id_to_ann_id[linked_entity])
|
|
194
194
|
for ann_id in ann_ids:
|
|
195
195
|
if ann_id != word.annotation_id:
|
|
196
|
-
word.dump_relationship(Relationships.
|
|
196
|
+
word.dump_relationship(Relationships.SEMANTIC_ENTITY_LINK, ann_id)
|
|
197
197
|
|
|
198
198
|
if mapping_context.context_error:
|
|
199
199
|
return None
|