deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +8 -25
- deepdoctection/analyzer/dd.py +84 -71
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +78 -56
- deepdoctection/datapoint/box.py +7 -7
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +157 -75
- deepdoctection/datapoint/view.py +175 -151
- deepdoctection/datasets/adapter.py +30 -24
- deepdoctection/datasets/base.py +10 -10
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +23 -25
- deepdoctection/datasets/instances/doclaynet.py +48 -49
- deepdoctection/datasets/instances/fintabnet.py +44 -45
- deepdoctection/datasets/instances/funsd.py +23 -23
- deepdoctection/datasets/instances/iiitar13k.py +8 -8
- deepdoctection/datasets/instances/layouttest.py +2 -2
- deepdoctection/datasets/instances/publaynet.py +3 -3
- deepdoctection/datasets/instances/pubtables1m.py +18 -18
- deepdoctection/datasets/instances/pubtabnet.py +30 -29
- deepdoctection/datasets/instances/rvlcdip.py +28 -29
- deepdoctection/datasets/instances/xfund.py +51 -30
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +13 -12
- deepdoctection/eval/eval.py +32 -26
- deepdoctection/eval/tedsmetric.py +16 -12
- deepdoctection/eval/tp_eval_callback.py +7 -16
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +69 -89
- deepdoctection/extern/deskew.py +11 -10
- deepdoctection/extern/doctrocr.py +81 -64
- deepdoctection/extern/fastlang.py +23 -16
- deepdoctection/extern/hfdetr.py +53 -38
- deepdoctection/extern/hflayoutlm.py +216 -155
- deepdoctection/extern/hflm.py +35 -30
- deepdoctection/extern/model.py +433 -255
- deepdoctection/extern/pdftext.py +15 -15
- deepdoctection/extern/pt/ptutils.py +4 -2
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +14 -16
- deepdoctection/extern/tp/tfutils.py +16 -2
- deepdoctection/extern/tp/tpcompat.py +11 -7
- deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
- deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
- deepdoctection/extern/tpdetect.py +40 -45
- deepdoctection/mapper/cats.py +36 -40
- deepdoctection/mapper/cocostruct.py +16 -12
- deepdoctection/mapper/d2struct.py +22 -22
- deepdoctection/mapper/hfstruct.py +7 -7
- deepdoctection/mapper/laylmstruct.py +22 -24
- deepdoctection/mapper/maputils.py +9 -10
- deepdoctection/mapper/match.py +33 -2
- deepdoctection/mapper/misc.py +6 -7
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +6 -6
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/anngen.py +39 -14
- deepdoctection/pipe/base.py +68 -99
- deepdoctection/pipe/common.py +181 -85
- deepdoctection/pipe/concurrency.py +14 -10
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +18 -16
- deepdoctection/pipe/lm.py +49 -47
- deepdoctection/pipe/order.py +63 -65
- deepdoctection/pipe/refine.py +102 -109
- deepdoctection/pipe/segment.py +157 -162
- deepdoctection/pipe/sub_layout.py +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/d2_frcnn_train.py +27 -25
- deepdoctection/train/hf_detr_train.py +22 -18
- deepdoctection/train/hf_layoutlm_train.py +49 -48
- deepdoctection/train/tp_frcnn_train.py +10 -11
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +52 -14
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +41 -14
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +15 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/pdf_utils.py +39 -14
- deepdoctection/utils/settings.py +188 -182
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +70 -69
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
- deepdoctection-0.34.dist-info/RECORD +146 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.32.dist-info/RECORD +0 -146
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
|
@@ -25,23 +25,23 @@ import sys
|
|
|
25
25
|
from errno import ENOENT
|
|
26
26
|
from io import BytesIO
|
|
27
27
|
from shutil import copyfile
|
|
28
|
-
from typing import Generator,
|
|
28
|
+
from typing import Generator, Optional
|
|
29
29
|
|
|
30
30
|
from numpy import uint8
|
|
31
31
|
from pypdf import PdfReader, PdfWriter, errors
|
|
32
32
|
|
|
33
33
|
from .context import save_tmp_file, timeout_manager
|
|
34
|
-
from .detection_types import ImageType, Pathlike
|
|
35
34
|
from .error import DependencyError, FileExtensionError
|
|
36
35
|
from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
|
|
37
36
|
from .logger import LoggingRecord, logger
|
|
37
|
+
from .types import PathLikeOrStr, PixelValues
|
|
38
38
|
from .utils import is_file_extension
|
|
39
39
|
from .viz import viz_handler
|
|
40
40
|
|
|
41
41
|
__all__ = ["decrypt_pdf_document", "get_pdf_file_reader", "get_pdf_file_writer", "PDFStreamer", "pdf_to_np_array"]
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
def decrypt_pdf_document(path:
|
|
44
|
+
def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
|
|
45
45
|
"""
|
|
46
46
|
Decrypting a pdf. As copying a pdf document removes the password that protects pdf, this method
|
|
47
47
|
generates a copy and decrypts the copy using qpdf. The result is saved as the original
|
|
@@ -73,7 +73,7 @@ def decrypt_pdf_document(path: Pathlike) -> bool:
|
|
|
73
73
|
return False
|
|
74
74
|
|
|
75
75
|
|
|
76
|
-
def get_pdf_file_reader(path:
|
|
76
|
+
def get_pdf_file_reader(path: PathLikeOrStr) -> PdfReader:
|
|
77
77
|
"""
|
|
78
78
|
Creates a file reader object from a pdf document. Will try to decrypt the document if it is
|
|
79
79
|
encrypted. (See `decrypt_pdf_document` to understand what is meant with "decrypt").
|
|
@@ -107,8 +107,7 @@ def get_pdf_file_reader(path: Pathlike) -> PdfReader:
|
|
|
107
107
|
)
|
|
108
108
|
sys.exit()
|
|
109
109
|
|
|
110
|
-
|
|
111
|
-
return file_reader
|
|
110
|
+
return PdfReader(os.fspath(path))
|
|
112
111
|
|
|
113
112
|
|
|
114
113
|
def get_pdf_file_writer() -> PdfWriter:
|
|
@@ -125,15 +124,27 @@ class PDFStreamer:
|
|
|
125
124
|
|
|
126
125
|
**Example:**
|
|
127
126
|
|
|
128
|
-
|
|
127
|
+
# Building a Dataflow with a PDFStreamer
|
|
128
|
+
df = dataflow.DataFromIterable(PDFStreamer(path=path))
|
|
129
129
|
df.reset_state()
|
|
130
130
|
|
|
131
131
|
for page in df:
|
|
132
132
|
... # do whatever you like
|
|
133
133
|
|
|
134
|
+
# Something else you can do:
|
|
135
|
+
streamer = PDFStreamer(path=path)
|
|
136
|
+
pages = len(streamer) # get the number of pages
|
|
137
|
+
random_int = random.sample(range(0, pages), 2) # select some pages
|
|
138
|
+
for ran in random_int:
|
|
139
|
+
pdf_bytes = streamer[ran] # get the page bytes directly
|
|
140
|
+
|
|
141
|
+
streamer.close() # Do not forget to close the streamer, otherwise the file will never be closed and might
|
|
142
|
+
# cause memory leaks if you open many files.
|
|
143
|
+
|
|
144
|
+
|
|
134
145
|
"""
|
|
135
146
|
|
|
136
|
-
def __init__(self, path:
|
|
147
|
+
def __init__(self, path: PathLikeOrStr) -> None:
|
|
137
148
|
"""
|
|
138
149
|
:param path: to a pdf.
|
|
139
150
|
"""
|
|
@@ -143,13 +154,27 @@ class PDFStreamer:
|
|
|
143
154
|
def __len__(self) -> int:
|
|
144
155
|
return len(self.file_reader.pages)
|
|
145
156
|
|
|
146
|
-
def __iter__(self) -> Generator[
|
|
157
|
+
def __iter__(self) -> Generator[tuple[bytes, int], None, None]:
|
|
147
158
|
for k in range(len(self)):
|
|
148
159
|
buffer = BytesIO()
|
|
149
160
|
writer = get_pdf_file_writer()
|
|
150
161
|
writer.add_page(self.file_reader.pages[k])
|
|
151
162
|
writer.write(buffer)
|
|
152
163
|
yield buffer.getvalue(), k
|
|
164
|
+
self.file_reader.close()
|
|
165
|
+
|
|
166
|
+
def __getitem__(self, index: int) -> bytes:
|
|
167
|
+
buffer = BytesIO()
|
|
168
|
+
writer = get_pdf_file_writer()
|
|
169
|
+
writer.add_page(self.file_reader.pages[index])
|
|
170
|
+
writer.write(buffer)
|
|
171
|
+
return buffer.getvalue()
|
|
172
|
+
|
|
173
|
+
def close(self) -> None:
|
|
174
|
+
"""
|
|
175
|
+
Close the file reader
|
|
176
|
+
"""
|
|
177
|
+
self.file_reader.close()
|
|
153
178
|
|
|
154
179
|
|
|
155
180
|
# The following functions are modified versions from the Python poppler wrapper
|
|
@@ -157,9 +182,9 @@ class PDFStreamer:
|
|
|
157
182
|
|
|
158
183
|
|
|
159
184
|
def _input_to_cli_str(
|
|
160
|
-
input_file_name:
|
|
161
|
-
) ->
|
|
162
|
-
cmd_args:
|
|
185
|
+
input_file_name: PathLikeOrStr, output_file_name: PathLikeOrStr, dpi: int, size: Optional[tuple[int, int]] = None
|
|
186
|
+
) -> list[str]:
|
|
187
|
+
cmd_args: list[str] = []
|
|
163
188
|
|
|
164
189
|
if pdf_to_ppm_available():
|
|
165
190
|
command = "pdftoppm"
|
|
@@ -196,7 +221,7 @@ class PopplerError(RuntimeError):
|
|
|
196
221
|
self.args = (status, message)
|
|
197
222
|
|
|
198
223
|
|
|
199
|
-
def _run_poppler(poppler_args:
|
|
224
|
+
def _run_poppler(poppler_args: list[str]) -> None:
|
|
200
225
|
try:
|
|
201
226
|
proc = subprocess.Popen(poppler_args) # pylint: disable=R1732
|
|
202
227
|
except OSError as error:
|
|
@@ -209,7 +234,7 @@ def _run_poppler(poppler_args: List[str]) -> None:
|
|
|
209
234
|
raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
|
|
210
235
|
|
|
211
236
|
|
|
212
|
-
def pdf_to_np_array(pdf_bytes: bytes, size: Optional[
|
|
237
|
+
def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
|
|
213
238
|
"""
|
|
214
239
|
Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
|
|
215
240
|
file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
|
deepdoctection/utils/settings.py
CHANGED
|
@@ -18,11 +18,12 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for funcs and constants that maintain general settings
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
23
|
import os
|
|
23
24
|
from enum import Enum
|
|
24
25
|
from pathlib import Path
|
|
25
|
-
from typing import
|
|
26
|
+
from typing import Optional, Union
|
|
26
27
|
|
|
27
28
|
import catalogue # type: ignore
|
|
28
29
|
|
|
@@ -34,7 +35,7 @@ class ObjectTypes(str, Enum):
|
|
|
34
35
|
return f"<{self.__class__.__name__}.{self.name}>"
|
|
35
36
|
|
|
36
37
|
@classmethod
|
|
37
|
-
def from_value(cls, value: str) ->
|
|
38
|
+
def from_value(cls, value: str) -> ObjectTypes:
|
|
38
39
|
"""Getting the enum member from a given string value
|
|
39
40
|
|
|
40
41
|
:param value: string value to get the enum member
|
|
@@ -56,263 +57,268 @@ object_types_registry = catalogue.create("deepdoctection", "settings", entry_poi
|
|
|
56
57
|
class DefaultType(ObjectTypes):
|
|
57
58
|
"""Type for default member"""
|
|
58
59
|
|
|
59
|
-
|
|
60
|
+
DEFAULT_TYPE = "default_type"
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
@object_types_registry.register("PageType")
|
|
63
64
|
class PageType(ObjectTypes):
|
|
64
65
|
"""Type for document page properties"""
|
|
65
66
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
DOCUMENT_TYPE = "document_type"
|
|
68
|
+
LANGUAGE = "language"
|
|
69
|
+
ANGLE = "angle"
|
|
69
70
|
|
|
70
71
|
|
|
71
72
|
@object_types_registry.register("SummaryType")
|
|
72
73
|
class SummaryType(ObjectTypes):
|
|
73
74
|
"""Summary type member"""
|
|
74
75
|
|
|
75
|
-
|
|
76
|
+
SUMMARY = "summary"
|
|
76
77
|
|
|
77
78
|
|
|
78
79
|
@object_types_registry.register("DocumentType")
|
|
79
80
|
class DocumentType(ObjectTypes):
|
|
80
81
|
"""Document types"""
|
|
81
82
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
83
|
+
LETTER = "letter"
|
|
84
|
+
FORM = "form"
|
|
85
|
+
EMAIL = "email"
|
|
86
|
+
HANDWRITTEN = "handwritten"
|
|
87
|
+
ADVERTISEMENT = "advertisement"
|
|
88
|
+
SCIENTIFIC_REPORT = "scientific_report"
|
|
89
|
+
SCIENTIFIC_PUBLICATION = "scientific_publication"
|
|
90
|
+
SPECIFICATION = "specification"
|
|
91
|
+
FILE_FOLDER = "file_folder"
|
|
92
|
+
NEWS_ARTICLE = "news_article"
|
|
93
|
+
BUDGET = "budget"
|
|
94
|
+
INVOICE = "invoice"
|
|
95
|
+
PRESENTATION = "presentation"
|
|
96
|
+
QUESTIONNAIRE = "questionnaire"
|
|
97
|
+
RESUME = "resume"
|
|
98
|
+
MEMO = "memo"
|
|
99
|
+
FINANCIAL_REPORT = "financial_report"
|
|
100
|
+
LAWS_AND_REGULATIONS = "laws_and_regulations"
|
|
101
|
+
GOVERNMENT_TENDERS = "government_tenders"
|
|
102
|
+
MANUALS = "manuals"
|
|
103
|
+
PATENTS = "patents"
|
|
104
|
+
MARK = "mark"
|
|
103
105
|
|
|
104
106
|
|
|
105
107
|
@object_types_registry.register("LayoutType")
|
|
106
108
|
class LayoutType(ObjectTypes):
|
|
107
109
|
"""Layout types"""
|
|
108
110
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
111
|
+
TABLE = "table"
|
|
112
|
+
TABLE_ROTATED = "table_rotated"
|
|
113
|
+
FIGURE = "figure"
|
|
114
|
+
LIST = "list"
|
|
115
|
+
TEXT = "text"
|
|
116
|
+
TITLE = "title"
|
|
117
|
+
LOGO = "logo"
|
|
118
|
+
SIGNATURE = "signature"
|
|
119
|
+
CAPTION = "caption"
|
|
120
|
+
FOOTNOTE = "footnote"
|
|
121
|
+
FORMULA = "formula"
|
|
122
|
+
PAGE_FOOTER = "page_footer"
|
|
123
|
+
PAGE_HEADER = "page_header"
|
|
124
|
+
SECTION_HEADER = "section_header"
|
|
125
|
+
PAGE = "page"
|
|
126
|
+
CELL = "cell"
|
|
127
|
+
ROW = "row"
|
|
128
|
+
COLUMN = "column"
|
|
129
|
+
WORD = "word"
|
|
130
|
+
LINE = "line"
|
|
131
|
+
BACKGROUND = "background"
|
|
132
|
+
PAGE_NUMBER = "page_number"
|
|
133
|
+
KEY_VALUE_AREA = "key_value_area"
|
|
134
|
+
LIST_ITEM = "list_item"
|
|
130
135
|
|
|
131
136
|
|
|
132
137
|
@object_types_registry.register("TableType")
|
|
133
138
|
class TableType(ObjectTypes):
|
|
134
139
|
"""Types for table properties"""
|
|
135
140
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
141
|
+
ITEM = "item"
|
|
142
|
+
NUMBER_OF_ROWS = "number_of_rows"
|
|
143
|
+
NUMBER_OF_COLUMNS = "number_of_columns"
|
|
144
|
+
MAX_ROW_SPAN = "max_row_span"
|
|
145
|
+
MAX_COL_SPAN = "max_col_span"
|
|
146
|
+
HTML = "html"
|
|
142
147
|
|
|
143
148
|
|
|
144
149
|
@object_types_registry.register("CellType")
|
|
145
150
|
class CellType(ObjectTypes):
|
|
146
151
|
"""Types for cell properties"""
|
|
147
152
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
153
|
+
HEADER = "header"
|
|
154
|
+
BODY = "body"
|
|
155
|
+
ROW_NUMBER = "row_number"
|
|
156
|
+
ROW_SPAN = "row_span"
|
|
157
|
+
ROW_HEADER = "row_header"
|
|
158
|
+
PROJECTED_ROW_HEADER = "projected_row_header"
|
|
159
|
+
COLUMN_NUMBER = "column_number"
|
|
160
|
+
COLUMN_SPAN = "column_span"
|
|
161
|
+
COLUMN_HEADER = "column_header"
|
|
162
|
+
SPANNING = "spanning"
|
|
158
163
|
|
|
159
164
|
|
|
160
165
|
@object_types_registry.register("WordType")
|
|
161
166
|
class WordType(ObjectTypes):
|
|
162
167
|
"""Types for word properties"""
|
|
163
168
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
169
|
+
CHARACTERS = "characters"
|
|
170
|
+
BLOCK = "block"
|
|
171
|
+
TOKEN_CLASS = "token_class"
|
|
172
|
+
TAG = "tag"
|
|
173
|
+
TOKEN_TAG = "token_tag"
|
|
174
|
+
TEXT_LINE = "text_line"
|
|
175
|
+
CHARACTER_TYPE = "character_type"
|
|
176
|
+
PRINTED = "printed"
|
|
177
|
+
HANDWRITTEN = "handwritten"
|
|
173
178
|
|
|
174
179
|
|
|
175
180
|
@object_types_registry.register("TokenClasses")
|
|
176
181
|
class TokenClasses(ObjectTypes):
|
|
177
182
|
"""Types for token classes"""
|
|
178
183
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
184
|
+
HEADER = "header"
|
|
185
|
+
QUESTION = "question"
|
|
186
|
+
ANSWER = "answer"
|
|
187
|
+
OTHER = "other"
|
|
183
188
|
|
|
184
189
|
|
|
185
190
|
@object_types_registry.register("BioTag")
|
|
186
191
|
class BioTag(ObjectTypes):
|
|
187
192
|
"""Types for tags"""
|
|
188
193
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
+
BEGIN = "B"
|
|
195
|
+
INSIDE = "I"
|
|
196
|
+
OUTSIDE = "O"
|
|
197
|
+
SINGLE = "S"
|
|
198
|
+
END = "E"
|
|
194
199
|
|
|
195
200
|
|
|
196
201
|
@object_types_registry.register("TokenClassWithTag")
|
|
197
202
|
class TokenClassWithTag(ObjectTypes):
|
|
198
203
|
"""Types for token classes with tags, e.g. B-answer"""
|
|
199
204
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
205
|
+
B_ANSWER = "B-answer"
|
|
206
|
+
B_HEADER = "B-header"
|
|
207
|
+
B_QUESTION = "B-question"
|
|
208
|
+
E_ANSWER = "E-answer"
|
|
209
|
+
E_HEADER = "E-header"
|
|
210
|
+
E_QUESTION = "E-question"
|
|
211
|
+
I_ANSWER = "I-answer"
|
|
212
|
+
I_HEADER = "I-header"
|
|
213
|
+
I_QUESTION = "I-question"
|
|
214
|
+
S_ANSWER = "S-answer"
|
|
215
|
+
S_HEADER = "S-header"
|
|
216
|
+
S_QUESTION = "S-question"
|
|
212
217
|
|
|
213
218
|
|
|
214
219
|
@object_types_registry.register("Relationships")
|
|
215
220
|
class Relationships(ObjectTypes):
|
|
216
221
|
"""Types for describing relationships between types"""
|
|
217
222
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
223
|
+
CHILD = "child"
|
|
224
|
+
READING_ORDER = "reading_order"
|
|
225
|
+
SEMANTIC_ENTITY_LINK = "semantic_entity_link"
|
|
226
|
+
LAYOUT_LINK = "layout_link"
|
|
221
227
|
|
|
222
228
|
|
|
223
229
|
@object_types_registry.register("Languages")
|
|
224
230
|
class Languages(ObjectTypes):
|
|
225
231
|
"""Language types"""
|
|
226
232
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
233
|
+
ENGLISH = "eng"
|
|
234
|
+
RUSSIAN = "rus"
|
|
235
|
+
GERMAN = "deu"
|
|
236
|
+
FRENCH = "fre"
|
|
237
|
+
ITALIAN = "ita"
|
|
238
|
+
JAPANESE = "jpn"
|
|
239
|
+
SPANISH = "spa"
|
|
240
|
+
CEBUANO = "ceb"
|
|
241
|
+
TURKISH = "tur"
|
|
242
|
+
PORTUGUESE = "por"
|
|
243
|
+
UKRAINIAN = "ukr"
|
|
244
|
+
ESPERANTO = "epo"
|
|
245
|
+
POLISH = "pol"
|
|
246
|
+
SWEDISH = "swe"
|
|
247
|
+
DUTCH = "dut"
|
|
248
|
+
HEBREW = "heb"
|
|
249
|
+
CHINESE = "chi"
|
|
250
|
+
HUNGARIAN = "hun"
|
|
251
|
+
ARABIC = "ara"
|
|
252
|
+
CATALAN = "cat"
|
|
253
|
+
FINNISH = "fin"
|
|
254
|
+
CZECH = "cze"
|
|
255
|
+
PERSIAN = "per"
|
|
256
|
+
SERBIAN = "srp"
|
|
257
|
+
GREEK = "gre"
|
|
258
|
+
VIETNAMESE = "vie"
|
|
259
|
+
BULGARIAN = "bul"
|
|
260
|
+
KOREAN = "kor"
|
|
261
|
+
NORWEGIAN = "nor"
|
|
262
|
+
MACEDONIAN = "mac"
|
|
263
|
+
ROMANIAN = "rum"
|
|
264
|
+
INDONESIAN = "ind"
|
|
265
|
+
THAI = "tha"
|
|
266
|
+
ARMENIAN = "arm"
|
|
267
|
+
DANISH = "dan"
|
|
268
|
+
TAMIL = "tam"
|
|
269
|
+
HINDI = "hin"
|
|
270
|
+
CROATIAN = "hrv"
|
|
271
|
+
BELARUSIAN = "bel"
|
|
272
|
+
GEORGIAN = "geo"
|
|
273
|
+
TELUGU = "tel"
|
|
274
|
+
KAZAKH = "kaz"
|
|
275
|
+
WARAY = "war"
|
|
276
|
+
LITHUANIAN = "lit"
|
|
277
|
+
SCOTTISH = "glg"
|
|
278
|
+
SLOVAK = "slo"
|
|
279
|
+
BENIN = "ben"
|
|
280
|
+
BASQUE = "baq"
|
|
281
|
+
SLOVENIAN = "slv"
|
|
282
|
+
MALAYALAM = "mal"
|
|
283
|
+
MARATHI = "mar"
|
|
284
|
+
ESTONIAN = "est"
|
|
285
|
+
AZERBAIJANI = "aze"
|
|
286
|
+
ALBANIAN = "alb"
|
|
287
|
+
LATIN = "lat"
|
|
288
|
+
BOSNIAN = "bos"
|
|
289
|
+
NORWEGIAN_NOVOSIBIRSK = "nno"
|
|
290
|
+
URDU = "urd"
|
|
291
|
+
NOT_DEFINED = "nn"
|
|
286
292
|
|
|
287
293
|
|
|
288
294
|
@object_types_registry.register("DatasetType")
|
|
289
295
|
class DatasetType(ObjectTypes):
|
|
290
296
|
"""Dataset types"""
|
|
291
297
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
298
|
+
OBJECT_DETECTION = "object_detection"
|
|
299
|
+
SEQUENCE_CLASSIFICATION = "sequence_classification"
|
|
300
|
+
TOKEN_CLASSIFICATION = "token_classification"
|
|
301
|
+
PUBLAYNET = "publaynet"
|
|
302
|
+
DEFAULT = "default"
|
|
297
303
|
|
|
298
304
|
|
|
299
305
|
_TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG = {
|
|
300
|
-
(TokenClasses.
|
|
301
|
-
(TokenClasses.
|
|
302
|
-
(TokenClasses.
|
|
303
|
-
(TokenClasses.
|
|
304
|
-
(TokenClasses.
|
|
305
|
-
(TokenClasses.
|
|
306
|
-
(TokenClasses.
|
|
307
|
-
(TokenClasses.
|
|
308
|
-
(TokenClasses.
|
|
309
|
-
(TokenClasses.
|
|
310
|
-
(TokenClasses.
|
|
311
|
-
(TokenClasses.
|
|
312
|
-
(TokenClasses.
|
|
313
|
-
(TokenClasses.
|
|
314
|
-
(TokenClasses.
|
|
315
|
-
(TokenClasses.
|
|
306
|
+
(TokenClasses.HEADER, BioTag.BEGIN): TokenClassWithTag.B_HEADER,
|
|
307
|
+
(TokenClasses.HEADER, BioTag.INSIDE): TokenClassWithTag.I_HEADER,
|
|
308
|
+
(TokenClasses.HEADER, BioTag.END): TokenClassWithTag.E_HEADER,
|
|
309
|
+
(TokenClasses.HEADER, BioTag.SINGLE): TokenClassWithTag.S_HEADER,
|
|
310
|
+
(TokenClasses.ANSWER, BioTag.BEGIN): TokenClassWithTag.B_ANSWER,
|
|
311
|
+
(TokenClasses.ANSWER, BioTag.INSIDE): TokenClassWithTag.I_ANSWER,
|
|
312
|
+
(TokenClasses.ANSWER, BioTag.END): TokenClassWithTag.E_ANSWER,
|
|
313
|
+
(TokenClasses.ANSWER, BioTag.SINGLE): TokenClassWithTag.S_ANSWER,
|
|
314
|
+
(TokenClasses.QUESTION, BioTag.BEGIN): TokenClassWithTag.B_QUESTION,
|
|
315
|
+
(TokenClasses.QUESTION, BioTag.INSIDE): TokenClassWithTag.I_QUESTION,
|
|
316
|
+
(TokenClasses.QUESTION, BioTag.END): TokenClassWithTag.E_QUESTION,
|
|
317
|
+
(TokenClasses.QUESTION, BioTag.SINGLE): TokenClassWithTag.S_QUESTION,
|
|
318
|
+
(TokenClasses.OTHER, BioTag.OUTSIDE): BioTag.OUTSIDE,
|
|
319
|
+
(TokenClasses.HEADER, BioTag.OUTSIDE): BioTag.OUTSIDE,
|
|
320
|
+
(TokenClasses.ANSWER, BioTag.OUTSIDE): BioTag.OUTSIDE,
|
|
321
|
+
(TokenClasses.QUESTION, BioTag.OUTSIDE): BioTag.OUTSIDE,
|
|
316
322
|
}
|
|
317
323
|
|
|
318
324
|
|
|
@@ -334,7 +340,7 @@ def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes
|
|
|
334
340
|
|
|
335
341
|
def token_class_with_tag_to_token_class_and_tag(
|
|
336
342
|
token_class_with_tag: ObjectTypes,
|
|
337
|
-
) -> Optional[
|
|
343
|
+
) -> Optional[tuple[ObjectTypes, ObjectTypes]]:
|
|
338
344
|
"""
|
|
339
345
|
This is the reverse mapping from TokenClassWithTag members to TokenClasses and BioTag
|
|
340
346
|
|
|
@@ -358,7 +364,7 @@ def update_all_types_dict() -> None:
|
|
|
358
364
|
_ALL_TYPES_DICT.update({e.value: e for e in obj})
|
|
359
365
|
|
|
360
366
|
|
|
361
|
-
_OLD_TO_NEW_OBJ_TYPE:
|
|
367
|
+
_OLD_TO_NEW_OBJ_TYPE: dict[str, str] = {
|
|
362
368
|
"DOC_CLASS": "document_type",
|
|
363
369
|
"CHARS": "characters",
|
|
364
370
|
"BIO_TAG": "tag",
|
|
@@ -381,10 +387,10 @@ def _get_new_obj_type_str(obj_type: str) -> str:
|
|
|
381
387
|
return _OLD_TO_NEW_OBJ_TYPE.get(obj_type, obj_type)
|
|
382
388
|
|
|
383
389
|
|
|
384
|
-
_BLACK_LIST:
|
|
390
|
+
_BLACK_LIST: list[str] = ["B", "I", "O", "E", "S"]
|
|
385
391
|
|
|
386
392
|
|
|
387
|
-
def _get_black_list() ->
|
|
393
|
+
def _get_black_list() -> list[str]:
|
|
388
394
|
return _BLACK_LIST
|
|
389
395
|
|
|
390
396
|
|