PyPI - deepdoctection - Versions diffs - 0.32__py3-none-any.whl → 0.34__py3-none-any.whl - Mend

deepdoctection 0.32py3-none-any.whl → 0.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show

deepdoctection/__init__.py +8 -25
deepdoctection/analyzer/dd.py +84 -71
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +78 -56
deepdoctection/datapoint/box.py +7 -7
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +157 -75
deepdoctection/datapoint/view.py +175 -151
deepdoctection/datasets/adapter.py +30 -24
deepdoctection/datasets/base.py +10 -10
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +23 -25
deepdoctection/datasets/instances/doclaynet.py +48 -49
deepdoctection/datasets/instances/fintabnet.py +44 -45
deepdoctection/datasets/instances/funsd.py +23 -23
deepdoctection/datasets/instances/iiitar13k.py +8 -8
deepdoctection/datasets/instances/layouttest.py +2 -2
deepdoctection/datasets/instances/publaynet.py +3 -3
deepdoctection/datasets/instances/pubtables1m.py +18 -18
deepdoctection/datasets/instances/pubtabnet.py +30 -29
deepdoctection/datasets/instances/rvlcdip.py +28 -29
deepdoctection/datasets/instances/xfund.py +51 -30
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +13 -12
deepdoctection/eval/eval.py +32 -26
deepdoctection/eval/tedsmetric.py +16 -12
deepdoctection/eval/tp_eval_callback.py +7 -16
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +69 -89
deepdoctection/extern/deskew.py +11 -10
deepdoctection/extern/doctrocr.py +81 -64
deepdoctection/extern/fastlang.py +23 -16
deepdoctection/extern/hfdetr.py +53 -38
deepdoctection/extern/hflayoutlm.py +216 -155
deepdoctection/extern/hflm.py +35 -30
deepdoctection/extern/model.py +433 -255
deepdoctection/extern/pdftext.py +15 -15
deepdoctection/extern/pt/ptutils.py +4 -2
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +14 -16
deepdoctection/extern/tp/tfutils.py +16 -2
deepdoctection/extern/tp/tpcompat.py +11 -7
deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
deepdoctection/extern/tpdetect.py +40 -45
deepdoctection/mapper/cats.py +36 -40
deepdoctection/mapper/cocostruct.py +16 -12
deepdoctection/mapper/d2struct.py +22 -22
deepdoctection/mapper/hfstruct.py +7 -7
deepdoctection/mapper/laylmstruct.py +22 -24
deepdoctection/mapper/maputils.py +9 -10
deepdoctection/mapper/match.py +33 -2
deepdoctection/mapper/misc.py +6 -7
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +6 -6
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +3 -3
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/anngen.py +39 -14
deepdoctection/pipe/base.py +68 -99
deepdoctection/pipe/common.py +181 -85
deepdoctection/pipe/concurrency.py +14 -10
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +18 -16
deepdoctection/pipe/lm.py +49 -47
deepdoctection/pipe/order.py +63 -65
deepdoctection/pipe/refine.py +102 -109
deepdoctection/pipe/segment.py +157 -162
deepdoctection/pipe/sub_layout.py +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/d2_frcnn_train.py +27 -25
deepdoctection/train/hf_detr_train.py +22 -18
deepdoctection/train/hf_layoutlm_train.py +49 -48
deepdoctection/train/tp_frcnn_train.py +10 -11
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +52 -14
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +41 -14
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +15 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/pdf_utils.py +39 -14
deepdoctection/utils/settings.py +188 -182
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +70 -69
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
deepdoctection-0.34.dist-info/RECORD +146 -0
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.32.dist-info/RECORD +0 -146
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0

deepdoctection/utils/pdf_utils.py CHANGED Viewed

@@ -25,23 +25,23 @@ import sys
 from errno import ENOENT
 from io import BytesIO
 from shutil import copyfile
-from typing import Generator, List, Optional, Tuple
+from typing import Generator, Optional
 from numpy import uint8
 from pypdf import PdfReader, PdfWriter, errors
 from .context import save_tmp_file, timeout_manager
-from .detection_types import ImageType, Pathlike
 from .error import DependencyError, FileExtensionError
 from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
 from .logger import LoggingRecord, logger
+from .types import PathLikeOrStr, PixelValues
 from .utils import is_file_extension
 from .viz import viz_handler
 __all__ = ["decrypt_pdf_document", "get_pdf_file_reader", "get_pdf_file_writer", "PDFStreamer", "pdf_to_np_array"]
-def decrypt_pdf_document(path: Pathlike) -> bool:
+def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
     """
     Decrypting a pdf. As copying a pdf document removes the password that protects pdf, this method
     generates a copy and decrypts the copy using qpdf. The result is saved as the original
@@ -73,7 +73,7 @@ def decrypt_pdf_document(path: Pathlike) -> bool:
     return False
-def get_pdf_file_reader(path: Pathlike) -> PdfReader:
+def get_pdf_file_reader(path: PathLikeOrStr) -> PdfReader:
     """
     Creates a file reader object from a pdf document. Will try to decrypt the document if it is
     encrypted. (See `decrypt_pdf_document` to understand what is meant with "decrypt").
@@ -107,8 +107,7 @@ def get_pdf_file_reader(path: Pathlike) -> PdfReader:
                     )
                     sys.exit()
-    file_reader = PdfReader(open(path, "rb"))  # pylint: disable=R1732
-    return file_reader
+    return PdfReader(os.fspath(path))
 def get_pdf_file_writer() -> PdfWriter:
@@ -125,15 +124,27 @@ class PDFStreamer:
     **Example:**
-             df = dataflow.DataFromIterable.PDFStreamer(path=path)
+             # Building a Dataflow with a PDFStreamer
+             df = dataflow.DataFromIterable(PDFStreamer(path=path))
              df.reset_state()
              for page in df:
                 ... # do whatever you like
+             # Something else you can do:
+            streamer = PDFStreamer(path=path)
+            pages = len(streamer)  # get the number of pages
+            random_int = random.sample(range(0, pages), 2) # select some pages
+            for ran in random_int:
+                pdf_bytes = streamer[ran]   # get the page bytes directly
+            streamer.close() # Do not forget to close the streamer, otherwise the file will never be closed and might
+                             # cause memory leaks if you open many files.
     """
-    def __init__(self, path: Pathlike) -> None:
+    def __init__(self, path: PathLikeOrStr) -> None:
         """
         :param path: to a pdf.
         """
@@ -143,13 +154,27 @@ class PDFStreamer:
     def __len__(self) -> int:
         return len(self.file_reader.pages)
-    def __iter__(self) -> Generator[Tuple[bytes, int], None, None]:
+    def __iter__(self) -> Generator[tuple[bytes, int], None, None]:
         for k in range(len(self)):
             buffer = BytesIO()
             writer = get_pdf_file_writer()
             writer.add_page(self.file_reader.pages[k])
             writer.write(buffer)
             yield buffer.getvalue(), k
+        self.file_reader.close()
+    def __getitem__(self, index: int) -> bytes:
+        buffer = BytesIO()
+        writer = get_pdf_file_writer()
+        writer.add_page(self.file_reader.pages[index])
+        writer.write(buffer)
+        return buffer.getvalue()
+    def close(self) -> None:
+        """
+        Close the file reader
+        """
+        self.file_reader.close()
 # The following functions are modified versions from the Python poppler wrapper
@@ -157,9 +182,9 @@ class PDFStreamer:
 def _input_to_cli_str(
-    input_file_name: Pathlike, output_file_name: Pathlike, dpi: int, size: Optional[Tuple[int, int]] = None
-) -> List[str]:
-    cmd_args: List[str] = []
+    input_file_name: PathLikeOrStr, output_file_name: PathLikeOrStr, dpi: int, size: Optional[tuple[int, int]] = None
+) -> list[str]:
+    cmd_args: list[str] = []
     if pdf_to_ppm_available():
         command = "pdftoppm"
@@ -196,7 +221,7 @@ class PopplerError(RuntimeError):
         self.args = (status, message)
-def _run_poppler(poppler_args: List[str]) -> None:
+def _run_poppler(poppler_args: list[str]) -> None:
     try:
         proc = subprocess.Popen(poppler_args)  # pylint: disable=R1732
     except OSError as error:
@@ -209,7 +234,7 @@ def _run_poppler(poppler_args: List[str]) -> None:
             raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
-def pdf_to_np_array(pdf_bytes: bytes, size: Optional[Tuple[int, int]] = None, dpi: int = 200) -> ImageType:
+def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
     """
     Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
     file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.

deepdoctection/utils/settings.py CHANGED Viewed

@@ -18,11 +18,12 @@
 """
 Module for funcs and constants that maintain general settings
 """
+from __future__ import annotations
 import os
 from enum import Enum
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 import catalogue  # type: ignore
@@ -34,7 +35,7 @@ class ObjectTypes(str, Enum):
         return f"<{self.__class__.__name__}.{self.name}>"
     @classmethod
-    def from_value(cls, value: str) -> "ObjectTypes":
+    def from_value(cls, value: str) -> ObjectTypes:
         """Getting the enum member from a given string value
         :param value: string value to get the enum member
@@ -56,263 +57,268 @@ object_types_registry = catalogue.create("deepdoctection", "settings", entry_poi
 class DefaultType(ObjectTypes):
     """Type for default member"""
-    default_type = "default_type"
+    DEFAULT_TYPE = "default_type"
 @object_types_registry.register("PageType")
 class PageType(ObjectTypes):
     """Type for document page properties"""
-    document_type = "document_type"
-    language = "language"
-    angle = "angle"
+    DOCUMENT_TYPE = "document_type"
+    LANGUAGE = "language"
+    ANGLE = "angle"
 @object_types_registry.register("SummaryType")
 class SummaryType(ObjectTypes):
     """Summary type member"""
-    summary = "summary"
+    SUMMARY = "summary"
 @object_types_registry.register("DocumentType")
 class DocumentType(ObjectTypes):
     """Document types"""
-    letter = "letter"
-    form = "form"
-    email = "email"
-    handwritten = "handwritten"
-    advertisement = "advertisement"
-    scientific_report = "scientific_report"
-    scientific_publication = "scientific_publication"
-    specification = "specification"
-    file_folder = "file_folder"
-    news_article = "news_article"
-    budget = "budget"
-    invoice = "invoice"
-    presentation = "presentation"
-    questionnaire = "questionnaire"
-    resume = "resume"
-    memo = "memo"
-    financial_report = "financial_report"
-    laws_and_regulations = "laws_and_regulations"
-    government_tenders = "government_tenders"
-    manuals = "manuals"
-    patents = "patents"
+    LETTER = "letter"
+    FORM = "form"
+    EMAIL = "email"
+    HANDWRITTEN = "handwritten"
+    ADVERTISEMENT = "advertisement"
+    SCIENTIFIC_REPORT = "scientific_report"
+    SCIENTIFIC_PUBLICATION = "scientific_publication"
+    SPECIFICATION = "specification"
+    FILE_FOLDER = "file_folder"
+    NEWS_ARTICLE = "news_article"
+    BUDGET = "budget"
+    INVOICE = "invoice"
+    PRESENTATION = "presentation"
+    QUESTIONNAIRE = "questionnaire"
+    RESUME = "resume"
+    MEMO = "memo"
+    FINANCIAL_REPORT = "financial_report"
+    LAWS_AND_REGULATIONS = "laws_and_regulations"
+    GOVERNMENT_TENDERS = "government_tenders"
+    MANUALS = "manuals"
+    PATENTS = "patents"
+    MARK = "mark"
 @object_types_registry.register("LayoutType")
 class LayoutType(ObjectTypes):
     """Layout types"""
-    table = "table"
-    table_rotated = "table_rotated"
-    figure = "figure"
-    list = "list"
-    text = "text"
-    title = "title"  # type: ignore
-    logo = "logo"
-    signature = "signature"
-    caption = "caption"
-    footnote = "footnote"
-    formula = "formula"
-    page_footer = "page_footer"
-    page_header = "page_header"
-    section_header = "section_header"
-    page = "page"
-    cell = "cell"
-    row = "row"
-    column = "column"
-    word = "word"
-    line = "line"
-    background = "background"
+    TABLE = "table"
+    TABLE_ROTATED = "table_rotated"
+    FIGURE = "figure"
+    LIST = "list"
+    TEXT = "text"
+    TITLE = "title"
+    LOGO = "logo"
+    SIGNATURE = "signature"
+    CAPTION = "caption"
+    FOOTNOTE = "footnote"
+    FORMULA = "formula"
+    PAGE_FOOTER = "page_footer"
+    PAGE_HEADER = "page_header"
+    SECTION_HEADER = "section_header"
+    PAGE = "page"
+    CELL = "cell"
+    ROW = "row"
+    COLUMN = "column"
+    WORD = "word"
+    LINE = "line"
+    BACKGROUND = "background"
+    PAGE_NUMBER = "page_number"
+    KEY_VALUE_AREA = "key_value_area"
+    LIST_ITEM = "list_item"
 @object_types_registry.register("TableType")
 class TableType(ObjectTypes):
     """Types for table properties"""
-    item = "item"
-    number_of_rows = "number_of_rows"
-    number_of_columns = "number_of_columns"
-    max_row_span = "max_row_span"
-    max_col_span = "max_col_span"
-    html = "html"
+    ITEM = "item"
+    NUMBER_OF_ROWS = "number_of_rows"
+    NUMBER_OF_COLUMNS = "number_of_columns"
+    MAX_ROW_SPAN = "max_row_span"
+    MAX_COL_SPAN = "max_col_span"
+    HTML = "html"
 @object_types_registry.register("CellType")
 class CellType(ObjectTypes):
     """Types for cell properties"""
-    header = "header"
-    body = "body"
-    row_number = "row_number"
-    row_span = "row_span"
-    row_header = "row_header"
-    projected_row_header = "projected_row_header"
-    column_number = "column_number"
-    column_span = "column_span"
-    column_header = "column_header"
-    spanning = "spanning"
+    HEADER = "header"
+    BODY = "body"
+    ROW_NUMBER = "row_number"
+    ROW_SPAN = "row_span"
+    ROW_HEADER = "row_header"
+    PROJECTED_ROW_HEADER = "projected_row_header"
+    COLUMN_NUMBER = "column_number"
+    COLUMN_SPAN = "column_span"
+    COLUMN_HEADER = "column_header"
+    SPANNING = "spanning"
 @object_types_registry.register("WordType")
 class WordType(ObjectTypes):
     """Types for word properties"""
-    characters = "characters"
-    block = "block"
-    token_class = "token_class"
-    tag = "tag"
-    token_tag = "token_tag"
-    text_line = "text_line"
-    character_type = "character_type"
-    printed = "printed"
-    handwritten = "handwritten"
+    CHARACTERS = "characters"
+    BLOCK = "block"
+    TOKEN_CLASS = "token_class"
+    TAG = "tag"
+    TOKEN_TAG = "token_tag"
+    TEXT_LINE = "text_line"
+    CHARACTER_TYPE = "character_type"
+    PRINTED = "printed"
+    HANDWRITTEN = "handwritten"
 @object_types_registry.register("TokenClasses")
 class TokenClasses(ObjectTypes):
     """Types for token classes"""
-    header = "header"
-    question = "question"
-    answer = "answer"
-    other = "other"
+    HEADER = "header"
+    QUESTION = "question"
+    ANSWER = "answer"
+    OTHER = "other"
 @object_types_registry.register("BioTag")
 class BioTag(ObjectTypes):
     """Types for tags"""
-    begin = "B"
-    inside = "I"
-    outside = "O"
-    single = "S"
-    end = "E"
+    BEGIN = "B"
+    INSIDE = "I"
+    OUTSIDE = "O"
+    SINGLE = "S"
+    END = "E"
 @object_types_registry.register("TokenClassWithTag")
 class TokenClassWithTag(ObjectTypes):
     """Types for token classes with tags, e.g. B-answer"""
-    b_answer = "B-answer"
-    b_header = "B-header"
-    b_question = "B-question"
-    e_answer = "E-answer"
-    e_header = "E-header"
-    e_question = "E-question"
-    i_answer = "I-answer"
-    i_header = "I-header"
-    i_question = "I-question"
-    s_answer = "S-answer"
-    s_header = "S-header"
-    s_question = "S-question"
+    B_ANSWER = "B-answer"
+    B_HEADER = "B-header"
+    B_QUESTION = "B-question"
+    E_ANSWER = "E-answer"
+    E_HEADER = "E-header"
+    E_QUESTION = "E-question"
+    I_ANSWER = "I-answer"
+    I_HEADER = "I-header"
+    I_QUESTION = "I-question"
+    S_ANSWER = "S-answer"
+    S_HEADER = "S-header"
+    S_QUESTION = "S-question"
 @object_types_registry.register("Relationships")
 class Relationships(ObjectTypes):
     """Types for describing relationships between types"""
-    child = "child"
-    reading_order = "reading_order"
-    semantic_entity_link = "semantic_entity_link"
+    CHILD = "child"
+    READING_ORDER = "reading_order"
+    SEMANTIC_ENTITY_LINK = "semantic_entity_link"
+    LAYOUT_LINK = "layout_link"
 @object_types_registry.register("Languages")
 class Languages(ObjectTypes):
     """Language types"""
-    english = "eng"
-    russian = "rus"
-    german = "deu"
-    french = "fre"
-    italian = "ita"
-    japanese = "jpn"
-    spanish = "spa"
-    cebuano = "ceb"
-    turkish = "tur"
-    portuguese = "por"
-    ukrainian = "ukr"
-    esperanto = "epo"
-    polish = "pol"
-    swedish = "swe"
-    dutch = "dut"
-    hebrew = "heb"
-    chinese = "chi"
-    hungarian = "hun"
-    arabic = "ara"
-    catalan = "cat"
-    finnish = "fin"
-    czech = "cze"
-    persian = "per"
-    serbian = "srp"
-    greek = "gre"
-    vietnamese = "vie"
-    bulgarian = "bul"
-    korean = "kor"
-    norwegian = "nor"
-    macedonian = "mac"
-    romanian = "rum"
-    indonesian = "ind"
-    thai = "tha"
-    armenian = "arm"
-    danish = "dan"
-    tamil = "tam"
-    hindi = "hin"
-    croatian = "hrv"
-    belarusian = "bel"
-    georgian = "geo"
-    telugu = "tel"
-    kazakh = "kaz"
-    waray = "war"
-    lithuanian = "lit"
-    scottish = "glg"
-    slovak = "slo"
-    benin = "ben"
-    basque = "baq"
-    slovenian = "slv"
-    malayalam = "mal"
-    marathi = "mar"
-    estonian = "est"
-    azerbaijani = "aze"
-    albanian = "alb"
-    latin = "lat"
-    bosnian = "bos"
-    norwegian_nynorsk = "nno"
-    urdu = "urd"
-    not_defined = "nn"
+    ENGLISH = "eng"
+    RUSSIAN = "rus"
+    GERMAN = "deu"
+    FRENCH = "fre"
+    ITALIAN = "ita"
+    JAPANESE = "jpn"
+    SPANISH = "spa"
+    CEBUANO = "ceb"
+    TURKISH = "tur"
+    PORTUGUESE = "por"
+    UKRAINIAN = "ukr"
+    ESPERANTO = "epo"
+    POLISH = "pol"
+    SWEDISH = "swe"
+    DUTCH = "dut"
+    HEBREW = "heb"
+    CHINESE = "chi"
+    HUNGARIAN = "hun"
+    ARABIC = "ara"
+    CATALAN = "cat"
+    FINNISH = "fin"
+    CZECH = "cze"
+    PERSIAN = "per"
+    SERBIAN = "srp"
+    GREEK = "gre"
+    VIETNAMESE = "vie"
+    BULGARIAN = "bul"
+    KOREAN = "kor"
+    NORWEGIAN = "nor"
+    MACEDONIAN = "mac"
+    ROMANIAN = "rum"
+    INDONESIAN = "ind"
+    THAI = "tha"
+    ARMENIAN = "arm"
+    DANISH = "dan"
+    TAMIL = "tam"
+    HINDI = "hin"
+    CROATIAN = "hrv"
+    BELARUSIAN = "bel"
+    GEORGIAN = "geo"
+    TELUGU = "tel"
+    KAZAKH = "kaz"
+    WARAY = "war"
+    LITHUANIAN = "lit"
+    SCOTTISH = "glg"
+    SLOVAK = "slo"
+    BENIN = "ben"
+    BASQUE = "baq"
+    SLOVENIAN = "slv"
+    MALAYALAM = "mal"
+    MARATHI = "mar"
+    ESTONIAN = "est"
+    AZERBAIJANI = "aze"
+    ALBANIAN = "alb"
+    LATIN = "lat"
+    BOSNIAN = "bos"
+    NORWEGIAN_NOVOSIBIRSK = "nno"
+    URDU = "urd"
+    NOT_DEFINED = "nn"
 @object_types_registry.register("DatasetType")
 class DatasetType(ObjectTypes):
     """Dataset types"""
-    object_detection = "object_detection"
-    sequence_classification = "sequence_classification"
-    token_classification = "token_classification"
-    publaynet = "publaynet"
-    default = "default"
+    OBJECT_DETECTION = "object_detection"
+    SEQUENCE_CLASSIFICATION = "sequence_classification"
+    TOKEN_CLASSIFICATION = "token_classification"
+    PUBLAYNET = "publaynet"
+    DEFAULT = "default"
 _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG = {
-    (TokenClasses.header, BioTag.begin): TokenClassWithTag.b_header,
-    (TokenClasses.header, BioTag.inside): TokenClassWithTag.i_header,
-    (TokenClasses.header, BioTag.end): TokenClassWithTag.e_header,
-    (TokenClasses.header, BioTag.single): TokenClassWithTag.s_header,
-    (TokenClasses.answer, BioTag.begin): TokenClassWithTag.b_answer,
-    (TokenClasses.answer, BioTag.inside): TokenClassWithTag.i_answer,
-    (TokenClasses.answer, BioTag.end): TokenClassWithTag.e_answer,
-    (TokenClasses.answer, BioTag.single): TokenClassWithTag.s_answer,
-    (TokenClasses.question, BioTag.begin): TokenClassWithTag.b_question,
-    (TokenClasses.question, BioTag.inside): TokenClassWithTag.i_question,
-    (TokenClasses.question, BioTag.end): TokenClassWithTag.e_question,
-    (TokenClasses.question, BioTag.single): TokenClassWithTag.s_question,
-    (TokenClasses.other, BioTag.outside): BioTag.outside,
-    (TokenClasses.header, BioTag.outside): BioTag.outside,
-    (TokenClasses.answer, BioTag.outside): BioTag.outside,
-    (TokenClasses.question, BioTag.outside): BioTag.outside,
+    (TokenClasses.HEADER, BioTag.BEGIN): TokenClassWithTag.B_HEADER,
+    (TokenClasses.HEADER, BioTag.INSIDE): TokenClassWithTag.I_HEADER,
+    (TokenClasses.HEADER, BioTag.END): TokenClassWithTag.E_HEADER,
+    (TokenClasses.HEADER, BioTag.SINGLE): TokenClassWithTag.S_HEADER,
+    (TokenClasses.ANSWER, BioTag.BEGIN): TokenClassWithTag.B_ANSWER,
+    (TokenClasses.ANSWER, BioTag.INSIDE): TokenClassWithTag.I_ANSWER,
+    (TokenClasses.ANSWER, BioTag.END): TokenClassWithTag.E_ANSWER,
+    (TokenClasses.ANSWER, BioTag.SINGLE): TokenClassWithTag.S_ANSWER,
+    (TokenClasses.QUESTION, BioTag.BEGIN): TokenClassWithTag.B_QUESTION,
+    (TokenClasses.QUESTION, BioTag.INSIDE): TokenClassWithTag.I_QUESTION,
+    (TokenClasses.QUESTION, BioTag.END): TokenClassWithTag.E_QUESTION,
+    (TokenClasses.QUESTION, BioTag.SINGLE): TokenClassWithTag.S_QUESTION,
+    (TokenClasses.OTHER, BioTag.OUTSIDE): BioTag.OUTSIDE,
+    (TokenClasses.HEADER, BioTag.OUTSIDE): BioTag.OUTSIDE,
+    (TokenClasses.ANSWER, BioTag.OUTSIDE): BioTag.OUTSIDE,
+    (TokenClasses.QUESTION, BioTag.OUTSIDE): BioTag.OUTSIDE,
 }
@@ -334,7 +340,7 @@ def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes
 def token_class_with_tag_to_token_class_and_tag(
     token_class_with_tag: ObjectTypes,
-) -> Optional[Tuple[ObjectTypes, ObjectTypes]]:
+) -> Optional[tuple[ObjectTypes, ObjectTypes]]:
     """
     This is the reverse mapping from TokenClassWithTag members to TokenClasses and BioTag
@@ -358,7 +364,7 @@ def update_all_types_dict() -> None:
         _ALL_TYPES_DICT.update({e.value: e for e in obj})
-_OLD_TO_NEW_OBJ_TYPE: Dict[str, str] = {
+_OLD_TO_NEW_OBJ_TYPE: dict[str, str] = {
     "DOC_CLASS": "document_type",
     "CHARS": "characters",
     "BIO_TAG": "tag",
@@ -381,10 +387,10 @@ def _get_new_obj_type_str(obj_type: str) -> str:
     return _OLD_TO_NEW_OBJ_TYPE.get(obj_type, obj_type)
-_BLACK_LIST: List[str] = ["B", "I", "O", "E", "S"]
+_BLACK_LIST: list[str] = ["B", "I", "O", "E", "S"]
-def _get_black_list() -> List[str]:
+def _get_black_list() -> list[str]:
     return _BLACK_LIST

deepdoctection/utils/tqdm.py CHANGED Viewed

@@ -23,7 +23,7 @@ from typing import Dict, Optional, Union
 from tqdm import tqdm
-from .detection_types import TqdmType
+from .types import TqdmType
 __all__ = ["get_tqdm", "get_tqdm_default_kwargs"]

deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.32py3-none-any.whl → 0.34py3-none-any.whl