PyPI - deepdoctection - Versions diffs - 0.30__py3-none-any.whl → 0.31__py3-none-any.whl - Mend

deepdoctection 0.30py3-none-any.whl → 0.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (74) hide show

deepdoctection/__init__.py +4 -2
deepdoctection/analyzer/dd.py +6 -5
deepdoctection/dataflow/base.py +0 -19
deepdoctection/dataflow/custom.py +4 -3
deepdoctection/dataflow/custom_serialize.py +14 -5
deepdoctection/dataflow/parallel_map.py +12 -11
deepdoctection/dataflow/serialize.py +5 -4
deepdoctection/datapoint/annotation.py +33 -12
deepdoctection/datapoint/box.py +1 -4
deepdoctection/datapoint/convert.py +3 -1
deepdoctection/datapoint/image.py +66 -29
deepdoctection/datapoint/view.py +57 -25
deepdoctection/datasets/adapter.py +1 -1
deepdoctection/datasets/base.py +83 -10
deepdoctection/datasets/dataflow_builder.py +1 -1
deepdoctection/datasets/info.py +2 -2
deepdoctection/datasets/instances/layouttest.py +2 -7
deepdoctection/eval/accmetric.py +1 -1
deepdoctection/eval/base.py +5 -4
deepdoctection/eval/eval.py +2 -2
deepdoctection/eval/tp_eval_callback.py +5 -4
deepdoctection/extern/base.py +39 -13
deepdoctection/extern/d2detect.py +164 -64
deepdoctection/extern/deskew.py +32 -7
deepdoctection/extern/doctrocr.py +227 -39
deepdoctection/extern/fastlang.py +45 -7
deepdoctection/extern/hfdetr.py +90 -33
deepdoctection/extern/hflayoutlm.py +109 -22
deepdoctection/extern/pdftext.py +2 -1
deepdoctection/extern/pt/ptutils.py +3 -2
deepdoctection/extern/tessocr.py +134 -22
deepdoctection/extern/texocr.py +2 -0
deepdoctection/extern/tp/tpcompat.py +4 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +2 -7
deepdoctection/extern/tpdetect.py +50 -23
deepdoctection/mapper/d2struct.py +1 -1
deepdoctection/mapper/hfstruct.py +1 -1
deepdoctection/mapper/laylmstruct.py +1 -1
deepdoctection/mapper/maputils.py +13 -2
deepdoctection/mapper/prodigystruct.py +1 -1
deepdoctection/mapper/pubstruct.py +10 -10
deepdoctection/mapper/tpstruct.py +1 -1
deepdoctection/pipe/anngen.py +35 -8
deepdoctection/pipe/base.py +53 -19
deepdoctection/pipe/cell.py +29 -8
deepdoctection/pipe/common.py +12 -4
deepdoctection/pipe/doctectionpipe.py +2 -2
deepdoctection/pipe/language.py +3 -2
deepdoctection/pipe/layout.py +3 -2
deepdoctection/pipe/lm.py +2 -2
deepdoctection/pipe/refine.py +18 -10
deepdoctection/pipe/segment.py +21 -16
deepdoctection/pipe/text.py +14 -8
deepdoctection/pipe/transform.py +16 -9
deepdoctection/train/d2_frcnn_train.py +15 -12
deepdoctection/train/hf_detr_train.py +8 -6
deepdoctection/train/hf_layoutlm_train.py +16 -11
deepdoctection/utils/__init__.py +3 -0
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +2 -2
deepdoctection/utils/env_info.py +55 -22
deepdoctection/utils/error.py +84 -0
deepdoctection/utils/file_utils.py +4 -15
deepdoctection/utils/fs.py +7 -7
deepdoctection/utils/pdf_utils.py +5 -4
deepdoctection/utils/settings.py +5 -1
deepdoctection/utils/transform.py +1 -1
deepdoctection/utils/utils.py +0 -6
deepdoctection/utils/viz.py +44 -2
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/METADATA +33 -58
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/RECORD +74 -73
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/WHEEL +1 -1
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/LICENSE +0 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/top_level.txt +0 -0

deepdoctection/pipe/language.py CHANGED Viewed

@@ -25,6 +25,7 @@ from ..datapoint.image import Image
 from ..datapoint.view import Page
 from ..extern.base import LanguageDetector, ObjectDetector
 from ..utils.detection_types import JsonDict
+from ..utils.error import ImageError
 from ..utils.settings import PageType, TypeOrStr, get_type
 from .base import PipelineComponent
 from .registry import pipeline_component_registry
@@ -86,7 +87,7 @@ class LanguageDetectionService(PipelineComponent):
             text = page.text_no_line_break
         else:
             if dp.image is None:
-                raise ValueError("dp.image cannot be None")
+                raise ImageError("image cannot be None")
             detect_result_list = self.text_detector.predict(dp.image)
             # this is a concatenation of all detection result. No reading order
             text = " ".join([result.text for result in detect_result_list if result.text is not None])
@@ -98,7 +99,7 @@ class LanguageDetectionService(PipelineComponent):
     def clone(self) -> PipelineComponent:
         predictor = self.predictor.clone()
         if not isinstance(predictor, LanguageDetector):
-            raise ValueError(f"Predictor must be of type LanguageDetector, but is of type {type(predictor)}")
+            raise TypeError(f"Predictor must be of type LanguageDetector, but is of type {type(predictor)}")
         return self.__class__(
             predictor,
             copy(self.text_container),

deepdoctection/pipe/layout.py CHANGED Viewed

@@ -25,6 +25,7 @@ import numpy as np
 from ..datapoint.image import Image
 from ..extern.base import ObjectDetector, PdfMiner
 from ..utils.detection_types import JsonDict
+from ..utils.error import ImageError
 from ..utils.transform import PadTransform
 from .base import PredictorPipelineComponent
 from .registry import pipeline_component_registry
@@ -79,7 +80,7 @@ class ImageLayoutService(PredictorPipelineComponent):
             if anns:
                 return
         if dp.image is None:
-            raise ValueError("image cannot be None")
+            raise ImageError("image cannot be None")
         np_image = dp.image
         if self.padder:
             np_image = self.padder.apply_image(np_image)
@@ -114,5 +115,5 @@ class ImageLayoutService(PredictorPipelineComponent):
         if self.padder:
             padder_clone = self.padder.clone()
         if not isinstance(predictor, ObjectDetector):
-            raise ValueError(f"predictor must be of type ObjectDetector, but is of type {type(predictor)}")
+            raise TypeError(f"predictor must be of type ObjectDetector, but is of type {type(predictor)}")
         return self.__class__(predictor, self.to_image, self.crop_image, padder_clone, self.skip_if_layout_extracted)

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -252,7 +252,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
             self.language_model.model.__class__.__name__, use_xlm_tokenizer
         )
         if not isinstance(self.tokenizer, type(tokenizer_reference)):
-            raise ValueError(
+            raise TypeError(
                 f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
                 f"in this framework"
             )
@@ -366,7 +366,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
             self.language_model.model.__class__.__name__, use_xlm_tokenizer
         )
         if not isinstance(self.tokenizer, type(tokenizer_reference)):
-            raise ValueError(
+            raise TypeError(
                 f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
                 f"in this framework"
             )

deepdoctection/pipe/refine.py CHANGED Viewed

@@ -33,6 +33,7 @@ from ..datapoint.image import Image
 from ..extern.base import DetectionResult
 from ..mapper.maputils import MappingContextManager
 from ..utils.detection_types import JsonDict
+from ..utils.error import AnnotationError, ImageError
 from ..utils.settings import CellType, LayoutType, Relationships, TableType, get_type
 from .base import PipelineComponent
 from .registry import pipeline_component_registry
@@ -302,7 +303,7 @@ def generate_html_string(table: ImageAnnotation) -> List[str]:
     :return: HTML representation of the table
     """
     if table.image is None:
-        raise ValueError("table.image cannot be None")
+        raise ImageError("table.image cannot be None")
     table_image = table.image
     cells = table_image.get_annotation(
         category_names=[
@@ -412,7 +413,7 @@ class TableSegmentationRefinementService(PipelineComponent):
         tables = dp.get_annotation(category_names=self._table_name)
         for table in tables:
             if table.image is None:
-                raise ValueError("table.image cannot be None")
+                raise ImageError("table.image cannot be None")
             tiles_to_cells_list = tiles_to_cells(dp, table)
             connected_components, tile_to_cell_dict = connected_component_tiles(tiles_to_cells_list)
             rectangle_tiling = generate_rectangle_tiling(connected_components)
@@ -464,14 +465,21 @@ class TableSegmentationRefinementService(PipelineComponent):
             max_col_span = max(int(cell.get_sub_category(CellType.column_span).category_id) for cell in cells)
             # TODO: the summaries should be sub categories of the underlying ann
             if table.image.summary is not None:
-                if TableType.number_of_rows in table.image.summary.sub_categories:
-                    table.get_summary(TableType.number_of_rows)
-                if TableType.number_of_columns in table.image.summary.sub_categories:
-                    table.get_summary(TableType.number_of_columns)
-                if TableType.max_row_span in table.image.summary.sub_categories:
-                    table.get_summary(TableType.max_row_span)
-                if TableType.max_col_span in table.image.summary.sub_categories:
-                    table.get_summary(TableType.max_col_span)
+                if (
+                    TableType.number_of_rows in table.image.summary.sub_categories
+                    and TableType.number_of_columns in table.image.summary.sub_categories
+                    and TableType.max_row_span in table.image.summary.sub_categories
+                    and TableType.max_col_span in table.image.summary.sub_categories
+                ):
+                    table.image.summary.remove_sub_category(TableType.number_of_rows)
+                    table.image.summary.remove_sub_category(TableType.number_of_columns)
+                    table.image.summary.remove_sub_category(TableType.max_row_span)
+                    table.image.summary.remove_sub_category(TableType.max_col_span)
+                else:
+                    raise AnnotationError(
+                        "Table summary does not contain sub categories TableType.number_of_rows, "
+                        "TableType.number_of_columns, TableType.max_row_span, TableType.max_col_span"
+                    )
             self.dp_manager.set_summary_annotation(
                 TableType.number_of_rows, TableType.number_of_rows, number_of_rows, annotation_id=table.annotation_id

deepdoctection/pipe/segment.py CHANGED Viewed

@@ -33,6 +33,7 @@ from ..extern.base import DetectionResult
 from ..mapper.maputils import MappingContextManager
 from ..mapper.match import match_anns_by_intersection
 from ..utils.detection_types import JsonDict
+from ..utils.error import ImageError
 from ..utils.settings import CellType, LayoutType, ObjectTypes, Relationships, TableType
 from .base import PipelineComponent
 from .refine import generate_html_string
@@ -136,12 +137,12 @@ def stretch_item_per_table(
     rows = dp.get_annotation(category_names=row_name, annotation_ids=item_ann_ids)
     if table.image is None:
-        raise ValueError("table.image cannot be None")
+        raise ImageError("table.image cannot be None")
     table_embedding_box = table.get_bounding_box(dp.image_id)
     for row in rows:
         if row.image is None:
-            raise ValueError("row.image cannot be None")
+            raise ImageError("row.image cannot be None")
         row_embedding_box = row.get_bounding_box(dp.image_id)
         row_embedding_box.ulx = table_embedding_box.ulx + 1.0
         row_embedding_box.lrx = table_embedding_box.lrx - 1.0
@@ -166,7 +167,7 @@ def stretch_item_per_table(
     for col in cols:
         if col.image is None:
-            raise ValueError("row.image cannot be None")
+            raise ImageError("row.image cannot be None")
         col_embedding_box = col.get_bounding_box(dp.image_id)
         col_embedding_box.uly = table_embedding_box.uly + 1.0
         col_embedding_box.lry = table_embedding_box.lry - 1.0
@@ -194,7 +195,7 @@ def _tile_by_stretching_rows_left_and_rightwise(
     dp: Image, items: List[ImageAnnotation], table: ImageAnnotation, item_name: str
 ) -> None:
     if table.image is None:
-        raise ValueError("table.image cannot be None")
+        raise ImageError("table.image cannot be None")
     table_embedding_box = table.get_bounding_box(dp.image_id)
     tmp_item_xy = table_embedding_box.uly + 1.0 if item_name == LayoutType.row else table_embedding_box.ulx + 1.0
@@ -206,7 +207,7 @@ def _tile_by_stretching_rows_left_and_rightwise(
             image_annotation={"category_name": item.category_name, "annotation_id": item.annotation_id},
         ):
             if item.image is None:
-                raise ValueError("item.image cannot be None")
+                raise ImageError("item.image cannot be None")
             item_embedding_box = item.get_bounding_box(dp.image_id)
             if idx != len(items) - 1:
                 next_item_embedding_box = items[idx + 1].get_bounding_box(dp.image_id)
@@ -258,7 +259,7 @@ def _tile_by_stretching_rows_leftwise_column_downwise(
     dp: Image, items: List[ImageAnnotation], table: ImageAnnotation, item_name: str
 ) -> None:
     if table.image is None:
-        raise ValueError("table.image cannot be None")
+        raise ImageError("table.image cannot be None")
     table_embedding_box = table.get_bounding_box(dp.image_id)
     tmp_item_xy = table_embedding_box.uly + 1.0 if item_name == LayoutType.row else table_embedding_box.ulx + 1.0
@@ -270,7 +271,7 @@ def _tile_by_stretching_rows_leftwise_column_downwise(
             image_annotation={"category_name": item.category_name, "annotation_id": item.annotation_id},
         ):
             if item.image is None:
-                raise ValueError("item.image cannot be None")
+                raise ImageError("item.image cannot be None")
             item_embedding_box = item.get_bounding_box(dp.image_id)
             new_embedding_box = BoundingBox(
                 ulx=item_embedding_box.ulx if item_name == LayoutType.row else tmp_item_xy,
@@ -339,9 +340,9 @@ def tile_tables_with_items_per_table(
     items = dp.get_annotation(category_names=item_name, annotation_ids=item_ann_ids)
     items.sort(
-        key=lambda x: x.get_bounding_box(dp.image_id).cx
-        if item_name == LayoutType.column
-        else x.get_bounding_box(dp.image_id).cy
+        key=lambda x: (
+            x.get_bounding_box(dp.image_id).cx if item_name == LayoutType.column else x.get_bounding_box(dp.image_id).cy
+        )
     )
     if stretch_rule == "left":
@@ -737,9 +738,11 @@ class TableSegmentationService(PipelineComponent):
                 # we will assume that either all or no image attribute has been generated
                 items.sort(
-                    key=lambda x: x.get_bounding_box(dp.image_id).cx  # pylint: disable=W0640
-                    if item_name == LayoutType.column  # pylint: disable=W0640
-                    else x.get_bounding_box(dp.image_id).cy  # pylint: disable=W0640
+                    key=lambda x: (
+                        x.get_bounding_box(dp.image_id).cx  # pylint: disable=W0640
+                        if item_name == LayoutType.column  # pylint: disable=W0640
+                        else x.get_bounding_box(dp.image_id).cy  # pylint: disable=W0640
+                    )
                 )
                 for item_number, item in enumerate(items, 1):
@@ -939,9 +942,11 @@ class PubtablesSegmentationService(PipelineComponent):
                 # we will assume that either all or no image attribute has been generated
                 items.sort(
-                    key=lambda x: x.get_bounding_box(dp.image_id).cx
-                    if item_name == LayoutType.column  # pylint: disable=W0640
-                    else x.get_bounding_box(dp.image_id).cy
+                    key=lambda x: (
+                        x.get_bounding_box(dp.image_id).cx
+                        if item_name == LayoutType.column  # pylint: disable=W0640
+                        else x.get_bounding_box(dp.image_id).cy
+                    )
                 )
                 for item_number, item in enumerate(items, 1):

deepdoctection/pipe/text.py CHANGED Viewed

@@ -26,6 +26,7 @@ from ..datapoint.image import Image
 from ..extern.base import ObjectDetector, PdfMiner, TextRecognizer
 from ..extern.tessocr import TesseractOcrDetector
 from ..utils.detection_types import ImageType, JsonDict
+from ..utils.error import ImageError
 from ..utils.settings import PageType, TypeOrStr, WordType, get_type
 from .base import PredictorPipelineComponent
 from .registry import pipeline_component_registry
@@ -89,7 +90,10 @@ class TextExtractionService(PredictorPipelineComponent):
         super().__init__(self._get_name(text_extract_detector.name), text_extract_detector)
         if self.extract_from_category:
             if not isinstance(self.predictor, (ObjectDetector, TextRecognizer)):
-                raise TypeError("Predicting from a cropped image requires to pass an ObjectDetector or TextRecognizer.")
+                raise TypeError(
+                    f"Predicting from a cropped image requires to pass an ObjectDetector or "
+                    f"TextRecognizer. Got {type(self.predictor)}"
+                )
         if run_time_ocr_language_selection:
             assert isinstance(
                 self.predictor, TesseractOcrDetector
@@ -171,13 +175,13 @@ class TextExtractionService(PredictorPipelineComponent):
         if isinstance(text_roi, ImageAnnotation):
             if text_roi.image is None:
-                raise ValueError("text_roi.image cannot be None")
+                raise ImageError("text_roi.image cannot be None")
             if text_roi.image.image is None:
-                raise ValueError("text_roi.image.image cannot be None")
+                raise ImageError("text_roi.image.image cannot be None")
             return text_roi.image.image
         if isinstance(self.predictor, ObjectDetector):
             if not isinstance(text_roi, Image):
-                raise ValueError("text_roi must be an image")
+                raise ImageError("text_roi must be an image")
             return text_roi.image
         if isinstance(text_roi, list):
             assert all(roi.image is not None for roi in text_roi)
@@ -201,9 +205,11 @@ class TextExtractionService(PredictorPipelineComponent):
             [
                 (
                     "image_annotations",
-                    self.predictor.possible_categories()
-                    if isinstance(self.predictor, (ObjectDetector, PdfMiner))
-                    else [],
+                    (
+                        self.predictor.possible_categories()
+                        if isinstance(self.predictor, (ObjectDetector, PdfMiner))
+                        else []
+                    ),
                 ),
                 ("sub_categories", sub_cat_dict),
                 ("relationships", {}),
@@ -218,5 +224,5 @@ class TextExtractionService(PredictorPipelineComponent):
     def clone(self) -> "PredictorPipelineComponent":
         predictor = self.predictor.clone()
         if not isinstance(predictor, (ObjectDetector, PdfMiner, TextRecognizer)):
-            raise ValueError(f"predictor must be of type ObjectDetector or PdfMiner, but is of type {type(predictor)}")
+            raise ImageError(f"predictor must be of type ObjectDetector or PdfMiner, but is of type {type(predictor)}")
         return self.__class__(predictor, deepcopy(self.extract_from_category), self.run_time_ocr_language_selection)

deepdoctection/pipe/transform.py CHANGED Viewed

@@ -23,7 +23,6 @@ on images (e.g. deskew, de-noising or more general GAN like operations.
 from ..datapoint.image import Image
 from ..extern.base import ImageTransformer
 from ..utils.detection_types import JsonDict
-from ..utils.logger import LoggingRecord, logger
 from .base import ImageTransformPipelineComponent
 from .registry import pipeline_component_registry
@@ -49,16 +48,24 @@ class SimpleTransformService(ImageTransformPipelineComponent):
     def serve(self, dp: Image) -> None:
         if dp.annotations:
-            logger.warning(
-                LoggingRecord(
-                    f"{self.name} has already received image with image annotations. These annotations "
-                    f"will not be transformed and might cause unexpected output in your pipeline."
-                )
+            raise RuntimeError(
+                "SimpleTransformService receives datapoints with ÌmageAnnotations. This violates the "
+                "pipeline building API but this can currently be catched only at runtime. "
+                "Please make sure that this component is the first one in the pipeline."
             )
         if dp.image is not None:
-            np_image_transform = self.transform_predictor.transform(dp.image)
+            detection_result = self.transform_predictor.predict(dp.image)
+            transformed_image = self.transform_predictor.transform(dp.image, detection_result)
             self.dp_manager.datapoint.clear_image(True)
-            self.dp_manager.datapoint.image = np_image_transform
+            self.dp_manager.datapoint.image = transformed_image
+            self.dp_manager.set_summary_annotation(
+                summary_key=self.transform_predictor.possible_category(),
+                summary_name=self.transform_predictor.possible_category(),
+                summary_number=None,
+                summary_value=getattr(detection_result, self.transform_predictor.possible_category().value, None),
+                summary_score=detection_result.score,
+            )
     def clone(self) -> "SimpleTransformService":
         return self.__class__(self.transform_predictor)
@@ -69,7 +76,7 @@ class SimpleTransformService(ImageTransformPipelineComponent):
                 ("image_annotations", []),
                 ("sub_categories", {}),
                 ("relationships", {}),
-                ("summaries", []),
+                ("summaries", [self.transform_predictor.possible_category()]),
             ]
         )

deepdoctection/train/d2_frcnn_train.py CHANGED Viewed

@@ -43,6 +43,7 @@ from ..extern.pt.ptutils import get_num_gpu
 from ..mapper.d2struct import image_to_d2_frcnn_training
 from ..pipe.base import PredictorPipelineComponent
 from ..pipe.registry import pipeline_component_registry
+from ..utils.error import DependencyError
 from ..utils.file_utils import get_wandb_requirement, wandb_available
 from ..utils.logger import LoggingRecord, logger
 from ..utils.utils import string_to_dict
@@ -153,16 +154,18 @@ class D2Trainer(DefaultTrainer):
         ret = [
             hooks.IterationTimer(),
             hooks.LRScheduler(),
-            hooks.PreciseBN(
-                # Run at the same freq as (but before) evaluation.
-                cfg.TEST.EVAL_PERIOD,
-                self.model,  # pylint: disable=E1101
-                # Build a new data loader to not affect training
-                self.build_train_loader(cfg),
-                cfg.TEST.PRECISE_BN.NUM_ITER,
-            )
-            if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)  # pylint: disable=E1101
-            else None,
+            (
+                hooks.PreciseBN(
+                    # Run at the same freq as (but before) evaluation.
+                    cfg.TEST.EVAL_PERIOD,
+                    self.model,  # pylint: disable=E1101
+                    # Build a new data loader to not affect training
+                    self.build_train_loader(cfg),
+                    cfg.TEST.PRECISE_BN.NUM_ITER,
+                )
+                if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)  # pylint: disable=E1101
+                else None
+            ),
         ]
         # Do PreciseBN before checkpointer, because it updates the model and need to
@@ -201,7 +204,7 @@ class D2Trainer(DefaultTrainer):
         if self.cfg.WANDB.USE_WANDB:
             _, _wandb_available, err_msg = get_wandb_requirement()
             if not _wandb_available:
-                raise ImportError(err_msg)
+                raise DependencyError(err_msg)
             if self.cfg.WANDB.PROJECT is None:
                 raise ValueError("When using W&B, you must specify a project, i.e. WANDB.PROJECT")
             writers_list.append(WandbWriter(self.cfg.WANDB.PROJECT, self.cfg.WANDB.REPO, self.cfg))
@@ -269,7 +272,7 @@ class D2Trainer(DefaultTrainer):
     @classmethod
     def build_evaluator(cls, cfg, dataset_name):  # type: ignore
-        raise NotImplementedError
+        raise NotImplementedError()
 def train_d2_faster_rcnn(

deepdoctection/train/hf_detr_train.py CHANGED Viewed

@@ -97,9 +97,9 @@ class DetrDerivedTrainer(Trainer):
     def evaluate(
         self,
-        eval_dataset: Optional[Dataset[Any]] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
+        eval_dataset: Optional[Dataset[Any]] = None,  # pylint: disable=W0613
+        ignore_keys: Optional[List[str]] = None,  # pylint: disable=W0613
+        metric_key_prefix: str = "eval",  # pylint: disable=W0613
     ) -> Dict[str, float]:
         """
         Overwritten method from `Trainer`. Arguments will not be used.
@@ -193,9 +193,11 @@ def train_hf_detr(
         "remove_unused_columns": False,
         "per_device_train_batch_size": 2,
         "max_steps": number_samples,
-        "evaluation_strategy": "steps"
-        if (dataset_val is not None and metric is not None and pipeline_component_name is not None)
-        else "no",
+        "evaluation_strategy": (
+            "steps"
+            if (dataset_val is not None and metric is not None and pipeline_component_name is not None)
+            else "no"
+        ),
         "eval_steps": 5000,
     }

deepdoctection/train/hf_layoutlm_train.py CHANGED Viewed

@@ -63,6 +63,7 @@ from ..pipe.base import LanguageModelPipelineComponent
 from ..pipe.lm import get_tokenizer_from_architecture
 from ..pipe.registry import pipeline_component_registry
 from ..utils.env_info import get_device
+from ..utils.error import DependencyError
 from ..utils.file_utils import wandb_available
 from ..utils.logger import LoggingRecord, logger
 from ..utils.settings import DatasetType, LayoutType, ObjectTypes, WordType
@@ -180,15 +181,17 @@ class LayoutLMTrainer(Trainer):
     def evaluate(
         self,
-        eval_dataset: Optional[Dataset[Any]] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
+        eval_dataset: Optional[Dataset[Any]] = None,  # pylint: disable=W0613
+        ignore_keys: Optional[List[str]] = None,  # pylint: disable=W0613
+        metric_key_prefix: str = "eval",  # pylint: disable=W0613
     ) -> Dict[str, float]:
         """
         Overwritten method from `Trainer`. Arguments will not be used.
         """
-        assert self.evaluator is not None
-        assert self.evaluator.pipe_component is not None
+        if self.evaluator is None:
+            raise ValueError("Evaluator not set up. Please use `setup_evaluator` before running evaluation")
+        if self.evaluator.pipe_component is None:
+            raise ValueError("Pipeline component not set up. Please use `setup_evaluator` before running evaluation")
         # memory metrics - must set up as early as possible
         self._memory_tracker.start()
@@ -222,7 +225,7 @@ def _get_model_class_and_tokenizer(
         raise KeyError("model_type and architectures not available in configs")
     if not model_cls:
-        raise ValueError("model not eligible to run with this framework")
+        raise UserWarning("model not eligible to run with this framework")
     return config_cls, model_cls, model_wrapper_cls, tokenizer_fast
@@ -347,7 +350,7 @@ def train_hf_layoutlm(
                 name_as_key=True,
             )[LayoutType.word][WordType.token_class]
     else:
-        raise ValueError("Dataset type not supported for training")
+        raise UserWarning("Dataset type not supported for training")
     config_cls, model_cls, model_wrapper_cls, tokenizer_fast = _get_model_class_and_tokenizer(
         path_config_json, dataset_type, use_xlm_tokenizer
@@ -374,9 +377,11 @@ def train_hf_layoutlm(
         "remove_unused_columns": False,
         "per_device_train_batch_size": 8,
         "max_steps": number_samples,
-        "evaluation_strategy": "steps"
-        if (dataset_val is not None and metric is not None and pipeline_component_name is not None)
-        else "no",
+        "evaluation_strategy": (
+            "steps"
+            if (dataset_val is not None and metric is not None and pipeline_component_name is not None)
+            else "no"
+        ),
         "eval_steps": 100,
         "use_wandb": False,
         "wandb_project": None,
@@ -416,7 +421,7 @@ def train_hf_layoutlm(
     run = None
     if use_wandb:
         if not wandb_available():
-            raise ModuleNotFoundError("WandB must be installed separately")
+            raise DependencyError("WandB must be installed separately")
         run = wandb.init(project=wandb_project, config=conf_dict)  # type: ignore
         run._label(repo=wandb_repo)  # type: ignore # pylint: disable=W0212
     else:

deepdoctection/utils/__init__.py CHANGED Viewed

@@ -6,7 +6,10 @@ Init file for utils package
 """
 from typing import Optional, Tuple, Union, no_type_check
+from .concurrency import *
 from .context import *
+from .env_info import *
+from .error import *
 from .file_utils import *
 from .fs import *
 from .identifier import *

deepdoctection/utils/concurrency.py CHANGED Viewed

@@ -109,7 +109,7 @@ def enable_death_signal(_warn: bool = True) -> None:
         prctl, "set_pdeathsig"
     ), "prctl.set_pdeathsig does not exist! Note that you need to install 'python-prctl' instead of 'prctl'."
     # is SIGHUP a good choice?
-    prctl.set_pdeathsig(signal.SIGHUP)
+    prctl.set_pdeathsig(signal.SIGHUP)  # pylint: disable=E1101
 # taken from https://github.com/tensorpack/dataflow/blob/master/dataflow/utils/concurrency.py

deepdoctection/utils/context.py CHANGED Viewed

@@ -61,7 +61,7 @@ def timeout_manager(proc, seconds: Optional[int] = None) -> Iterator[str]:  # ty
             proc.terminate()
             proc.kill()
             proc.returncode = -1
-            raise RuntimeError("Tesseract process timeout")  # pylint: disable=W0707
+            raise RuntimeError(f"timeout for process id: {proc.pid}")  # pylint: disable=W0707
     finally:
         if proc.stdin is not None:
             proc.stdin.close()
@@ -88,7 +88,7 @@ def save_tmp_file(image: Union[str, ImageType, bytes], prefix: str) -> Iterator[
                 yield file.name, path.realpath(path.normpath(path.normcase(image)))
                 return
             if isinstance(image, (np.ndarray, np.generic)):
-                input_file_name = file.name + ".PNG"
+                input_file_name = file.name + "_input.PNG"
                 viz_handler.write_image(input_file_name, image)
                 yield file.name, input_file_name
             if isinstance(image, bytes):

deepdoctection 0.30__py3-none-any.whl → 0.31__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.30py3-none-any.whl → 0.31py3-none-any.whl