deepdoctection 0.38__py3-none-any.whl → 0.39.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +5 -1
- deepdoctection/analyzer/dd.py +6 -5
- deepdoctection/analyzer/factory.py +7 -2
- deepdoctection/datapoint/convert.py +14 -8
- deepdoctection/datapoint/image.py +1 -1
- deepdoctection/datapoint/view.py +34 -24
- deepdoctection/extern/model.py +6 -97
- deepdoctection/mapper/cats.py +21 -10
- deepdoctection/mapper/match.py +0 -22
- deepdoctection/mapper/misc.py +12 -2
- deepdoctection/mapper/pubstruct.py +1 -1
- deepdoctection/pipe/base.py +38 -5
- deepdoctection/pipe/common.py +3 -3
- deepdoctection/pipe/doctectionpipe.py +20 -3
- deepdoctection/pipe/lm.py +20 -5
- deepdoctection/pipe/segment.py +4 -8
- deepdoctection/train/hf_detr_train.py +1 -1
- deepdoctection/train/hf_layoutlm_train.py +3 -1
- deepdoctection/utils/pdf_utils.py +17 -9
- deepdoctection/utils/settings.py +1 -1
- {deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/METADATA +8 -8
- {deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/RECORD +25 -25
- {deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/LICENSE +0 -0
- {deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/WHEEL +0 -0
- {deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ Init file for deepdoctection package. This file is used to import all submodules
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import importlib.util
|
|
9
|
+
import os
|
|
9
10
|
|
|
10
11
|
# Before doing anything else, check if the .env file exists and load it
|
|
11
12
|
if importlib.util.find_spec("dotenv") is not None:
|
|
@@ -24,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
|
|
|
24
25
|
|
|
25
26
|
# pylint: enable=wrong-import-position
|
|
26
27
|
|
|
27
|
-
__version__ = "0.
|
|
28
|
+
__version__ = "0.39.1"
|
|
28
29
|
|
|
29
30
|
_IMPORT_STRUCTURE = {
|
|
30
31
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
@@ -423,6 +424,9 @@ _IMPORT_STRUCTURE = {
|
|
|
423
424
|
env_info = collect_env_info()
|
|
424
425
|
logger.debug(LoggingRecord(msg=env_info))
|
|
425
426
|
auto_select_pdf_render_framework()
|
|
427
|
+
os.environ["DPI"] = "300"
|
|
428
|
+
os.environ["IMAGE_WIDTH"] = ""
|
|
429
|
+
os.environ["IMAGE_HEIGHT"] = ""
|
|
426
430
|
|
|
427
431
|
# Direct imports for type-checking
|
|
428
432
|
if TYPE_CHECKING:
|
deepdoctection/analyzer/dd.py
CHANGED
|
@@ -32,7 +32,6 @@ from ..extern.pt.ptutils import get_torch_device
|
|
|
32
32
|
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
33
33
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
34
34
|
from ..utils.env_info import ENV_VARS_TRUE
|
|
35
|
-
from ..utils.error import DependencyError
|
|
36
35
|
from ..utils.file_utils import tensorpack_available
|
|
37
36
|
from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
|
|
38
37
|
from ..utils.logger import LoggingRecord, logger
|
|
@@ -118,13 +117,15 @@ def get_dd_analyzer(
|
|
|
118
117
|
:return: A DoctectionPipe instance with given configs
|
|
119
118
|
"""
|
|
120
119
|
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
121
|
-
|
|
122
|
-
|
|
120
|
+
if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE:
|
|
121
|
+
lib = "TF"
|
|
123
122
|
device = get_tf_device()
|
|
124
|
-
elif
|
|
123
|
+
elif os.environ.get("DD_USE_TORCH", "0") in ENV_VARS_TRUE:
|
|
124
|
+
lib = "PT"
|
|
125
125
|
device = get_torch_device()
|
|
126
126
|
else:
|
|
127
|
-
|
|
127
|
+
lib = None
|
|
128
|
+
device = None
|
|
128
129
|
dd_one_config_path = maybe_copy_config_to_cache(
|
|
129
130
|
get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
|
|
130
131
|
)
|
|
@@ -48,6 +48,7 @@ from ..pipe.segment import PubtablesSegmentationService, TableSegmentationServic
|
|
|
48
48
|
from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
|
|
49
49
|
from ..pipe.text import TextExtractionService
|
|
50
50
|
from ..pipe.transform import SimpleTransformService
|
|
51
|
+
from ..utils.error import DependencyError
|
|
51
52
|
from ..utils.file_utils import detectron2_available
|
|
52
53
|
from ..utils.fs import get_configs_dir_path
|
|
53
54
|
from ..utils.metacfg import AttrDict
|
|
@@ -62,8 +63,6 @@ __all__ = [
|
|
|
62
63
|
"ServiceFactory",
|
|
63
64
|
]
|
|
64
65
|
|
|
65
|
-
# from ._config import cfg
|
|
66
|
-
|
|
67
66
|
|
|
68
67
|
class ServiceFactory:
|
|
69
68
|
"""
|
|
@@ -94,6 +93,8 @@ class ServiceFactory:
|
|
|
94
93
|
:param config: configuration object
|
|
95
94
|
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
96
95
|
"""
|
|
96
|
+
if config.LIB is None:
|
|
97
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
97
98
|
weights = (
|
|
98
99
|
getattr(config.TF, mode).WEIGHTS
|
|
99
100
|
if config.LIB == "TF"
|
|
@@ -310,6 +311,8 @@ class ServiceFactory:
|
|
|
310
311
|
config_overwrite=[f"LANGUAGES={config.LANGUAGE}"] if config.LANGUAGE is not None else None,
|
|
311
312
|
)
|
|
312
313
|
if config.OCR.USE_DOCTR:
|
|
314
|
+
if config.LIB is None:
|
|
315
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
313
316
|
weights = (
|
|
314
317
|
config.OCR.WEIGHTS.DOCTR_RECOGNITION.TF
|
|
315
318
|
if config.LIB == "TF"
|
|
@@ -353,6 +356,8 @@ class ServiceFactory:
|
|
|
353
356
|
:param config: configuration object
|
|
354
357
|
:return: DoctrTextlineDetector
|
|
355
358
|
"""
|
|
359
|
+
if config.LIB is None:
|
|
360
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
356
361
|
weights = config.OCR.WEIGHTS.DOCTR_WORD.TF if config.LIB == "TF" else config.OCR.WEIGHTS.DOCTR_WORD.PT
|
|
357
362
|
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
358
363
|
profile = ModelCatalog.get_profile(weights)
|
|
@@ -154,7 +154,9 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
|
|
|
154
154
|
return np_array.astype(uint8)
|
|
155
155
|
|
|
156
156
|
|
|
157
|
-
def convert_pdf_bytes_to_np_array_v2(
|
|
157
|
+
def convert_pdf_bytes_to_np_array_v2(
|
|
158
|
+
pdf_bytes: bytes, dpi: Optional[int] = None, width: Optional[int] = None, height: Optional[int] = None
|
|
159
|
+
) -> PixelValues:
|
|
158
160
|
"""
|
|
159
161
|
Converts a pdf passed as bytes into a numpy array. We use poppler or pdfmium to convert the pdf to an image.
|
|
160
162
|
If both is available you can steer the selection of the render engine with environment variables:
|
|
@@ -165,17 +167,21 @@ def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = 200)
|
|
|
165
167
|
:param pdf_bytes: A pdf as bytes object. A byte representation can from a pdf file can be generated e.g. with
|
|
166
168
|
`utils.fs.load_bytes_from_pdf_file`
|
|
167
169
|
:param dpi: The dpi value of the resulting output image. For high resolution set dpi=300.
|
|
170
|
+
:param width: The width of the resulting output image. This option does only work when using Poppler as
|
|
171
|
+
PDF renderer
|
|
172
|
+
:param height: The height of the resulting output image. This option does only work when using Poppler as
|
|
173
|
+
PDF renderer
|
|
168
174
|
:return: Image as numpy array.
|
|
169
175
|
"""
|
|
170
176
|
|
|
171
|
-
with BytesIO(pdf_bytes) as pdf_file:
|
|
172
|
-
pdf = PdfReader(pdf_file).pages[0]
|
|
173
|
-
shape = pdf.mediabox # pylint: disable=E1101
|
|
174
|
-
height = shape[3] - shape[1]
|
|
175
|
-
width = shape[2] - shape[0]
|
|
176
|
-
|
|
177
177
|
if dpi is None:
|
|
178
|
-
|
|
178
|
+
if width is None or height is None:
|
|
179
|
+
with BytesIO(pdf_bytes) as pdf_file:
|
|
180
|
+
pdf = PdfReader(pdf_file).pages[0]
|
|
181
|
+
shape = pdf.mediabox # pylint: disable=E1101
|
|
182
|
+
height = shape[3] - shape[1]
|
|
183
|
+
width = shape[2] - shape[0]
|
|
184
|
+
return pdf_to_np_array(pdf_bytes, size=(int(width), int(height))) # type: ignore
|
|
179
185
|
return pdf_to_np_array(pdf_bytes, dpi=dpi)
|
|
180
186
|
|
|
181
187
|
|
|
@@ -153,7 +153,7 @@ class Image:
|
|
|
153
153
|
self.set_width_height(self._image.shape[1], self._image.shape[0])
|
|
154
154
|
self._self_embedding()
|
|
155
155
|
elif isinstance(image, bytes):
|
|
156
|
-
self._image = convert_pdf_bytes_to_np_array_v2(image, dpi=environ
|
|
156
|
+
self._image = convert_pdf_bytes_to_np_array_v2(image, dpi=int(environ["DPI"]))
|
|
157
157
|
self.set_width_height(self._image.shape[1], self._image.shape[0])
|
|
158
158
|
self._self_embedding()
|
|
159
159
|
else:
|
deepdoctection/datapoint/view.py
CHANGED
|
@@ -228,23 +228,33 @@ class Layout(ImageAnnotationBaseView):
|
|
|
228
228
|
|
|
229
229
|
"""
|
|
230
230
|
words = self.get_ordered_words()
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
231
|
+
if words:
|
|
232
|
+
characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
|
|
233
|
+
*[
|
|
234
|
+
(
|
|
235
|
+
word.characters,
|
|
236
|
+
word.annotation_id,
|
|
237
|
+
word.token_class,
|
|
238
|
+
word.token_tag,
|
|
239
|
+
word.get_sub_category(WordType.TOKEN_CLASS).category_id
|
|
240
|
+
if WordType.TOKEN_CLASS in word.sub_categories
|
|
241
|
+
else None,
|
|
242
|
+
word.get_sub_category(WordType.TOKEN_TAG).category_id
|
|
243
|
+
if WordType.TOKEN_TAG in word.sub_categories
|
|
244
|
+
else None,
|
|
245
|
+
)
|
|
246
|
+
for word in words
|
|
247
|
+
]
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = (
|
|
251
|
+
[], # type: ignore
|
|
252
|
+
[], # type: ignore
|
|
253
|
+
[], # type: ignore
|
|
254
|
+
[], # type: ignore
|
|
255
|
+
[], # type: ignore
|
|
256
|
+
[], # type: ignore
|
|
257
|
+
)
|
|
248
258
|
return {
|
|
249
259
|
"text": " ".join(characters),
|
|
250
260
|
"words": characters,
|
|
@@ -327,7 +337,7 @@ class Table(Layout):
|
|
|
327
337
|
:return: A list of `Cell` objects that are row headers.
|
|
328
338
|
"""
|
|
329
339
|
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
330
|
-
all_cells: list[Cell] = self.base_page.get_annotation(
|
|
340
|
+
all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
|
|
331
341
|
category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
|
|
332
342
|
)
|
|
333
343
|
row_header_cells = list(filter(lambda cell: CellType.ROW_HEADER in cell.sub_categories, all_cells))
|
|
@@ -363,18 +373,18 @@ class Table(Layout):
|
|
|
363
373
|
category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
|
|
364
374
|
)
|
|
365
375
|
row_cells = list(
|
|
366
|
-
filter(
|
|
367
|
-
lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells # type: ignore
|
|
368
|
-
)
|
|
376
|
+
filter(lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells) # type: ignore
|
|
369
377
|
)
|
|
370
|
-
row_cells.sort(key=lambda c: c.column_number)
|
|
378
|
+
row_cells.sort(key=lambda c: c.column_number) # type: ignore
|
|
371
379
|
column_header_cells = self.column_header_cells
|
|
372
380
|
|
|
373
381
|
kv_dict: Mapping[str, str] = {}
|
|
374
382
|
for cell in row_cells:
|
|
375
383
|
for header in column_header_cells:
|
|
376
|
-
if (
|
|
377
|
-
|
|
384
|
+
if (
|
|
385
|
+
cell.column_number == header.column_number # type: ignore
|
|
386
|
+
and cell.annotation_id != header.annotation_id # type: ignore
|
|
387
|
+
):
|
|
378
388
|
kv_dict[(header.column_number, header.text)] = cell.text # type: ignore
|
|
379
389
|
break
|
|
380
390
|
return kv_dict
|
deepdoctection/extern/model.py
CHANGED
|
@@ -24,7 +24,7 @@ from dataclasses import asdict, dataclass, field
|
|
|
24
24
|
from typing import Any, Mapping, Optional, Union
|
|
25
25
|
|
|
26
26
|
import jsonlines
|
|
27
|
-
from huggingface_hub import
|
|
27
|
+
from huggingface_hub import hf_hub_download
|
|
28
28
|
from tabulate import tabulate
|
|
29
29
|
from termcolor import colored
|
|
30
30
|
|
|
@@ -136,51 +136,6 @@ class ModelCatalog:
|
|
|
136
136
|
dl_library="TF",
|
|
137
137
|
model_wrapper="TPFrcnnDetector",
|
|
138
138
|
),
|
|
139
|
-
"item/model-1620000.data-00000-of-00001": ModelProfile(
|
|
140
|
-
name="item/model-1620000.data-00000-of-00001",
|
|
141
|
-
description="Tensorpack row/column detection model trained on Pubtabnet",
|
|
142
|
-
config="dd/tp/conf_frcnn_rows.yaml",
|
|
143
|
-
size=[823546048, 25787],
|
|
144
|
-
tp_model=True,
|
|
145
|
-
hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc",
|
|
146
|
-
hf_model_name="model-1620000",
|
|
147
|
-
hf_config_file=["conf_frcnn_rows.yaml"],
|
|
148
|
-
categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
|
|
149
|
-
dl_library="TF",
|
|
150
|
-
model_wrapper="TPFrcnnDetector",
|
|
151
|
-
),
|
|
152
|
-
"layout/model-800000.data-00000-of-00001": ModelProfile(
|
|
153
|
-
name="layout/model-800000.data-00000-of-00001",
|
|
154
|
-
description="Tensorpack layout detection model trained on Publaynet",
|
|
155
|
-
config="dd/tp/conf_frcnn_layout.yaml",
|
|
156
|
-
size=[823656748, 25796],
|
|
157
|
-
tp_model=True,
|
|
158
|
-
hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet",
|
|
159
|
-
hf_model_name="model-800000",
|
|
160
|
-
hf_config_file=["conf_frcnn_layout.yaml"],
|
|
161
|
-
dl_library="TF",
|
|
162
|
-
categories={
|
|
163
|
-
1: LayoutType.TEXT,
|
|
164
|
-
2: LayoutType.TITLE,
|
|
165
|
-
3: LayoutType.LIST,
|
|
166
|
-
4: LayoutType.TABLE,
|
|
167
|
-
5: LayoutType.FIGURE,
|
|
168
|
-
},
|
|
169
|
-
model_wrapper="TPFrcnnDetector",
|
|
170
|
-
),
|
|
171
|
-
"cell/model-1800000.data-00000-of-00001": ModelProfile(
|
|
172
|
-
name="cell/model-1800000.data-00000-of-00001",
|
|
173
|
-
description="Tensorpack cell detection model trained on Pubtabnet",
|
|
174
|
-
config="dd/tp/conf_frcnn_cell.yaml",
|
|
175
|
-
size=[823509160, 25905],
|
|
176
|
-
tp_model=True,
|
|
177
|
-
hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c",
|
|
178
|
-
hf_model_name="model-1800000",
|
|
179
|
-
hf_config_file=["conf_frcnn_cell.yaml"],
|
|
180
|
-
categories={1: LayoutType.CELL},
|
|
181
|
-
dl_library="TF",
|
|
182
|
-
model_wrapper="TPFrcnnDetector",
|
|
183
|
-
),
|
|
184
139
|
"layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
|
|
185
140
|
name="layout/d2_model_0829999_layout_inf_only.pt",
|
|
186
141
|
description="Detectron2 layout detection model trained on Publaynet",
|
|
@@ -200,25 +155,6 @@ class ModelCatalog:
|
|
|
200
155
|
dl_library="PT",
|
|
201
156
|
model_wrapper="D2FrcnnDetector",
|
|
202
157
|
),
|
|
203
|
-
"layout/d2_model_0829999_layout.pth": ModelProfile(
|
|
204
|
-
name="layout/d2_model_0829999_layout.pth",
|
|
205
|
-
description="Detectron2 layout detection model trained on Publaynet. Checkpoint for resuming training",
|
|
206
|
-
config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
|
|
207
|
-
size=[548377327],
|
|
208
|
-
tp_model=False,
|
|
209
|
-
hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
|
|
210
|
-
hf_model_name="d2_model_0829999_layout.pth",
|
|
211
|
-
hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
|
|
212
|
-
categories={
|
|
213
|
-
1: LayoutType.TEXT,
|
|
214
|
-
2: LayoutType.TITLE,
|
|
215
|
-
3: LayoutType.LIST,
|
|
216
|
-
4: LayoutType.TABLE,
|
|
217
|
-
5: LayoutType.FIGURE,
|
|
218
|
-
},
|
|
219
|
-
dl_library="PT",
|
|
220
|
-
model_wrapper="D2FrcnnDetector",
|
|
221
|
-
),
|
|
222
158
|
"layout/d2_model_0829999_layout_inf_only.ts": ModelProfile(
|
|
223
159
|
name="layout/d2_model_0829999_layout_inf_only.ts",
|
|
224
160
|
description="Detectron2 layout detection model trained on Publaynet. Torchscript export",
|
|
@@ -264,32 +200,6 @@ class ModelCatalog:
|
|
|
264
200
|
dl_library="PT",
|
|
265
201
|
model_wrapper="D2FrcnnTracingDetector",
|
|
266
202
|
),
|
|
267
|
-
"cell/d2_model_1849999_cell.pth": ModelProfile(
|
|
268
|
-
name="cell/d2_model_1849999_cell.pth",
|
|
269
|
-
description="Detectron2 cell detection inference only model trained on Pubtabnet",
|
|
270
|
-
config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
|
|
271
|
-
size=[548279023],
|
|
272
|
-
tp_model=False,
|
|
273
|
-
hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
|
|
274
|
-
hf_model_name="cell/d2_model_1849999_cell.pth",
|
|
275
|
-
hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
|
|
276
|
-
categories={1: LayoutType.CELL},
|
|
277
|
-
dl_library="PT",
|
|
278
|
-
model_wrapper="D2FrcnnDetector",
|
|
279
|
-
),
|
|
280
|
-
"item/d2_model_1639999_item.pth": ModelProfile(
|
|
281
|
-
name="item/d2_model_1639999_item.pth",
|
|
282
|
-
description="Detectron2 item detection model trained on Pubtabnet",
|
|
283
|
-
config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
|
|
284
|
-
size=[548303599],
|
|
285
|
-
tp_model=False,
|
|
286
|
-
hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
|
|
287
|
-
hf_model_name="d2_model_1639999_item.pth",
|
|
288
|
-
hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
|
|
289
|
-
categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
|
|
290
|
-
dl_library="PT",
|
|
291
|
-
model_wrapper="D2FrcnnDetector",
|
|
292
|
-
),
|
|
293
203
|
"item/d2_model_1639999_item_inf_only.pt": ModelProfile(
|
|
294
204
|
name="item/d2_model_1639999_item_inf_only.pt",
|
|
295
205
|
description="Detectron2 item detection model inference only trained on Pubtabnet",
|
|
@@ -1232,20 +1142,19 @@ class ModelDownloadManager:
|
|
|
1232
1142
|
def _load_from_hf_hub(
|
|
1233
1143
|
repo_id: str, file_name: str, cache_directory: PathLikeOrStr, force_download: bool = False
|
|
1234
1144
|
) -> int:
|
|
1235
|
-
url = hf_hub_url(repo_id=repo_id, filename=file_name)
|
|
1236
1145
|
token = os.environ.get("HF_CREDENTIALS", None)
|
|
1237
|
-
f_path =
|
|
1238
|
-
|
|
1239
|
-
|
|
1146
|
+
f_path = hf_hub_download(
|
|
1147
|
+
repo_id,
|
|
1148
|
+
file_name,
|
|
1149
|
+
local_dir=cache_directory, # type: ignore
|
|
1240
1150
|
force_filename=file_name,
|
|
1241
1151
|
force_download=force_download,
|
|
1242
1152
|
token=token,
|
|
1243
|
-
legacy_cache_layout=True,
|
|
1244
1153
|
)
|
|
1245
1154
|
if f_path:
|
|
1246
1155
|
stat_info = os.stat(f_path)
|
|
1247
1156
|
size = stat_info.st_size
|
|
1248
1157
|
|
|
1249
|
-
assert size > 0, f"Downloaded an empty file from {
|
|
1158
|
+
assert size > 0, f"Downloaded an empty file from {f_path}!"
|
|
1250
1159
|
return size
|
|
1251
1160
|
raise TypeError("Returned value from cached_download cannot be Null")
|
deepdoctection/mapper/cats.py
CHANGED
|
@@ -73,18 +73,21 @@ def re_assign_cat_ids(
|
|
|
73
73
|
Annotations that are not in the dictionary provided will be removed.
|
|
74
74
|
|
|
75
75
|
:param dp: Image
|
|
76
|
-
:param categories_dict_name_as_key: e.g. `{LayoutType.word:
|
|
76
|
+
:param categories_dict_name_as_key: e.g. `{LayoutType.word: 1}`
|
|
77
77
|
:param cat_to_sub_cat_mapping: e.g. `{<LayoutType.word>:
|
|
78
78
|
{<WordType.token_class>:
|
|
79
|
-
{<FundsFirstPage.
|
|
80
|
-
<FundsFirstPage.
|
|
81
|
-
<FundsFirstPage.
|
|
82
|
-
<FundsFirstPage.
|
|
83
|
-
<TokenClasses.
|
|
84
|
-
<WordType.
|
|
85
|
-
{<BioTag.
|
|
86
|
-
<BioTag.
|
|
87
|
-
<BioTag.
|
|
79
|
+
{<FundsFirstPage.REPORT_DATE>: 1,
|
|
80
|
+
<FundsFirstPage.REPORT_TYPE>: 2,
|
|
81
|
+
<FundsFirstPage.UMBRELLA>: 3,
|
|
82
|
+
<FundsFirstPage.FUND_NAME>: 4,
|
|
83
|
+
<TokenClasses.OTHER>: 5},
|
|
84
|
+
<WordType.TAG>:
|
|
85
|
+
{<BioTag.INSIDE>: 1,
|
|
86
|
+
<BioTag.OUTSIDE>: 2,
|
|
87
|
+
<BioTag.BEGIN>: 3}}}`
|
|
88
|
+
To re-assign the category ids of an image summary, use the key 'default_type' for the default category, e.g.
|
|
89
|
+
`{DefaultType.DEFAULT_TYPE: {<PageType.DOCUMENT_TYPE>: {<DocumentType.INVOICE>:1,
|
|
90
|
+
<DocumentType.BANK_STATEMENT>:2}}}`
|
|
88
91
|
:return: Image
|
|
89
92
|
"""
|
|
90
93
|
|
|
@@ -104,6 +107,14 @@ def re_assign_cat_ids(
|
|
|
104
107
|
sub_category = ann.get_sub_category(key)
|
|
105
108
|
sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
|
|
106
109
|
|
|
110
|
+
if cat_to_sub_cat_mapping:
|
|
111
|
+
if "default_type" in cat_to_sub_cat_mapping:
|
|
112
|
+
sub_cat_keys_to_sub_cat_values = cat_to_sub_cat_mapping[get_type("default_type")]
|
|
113
|
+
for key in sub_cat_keys_to_sub_cat_values:
|
|
114
|
+
sub_cat_values_dict = sub_cat_keys_to_sub_cat_values[key]
|
|
115
|
+
sub_category = dp.summary.get_sub_category(key)
|
|
116
|
+
sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
|
|
117
|
+
|
|
107
118
|
dp.remove(annotation_ids=ann_ids_to_remove)
|
|
108
119
|
|
|
109
120
|
return dp
|
deepdoctection/mapper/match.py
CHANGED
|
@@ -101,17 +101,6 @@ def match_anns_by_intersection(
|
|
|
101
101
|
]
|
|
102
102
|
)
|
|
103
103
|
|
|
104
|
-
# second try, if ann has empty image
|
|
105
|
-
n_dim = child_ann_boxes.ndim
|
|
106
|
-
if n_dim != 2:
|
|
107
|
-
child_ann_boxes = np.array(
|
|
108
|
-
[
|
|
109
|
-
ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
|
|
110
|
-
for ann in child_anns
|
|
111
|
-
if ann.bounding_box is not None
|
|
112
|
-
]
|
|
113
|
-
)
|
|
114
|
-
|
|
115
104
|
parent_anns = dp.get_annotation(annotation_ids=parent_ann_ids, category_names=parent_ann_category_names)
|
|
116
105
|
parent_ann_boxes = np.array(
|
|
117
106
|
[
|
|
@@ -120,17 +109,6 @@ def match_anns_by_intersection(
|
|
|
120
109
|
]
|
|
121
110
|
)
|
|
122
111
|
|
|
123
|
-
# same for parent
|
|
124
|
-
n_dim = parent_ann_boxes.ndim
|
|
125
|
-
if n_dim != 2:
|
|
126
|
-
parent_ann_boxes = np.array(
|
|
127
|
-
[
|
|
128
|
-
ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
|
|
129
|
-
for ann in parent_anns
|
|
130
|
-
if ann.bounding_box is not None
|
|
131
|
-
]
|
|
132
|
-
)
|
|
133
|
-
|
|
134
112
|
if matching_rule in ["iou"] and parent_anns and child_anns:
|
|
135
113
|
iou_matrix = iou(child_ann_boxes, parent_ann_boxes)
|
|
136
114
|
output = iou_matrix > threshold
|
deepdoctection/mapper/misc.py
CHANGED
|
@@ -38,12 +38,20 @@ with try_import() as import_guard:
|
|
|
38
38
|
from lxml import etree # pylint: disable=W0611
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
def to_image(
|
|
41
|
+
def to_image(
|
|
42
|
+
dp: Union[str, Mapping[str, Union[str, bytes]]],
|
|
43
|
+
dpi: Optional[int] = None,
|
|
44
|
+
width: Optional[int] = None,
|
|
45
|
+
height: Optional[int] = None,
|
|
46
|
+
) -> Optional[Image]:
|
|
42
47
|
"""
|
|
43
48
|
Mapping an input from `dataflow.SerializerFiles` or similar to an Image
|
|
44
49
|
|
|
45
50
|
:param dp: Image
|
|
46
51
|
:param dpi: dot per inch definition for pdf resolution when converting to numpy array
|
|
52
|
+
:param width: target width of the image. This option does only work when using Poppler as PDF renderer
|
|
53
|
+
:param height: target width of the image. This option does only work when using Poppler as PDF renderer
|
|
54
|
+
:param height: target height of the image
|
|
47
55
|
:return: Image
|
|
48
56
|
"""
|
|
49
57
|
|
|
@@ -77,7 +85,9 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
|
|
|
77
85
|
dp_image.pdf_bytes = dp.get("pdf_bytes")
|
|
78
86
|
if dp_image.pdf_bytes is not None:
|
|
79
87
|
if isinstance(dp_image.pdf_bytes, bytes):
|
|
80
|
-
dp_image.image = convert_pdf_bytes_to_np_array_v2(
|
|
88
|
+
dp_image.image = convert_pdf_bytes_to_np_array_v2(
|
|
89
|
+
dp_image.pdf_bytes, dpi=dpi, width=width, height=height
|
|
90
|
+
)
|
|
81
91
|
elif image_bytes is not None:
|
|
82
92
|
dp_image.image = convert_bytes_to_np_array(image_bytes)
|
|
83
93
|
else:
|
|
@@ -393,7 +393,7 @@ def pub_to_image_uncur( # pylint: disable=R0914
|
|
|
393
393
|
np_image = load_image_from_file(dp["filename"])
|
|
394
394
|
if is_file_extension(dp["filename"], ".pdf"):
|
|
395
395
|
pdf_bytes = load_bytes_from_pdf_file(dp["filename"])
|
|
396
|
-
np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes)
|
|
396
|
+
np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes, dpi=200)
|
|
397
397
|
dp = _convert_boxes(dp, np_image.shape[0])
|
|
398
398
|
|
|
399
399
|
if load_image and np_image is not None:
|
deepdoctection/pipe/base.py
CHANGED
|
@@ -24,7 +24,7 @@ from __future__ import annotations
|
|
|
24
24
|
from abc import ABC, abstractmethod
|
|
25
25
|
from collections import defaultdict
|
|
26
26
|
from dataclasses import dataclass, field
|
|
27
|
-
from typing import Any, Mapping, Optional, Union
|
|
27
|
+
from typing import Any, Mapping, Optional, Union, Callable
|
|
28
28
|
from uuid import uuid1
|
|
29
29
|
|
|
30
30
|
from ..dataflow import DataFlow, MapData
|
|
@@ -33,6 +33,7 @@ from ..mapper.misc import curry
|
|
|
33
33
|
from ..utils.context import timed_operation
|
|
34
34
|
from ..utils.identifier import get_uuid_from_str
|
|
35
35
|
from ..utils.settings import ObjectTypes
|
|
36
|
+
from ..utils.types import DP
|
|
36
37
|
from .anngen import DatapointManager
|
|
37
38
|
|
|
38
39
|
|
|
@@ -76,6 +77,30 @@ class PipelineComponent(ABC):
|
|
|
76
77
|
self.service_id = self.get_service_id()
|
|
77
78
|
self.dp_manager = DatapointManager(self.service_id, model_id)
|
|
78
79
|
self.timer_on = False
|
|
80
|
+
self.filter_func: Callable[[DP], bool] = lambda dp: False
|
|
81
|
+
|
|
82
|
+
def set_inbound_filter(self, filter_func: Callable[[DP], bool]) -> None:
|
|
83
|
+
"""
|
|
84
|
+
Set a filter function to decide, if an image of the inbound dataflow should be passed to self.serve.
|
|
85
|
+
The filter function should return a boolean value. If the function returns True, the image will not be processed
|
|
86
|
+
by this pipeline component.
|
|
87
|
+
|
|
88
|
+
**Example:**
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
def do_not_process_tables(dp: Image) -> bool:
|
|
92
|
+
if "table" not in dp.get_categories_from_current_state():
|
|
93
|
+
return True
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
layout_component = ImageLayoutService(...)
|
|
97
|
+
layout_component.set_inbound_filter(do_not_process_tables)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
:param filter_func: A function that takes an image datapoint and returns a boolean value
|
|
102
|
+
"""
|
|
103
|
+
self.filter_func = filter_func # type: ignore
|
|
79
104
|
|
|
80
105
|
@abstractmethod
|
|
81
106
|
def serve(self, dp: Image) -> None:
|
|
@@ -92,6 +117,12 @@ class PipelineComponent(ABC):
|
|
|
92
117
|
"""
|
|
93
118
|
raise NotImplementedError()
|
|
94
119
|
|
|
120
|
+
def _pass_datapoint(self, dp: Image) -> None:
|
|
121
|
+
self.dp_manager.datapoint = dp
|
|
122
|
+
if not self.filter_func(dp):
|
|
123
|
+
self.serve(dp)
|
|
124
|
+
|
|
125
|
+
|
|
95
126
|
def pass_datapoint(self, dp: Image) -> Image:
|
|
96
127
|
"""
|
|
97
128
|
Acceptance, handover to dp_manager, transformation and forwarding of dp. To measure the time, use
|
|
@@ -103,11 +134,9 @@ class PipelineComponent(ABC):
|
|
|
103
134
|
"""
|
|
104
135
|
if self.timer_on:
|
|
105
136
|
with timed_operation(self.__class__.__name__):
|
|
106
|
-
self.
|
|
107
|
-
self.serve(dp)
|
|
137
|
+
self._pass_datapoint(dp)
|
|
108
138
|
else:
|
|
109
|
-
self.
|
|
110
|
-
self.serve(dp)
|
|
139
|
+
self._pass_datapoint(dp)
|
|
111
140
|
return self.dp_manager.datapoint
|
|
112
141
|
|
|
113
142
|
def predict_dataflow(self, df: DataFlow) -> DataFlow:
|
|
@@ -205,6 +234,7 @@ class Pipeline(ABC):
|
|
|
205
234
|
|
|
206
235
|
**Example:**
|
|
207
236
|
|
|
237
|
+
```python
|
|
208
238
|
layout = LayoutPipeComponent(layout_detector ...)
|
|
209
239
|
text = TextExtractPipeComponent(text_detector ...)
|
|
210
240
|
simple_pipe = MyPipeline(pipeline_component = [layout, text])
|
|
@@ -212,6 +242,7 @@ class Pipeline(ABC):
|
|
|
212
242
|
|
|
213
243
|
for page in doc_dataflow:
|
|
214
244
|
print(page)
|
|
245
|
+
```
|
|
215
246
|
|
|
216
247
|
In doing so, page contains all document structures determined via the pipeline (either directly from the Image core
|
|
217
248
|
model or already processed further).
|
|
@@ -225,10 +256,12 @@ class Pipeline(ABC):
|
|
|
225
256
|
|
|
226
257
|
**Example:**
|
|
227
258
|
|
|
259
|
+
```python
|
|
228
260
|
pipe = MyPipeline(pipeline_component = [layout, text])
|
|
229
261
|
pipe.set_session_id = True
|
|
230
262
|
|
|
231
263
|
df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
|
|
264
|
+
```
|
|
232
265
|
"""
|
|
233
266
|
|
|
234
267
|
def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
|
deepdoctection/pipe/common.py
CHANGED
|
@@ -349,8 +349,8 @@ class AnnotationNmsService(PipelineComponent):
|
|
|
349
349
|
def __init__(
|
|
350
350
|
self,
|
|
351
351
|
nms_pairs: Sequence[Sequence[TypeOrStr]],
|
|
352
|
-
thresholds: Union[float,
|
|
353
|
-
priority: Optional[
|
|
352
|
+
thresholds: Union[float, Sequence[float]],
|
|
353
|
+
priority: Optional[Sequence[Union[Optional[TypeOrStr]]]] = None,
|
|
354
354
|
):
|
|
355
355
|
"""
|
|
356
356
|
:param nms_pairs: Groups of categories, either as string or by `ObjectType`.
|
|
@@ -362,7 +362,7 @@ class AnnotationNmsService(PipelineComponent):
|
|
|
362
362
|
self.threshold = [thresholds for _ in self.nms_pairs]
|
|
363
363
|
else:
|
|
364
364
|
assert len(self.nms_pairs) == len(thresholds), "Sequences of nms_pairs and thresholds must have same length"
|
|
365
|
-
self.threshold = thresholds
|
|
365
|
+
self.threshold = thresholds # type: ignore
|
|
366
366
|
if priority:
|
|
367
367
|
assert len(self.nms_pairs) == len(priority), "Sequences of nms_pairs and priority must have same length"
|
|
368
368
|
|
|
@@ -109,8 +109,13 @@ def _proto_process(
|
|
|
109
109
|
|
|
110
110
|
|
|
111
111
|
@curry
|
|
112
|
-
def _to_image(
|
|
113
|
-
|
|
112
|
+
def _to_image(
|
|
113
|
+
dp: Union[str, Mapping[str, Union[str, bytes]]],
|
|
114
|
+
dpi: Optional[int] = None,
|
|
115
|
+
width: Optional[int] = None,
|
|
116
|
+
height: Optional[int] = None,
|
|
117
|
+
) -> Optional[Image]:
|
|
118
|
+
return to_image(dp, dpi, width, height)
|
|
114
119
|
|
|
115
120
|
|
|
116
121
|
def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
|
|
@@ -188,7 +193,19 @@ class DoctectionPipe(Pipeline):
|
|
|
188
193
|
|
|
189
194
|
df = MapData(df, _proto_process(path, doc_path))
|
|
190
195
|
if dataset_dataflow is None:
|
|
191
|
-
|
|
196
|
+
if dpi := os.environ["DPI"]:
|
|
197
|
+
df = MapData(df, _to_image(dpi=int(dpi))) # pylint: disable=E1120
|
|
198
|
+
else:
|
|
199
|
+
width, height = kwargs.get("width", ""), kwargs.get("height", "")
|
|
200
|
+
if not width or not height:
|
|
201
|
+
width = os.environ["IMAGE_WIDTH"]
|
|
202
|
+
height = os.environ["IMAGE_HEIGHT"]
|
|
203
|
+
if not width or not height:
|
|
204
|
+
raise ValueError(
|
|
205
|
+
"DPI, IMAGE_WIDTH and IMAGE_HEIGHT are all None, but "
|
|
206
|
+
"either DPI or IMAGE_WIDTH and IMAGE_HEIGHT must be set"
|
|
207
|
+
)
|
|
208
|
+
df = MapData(df, _to_image(width=int(width), height=int(height))) # pylint: disable=E1120
|
|
192
209
|
return df
|
|
193
210
|
|
|
194
211
|
@staticmethod
|
deepdoctection/pipe/lm.py
CHANGED
|
@@ -24,6 +24,7 @@ from copy import copy
|
|
|
24
24
|
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
|
|
25
25
|
|
|
26
26
|
from ..datapoint.image import Image
|
|
27
|
+
from ..extern.base import SequenceClassResult
|
|
27
28
|
from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
|
|
28
29
|
from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
|
|
29
30
|
from .base import MetaAnnotation, PipelineComponent
|
|
@@ -264,6 +265,7 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
264
265
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
265
266
|
truncation: bool = True,
|
|
266
267
|
return_overflowing_tokens: bool = False,
|
|
268
|
+
use_other_as_default_category: bool = False
|
|
267
269
|
) -> None:
|
|
268
270
|
"""
|
|
269
271
|
:param tokenizer: Tokenizer, typing allows currently anything. This will be changed in the future
|
|
@@ -279,11 +281,16 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
279
281
|
:param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
|
|
280
282
|
can be returned as an additional batch element. Not that in this case, the number of input
|
|
281
283
|
batch samples will be smaller than the output batch samples.
|
|
284
|
+
:param use_other_as_default_category: When predicting document classes, it might be possible that some pages
|
|
285
|
+
do not get sent to the model because they are empty. If set to `True` it
|
|
286
|
+
will assign images with no features the category `TokenClasses.OTHER`.
|
|
287
|
+
|
|
282
288
|
"""
|
|
283
289
|
self.language_model = language_model
|
|
284
290
|
self.padding = padding
|
|
285
291
|
self.truncation = truncation
|
|
286
292
|
self.return_overflowing_tokens = return_overflowing_tokens
|
|
293
|
+
self.use_other_as_default_category = use_other_as_default_category
|
|
287
294
|
self.tokenizer = tokenizer
|
|
288
295
|
self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
|
|
289
296
|
super().__init__(self._get_name(), self.language_model.model_id)
|
|
@@ -299,12 +306,20 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
299
306
|
|
|
300
307
|
def serve(self, dp: Image) -> None:
|
|
301
308
|
lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
|
|
309
|
+
lm_output = None
|
|
302
310
|
if lm_input is None:
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
311
|
+
if self.use_other_as_default_category:
|
|
312
|
+
class_id = self.language_model.categories.get_categories(as_dict=True,
|
|
313
|
+
name_as_key=True).get(TokenClasses.OTHER, 1)
|
|
314
|
+
lm_output = SequenceClassResult(class_name=TokenClasses.OTHER,
|
|
315
|
+
class_id = class_id,
|
|
316
|
+
score=-1.)
|
|
317
|
+
else:
|
|
318
|
+
lm_output = self.language_model.predict(**lm_input)
|
|
319
|
+
if lm_output:
|
|
320
|
+
self.dp_manager.set_summary_annotation(
|
|
321
|
+
PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
|
|
322
|
+
)
|
|
308
323
|
|
|
309
324
|
def clone(self) -> LMSequenceClassifierService:
|
|
310
325
|
return self.__class__(
|
deepdoctection/pipe/segment.py
CHANGED
|
@@ -1191,17 +1191,13 @@ class PubtablesSegmentationService(PipelineComponent):
|
|
|
1191
1191
|
if key[idx] == item_number:
|
|
1192
1192
|
cell_ann = dp.get_annotation(annotation_ids=value)[0]
|
|
1193
1193
|
self.dp_manager.set_category_annotation(
|
|
1194
|
-
item_header_cell_name,
|
|
1195
|
-
None,
|
|
1196
|
-
item_header_cell_name,
|
|
1197
|
-
cell_ann.annotation_id
|
|
1194
|
+
item_header_cell_name, None, item_header_cell_name, cell_ann.annotation_id
|
|
1198
1195
|
)
|
|
1199
1196
|
else:
|
|
1200
1197
|
cell_ann = dp.get_annotation(annotation_ids=value)[0]
|
|
1201
|
-
self.dp_manager.set_category_annotation(
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
cell_ann.annotation_id)
|
|
1198
|
+
self.dp_manager.set_category_annotation(
|
|
1199
|
+
item_header_cell_name, None, CellType.BODY, cell_ann.annotation_id
|
|
1200
|
+
)
|
|
1205
1201
|
|
|
1206
1202
|
# TODO: the summaries should be sub categories of the underlying ann
|
|
1207
1203
|
self.dp_manager.set_summary_annotation(
|
|
@@ -73,7 +73,7 @@ class DetrDerivedTrainer(Trainer):
|
|
|
73
73
|
model: Union[PreTrainedModel, nn.Module],
|
|
74
74
|
args: TrainingArguments,
|
|
75
75
|
data_collator: DetrDataCollator,
|
|
76
|
-
train_dataset:
|
|
76
|
+
train_dataset: DatasetAdapter,
|
|
77
77
|
):
|
|
78
78
|
self.evaluator: Optional[Evaluator] = None
|
|
79
79
|
self.build_eval_kwargs: Optional[dict[str, Any]] = None
|
|
@@ -499,7 +499,9 @@ def train_hf_layoutlm(
|
|
|
499
499
|
)
|
|
500
500
|
pipeline_component_cls = pipeline_component_registry.get(pipeline_component_name)
|
|
501
501
|
if dataset_type == DatasetType.SEQUENCE_CLASSIFICATION:
|
|
502
|
-
pipeline_component = pipeline_component_cls(tokenizer_fast,
|
|
502
|
+
pipeline_component = pipeline_component_cls(tokenizer_fast,
|
|
503
|
+
dd_model,
|
|
504
|
+
use_other_as_default_category=True)
|
|
503
505
|
else:
|
|
504
506
|
pipeline_component = pipeline_component_cls(
|
|
505
507
|
tokenizer_fast,
|
|
@@ -181,8 +181,6 @@ class PDFStreamer:
|
|
|
181
181
|
|
|
182
182
|
streamer.close() # Do not forget to close the streamer, otherwise the file will never be closed and might
|
|
183
183
|
# cause memory leaks if you open many files.
|
|
184
|
-
|
|
185
|
-
|
|
186
184
|
"""
|
|
187
185
|
|
|
188
186
|
def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
|
|
@@ -223,7 +221,10 @@ class PDFStreamer:
|
|
|
223
221
|
|
|
224
222
|
|
|
225
223
|
def _input_to_cli_str(
|
|
226
|
-
input_file_name: PathLikeOrStr,
|
|
224
|
+
input_file_name: PathLikeOrStr,
|
|
225
|
+
output_file_name: PathLikeOrStr,
|
|
226
|
+
dpi: Optional[int] = None,
|
|
227
|
+
size: Optional[tuple[int, int]] = None,
|
|
227
228
|
) -> list[str]:
|
|
228
229
|
cmd_args: list[str] = []
|
|
229
230
|
|
|
@@ -237,7 +238,10 @@ def _input_to_cli_str(
|
|
|
237
238
|
if platform.system() == "Windows":
|
|
238
239
|
command = command + ".exe"
|
|
239
240
|
cmd_args.append(command)
|
|
240
|
-
|
|
241
|
+
|
|
242
|
+
if dpi:
|
|
243
|
+
cmd_args.extend(["-r", str(dpi)])
|
|
244
|
+
cmd_args.append(str(input_file_name))
|
|
241
245
|
cmd_args.append("-png")
|
|
242
246
|
cmd_args.append(str(output_file_name))
|
|
243
247
|
|
|
@@ -275,7 +279,9 @@ def _run_poppler(poppler_args: list[str]) -> None:
|
|
|
275
279
|
raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
|
|
276
280
|
|
|
277
281
|
|
|
278
|
-
def pdf_to_np_array_poppler(
|
|
282
|
+
def pdf_to_np_array_poppler(
|
|
283
|
+
pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None
|
|
284
|
+
) -> PixelValues:
|
|
279
285
|
"""
|
|
280
286
|
Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
|
|
281
287
|
file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
|
|
@@ -285,7 +291,8 @@ def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] =
|
|
|
285
291
|
:param dpi: Image quality in DPI/dots-per-inch (default 200)
|
|
286
292
|
:return: numpy array
|
|
287
293
|
"""
|
|
288
|
-
|
|
294
|
+
if dpi is None and size is None:
|
|
295
|
+
raise ValueError("Either dpi or size must be provided.")
|
|
289
296
|
with save_tmp_file(pdf_bytes, "pdf_") as (tmp_name, input_file_name):
|
|
290
297
|
_run_poppler(_input_to_cli_str(input_file_name, tmp_name, dpi, size))
|
|
291
298
|
image = viz_handler.read_image(tmp_name + "-1.png")
|
|
@@ -293,7 +300,7 @@ def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] =
|
|
|
293
300
|
return image.astype(uint8)
|
|
294
301
|
|
|
295
302
|
|
|
296
|
-
def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int =
|
|
303
|
+
def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
|
|
297
304
|
"""
|
|
298
305
|
Convert a single pdf page from its byte representation to a numpy array using pdfium.
|
|
299
306
|
|
|
@@ -301,12 +308,13 @@ def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
|
|
|
301
308
|
:param dpi: Image quality in DPI/dots-per-inch (default 200)
|
|
302
309
|
:return: numpy array
|
|
303
310
|
"""
|
|
304
|
-
|
|
311
|
+
if dpi is None:
|
|
312
|
+
raise ValueError("dpi must be provided.")
|
|
305
313
|
page = pypdfium2.PdfDocument(pdf_bytes)[0]
|
|
306
314
|
return page.render(scale=dpi * 1 / 72).to_numpy().astype(uint8)
|
|
307
315
|
|
|
308
316
|
|
|
309
|
-
def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int =
|
|
317
|
+
def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None) -> PixelValues:
|
|
310
318
|
"""
|
|
311
319
|
Convert a single pdf page from its byte representation to a numpy array. This function will either use Poppler or
|
|
312
320
|
pdfium to render the pdf.
|
deepdoctection/utils/settings.py
CHANGED
|
@@ -101,7 +101,6 @@ class DocumentType(ObjectTypes):
|
|
|
101
101
|
GOVERNMENT_TENDERS = "government_tenders"
|
|
102
102
|
MANUALS = "manuals"
|
|
103
103
|
PATENTS = "patents"
|
|
104
|
-
MARK = "mark"
|
|
105
104
|
|
|
106
105
|
|
|
107
106
|
@object_types_registry.register("LayoutType")
|
|
@@ -132,6 +131,7 @@ class LayoutType(ObjectTypes):
|
|
|
132
131
|
PAGE_NUMBER = "page_number"
|
|
133
132
|
KEY_VALUE_AREA = "key_value_area"
|
|
134
133
|
LIST_ITEM = "list_item"
|
|
134
|
+
MARK = "mark"
|
|
135
135
|
|
|
136
136
|
|
|
137
137
|
@object_types_registry.register("TableType")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.39.1
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: catalogue==2.0.10
|
|
20
|
-
Requires-Dist: huggingface_hub
|
|
20
|
+
Requires-Dist: huggingface_hub>=0.26.0
|
|
21
21
|
Requires-Dist: importlib-metadata>=5.0.0
|
|
22
22
|
Requires-Dist: jsonlines==3.1.0
|
|
23
23
|
Requires-Dist: lazy-imports==0.3.1
|
|
@@ -36,7 +36,7 @@ Requires-Dist: tabulate>=0.7.7
|
|
|
36
36
|
Requires-Dist: tqdm==4.64.0
|
|
37
37
|
Provides-Extra: tf
|
|
38
38
|
Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
39
|
-
Requires-Dist: huggingface_hub
|
|
39
|
+
Requires-Dist: huggingface_hub>=0.26.0; extra == "tf"
|
|
40
40
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
41
41
|
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
42
42
|
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
@@ -61,14 +61,14 @@ Requires-Dist: python-doctr==0.8.1; extra == "tf"
|
|
|
61
61
|
Requires-Dist: pycocotools>=2.0.2; extra == "tf"
|
|
62
62
|
Requires-Dist: boto3==1.34.102; extra == "tf"
|
|
63
63
|
Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
|
|
64
|
-
Requires-Dist: fasttext
|
|
64
|
+
Requires-Dist: fasttext-wheel; extra == "tf"
|
|
65
65
|
Requires-Dist: jdeskew>=0.2.2; extra == "tf"
|
|
66
66
|
Requires-Dist: apted==1.0.3; extra == "tf"
|
|
67
67
|
Requires-Dist: distance==0.1.3; extra == "tf"
|
|
68
68
|
Requires-Dist: lxml>=4.9.1; extra == "tf"
|
|
69
69
|
Provides-Extra: pt
|
|
70
70
|
Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
71
|
-
Requires-Dist: huggingface_hub
|
|
71
|
+
Requires-Dist: huggingface_hub>=0.26.0; extra == "pt"
|
|
72
72
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
73
73
|
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
74
74
|
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
@@ -86,12 +86,12 @@ Requires-Dist: termcolor>=1.1; extra == "pt"
|
|
|
86
86
|
Requires-Dist: tabulate>=0.7.7; extra == "pt"
|
|
87
87
|
Requires-Dist: tqdm==4.64.0; extra == "pt"
|
|
88
88
|
Requires-Dist: timm>=0.9.16; extra == "pt"
|
|
89
|
-
Requires-Dist: transformers>=4.
|
|
89
|
+
Requires-Dist: transformers>=4.48.0; extra == "pt"
|
|
90
90
|
Requires-Dist: accelerate>=0.29.1; extra == "pt"
|
|
91
91
|
Requires-Dist: python-doctr==0.8.1; extra == "pt"
|
|
92
92
|
Requires-Dist: boto3==1.34.102; extra == "pt"
|
|
93
93
|
Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
|
|
94
|
-
Requires-Dist: fasttext
|
|
94
|
+
Requires-Dist: fasttext-wheel; extra == "pt"
|
|
95
95
|
Requires-Dist: jdeskew>=0.2.2; extra == "pt"
|
|
96
96
|
Requires-Dist: apted==1.0.3; extra == "pt"
|
|
97
97
|
Requires-Dist: distance==0.1.3; extra == "pt"
|
|
@@ -99,7 +99,7 @@ Requires-Dist: lxml>=4.9.1; extra == "pt"
|
|
|
99
99
|
Provides-Extra: docs
|
|
100
100
|
Requires-Dist: tensorpack==0.11; extra == "docs"
|
|
101
101
|
Requires-Dist: boto3==1.34.102; extra == "docs"
|
|
102
|
-
Requires-Dist: transformers>=4.
|
|
102
|
+
Requires-Dist: transformers>=4.48.0; extra == "docs"
|
|
103
103
|
Requires-Dist: accelerate>=0.29.1; extra == "docs"
|
|
104
104
|
Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
|
|
105
105
|
Requires-Dist: lxml>=4.9.1; extra == "docs"
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
deepdoctection/__init__.py,sha256=
|
|
1
|
+
deepdoctection/__init__.py,sha256=uDowNayqaYZGYaqnGzPSz6pVuHQhtDVRAN_bvPq85Ko,12754
|
|
2
2
|
deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
|
|
4
4
|
deepdoctection/analyzer/_config.py,sha256=OZMOPlyFv4gcyabPG6KO08EYx-0tUH82Ehs9YDv2B1Q,5027
|
|
5
|
-
deepdoctection/analyzer/dd.py,sha256=
|
|
6
|
-
deepdoctection/analyzer/factory.py,sha256=
|
|
5
|
+
deepdoctection/analyzer/dd.py,sha256=bfR7e1JV7BwUNDRLu0jYZU7qQXnyA_vbRAJl2Ylrq5o,5905
|
|
6
|
+
deepdoctection/analyzer/factory.py,sha256=Kf3Ztv5FEcF5yJf6i4I557aOIUHybuxIP0moHryguTQ,32344
|
|
7
7
|
deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
|
|
8
8
|
deepdoctection/configs/conf_dd_one.yaml,sha256=qnrDAST1PHBtdIKE_hdkZexW22FqVvNTI-PEo9wvinM,3025
|
|
9
9
|
deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
|
|
@@ -18,9 +18,9 @@ deepdoctection/dataflow/stats.py,sha256=Bsr6v7lcesKXUYtO9wjqlzx_Yq_uyIF3Lel-tQ0i
|
|
|
18
18
|
deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SPb7C1lOY,1643
|
|
19
19
|
deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
|
|
20
20
|
deepdoctection/datapoint/box.py,sha256=UAdSnLexvFyg4KK1u9kXdJxhaWTwRxTU-cnQcvl37Q8,23410
|
|
21
|
-
deepdoctection/datapoint/convert.py,sha256=
|
|
22
|
-
deepdoctection/datapoint/image.py,sha256=
|
|
23
|
-
deepdoctection/datapoint/view.py,sha256=
|
|
21
|
+
deepdoctection/datapoint/convert.py,sha256=gJbHY2V8nlMht1N5VdxTmWSsOeydpFPTJsaJHp6XGgE,7516
|
|
22
|
+
deepdoctection/datapoint/image.py,sha256=S6yfsIRQgMCl6HYAcHYJSBcbfdYKKtebtkEkkkrXsMQ,33619
|
|
23
|
+
deepdoctection/datapoint/view.py,sha256=srMyPQGsK4OSiorxkyG6UAIgpViM6Ks1CI3b5k97cjY,49452
|
|
24
24
|
deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
|
|
25
25
|
deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
|
|
26
26
|
deepdoctection/datasets/base.py,sha256=DT4i-d74sIEiUNC6UspIHNJuHSK0t1dBv7qwadg4rLw,22341
|
|
@@ -58,7 +58,7 @@ deepdoctection/extern/fastlang.py,sha256=F4gK-SEwcCujjxH327ZDzMGWToJ49xS_dCKcePQ
|
|
|
58
58
|
deepdoctection/extern/hfdetr.py,sha256=JzHrrTyzS9qh6T2TsvKboAGZkIhno2txmSoLQ5Vd-lo,12077
|
|
59
59
|
deepdoctection/extern/hflayoutlm.py,sha256=tFaf90FRbZzhSycdp8rGkeiPywQa6UcTEEwbayIXkr0,57023
|
|
60
60
|
deepdoctection/extern/hflm.py,sha256=kwS6kcSlY_2m9u0RzBLTRq-UMM7c1PhyUaDTvSdejus,9217
|
|
61
|
-
deepdoctection/extern/model.py,sha256=
|
|
61
|
+
deepdoctection/extern/model.py,sha256=lbVwDa3vD6VwCD_dsozcI8b4xDZs4KJ1628SxaDdtHQ,55378
|
|
62
62
|
deepdoctection/extern/pdftext.py,sha256=KS_t27SUiYn_IOS_J2lF9lSSo22vLagxmxvYCY3CqXA,7228
|
|
63
63
|
deepdoctection/extern/tessocr.py,sha256=tG7etMvZ-jHFdq-jJAHYMJii3ujDjMfAFYUsjBp3nKI,17444
|
|
64
64
|
deepdoctection/extern/texocr.py,sha256=yMt5ZzKtsjd7ogrcNXba7zccGGGF9LXK194EtER6YNQ,5804
|
|
@@ -88,39 +88,39 @@ deepdoctection/extern/tp/tpfrcnn/utils/__init__.py,sha256=kiPlXxHlTGN9eI7YE9Bgwt
|
|
|
88
88
|
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py,sha256=aBLqPg_ApaiimtBRaOsLKTZZFIBh87vVtqjLPMaX9fQ,2379
|
|
89
89
|
deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py,sha256=O-q1GQiOEd1lN1MQDsJvHwD2OmBO-qHNeqJ1Qnec93g,3539
|
|
90
90
|
deepdoctection/mapper/__init__.py,sha256=Xqb34aCjslZDQnqQgCSvnloL5DbdT9eHhn-StpVPbzE,1130
|
|
91
|
-
deepdoctection/mapper/cats.py,sha256=
|
|
91
|
+
deepdoctection/mapper/cats.py,sha256=s73JzONV2UQ71szfljurk7H1-UjDBWsW4oNLs5xePUk,16474
|
|
92
92
|
deepdoctection/mapper/cocostruct.py,sha256=GcbUpPFUg67pcOHQluWBFOFcGaYnlZcTmwBDERBVgCA,5978
|
|
93
93
|
deepdoctection/mapper/d2struct.py,sha256=Dx-YnycsIQH4a5-9Gn_yMhiQ-gOFgMueNeH3rhXjuCU,8555
|
|
94
94
|
deepdoctection/mapper/hfstruct.py,sha256=2PjGKsYturVJBimLT1CahYh09KSRAFEHz_QNtC162kQ,5551
|
|
95
95
|
deepdoctection/mapper/laylmstruct.py,sha256=abMZkYU2W0e_VcCm_c0ZXNFuv-lfMFWcTedcZS5EYvE,42935
|
|
96
96
|
deepdoctection/mapper/maputils.py,sha256=eI6ZcDg9W5uB6xQNBZpMIdEd86HlCxTtkJuyROdTqiw,8146
|
|
97
|
-
deepdoctection/mapper/match.py,sha256=
|
|
98
|
-
deepdoctection/mapper/misc.py,sha256=
|
|
97
|
+
deepdoctection/mapper/match.py,sha256=Ed9FsuVPNp_faaW5PKnvUHZoEXcRcrO-muduTMzjp1s,8937
|
|
98
|
+
deepdoctection/mapper/misc.py,sha256=vX-fV420Te00eD-cqTiWBV2twHqdBcBV2_7rAFRgPRg,7164
|
|
99
99
|
deepdoctection/mapper/pascalstruct.py,sha256=TzVU1p0oiw0nOuxTFFbEB9vXJxH1v6VUvTJ7MD0manU,3828
|
|
100
100
|
deepdoctection/mapper/prodigystruct.py,sha256=Re4Sd_zAp6qOvbXZLmMJeG0IGEfMQxebuyDeZgMcTa8,6827
|
|
101
|
-
deepdoctection/mapper/pubstruct.py,sha256=
|
|
101
|
+
deepdoctection/mapper/pubstruct.py,sha256=PAJ2N1HSPNS6F2ZrIwlD7PiBhIM-rJscK_Ti8OR_IGs,23370
|
|
102
102
|
deepdoctection/mapper/tpstruct.py,sha256=YNABRibvcISD5Lavg3jouoE4FMdqXEJoM-hNoB_rnww,4481
|
|
103
103
|
deepdoctection/mapper/xfundstruct.py,sha256=_3r3c0K82fnF2h1HxA85h-9ETYrHwcERa6MNc6Ko6Z8,8807
|
|
104
104
|
deepdoctection/pipe/__init__.py,sha256=ywTVoetftdL6plXg2YlBzMfmqBZupq7yXblSVyvvkcQ,1127
|
|
105
105
|
deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac,16393
|
|
106
|
-
deepdoctection/pipe/base.py,sha256=
|
|
107
|
-
deepdoctection/pipe/common.py,sha256=
|
|
106
|
+
deepdoctection/pipe/base.py,sha256=F4NusbZ-xYc6wuO-XAngmC8uzahT2ubsu2g9NO8PpVw,15390
|
|
107
|
+
deepdoctection/pipe/common.py,sha256=vlWzvwn8wl7baPbK-917HUWujEGJEkHur_-ilkweKjk,17751
|
|
108
108
|
deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
|
|
109
|
-
deepdoctection/pipe/doctectionpipe.py,sha256=
|
|
109
|
+
deepdoctection/pipe/doctectionpipe.py,sha256=bGW3ugky-fb-nEe-3bvO6Oc_4_6w82cQboGM_6p2eIo,12530
|
|
110
110
|
deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
|
|
111
111
|
deepdoctection/pipe/layout.py,sha256=xIhnJpyUSbvLbhTXyAKXY1hmG9352jihGYFSclTH_1g,5567
|
|
112
|
-
deepdoctection/pipe/lm.py,sha256=
|
|
112
|
+
deepdoctection/pipe/lm.py,sha256=Ygj6MmBvBZ7l4RGCwBuhmMsOM0Ep3LWteNg7bzh-UmI,17703
|
|
113
113
|
deepdoctection/pipe/order.py,sha256=PnJZiCnxFluJiECXLTZT0c1Rr66vIRBFraa_G41UA2k,40121
|
|
114
114
|
deepdoctection/pipe/refine.py,sha256=dTfI396xydPdbzpfo4yqFcuxl3UAB1y-WbSQn1o76ec,22367
|
|
115
115
|
deepdoctection/pipe/registry.py,sha256=aFx-Tn0xhVA5l5H18duNW5QoTNKQltybsEUEzsMgUfg,902
|
|
116
|
-
deepdoctection/pipe/segment.py,sha256=
|
|
116
|
+
deepdoctection/pipe/segment.py,sha256=CR83HQMW0hrRG8W6pFuB0YibxQMWpqI7_LaUIcJcQwo,59116
|
|
117
117
|
deepdoctection/pipe/sub_layout.py,sha256=N1RcID-boORcwsW_j0l64HpUu3rff0ge5qEanudLYgk,13838
|
|
118
118
|
deepdoctection/pipe/text.py,sha256=h9q6d3HFOs7LOg-iwdLUPiQxrPqgunBVNmtYMBrfRQE,11180
|
|
119
119
|
deepdoctection/pipe/transform.py,sha256=9Om7X7hJeL4jgUwHM1CHa4sb5v7Qo1PtVG0ls_3nI7w,3798
|
|
120
120
|
deepdoctection/train/__init__.py,sha256=YFTRAZF1F7cEAKTdAIi1BLyYb6rSRcwq09Ui5Lu8d6E,1071
|
|
121
121
|
deepdoctection/train/d2_frcnn_train.py,sha256=sFc_G-mEpaM8d1CCE0_6Gl4nBh11X2RYRBA3p_ylFJQ,16000
|
|
122
|
-
deepdoctection/train/hf_detr_train.py,sha256=
|
|
123
|
-
deepdoctection/train/hf_layoutlm_train.py,sha256=
|
|
122
|
+
deepdoctection/train/hf_detr_train.py,sha256=eHSdI11U8oGy93noZxAISfukhRBElj4dBerJ4Xcercw,10785
|
|
123
|
+
deepdoctection/train/hf_layoutlm_train.py,sha256=irSg-IpbVoSlaw1-vZCej2mCZcctONtXr5Z2NQAc_a4,22680
|
|
124
124
|
deepdoctection/train/tp_frcnn_train.py,sha256=pEpXokSVGveqo82pRnhnAmHPmjQ_8wQWpqM4ZyNHJgs,13049
|
|
125
125
|
deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ61nI,2371
|
|
126
126
|
deepdoctection/utils/concurrency.py,sha256=nIhpkSncmv0LBB8PtcOLY-BsRGlfcDpz7foVdgzZd20,4598
|
|
@@ -134,15 +134,15 @@ deepdoctection/utils/identifier.py,sha256=QkNaGGqPynHwDPnd3_m8iur4Cv64rcQa7qolCE
|
|
|
134
134
|
deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mVA,10003
|
|
135
135
|
deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
|
|
136
136
|
deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
|
|
137
|
-
deepdoctection/utils/pdf_utils.py,sha256=
|
|
138
|
-
deepdoctection/utils/settings.py,sha256=
|
|
137
|
+
deepdoctection/utils/pdf_utils.py,sha256=Fi0eZ2GbnO7N61Rd8b8YRKRff4dalHAzkcn3zpGPoic,13119
|
|
138
|
+
deepdoctection/utils/settings.py,sha256=hDD6yDX_4pQXwR5ILVwJIj6hb7NXA0-ifnC25ldcUjA,12464
|
|
139
139
|
deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
|
|
140
140
|
deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
|
|
141
141
|
deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
|
|
142
142
|
deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
|
|
143
143
|
deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
|
|
144
|
-
deepdoctection-0.
|
|
145
|
-
deepdoctection-0.
|
|
146
|
-
deepdoctection-0.
|
|
147
|
-
deepdoctection-0.
|
|
148
|
-
deepdoctection-0.
|
|
144
|
+
deepdoctection-0.39.1.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
|
|
145
|
+
deepdoctection-0.39.1.dist-info/METADATA,sha256=NBN2dqFMUiXkcJ28xJDwyN6eNP-MmFw64F7dm3kUWTA,19741
|
|
146
|
+
deepdoctection-0.39.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
147
|
+
deepdoctection-0.39.1.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
|
|
148
|
+
deepdoctection-0.39.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|