docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +379 -324
  12. docling/datamodel/pipeline_options.py +16 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +19 -6
  17. docling/models/ds_glm_model.py +220 -22
  18. docling/models/easyocr_model.py +45 -40
  19. docling/models/layout_model.py +130 -114
  20. docling/models/page_assemble_model.py +119 -95
  21. docling/models/page_preprocessing_model.py +61 -0
  22. docling/models/table_structure_model.py +122 -111
  23. docling/models/tesseract_ocr_cli_model.py +65 -58
  24. docling/models/tesseract_ocr_model.py +58 -50
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.1.0.dist-info/METADATA +149 -0
  31. docling-2.1.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.0.dist-info/METADATA +0 -380
  35. docling-1.19.0.dist-info/RECORD +0 -34
  36. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,190 @@
1
+ import functools
2
+ import logging
3
+ import time
4
+ import traceback
5
+ from abc import ABC, abstractmethod
6
+ from typing import Callable, Iterable, List
7
+
8
+ from docling_core.types.doc import DoclingDocument, NodeItem
9
+
10
+ from docling.backend.abstract_backend import AbstractDocumentBackend
11
+ from docling.backend.pdf_backend import PdfDocumentBackend
12
+ from docling.datamodel.base_models import (
13
+ ConversionStatus,
14
+ DoclingComponentType,
15
+ ErrorItem,
16
+ Page,
17
+ )
18
+ from docling.datamodel.document import ConversionResult, InputDocument
19
+ from docling.datamodel.pipeline_options import PipelineOptions
20
+ from docling.datamodel.settings import settings
21
+ from docling.models.base_model import BaseEnrichmentModel
22
+ from docling.utils.utils import chunkify
23
+
24
+ _log = logging.getLogger(__name__)
25
+
26
+
27
+ class BasePipeline(ABC):
28
+ def __init__(self, pipeline_options: PipelineOptions):
29
+ self.pipeline_options = pipeline_options
30
+ self.build_pipe: List[Callable] = []
31
+ self.enrichment_pipe: List[BaseEnrichmentModel] = []
32
+
33
+ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
34
+ conv_res = ConversionResult(input=in_doc)
35
+
36
+ _log.info(f"Processing document {in_doc.file.name}")
37
+ try:
38
+ # These steps are building and assembling the structure of the
39
+ # output DoclingDocument
40
+ conv_res = self._build_document(in_doc, conv_res)
41
+ conv_res = self._assemble_document(in_doc, conv_res)
42
+ # From this stage, all operations should rely only on conv_res.output
43
+ conv_res = self._enrich_document(in_doc, conv_res)
44
+ conv_res.status = self._determine_status(in_doc, conv_res)
45
+ except Exception as e:
46
+ conv_res.status = ConversionStatus.FAILURE
47
+ if raises_on_error:
48
+ raise e
49
+
50
+ return conv_res
51
+
52
+ @abstractmethod
53
+ def _build_document(
54
+ self, in_doc: InputDocument, conv_res: ConversionResult
55
+ ) -> ConversionResult:
56
+ pass
57
+
58
+ def _assemble_document(
59
+ self, in_doc: InputDocument, conv_res: ConversionResult
60
+ ) -> ConversionResult:
61
+ return conv_res
62
+
63
+ def _enrich_document(
64
+ self, in_doc: InputDocument, conv_res: ConversionResult
65
+ ) -> ConversionResult:
66
+
67
+ def _filter_elements(
68
+ doc: DoclingDocument, model: BaseEnrichmentModel
69
+ ) -> Iterable[NodeItem]:
70
+ for element, _level in doc.iterate_items():
71
+ if model.is_processable(doc=doc, element=element):
72
+ yield element
73
+
74
+ for model in self.enrichment_pipe:
75
+ for element_batch in chunkify(
76
+ _filter_elements(conv_res.document, model),
77
+ settings.perf.elements_batch_size,
78
+ ):
79
+ # TODO: currently we assume the element itself is modified, because
80
+ # we don't have an interface to save the element back to the document
81
+ for element in model(
82
+ doc=conv_res.document, element_batch=element_batch
83
+ ): # Must exhaust!
84
+ pass
85
+
86
+ return conv_res
87
+
88
+ @abstractmethod
89
+ def _determine_status(
90
+ self, in_doc: InputDocument, conv_res: ConversionResult
91
+ ) -> ConversionStatus:
92
+ pass
93
+
94
+ @classmethod
95
+ @abstractmethod
96
+ def get_default_options(cls) -> PipelineOptions:
97
+ pass
98
+
99
+ @classmethod
100
+ @abstractmethod
101
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
102
+ pass
103
+
104
+ # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
105
+ # for model in self.build_pipe:
106
+ # element_batch = model(element_batch)
107
+ #
108
+ # yield from element_batch
109
+
110
+
111
+ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
112
+
113
+ def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
114
+ for model in self.build_pipe:
115
+ page_batch = model(page_batch)
116
+
117
+ yield from page_batch
118
+
119
+ def _build_document(
120
+ self, in_doc: InputDocument, conv_res: ConversionResult
121
+ ) -> ConversionResult:
122
+
123
+ if not isinstance(in_doc._backend, PdfDocumentBackend):
124
+ raise RuntimeError(
125
+ f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
126
+ f"Can not convert this with a PDF pipeline. "
127
+ f"Please check your format configuration on DocumentConverter."
128
+ )
129
+ # conv_res.status = ConversionStatus.FAILURE
130
+ # return conv_res
131
+
132
+ for i in range(0, in_doc.page_count):
133
+ conv_res.pages.append(Page(page_no=i))
134
+
135
+ try:
136
+ # Iterate batches of pages (page_batch_size) in the doc
137
+ for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
138
+ start_pb_time = time.time()
139
+
140
+ # 1. Initialise the page resources
141
+ init_pages = map(
142
+ functools.partial(self.initialize_page, in_doc), page_batch
143
+ )
144
+
145
+ # 2. Run pipeline stages
146
+ pipeline_pages = self._apply_on_pages(init_pages)
147
+
148
+ for p in pipeline_pages: # Must exhaust!
149
+ pass
150
+
151
+ end_pb_time = time.time() - start_pb_time
152
+ _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
153
+
154
+ except Exception as e:
155
+ conv_res.status = ConversionStatus.FAILURE
156
+ trace = "\n".join(traceback.format_exception(e))
157
+ _log.warning(
158
+ f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
159
+ f"{trace}"
160
+ )
161
+ raise e
162
+
163
+ finally:
164
+ # Always unload the PDF backend, even in case of failure
165
+ if in_doc._backend:
166
+ in_doc._backend.unload()
167
+
168
+ return conv_res
169
+
170
+ def _determine_status(
171
+ self, in_doc: InputDocument, conv_res: ConversionResult
172
+ ) -> ConversionStatus:
173
+ status = ConversionStatus.SUCCESS
174
+ for page in conv_res.pages:
175
+ if page._backend is None or not page._backend.is_valid():
176
+ conv_res.errors.append(
177
+ ErrorItem(
178
+ component_type=DoclingComponentType.DOCUMENT_BACKEND,
179
+ module_name=type(page._backend).__name__,
180
+ error_message=f"Page {page.page_no} failed to parse.",
181
+ )
182
+ )
183
+ status = ConversionStatus.PARTIAL_SUCCESS
184
+
185
+ return status
186
+
187
+ # Initialise and load resources for a page
188
+ @abstractmethod
189
+ def initialize_page(self, doc: InputDocument, page: Page) -> Page:
190
+ pass
@@ -0,0 +1,59 @@
1
+ import logging
2
+
3
+ from docling.backend.abstract_backend import (
4
+ AbstractDocumentBackend,
5
+ DeclarativeDocumentBackend,
6
+ )
7
+ from docling.datamodel.base_models import ConversionStatus
8
+ from docling.datamodel.document import ConversionResult, InputDocument
9
+ from docling.datamodel.pipeline_options import PipelineOptions
10
+ from docling.pipeline.base_pipeline import BasePipeline
11
+
12
+ _log = logging.getLogger(__name__)
13
+
14
+
15
+ class SimplePipeline(BasePipeline):
16
+ """SimpleModelPipeline.
17
+
18
+ This class is used at the moment for formats / backends
19
+ which produce straight DoclingDocument output.
20
+ """
21
+
22
+ def __init__(self, pipeline_options: PipelineOptions):
23
+ super().__init__(pipeline_options)
24
+
25
+ def _build_document(
26
+ self, in_doc: InputDocument, conv_res: ConversionResult
27
+ ) -> ConversionResult:
28
+
29
+ if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
30
+ raise RuntimeError(
31
+ f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
32
+ f"Can not convert this with simple pipeline. "
33
+ f"Please check your format configuration on DocumentConverter."
34
+ )
35
+ # conv_res.status = ConversionStatus.FAILURE
36
+ # return conv_res
37
+
38
+ # Instead of running a page-level pipeline to build up the document structure,
39
+ # the backend is expected to be of type DeclarativeDocumentBackend, which can output
40
+ # a DoclingDocument straight.
41
+
42
+ conv_res.document = in_doc._backend.convert()
43
+ return conv_res
44
+
45
+ def _determine_status(
46
+ self, in_doc: InputDocument, conv_res: ConversionResult
47
+ ) -> ConversionStatus:
48
+ # This is called only if the previous steps didn't raise.
49
+ # Since we don't have anything else to evaluate, we can
50
+ # safely return SUCCESS.
51
+ return ConversionStatus.SUCCESS
52
+
53
+ @classmethod
54
+ def get_default_options(cls) -> PipelineOptions:
55
+ return PipelineOptions()
56
+
57
+ @classmethod
58
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
59
+ return isinstance(backend, DeclarativeDocumentBackend)
@@ -0,0 +1,198 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
6
+
7
+ from docling.backend.abstract_backend import AbstractDocumentBackend
8
+ from docling.backend.pdf_backend import PdfDocumentBackend
9
+ from docling.datamodel.base_models import AssembledUnit, Page
10
+ from docling.datamodel.document import ConversionResult, InputDocument
11
+ from docling.datamodel.pipeline_options import (
12
+ EasyOcrOptions,
13
+ PdfPipelineOptions,
14
+ TesseractCliOcrOptions,
15
+ TesseractOcrOptions,
16
+ )
17
+ from docling.models.base_ocr_model import BaseOcrModel
18
+ from docling.models.ds_glm_model import GlmModel, GlmOptions
19
+ from docling.models.easyocr_model import EasyOcrModel
20
+ from docling.models.layout_model import LayoutModel
21
+ from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
22
+ from docling.models.page_preprocessing_model import (
23
+ PagePreprocessingModel,
24
+ PagePreprocessingOptions,
25
+ )
26
+ from docling.models.table_structure_model import TableStructureModel
27
+ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
28
+ from docling.models.tesseract_ocr_model import TesseractOcrModel
29
+ from docling.pipeline.base_pipeline import PaginatedPipeline
30
+
31
+ _log = logging.getLogger(__name__)
32
+
33
+
34
+ class StandardPdfPipeline(PaginatedPipeline):
35
+ _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
36
+ _table_model_path = "model_artifacts/tableformer"
37
+
38
+ def __init__(self, pipeline_options: PdfPipelineOptions):
39
+ super().__init__(pipeline_options)
40
+ self.pipeline_options: PdfPipelineOptions
41
+
42
+ if pipeline_options.artifacts_path is None:
43
+ self.artifacts_path = self.download_models_hf()
44
+ else:
45
+ self.artifacts_path = Path(pipeline_options.artifacts_path)
46
+
47
+ keep_images = (
48
+ self.pipeline_options.generate_page_images
49
+ or self.pipeline_options.generate_picture_images
50
+ or self.pipeline_options.generate_table_images
51
+ )
52
+
53
+ self.glm_model = GlmModel(options=GlmOptions())
54
+
55
+ if (ocr_model := self.get_ocr_model()) is None:
56
+ raise RuntimeError(
57
+ f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
58
+ )
59
+
60
+ self.build_pipe = [
61
+ # Pre-processing
62
+ PagePreprocessingModel(
63
+ options=PagePreprocessingOptions(
64
+ images_scale=pipeline_options.images_scale
65
+ )
66
+ ),
67
+ # OCR
68
+ ocr_model,
69
+ # Layout model
70
+ LayoutModel(
71
+ artifacts_path=self.artifacts_path
72
+ / StandardPdfPipeline._layout_model_path
73
+ ),
74
+ # Table structure model
75
+ TableStructureModel(
76
+ enabled=pipeline_options.do_table_structure,
77
+ artifacts_path=self.artifacts_path
78
+ / StandardPdfPipeline._table_model_path,
79
+ options=pipeline_options.table_structure_options,
80
+ ),
81
+ # Page assemble
82
+ PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
83
+ ]
84
+
85
+ self.enrichment_pipe = [
86
+ # Other models working on `NodeItem` elements in the DoclingDocument
87
+ ]
88
+
89
+ @staticmethod
90
+ def download_models_hf(
91
+ local_dir: Optional[Path] = None, force: bool = False
92
+ ) -> Path:
93
+ from huggingface_hub import snapshot_download
94
+
95
+ download_path = snapshot_download(
96
+ repo_id="ds4sd/docling-models",
97
+ force_download=force,
98
+ local_dir=local_dir,
99
+ revision="v2.0.1",
100
+ )
101
+
102
+ return Path(download_path)
103
+
104
+ def get_ocr_model(self) -> Optional[BaseOcrModel]:
105
+ if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
106
+ return EasyOcrModel(
107
+ enabled=self.pipeline_options.do_ocr,
108
+ options=self.pipeline_options.ocr_options,
109
+ )
110
+ elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
111
+ return TesseractOcrCliModel(
112
+ enabled=self.pipeline_options.do_ocr,
113
+ options=self.pipeline_options.ocr_options,
114
+ )
115
+ elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
116
+ return TesseractOcrModel(
117
+ enabled=self.pipeline_options.do_ocr,
118
+ options=self.pipeline_options.ocr_options,
119
+ )
120
+ return None
121
+
122
+ def initialize_page(self, doc: InputDocument, page: Page) -> Page:
123
+ page._backend = doc._backend.load_page(page.page_no) # type: ignore
124
+ if page._backend is not None and page._backend.is_valid():
125
+ page.size = page._backend.get_size()
126
+
127
+ return page
128
+
129
+ def _assemble_document(
130
+ self, in_doc: InputDocument, conv_res: ConversionResult
131
+ ) -> ConversionResult:
132
+ all_elements = []
133
+ all_headers = []
134
+ all_body = []
135
+
136
+ for p in conv_res.pages:
137
+ if p.assembled is not None:
138
+ for el in p.assembled.body:
139
+ all_body.append(el)
140
+ for el in p.assembled.headers:
141
+ all_headers.append(el)
142
+ for el in p.assembled.elements:
143
+ all_elements.append(el)
144
+
145
+ conv_res.assembled = AssembledUnit(
146
+ elements=all_elements, headers=all_headers, body=all_body
147
+ )
148
+
149
+ conv_res.document = self.glm_model(conv_res)
150
+
151
+ # Generate page images in the output
152
+ if self.pipeline_options.generate_page_images:
153
+ for page in conv_res.pages:
154
+ assert page.image is not None
155
+ page_no = page.page_no + 1
156
+ conv_res.document.pages[page_no].image = ImageRef.from_pil(
157
+ page.image, dpi=int(72 * self.pipeline_options.images_scale)
158
+ )
159
+
160
+ # Generate images of the requested element types
161
+ if (
162
+ self.pipeline_options.generate_picture_images
163
+ or self.pipeline_options.generate_table_images
164
+ ):
165
+ scale = self.pipeline_options.images_scale
166
+ for element, _level in conv_res.document.iterate_items():
167
+ if not isinstance(element, DocItem) or len(element.prov) == 0:
168
+ continue
169
+ if (
170
+ isinstance(element, PictureItem)
171
+ and self.pipeline_options.generate_picture_images
172
+ ) or (
173
+ isinstance(element, TableItem)
174
+ and self.pipeline_options.generate_table_images
175
+ ):
176
+ page_ix = element.prov[0].page_no - 1
177
+ page = conv_res.pages[page_ix]
178
+ assert page.size is not None
179
+ assert page.image is not None
180
+
181
+ crop_bbox = (
182
+ element.prov[0]
183
+ .bbox.scaled(scale=scale)
184
+ .to_top_left_origin(page_height=page.size.height * scale)
185
+ )
186
+
187
+ cropped_im = page.image.crop(crop_bbox.as_tuple())
188
+ element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
189
+
190
+ return conv_res
191
+
192
+ @classmethod
193
+ def get_default_options(cls) -> PdfPipelineOptions:
194
+ return PdfPipelineOptions()
195
+
196
+ @classmethod
197
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
198
+ return isinstance(backend, PdfDocumentBackend)
docling/utils/export.py CHANGED
@@ -1,9 +1,10 @@
1
1
  import logging
2
2
  from typing import Any, Dict, Iterable, List, Tuple, Union
3
3
 
4
- from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
4
+ from docling_core.types.doc import BoundingBox, CoordOrigin
5
+ from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
5
6
 
6
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
7
+ from docling.datamodel.base_models import OcrCell
7
8
  from docling.datamodel.document import ConversionResult, Page
8
9
 
9
10
  _log = logging.getLogger(__name__)
@@ -40,7 +41,7 @@ def generate_multimodal_pages(
40
41
  end_ix = 0
41
42
  doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
42
43
 
43
- doc = doc_result.output
44
+ doc = doc_result.legacy_document
44
45
 
45
46
  def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
46
47
  segments = []
@@ -2,6 +2,7 @@ import copy
2
2
  import logging
3
3
 
4
4
  import networkx as nx
5
+ from docling_core.types.doc import DocItemLabel
5
6
 
6
7
  logger = logging.getLogger("layout_utils")
7
8
 
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
370
371
  "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
371
372
  )
372
373
  logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
373
- if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
374
+ if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
374
375
  logger.debug(" Empty non-picture, removed")
375
376
  continue ## Skip this former cluster, now without cells.
376
377
  new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
380
381
 
381
382
 
382
383
  def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
383
- if not (cluster["type"] in ["Table", "Picture"]):
384
+ if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
384
385
  ## A text-like cluster. The bbox only needs to be around the text cells:
385
386
  logger.debug(" Initial bbox: " + str(cluster["bbox"]))
386
387
  new_bbox = surrounding_list(
387
388
  [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
388
389
  )
389
390
  logger.debug(" New bounding box:" + str(new_bbox))
390
- if cluster["type"] == "Picture":
391
+ if cluster["type"] == DocItemLabel.PICTURE:
391
392
  ## We only make the bbox completely comprise included text cells:
392
393
  logger.debug(" Picture")
393
394
  if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
587
588
  max_id = -1
588
589
  figures = []
589
590
  for cluster in cluster_predictions:
590
- if cluster["type"] == "Picture":
591
+ if cluster["type"] == DocItemLabel.PICTURE:
591
592
  figures.append(cluster)
592
593
 
593
594
  if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
638
639
  # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
639
640
  if fig_flag == False and lines_detector == False:
640
641
  # get class from low confidence detections if not set as text:
641
- class_type = "Text"
642
+ class_type = DocItemLabel.TEXT
642
643
 
643
644
  for cluster in cluster_predictions_low:
644
645
  intersection = compute_intersection(
645
646
  orph_cell["bbox"], cluster["bbox"]
646
647
  )
647
- class_type = "Text"
648
+ class_type = DocItemLabel.TEXT
648
649
  if (
649
650
  cluster["confidence"] > 0.1
650
651
  and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
718
719
  if cluster["id"] == node:
719
720
  lines.append(cluster)
720
721
  cluster_predictions.remove(cluster)
721
- new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
722
+ new_merged_cluster = build_cluster_from_lines(
723
+ lines, DocItemLabel.TEXT, max_id
724
+ )
722
725
  cluster_predictions.append(new_merged_cluster)
723
726
  return cluster_predictions
724
727
 
@@ -753,9 +756,9 @@ def clean_up_clusters(
753
756
  # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
754
757
  elif img_table == True:
755
758
  if (
756
- cluster_1["type"] == "Text"
757
- and cluster_2["type"] == "Picture"
758
- or cluster_2["type"] == "Table"
759
+ cluster_1["type"] == DocItemLabel.TEXT
760
+ and cluster_2["type"] == DocItemLabel.PICTURE
761
+ or cluster_2["type"] == DocItemLabel.TABLE
759
762
  ):
760
763
  if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
761
764
  DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +774,10 @@ def clean_up_clusters(
771
774
  DuplicateDeletedClusterIDs.append(cluster_1["id"])
772
775
  # remove tables that have one pdf cell
773
776
  if one_cell_table == True:
774
- if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
777
+ if (
778
+ cluster_1["type"] == DocItemLabel.TABLE
779
+ and len(cluster_1["cell_ids"]) < 2
780
+ ):
775
781
  DuplicateDeletedClusterIDs.append(cluster_1["id"])
776
782
 
777
783
  DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))