docling 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,8 +8,11 @@ import pandas as pd
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
9
 
10
10
  from docling.datamodel.base_models import OcrCell, Page
11
+ from docling.datamodel.document import ConversionResult
11
12
  from docling.datamodel.pipeline_options import TesseractCliOcrOptions
13
+ from docling.datamodel.settings import settings
12
14
  from docling.models.base_ocr_model import BaseOcrModel
15
+ from docling.utils.profiling import TimeRecorder
13
16
 
14
17
  _log = logging.getLogger(__name__)
15
18
 
@@ -102,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
102
105
 
103
106
  return df_filtered
104
107
 
105
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
108
+ def __call__(
109
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
110
+ ) -> Iterable[Page]:
106
111
 
107
112
  if not self.enabled:
108
113
  yield from page_batch
@@ -113,62 +118,67 @@ class TesseractOcrCliModel(BaseOcrModel):
113
118
  if not page._backend.is_valid():
114
119
  yield page
115
120
  else:
116
- ocr_rects = self.get_ocr_rects(page)
117
-
118
- all_ocr_cells = []
119
- for ocr_rect in ocr_rects:
120
- # Skip zero area boxes
121
- if ocr_rect.area() == 0:
122
- continue
123
- high_res_image = page._backend.get_page_image(
124
- scale=self.scale, cropbox=ocr_rect
125
- )
121
+ with TimeRecorder(conv_res, "ocr"):
126
122
 
127
- with tempfile.NamedTemporaryFile(
128
- suffix=".png", mode="w"
129
- ) as image_file:
130
- fname = image_file.name
131
- high_res_image.save(fname)
132
-
133
- df = self._run_tesseract(fname)
134
-
135
- # _log.info(df)
136
-
137
- # Print relevant columns (bounding box and text)
138
- for ix, row in df.iterrows():
139
- text = row["text"]
140
- conf = row["conf"]
141
-
142
- l = float(row["left"])
143
- b = float(row["top"])
144
- w = float(row["width"])
145
- h = float(row["height"])
146
-
147
- t = b + h
148
- r = l + w
149
-
150
- cell = OcrCell(
151
- id=ix,
152
- text=text,
153
- confidence=conf / 100.0,
154
- bbox=BoundingBox.from_tuple(
155
- coord=(
156
- (l / self.scale) + ocr_rect.l,
157
- (b / self.scale) + ocr_rect.t,
158
- (r / self.scale) + ocr_rect.l,
159
- (t / self.scale) + ocr_rect.t,
160
- ),
161
- origin=CoordOrigin.TOPLEFT,
162
- ),
123
+ ocr_rects = self.get_ocr_rects(page)
124
+
125
+ all_ocr_cells = []
126
+ for ocr_rect in ocr_rects:
127
+ # Skip zero area boxes
128
+ if ocr_rect.area() == 0:
129
+ continue
130
+ high_res_image = page._backend.get_page_image(
131
+ scale=self.scale, cropbox=ocr_rect
163
132
  )
164
- all_ocr_cells.append(cell)
165
133
 
166
- ## Remove OCR cells which overlap with programmatic cells.
167
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
134
+ with tempfile.NamedTemporaryFile(
135
+ suffix=".png", mode="w"
136
+ ) as image_file:
137
+ fname = image_file.name
138
+ high_res_image.save(fname)
139
+
140
+ df = self._run_tesseract(fname)
141
+
142
+ # _log.info(df)
143
+
144
+ # Print relevant columns (bounding box and text)
145
+ for ix, row in df.iterrows():
146
+ text = row["text"]
147
+ conf = row["conf"]
148
+
149
+ l = float(row["left"])
150
+ b = float(row["top"])
151
+ w = float(row["width"])
152
+ h = float(row["height"])
153
+
154
+ t = b + h
155
+ r = l + w
156
+
157
+ cell = OcrCell(
158
+ id=ix,
159
+ text=text,
160
+ confidence=conf / 100.0,
161
+ bbox=BoundingBox.from_tuple(
162
+ coord=(
163
+ (l / self.scale) + ocr_rect.l,
164
+ (b / self.scale) + ocr_rect.t,
165
+ (r / self.scale) + ocr_rect.l,
166
+ (t / self.scale) + ocr_rect.t,
167
+ ),
168
+ origin=CoordOrigin.TOPLEFT,
169
+ ),
170
+ )
171
+ all_ocr_cells.append(cell)
172
+
173
+ ## Remove OCR cells which overlap with programmatic cells.
174
+ filtered_ocr_cells = self.filter_ocr_cells(
175
+ all_ocr_cells, page.cells
176
+ )
168
177
 
169
- page.cells.extend(filtered_ocr_cells)
178
+ page.cells.extend(filtered_ocr_cells)
170
179
 
171
180
  # DEBUG code:
172
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
181
+ if settings.debug.visualize_ocr:
182
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
173
183
 
174
184
  yield page
@@ -4,8 +4,11 @@ from typing import Iterable
4
4
  from docling_core.types.doc import BoundingBox, CoordOrigin
5
5
 
6
6
  from docling.datamodel.base_models import OcrCell, Page
7
+ from docling.datamodel.document import ConversionResult
7
8
  from docling.datamodel.pipeline_options import TesseractOcrOptions
9
+ from docling.datamodel.settings import settings
8
10
  from docling.models.base_ocr_model import BaseOcrModel
11
+ from docling.utils.profiling import TimeRecorder
9
12
 
10
13
  _log = logging.getLogger(__name__)
11
14
 
@@ -61,7 +64,9 @@ class TesseractOcrModel(BaseOcrModel):
61
64
  # Finalize the tesseractAPI
62
65
  self.reader.End()
63
66
 
64
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
67
+ def __call__(
68
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
69
+ ) -> Iterable[Page]:
65
70
 
66
71
  if not self.enabled:
67
72
  yield from page_batch
@@ -72,59 +77,66 @@ class TesseractOcrModel(BaseOcrModel):
72
77
  if not page._backend.is_valid():
73
78
  yield page
74
79
  else:
75
- assert self.reader is not None
80
+ with TimeRecorder(conv_res, "ocr"):
76
81
 
77
- ocr_rects = self.get_ocr_rects(page)
82
+ assert self.reader is not None
78
83
 
79
- all_ocr_cells = []
80
- for ocr_rect in ocr_rects:
81
- # Skip zero area boxes
82
- if ocr_rect.area() == 0:
83
- continue
84
- high_res_image = page._backend.get_page_image(
85
- scale=self.scale, cropbox=ocr_rect
86
- )
84
+ ocr_rects = self.get_ocr_rects(page)
87
85
 
88
- # Retrieve text snippets with their bounding boxes
89
- self.reader.SetImage(high_res_image)
90
- boxes = self.reader.GetComponentImages(
91
- self.reader_RIL.TEXTLINE, True
92
- )
86
+ all_ocr_cells = []
87
+ for ocr_rect in ocr_rects:
88
+ # Skip zero area boxes
89
+ if ocr_rect.area() == 0:
90
+ continue
91
+ high_res_image = page._backend.get_page_image(
92
+ scale=self.scale, cropbox=ocr_rect
93
+ )
93
94
 
94
- cells = []
95
- for ix, (im, box, _, _) in enumerate(boxes):
96
- # Set the area of interest. Tesseract uses Bottom-Left for the origin
97
- self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
98
-
99
- # Extract text within the bounding box
100
- text = self.reader.GetUTF8Text().strip()
101
- confidence = self.reader.MeanTextConf()
102
- left = box["x"] / self.scale
103
- bottom = box["y"] / self.scale
104
- right = (box["x"] + box["w"]) / self.scale
105
- top = (box["y"] + box["h"]) / self.scale
106
-
107
- cells.append(
108
- OcrCell(
109
- id=ix,
110
- text=text,
111
- confidence=confidence,
112
- bbox=BoundingBox.from_tuple(
113
- coord=(left, top, right, bottom),
114
- origin=CoordOrigin.TOPLEFT,
115
- ),
116
- )
95
+ # Retrieve text snippets with their bounding boxes
96
+ self.reader.SetImage(high_res_image)
97
+ boxes = self.reader.GetComponentImages(
98
+ self.reader_RIL.TEXTLINE, True
117
99
  )
118
100
 
119
- # del high_res_image
120
- all_ocr_cells.extend(cells)
101
+ cells = []
102
+ for ix, (im, box, _, _) in enumerate(boxes):
103
+ # Set the area of interest. Tesseract uses Bottom-Left for the origin
104
+ self.reader.SetRectangle(
105
+ box["x"], box["y"], box["w"], box["h"]
106
+ )
121
107
 
122
- ## Remove OCR cells which overlap with programmatic cells.
123
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
108
+ # Extract text within the bounding box
109
+ text = self.reader.GetUTF8Text().strip()
110
+ confidence = self.reader.MeanTextConf()
111
+ left = box["x"] / self.scale
112
+ bottom = box["y"] / self.scale
113
+ right = (box["x"] + box["w"]) / self.scale
114
+ top = (box["y"] + box["h"]) / self.scale
115
+
116
+ cells.append(
117
+ OcrCell(
118
+ id=ix,
119
+ text=text,
120
+ confidence=confidence,
121
+ bbox=BoundingBox.from_tuple(
122
+ coord=(left, top, right, bottom),
123
+ origin=CoordOrigin.TOPLEFT,
124
+ ),
125
+ )
126
+ )
127
+
128
+ # del high_res_image
129
+ all_ocr_cells.extend(cells)
130
+
131
+ ## Remove OCR cells which overlap with programmatic cells.
132
+ filtered_ocr_cells = self.filter_ocr_cells(
133
+ all_ocr_cells, page.cells
134
+ )
124
135
 
125
- page.cells.extend(filtered_ocr_cells)
136
+ page.cells.extend(filtered_ocr_cells)
126
137
 
127
138
  # DEBUG code:
128
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
139
+ if settings.debug.visualize_ocr:
140
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
129
141
 
130
142
  yield page
@@ -19,6 +19,7 @@ from docling.datamodel.document import ConversionResult, InputDocument
19
19
  from docling.datamodel.pipeline_options import PipelineOptions
20
20
  from docling.datamodel.settings import settings
21
21
  from docling.models.base_model import BaseEnrichmentModel
22
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
22
23
  from docling.utils.utils import chunkify
23
24
 
24
25
  _log = logging.getLogger(__name__)
@@ -35,13 +36,16 @@ class BasePipeline(ABC):
35
36
 
36
37
  _log.info(f"Processing document {in_doc.file.name}")
37
38
  try:
38
- # These steps are building and assembling the structure of the
39
- # output DoclingDocument
40
- conv_res = self._build_document(in_doc, conv_res)
41
- conv_res = self._assemble_document(in_doc, conv_res)
42
- # From this stage, all operations should rely only on conv_res.output
43
- conv_res = self._enrich_document(in_doc, conv_res)
44
- conv_res.status = self._determine_status(in_doc, conv_res)
39
+ with TimeRecorder(
40
+ conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
41
+ ):
42
+ # These steps are building and assembling the structure of the
43
+ # output DoclingDocument
44
+ conv_res = self._build_document(conv_res)
45
+ conv_res = self._assemble_document(conv_res)
46
+ # From this stage, all operations should rely only on conv_res.output
47
+ conv_res = self._enrich_document(conv_res)
48
+ conv_res.status = self._determine_status(conv_res)
45
49
  except Exception as e:
46
50
  conv_res.status = ConversionStatus.FAILURE
47
51
  if raises_on_error:
@@ -50,19 +54,13 @@ class BasePipeline(ABC):
50
54
  return conv_res
51
55
 
52
56
  @abstractmethod
53
- def _build_document(
54
- self, in_doc: InputDocument, conv_res: ConversionResult
55
- ) -> ConversionResult:
57
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
56
58
  pass
57
59
 
58
- def _assemble_document(
59
- self, in_doc: InputDocument, conv_res: ConversionResult
60
- ) -> ConversionResult:
60
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
61
61
  return conv_res
62
62
 
63
- def _enrich_document(
64
- self, in_doc: InputDocument, conv_res: ConversionResult
65
- ) -> ConversionResult:
63
+ def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
66
64
 
67
65
  def _filter_elements(
68
66
  doc: DoclingDocument, model: BaseEnrichmentModel
@@ -71,24 +69,23 @@ class BasePipeline(ABC):
71
69
  if model.is_processable(doc=doc, element=element):
72
70
  yield element
73
71
 
74
- for model in self.enrichment_pipe:
75
- for element_batch in chunkify(
76
- _filter_elements(conv_res.document, model),
77
- settings.perf.elements_batch_size,
78
- ):
79
- # TODO: currently we assume the element itself is modified, because
80
- # we don't have an interface to save the element back to the document
81
- for element in model(
82
- doc=conv_res.document, element_batch=element_batch
83
- ): # Must exhaust!
84
- pass
72
+ with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
73
+ for model in self.enrichment_pipe:
74
+ for element_batch in chunkify(
75
+ _filter_elements(conv_res.document, model),
76
+ settings.perf.elements_batch_size,
77
+ ):
78
+ # TODO: currently we assume the element itself is modified, because
79
+ # we don't have an interface to save the element back to the document
80
+ for element in model(
81
+ doc=conv_res.document, element_batch=element_batch
82
+ ): # Must exhaust!
83
+ pass
85
84
 
86
85
  return conv_res
87
86
 
88
87
  @abstractmethod
89
- def _determine_status(
90
- self, in_doc: InputDocument, conv_res: ConversionResult
91
- ) -> ConversionStatus:
88
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
92
89
  pass
93
90
 
94
91
  @classmethod
@@ -110,66 +107,68 @@ class BasePipeline(ABC):
110
107
 
111
108
  class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
112
109
 
113
- def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
110
+ def _apply_on_pages(
111
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
112
+ ) -> Iterable[Page]:
114
113
  for model in self.build_pipe:
115
- page_batch = model(page_batch)
114
+ page_batch = model(conv_res, page_batch)
116
115
 
117
116
  yield from page_batch
118
117
 
119
- def _build_document(
120
- self, in_doc: InputDocument, conv_res: ConversionResult
121
- ) -> ConversionResult:
118
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
122
119
 
123
- if not isinstance(in_doc._backend, PdfDocumentBackend):
120
+ if not isinstance(conv_res.input._backend, PdfDocumentBackend):
124
121
  raise RuntimeError(
125
- f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
122
+ f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
126
123
  f"Can not convert this with a PDF pipeline. "
127
124
  f"Please check your format configuration on DocumentConverter."
128
125
  )
129
126
  # conv_res.status = ConversionStatus.FAILURE
130
127
  # return conv_res
131
128
 
132
- for i in range(0, in_doc.page_count):
133
- conv_res.pages.append(Page(page_no=i))
129
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
134
130
 
135
- try:
136
- # Iterate batches of pages (page_batch_size) in the doc
137
- for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
138
- start_pb_time = time.time()
131
+ for i in range(0, conv_res.input.page_count):
132
+ conv_res.pages.append(Page(page_no=i))
139
133
 
140
- # 1. Initialise the page resources
141
- init_pages = map(
142
- functools.partial(self.initialize_page, in_doc), page_batch
143
- )
134
+ try:
135
+ # Iterate batches of pages (page_batch_size) in the doc
136
+ for page_batch in chunkify(
137
+ conv_res.pages, settings.perf.page_batch_size
138
+ ):
139
+ start_pb_time = time.time()
144
140
 
145
- # 2. Run pipeline stages
146
- pipeline_pages = self._apply_on_pages(init_pages)
141
+ # 1. Initialise the page resources
142
+ init_pages = map(
143
+ functools.partial(self.initialize_page, conv_res), page_batch
144
+ )
147
145
 
148
- for p in pipeline_pages: # Must exhaust!
149
- pass
146
+ # 2. Run pipeline stages
147
+ pipeline_pages = self._apply_on_pages(conv_res, init_pages)
150
148
 
151
- end_pb_time = time.time() - start_pb_time
152
- _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
149
+ for p in pipeline_pages: # Must exhaust!
150
+ pass
153
151
 
154
- except Exception as e:
155
- conv_res.status = ConversionStatus.FAILURE
156
- trace = "\n".join(traceback.format_exception(e))
157
- _log.warning(
158
- f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
159
- f"{trace}"
160
- )
161
- raise e
152
+ end_pb_time = time.time() - start_pb_time
153
+ _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
154
+
155
+ except Exception as e:
156
+ conv_res.status = ConversionStatus.FAILURE
157
+ trace = "\n".join(traceback.format_exception(e))
158
+ _log.warning(
159
+ f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
160
+ f"{trace}"
161
+ )
162
+ raise e
162
163
 
163
- finally:
164
- # Always unload the PDF backend, even in case of failure
165
- if in_doc._backend:
166
- in_doc._backend.unload()
164
+ finally:
165
+ # Always unload the PDF backend, even in case of failure
166
+ if conv_res.input._backend:
167
+ conv_res.input._backend.unload()
167
168
 
168
169
  return conv_res
169
170
 
170
- def _determine_status(
171
- self, in_doc: InputDocument, conv_res: ConversionResult
172
- ) -> ConversionStatus:
171
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
173
172
  status = ConversionStatus.SUCCESS
174
173
  for page in conv_res.pages:
175
174
  if page._backend is None or not page._backend.is_valid():
@@ -186,5 +185,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
186
185
 
187
186
  # Initialise and load resources for a page
188
187
  @abstractmethod
189
- def initialize_page(self, doc: InputDocument, page: Page) -> Page:
188
+ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
190
189
  pass
@@ -5,9 +5,10 @@ from docling.backend.abstract_backend import (
5
5
  DeclarativeDocumentBackend,
6
6
  )
7
7
  from docling.datamodel.base_models import ConversionStatus
8
- from docling.datamodel.document import ConversionResult, InputDocument
8
+ from docling.datamodel.document import ConversionResult
9
9
  from docling.datamodel.pipeline_options import PipelineOptions
10
10
  from docling.pipeline.base_pipeline import BasePipeline
11
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
11
12
 
12
13
  _log = logging.getLogger(__name__)
13
14
 
@@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
22
23
  def __init__(self, pipeline_options: PipelineOptions):
23
24
  super().__init__(pipeline_options)
24
25
 
25
- def _build_document(
26
- self, in_doc: InputDocument, conv_res: ConversionResult
27
- ) -> ConversionResult:
26
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
28
27
 
29
- if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
28
+ if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
30
29
  raise RuntimeError(
31
- f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
30
+ f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
32
31
  f"Can not convert this with simple pipeline. "
33
32
  f"Please check your format configuration on DocumentConverter."
34
33
  )
@@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
38
37
  # Instead of running a page-level pipeline to build up the document structure,
39
38
  # the backend is expected to be of type DeclarativeDocumentBackend, which can output
40
39
  # a DoclingDocument straight.
41
-
42
- conv_res.document = in_doc._backend.convert()
40
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
41
+ conv_res.document = conv_res.input._backend.convert()
43
42
  return conv_res
44
43
 
45
- def _determine_status(
46
- self, in_doc: InputDocument, conv_res: ConversionResult
47
- ) -> ConversionStatus:
44
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
48
45
  # This is called only if the previous steps didn't raise.
49
46
  # Since we don't have anything else to evaluate, we can
50
47
  # safely return SUCCESS.