docling 2.33.0__tar.gz → 2.35.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {docling-2.33.0 → docling-2.35.0}/PKG-INFO +2 -2
  2. {docling-2.33.0 → docling-2.35.0}/docling/backend/docling_parse_backend.py +1 -1
  3. {docling-2.33.0 → docling-2.35.0}/docling/backend/docling_parse_v2_backend.py +1 -1
  4. {docling-2.33.0 → docling-2.35.0}/docling/backend/docling_parse_v4_backend.py +1 -1
  5. {docling-2.33.0 → docling-2.35.0}/docling/cli/main.py +36 -3
  6. {docling-2.33.0 → docling-2.35.0}/docling/datamodel/base_models.py +99 -2
  7. {docling-2.33.0 → docling-2.35.0}/docling/datamodel/document.py +10 -3
  8. {docling-2.33.0 → docling-2.35.0}/docling/models/layout_model.py +19 -0
  9. {docling-2.33.0 → docling-2.35.0}/docling/models/page_assemble_model.py +1 -0
  10. {docling-2.33.0 → docling-2.35.0}/docling/models/page_preprocessing_model.py +54 -0
  11. {docling-2.33.0 → docling-2.35.0}/docling/models/tesseract_ocr_cli_model.py +85 -41
  12. {docling-2.33.0 → docling-2.35.0}/docling/models/tesseract_ocr_model.py +52 -30
  13. {docling-2.33.0 → docling-2.35.0}/docling/pipeline/standard_pdf_pipeline.py +75 -38
  14. {docling-2.33.0 → docling-2.35.0}/docling/utils/layout_postprocessor.py +10 -22
  15. docling-2.35.0/docling/utils/ocr_utils.py +69 -0
  16. docling-2.35.0/docling/utils/orientation.py +71 -0
  17. {docling-2.33.0 → docling-2.35.0}/pyproject.toml +2 -2
  18. docling-2.33.0/docling/utils/ocr_utils.py +0 -9
  19. {docling-2.33.0 → docling-2.35.0}/LICENSE +0 -0
  20. {docling-2.33.0 → docling-2.35.0}/README.md +0 -0
  21. {docling-2.33.0 → docling-2.35.0}/docling/__init__.py +0 -0
  22. {docling-2.33.0 → docling-2.35.0}/docling/backend/__init__.py +0 -0
  23. {docling-2.33.0 → docling-2.35.0}/docling/backend/abstract_backend.py +0 -0
  24. {docling-2.33.0 → docling-2.35.0}/docling/backend/asciidoc_backend.py +0 -0
  25. {docling-2.33.0 → docling-2.35.0}/docling/backend/csv_backend.py +0 -0
  26. {docling-2.33.0 → docling-2.35.0}/docling/backend/docx/__init__.py +0 -0
  27. {docling-2.33.0 → docling-2.35.0}/docling/backend/docx/latex/__init__.py +0 -0
  28. {docling-2.33.0 → docling-2.35.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  29. {docling-2.33.0 → docling-2.35.0}/docling/backend/docx/latex/omml.py +0 -0
  30. {docling-2.33.0 → docling-2.35.0}/docling/backend/html_backend.py +0 -0
  31. {docling-2.33.0 → docling-2.35.0}/docling/backend/json/__init__.py +0 -0
  32. {docling-2.33.0 → docling-2.35.0}/docling/backend/json/docling_json_backend.py +0 -0
  33. {docling-2.33.0 → docling-2.35.0}/docling/backend/md_backend.py +0 -0
  34. {docling-2.33.0 → docling-2.35.0}/docling/backend/msexcel_backend.py +0 -0
  35. {docling-2.33.0 → docling-2.35.0}/docling/backend/mspowerpoint_backend.py +0 -0
  36. {docling-2.33.0 → docling-2.35.0}/docling/backend/msword_backend.py +0 -0
  37. {docling-2.33.0 → docling-2.35.0}/docling/backend/pdf_backend.py +0 -0
  38. {docling-2.33.0 → docling-2.35.0}/docling/backend/pypdfium2_backend.py +0 -0
  39. {docling-2.33.0 → docling-2.35.0}/docling/backend/xml/__init__.py +0 -0
  40. {docling-2.33.0 → docling-2.35.0}/docling/backend/xml/jats_backend.py +0 -0
  41. {docling-2.33.0 → docling-2.35.0}/docling/backend/xml/uspto_backend.py +0 -0
  42. {docling-2.33.0 → docling-2.35.0}/docling/chunking/__init__.py +0 -0
  43. {docling-2.33.0 → docling-2.35.0}/docling/cli/__init__.py +0 -0
  44. {docling-2.33.0 → docling-2.35.0}/docling/cli/models.py +0 -0
  45. {docling-2.33.0 → docling-2.35.0}/docling/cli/tools.py +0 -0
  46. {docling-2.33.0 → docling-2.35.0}/docling/datamodel/__init__.py +0 -0
  47. {docling-2.33.0 → docling-2.35.0}/docling/datamodel/pipeline_options.py +0 -0
  48. {docling-2.33.0 → docling-2.35.0}/docling/datamodel/settings.py +0 -0
  49. {docling-2.33.0 → docling-2.35.0}/docling/document_converter.py +0 -0
  50. {docling-2.33.0 → docling-2.35.0}/docling/exceptions.py +0 -0
  51. {docling-2.33.0 → docling-2.35.0}/docling/models/__init__.py +0 -0
  52. {docling-2.33.0 → docling-2.35.0}/docling/models/api_vlm_model.py +0 -0
  53. {docling-2.33.0 → docling-2.35.0}/docling/models/base_model.py +0 -0
  54. {docling-2.33.0 → docling-2.35.0}/docling/models/base_ocr_model.py +0 -0
  55. {docling-2.33.0 → docling-2.35.0}/docling/models/code_formula_model.py +0 -0
  56. {docling-2.33.0 → docling-2.35.0}/docling/models/document_picture_classifier.py +0 -0
  57. {docling-2.33.0 → docling-2.35.0}/docling/models/easyocr_model.py +0 -0
  58. {docling-2.33.0 → docling-2.35.0}/docling/models/factories/__init__.py +0 -0
  59. {docling-2.33.0 → docling-2.35.0}/docling/models/factories/base_factory.py +0 -0
  60. {docling-2.33.0 → docling-2.35.0}/docling/models/factories/ocr_factory.py +0 -0
  61. {docling-2.33.0 → docling-2.35.0}/docling/models/factories/picture_description_factory.py +0 -0
  62. {docling-2.33.0 → docling-2.35.0}/docling/models/hf_mlx_model.py +0 -0
  63. {docling-2.33.0 → docling-2.35.0}/docling/models/hf_vlm_model.py +0 -0
  64. {docling-2.33.0 → docling-2.35.0}/docling/models/ocr_mac_model.py +0 -0
  65. {docling-2.33.0 → docling-2.35.0}/docling/models/picture_description_api_model.py +0 -0
  66. {docling-2.33.0 → docling-2.35.0}/docling/models/picture_description_base_model.py +0 -0
  67. {docling-2.33.0 → docling-2.35.0}/docling/models/picture_description_vlm_model.py +0 -0
  68. {docling-2.33.0 → docling-2.35.0}/docling/models/plugins/__init__.py +0 -0
  69. {docling-2.33.0 → docling-2.35.0}/docling/models/plugins/defaults.py +0 -0
  70. {docling-2.33.0 → docling-2.35.0}/docling/models/rapid_ocr_model.py +0 -0
  71. {docling-2.33.0 → docling-2.35.0}/docling/models/readingorder_model.py +0 -0
  72. {docling-2.33.0 → docling-2.35.0}/docling/models/table_structure_model.py +0 -0
  73. {docling-2.33.0 → docling-2.35.0}/docling/pipeline/__init__.py +0 -0
  74. {docling-2.33.0 → docling-2.35.0}/docling/pipeline/base_pipeline.py +0 -0
  75. {docling-2.33.0 → docling-2.35.0}/docling/pipeline/simple_pipeline.py +0 -0
  76. {docling-2.33.0 → docling-2.35.0}/docling/pipeline/vlm_pipeline.py +0 -0
  77. {docling-2.33.0 → docling-2.35.0}/docling/py.typed +0 -0
  78. {docling-2.33.0 → docling-2.35.0}/docling/utils/__init__.py +0 -0
  79. {docling-2.33.0 → docling-2.35.0}/docling/utils/accelerator_utils.py +0 -0
  80. {docling-2.33.0 → docling-2.35.0}/docling/utils/api_image_request.py +0 -0
  81. {docling-2.33.0 → docling-2.35.0}/docling/utils/export.py +0 -0
  82. {docling-2.33.0 → docling-2.35.0}/docling/utils/glm_utils.py +0 -0
  83. {docling-2.33.0 → docling-2.35.0}/docling/utils/locks.py +0 -0
  84. {docling-2.33.0 → docling-2.35.0}/docling/utils/model_downloader.py +0 -0
  85. {docling-2.33.0 → docling-2.35.0}/docling/utils/profiling.py +0 -0
  86. {docling-2.33.0 → docling-2.35.0}/docling/utils/utils.py +0 -0
  87. {docling-2.33.0 → docling-2.35.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.33.0
3
+ Version: 2.35.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
31
  Requires-Dist: click (<8.2.0)
32
- Requires-Dist: docling-core[chunking] (>=2.29.0,<3.0.0)
32
+ Requires-Dist: docling-core[chunking] (>=2.31.2,<3.0.0)
33
33
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
34
34
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
35
35
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
60
60
  coord_origin=CoordOrigin.BOTTOMLEFT,
61
61
  ).to_top_left_origin(page_height=page_size.height * scale)
62
62
 
63
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
63
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
64
64
 
65
65
  if overlap_frac > 0.5:
66
66
  if len(text_piece) > 0:
@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
71
71
  coord_origin=CoordOrigin.BOTTOMLEFT,
72
72
  ).to_top_left_origin(page_height=page_size.height * scale)
73
73
 
74
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
74
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
75
75
 
76
76
  if overlap_frac > 0.5:
77
77
  if len(text_piece) > 0:
@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
46
46
  .scaled(scale)
47
47
  )
48
48
 
49
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
49
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
50
50
 
51
51
  if overlap_frac > 0.5:
52
52
  if len(text_piece) > 0:
@@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
12
12
 
13
13
  import rich.table
14
14
  import typer
15
+ from docling_core.transforms.serializer.html import (
16
+ HTMLDocSerializer,
17
+ HTMLOutputStyle,
18
+ HTMLParams,
19
+ )
20
+ from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
15
21
  from docling_core.types.doc import ImageRefMode
16
22
  from docling_core.utils.file import resolve_source_to_path
17
23
  from pydantic import TypeAdapter
@@ -156,6 +162,7 @@ def export_documents(
156
162
  export_json: bool,
157
163
  export_html: bool,
158
164
  export_html_split_page: bool,
165
+ show_layout: bool,
159
166
  export_md: bool,
160
167
  export_txt: bool,
161
168
  export_doctags: bool,
@@ -189,9 +196,27 @@ def export_documents(
189
196
  if export_html_split_page:
190
197
  fname = output_dir / f"{doc_filename}.html"
191
198
  _log.info(f"writing HTML output to {fname}")
192
- conv_res.document.save_as_html(
193
- filename=fname, image_mode=image_export_mode, split_page_view=True
194
- )
199
+ if show_layout:
200
+ ser = HTMLDocSerializer(
201
+ doc=conv_res.document,
202
+ params=HTMLParams(
203
+ image_mode=image_export_mode,
204
+ output_style=HTMLOutputStyle.SPLIT_PAGE,
205
+ ),
206
+ )
207
+ visualizer = LayoutVisualizer()
208
+ visualizer.params.show_label = False
209
+ ser_res = ser.serialize(
210
+ visualizer=visualizer,
211
+ )
212
+ with open(fname, "w") as fw:
213
+ fw.write(ser_res.text)
214
+ else:
215
+ conv_res.document.save_as_html(
216
+ filename=fname,
217
+ image_mode=image_export_mode,
218
+ split_page_view=True,
219
+ )
195
220
 
196
221
  # Export Text format:
197
222
  if export_txt:
@@ -250,6 +275,13 @@ def convert( # noqa: C901
250
275
  to_formats: List[OutputFormat] = typer.Option(
251
276
  None, "--to", help="Specify output formats. Defaults to Markdown."
252
277
  ),
278
+ show_layout: Annotated[
279
+ bool,
280
+ typer.Option(
281
+ ...,
282
+ help="If enabled, the page images will show the bounding-boxes of the items.",
283
+ ),
284
+ ] = False,
253
285
  headers: str = typer.Option(
254
286
  None,
255
287
  "--headers",
@@ -596,6 +628,7 @@ def convert( # noqa: C901
596
628
  export_json=export_json,
597
629
  export_html=export_html,
598
630
  export_html_split_page=export_html_split_page,
631
+ show_layout=show_layout,
599
632
  export_md=export_md,
600
633
  export_txt=export_txt,
601
634
  export_doctags=export_doctags,
@@ -1,6 +1,9 @@
1
+ import math
2
+ from collections import defaultdict
1
3
  from enum import Enum
2
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
+ from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
3
5
 
6
+ import numpy as np
4
7
  from docling_core.types.doc import (
5
8
  BoundingBox,
6
9
  DocItemLabel,
@@ -16,7 +19,7 @@ from docling_core.types.io import (
16
19
  DocumentStream,
17
20
  )
18
21
  from PIL.Image import Image
19
- from pydantic import BaseModel, ConfigDict
22
+ from pydantic import BaseModel, ConfigDict, Field, computed_field
20
23
 
21
24
  if TYPE_CHECKING:
22
25
  from docling.backend.pdf_backend import PdfPageBackend
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
298
301
  choices: List[OpenAiResponseChoice]
299
302
  created: int
300
303
  usage: OpenAiResponseUsage
304
+
305
+
306
+ # Create a type alias for score values
307
+ ScoreValue = float
308
+
309
+
310
+ class QualityGrade(str, Enum):
311
+ POOR = "poor"
312
+ FAIR = "fair"
313
+ GOOD = "good"
314
+ EXCELLENT = "excellent"
315
+ UNSPECIFIED = "unspecified"
316
+
317
+
318
+ class PageConfidenceScores(BaseModel):
319
+ parse_score: ScoreValue = np.nan
320
+ layout_score: ScoreValue = np.nan
321
+ table_score: ScoreValue = np.nan
322
+ ocr_score: ScoreValue = np.nan
323
+
324
+ def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
325
+ if score < 0.5:
326
+ return QualityGrade.POOR
327
+ elif score < 0.8:
328
+ return QualityGrade.FAIR
329
+ elif score < 0.9:
330
+ return QualityGrade.GOOD
331
+ elif score >= 0.9:
332
+ return QualityGrade.EXCELLENT
333
+
334
+ return QualityGrade.UNSPECIFIED
335
+
336
+ @computed_field # type: ignore
337
+ @property
338
+ def mean_grade(self) -> QualityGrade:
339
+ return self._score_to_grade(self.mean_score)
340
+
341
+ @computed_field # type: ignore
342
+ @property
343
+ def low_grade(self) -> QualityGrade:
344
+ return self._score_to_grade(self.low_score)
345
+
346
+ @computed_field # type: ignore
347
+ @property
348
+ def mean_score(self) -> ScoreValue:
349
+ return ScoreValue(
350
+ np.nanmean(
351
+ [
352
+ self.ocr_score,
353
+ self.table_score,
354
+ self.layout_score,
355
+ self.parse_score,
356
+ ]
357
+ )
358
+ )
359
+
360
+ @computed_field # type: ignore
361
+ @property
362
+ def low_score(self) -> ScoreValue:
363
+ return ScoreValue(
364
+ np.nanquantile(
365
+ [
366
+ self.ocr_score,
367
+ self.table_score,
368
+ self.layout_score,
369
+ self.parse_score,
370
+ ],
371
+ q=0.05,
372
+ )
373
+ )
374
+
375
+
376
+ class ConfidenceReport(PageConfidenceScores):
377
+ pages: Dict[int, PageConfidenceScores] = Field(
378
+ default_factory=lambda: defaultdict(PageConfidenceScores)
379
+ )
380
+
381
+ @computed_field # type: ignore
382
+ @property
383
+ def mean_score(self) -> ScoreValue:
384
+ return ScoreValue(
385
+ np.nanmean(
386
+ [c.mean_score for c in self.pages.values()],
387
+ )
388
+ )
389
+
390
+ @computed_field # type: ignore
391
+ @property
392
+ def low_score(self) -> ScoreValue:
393
+ return ScoreValue(
394
+ np.nanmean(
395
+ [c.low_score for c in self.pages.values()],
396
+ )
397
+ )
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
47
47
  )
48
48
  from docling_core.utils.file import resolve_source_to_stream
49
49
  from docling_core.utils.legacy import docling_document_to_legacy
50
- from pydantic import BaseModel
50
+ from pydantic import BaseModel, Field
51
51
  from typing_extensions import deprecated
52
52
 
53
53
  from docling.backend.abstract_backend import (
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
56
56
  )
57
57
  from docling.datamodel.base_models import (
58
58
  AssembledUnit,
59
+ ConfidenceReport,
59
60
  ConversionStatus,
60
61
  DocumentStream,
61
62
  ErrorItem,
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
201
202
  pages: List[Page] = []
202
203
  assembled: AssembledUnit = AssembledUnit()
203
204
  timings: Dict[str, ProfilingItem] = {}
205
+ confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
204
206
 
205
207
  document: DoclingDocument = _EMPTY_DOCLING_DOC
206
208
 
@@ -332,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
332
334
  ) -> Optional[InputFormat]:
333
335
  """Guess the input format of a document by checking part of its content."""
334
336
  input_format: Optional[InputFormat] = None
335
- content_str = content.decode("utf-8")
336
337
 
337
338
  if mime == "application/xml":
339
+ content_str = content.decode("utf-8")
338
340
  match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
339
341
  if match_doctype:
340
342
  xml_doctype = match_doctype.group()
@@ -356,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
356
358
  input_format = InputFormat.XML_JATS
357
359
 
358
360
  elif mime == "text/plain":
361
+ content_str = content.decode("utf-8")
359
362
  if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
360
363
  input_format = InputFormat.XML_USPTO
361
364
 
@@ -409,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
409
412
  else:
410
413
  return "application/xml"
411
414
 
412
- if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
415
+ if re.match(
416
+ r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
417
+ content_str,
418
+ re.DOTALL,
419
+ ):
413
420
  return "text/html"
414
421
 
415
422
  p = re.compile(
@@ -5,6 +5,7 @@ from collections.abc import Iterable
5
5
  from pathlib import Path
6
6
  from typing import Optional
7
7
 
8
+ import numpy as np
8
9
  from docling_core.types.doc import DocItemLabel
9
10
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
10
11
  from PIL import Image
@@ -184,6 +185,24 @@ class LayoutModel(BasePageModel):
184
185
  ).postprocess()
185
186
  # processed_clusters, processed_cells = clusters, page.cells
186
187
 
188
+ with warnings.catch_warnings():
189
+ warnings.filterwarnings(
190
+ "ignore",
191
+ "Mean of empty slice|invalid value encountered in scalar divide",
192
+ RuntimeWarning,
193
+ "numpy",
194
+ )
195
+
196
+ conv_res.confidence.pages[page.page_no].layout_score = float(
197
+ np.mean([c.confidence for c in processed_clusters])
198
+ )
199
+
200
+ conv_res.confidence.pages[page.page_no].ocr_score = float(
201
+ np.mean(
202
+ [c.confidence for c in processed_cells if c.from_ocr]
203
+ )
204
+ )
205
+
187
206
  page.cells = processed_cells
188
207
  page.predictions.layout = LayoutPrediction(
189
208
  clusters=processed_clusters
@@ -3,6 +3,7 @@ import re
3
3
  from collections.abc import Iterable
4
4
  from typing import List
5
5
 
6
+ import numpy as np
6
7
  from pydantic import BaseModel
7
8
 
8
9
  from docling.datamodel.base_models import (
@@ -1,7 +1,10 @@
1
+ import re
2
+ import warnings
1
3
  from collections.abc import Iterable
2
4
  from pathlib import Path
3
5
  from typing import Optional
4
6
 
7
+ import numpy as np
5
8
  from PIL import ImageDraw
6
9
  from pydantic import BaseModel
7
10
 
@@ -21,6 +24,14 @@ class PagePreprocessingModel(BasePageModel):
21
24
  def __init__(self, options: PagePreprocessingOptions):
22
25
  self.options = options
23
26
 
27
+ # Pre-compiled regex patterns for efficiency
28
+ self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
29
+ self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
30
+ self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
31
+ self.SLASH_NUMBER_GARBAGE_RE = re.compile(
32
+ r"(?:/\w+\s*){2,}"
33
+ ) # Two or more "/token " sequences
34
+
24
35
  def __call__(
25
36
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
26
37
  ) -> Iterable[Page]:
@@ -60,6 +71,22 @@ class PagePreprocessingModel(BasePageModel):
60
71
  if self.options.create_parsed_page:
61
72
  page.parsed_page = page._backend.get_segmented_page()
62
73
 
74
+ # Rate the text quality from the PDF parser, and aggregate on page
75
+ text_scores = []
76
+ for c in page.cells:
77
+ score = self.rate_text_quality(c.text)
78
+ text_scores.append(score)
79
+
80
+ with warnings.catch_warnings():
81
+ warnings.filterwarnings(
82
+ "ignore", "Mean of empty slice", RuntimeWarning, "numpy"
83
+ )
84
+ conv_res.confidence.pages[page.page_no].parse_score = float(
85
+ np.nanquantile(
86
+ text_scores, q=0.10
87
+ ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
88
+ )
89
+
63
90
  # DEBUG code:
64
91
  def draw_text_boxes(image, cells, show: bool = False):
65
92
  draw = ImageDraw.Draw(image)
@@ -88,3 +115,30 @@ class PagePreprocessingModel(BasePageModel):
88
115
  draw_text_boxes(page.get_image(scale=1.0), page.cells)
89
116
 
90
117
  return page
118
+
119
+ def rate_text_quality(self, text: str) -> float:
120
+ # Hard errors: if any of these patterns are found, return 0.0 immediately.
121
+ blacklist_chars = ["�"]
122
+ if (
123
+ any(text.find(c) >= 0 for c in blacklist_chars)
124
+ or self.GLYPH_RE.search(text)
125
+ or self.SLASH_G_RE.search(text)
126
+ or self.SLASH_NUMBER_GARBAGE_RE.match(
127
+ text
128
+ ) # Check if text is mostly slash-number pattern
129
+ ):
130
+ return 0.0
131
+
132
+ penalty = 0.0
133
+
134
+ # Apply a penalty only if the fragmented words pattern occurs at least three times.
135
+ frag_matches = self.FRAG_RE.findall(text)
136
+ if len(frag_matches) >= 3:
137
+ penalty += 0.1 * len(frag_matches)
138
+
139
+ # Additional heuristic: if the average token length is below 2, add a penalty.
140
+ # tokens = text.split()
141
+ # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
142
+ # penalty += 0.2
143
+
144
+ return max(1.0 - penalty, 0.0)
@@ -2,6 +2,7 @@ import csv
2
2
  import io
3
3
  import logging
4
4
  import os
5
+ import subprocess
5
6
  import tempfile
6
7
  from collections.abc import Iterable
7
8
  from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
10
11
 
11
12
  import pandas as pd
12
13
  from docling_core.types.doc import BoundingBox, CoordOrigin
13
- from docling_core.types.doc.page import BoundingRectangle, TextCell
14
+ from docling_core.types.doc.page import TextCell
14
15
 
15
16
  from docling.datamodel.base_models import Page
16
17
  from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
21
22
  )
22
23
  from docling.datamodel.settings import settings
23
24
  from docling.models.base_ocr_model import BaseOcrModel
24
- from docling.utils.ocr_utils import map_tesseract_script
25
+ from docling.utils.ocr_utils import (
26
+ map_tesseract_script,
27
+ parse_tesseract_orientation,
28
+ tesseract_box_to_bounding_rectangle,
29
+ )
25
30
  from docling.utils.profiling import TimeRecorder
26
31
 
27
32
  _log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
49
54
  self._version: Optional[str] = None
50
55
  self._tesseract_languages: Optional[List[str]] = None
51
56
  self._script_prefix: Optional[str] = None
57
+ self._is_auto: bool = "auto" in self.options.lang
52
58
 
53
59
  if self.enabled:
54
60
  try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
93
99
 
94
100
  return name, version
95
101
 
96
- def _run_tesseract(self, ifilename: str):
102
+ def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
97
103
  r"""
98
104
  Run tesseract CLI
99
105
  """
100
106
  cmd = [self.options.tesseract_cmd]
101
-
102
- if "auto" in self.options.lang:
103
- lang = self._detect_language(ifilename)
107
+ if self._is_auto:
108
+ lang = self._parse_language(osd)
104
109
  if lang is not None:
105
110
  cmd.append("-l")
106
111
  cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
115
120
  cmd += [ifilename, "stdout", "tsv"]
116
121
  _log.info("command: {}".format(" ".join(cmd)))
117
122
 
118
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
119
- output, _ = proc.communicate()
123
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
120
124
 
121
125
  # _log.info(output)
122
126
 
123
127
  # Decode the byte string to a regular string
124
- decoded_data = output.decode("utf-8")
128
+ decoded_data = output.stdout.decode("utf-8")
125
129
  # _log.info(decoded_data)
126
130
 
127
131
  # Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
139
143
 
140
144
  return df_filtered
141
145
 
142
- def _detect_language(self, ifilename: str):
146
+ def _perform_osd(self, ifilename: str) -> pd.DataFrame:
143
147
  r"""
144
148
  Run tesseract in PSM 0 mode to detect the language
145
149
  """
146
- assert self._tesseract_languages is not None
147
150
 
148
151
  cmd = [self.options.tesseract_cmd]
149
152
  cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
150
153
  _log.info("command: {}".format(" ".join(cmd)))
151
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
152
- output, _ = proc.communicate()
153
- decoded_data = output.decode("utf-8")
154
+ output = subprocess.run(cmd, capture_output=True, check=True)
155
+ decoded_data = output.stdout.decode("utf-8")
154
156
  df_detected = pd.read_csv(
155
157
  io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
156
158
  )
157
- scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
159
+ return df_detected
160
+
161
+ def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
162
+ assert self._tesseract_languages is not None
163
+ scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
158
164
  if len(scripts) == 0:
159
165
  _log.warning("Tesseract cannot detect the script of the page")
160
166
  return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
182
188
  cmd = [self.options.tesseract_cmd]
183
189
  cmd.append("--list-langs")
184
190
  _log.info("command: {}".format(" ".join(cmd)))
185
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
186
- output, _ = proc.communicate()
187
- decoded_data = output.decode("utf-8")
191
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
192
+ decoded_data = output.stdout.decode("utf-8")
188
193
  df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
189
194
  self._tesseract_languages = df_list[0].tolist()[1:]
190
195
 
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
203
208
  yield from page_batch
204
209
  return
205
210
 
206
- for page in page_batch:
211
+ for page_i, page in enumerate(page_batch):
207
212
  assert page._backend is not None
208
213
  if not page._backend.is_valid():
209
214
  yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
212
217
  ocr_rects = self.get_ocr_rects(page)
213
218
 
214
219
  all_ocr_cells = []
215
- for ocr_rect in ocr_rects:
220
+ for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
216
221
  # Skip zero area boxes
217
222
  if ocr_rect.area() == 0:
218
223
  continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
225
230
  ) as image_file:
226
231
  fname = image_file.name
227
232
  high_res_image.save(image_file)
228
-
229
- df_result = self._run_tesseract(fname)
233
+ doc_orientation = 0
234
+ try:
235
+ df_osd = self._perform_osd(fname)
236
+ doc_orientation = _parse_orientation(df_osd)
237
+ except subprocess.CalledProcessError as exc:
238
+ _log.error(
239
+ "OSD failed (doc %s, page: %s, "
240
+ "OCR rectangle: %s, processed image file %s):\n %s",
241
+ conv_res.input.file,
242
+ page_i,
243
+ ocr_rect_i,
244
+ image_file,
245
+ exc.stderr,
246
+ )
247
+ # Skipping if OSD fail when in auto mode, otherwise proceed
248
+ # to OCR in the hope OCR will succeed while OSD failed
249
+ if self._is_auto:
250
+ continue
251
+ if doc_orientation != 0:
252
+ high_res_image = high_res_image.rotate(
253
+ -doc_orientation, expand=True
254
+ )
255
+ high_res_image.save(fname)
256
+ try:
257
+ df_result = self._run_tesseract(fname, df_osd)
258
+ except subprocess.CalledProcessError as exc:
259
+ _log.error(
260
+ "tesseract OCR failed (doc %s, page: %s, "
261
+ "OCR rectangle: %s, processed image file %s):\n %s",
262
+ conv_res.input.file,
263
+ page_i,
264
+ ocr_rect_i,
265
+ image_file,
266
+ exc.stderr,
267
+ )
268
+ continue
230
269
  finally:
231
270
  if os.path.exists(fname):
232
271
  os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
238
277
  text = row["text"]
239
278
  conf = row["conf"]
240
279
 
241
- l = float(row["left"]) # noqa: E741
242
- b = float(row["top"])
243
- w = float(row["width"])
244
- h = float(row["height"])
245
-
246
- t = b + h
247
- r = l + w
248
-
280
+ left, top = float(row["left"]), float(row["top"])
281
+ right = left + float(row["width"])
282
+ bottom = top + row["height"]
283
+ bbox = BoundingBox(
284
+ l=left,
285
+ t=top,
286
+ r=right,
287
+ b=bottom,
288
+ coord_origin=CoordOrigin.TOPLEFT,
289
+ )
290
+ rect = tesseract_box_to_bounding_rectangle(
291
+ bbox,
292
+ original_offset=ocr_rect,
293
+ scale=self.scale,
294
+ orientation=doc_orientation,
295
+ im_size=high_res_image.size,
296
+ )
249
297
  cell = TextCell(
250
298
  index=ix,
251
299
  text=str(text),
252
300
  orig=str(text),
253
301
  from_ocr=True,
254
302
  confidence=conf / 100.0,
255
- rect=BoundingRectangle.from_bounding_box(
256
- BoundingBox.from_tuple(
257
- coord=(
258
- (l / self.scale) + ocr_rect.l,
259
- (b / self.scale) + ocr_rect.t,
260
- (r / self.scale) + ocr_rect.l,
261
- (t / self.scale) + ocr_rect.t,
262
- ),
263
- origin=CoordOrigin.TOPLEFT,
264
- )
265
- ),
303
+ rect=rect,
266
304
  )
267
305
  all_ocr_cells.append(cell)
268
306
 
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
278
316
  @classmethod
279
317
  def get_options_type(cls) -> Type[OcrOptions]:
280
318
  return TesseractCliOcrOptions
319
+
320
+
321
+ def _parse_orientation(df_osd: pd.DataFrame) -> int:
322
+ orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
323
+ orientation = parse_tesseract_orientation(orientations[0].strip())
324
+ return orientation