docling 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. docling/backend/abstract_backend.py +0 -1
  2. docling/backend/asciidoc_backend.py +0 -1
  3. docling/backend/docling_parse_backend.py +1 -1
  4. docling/backend/docling_parse_v2_backend.py +1 -1
  5. docling/backend/html_backend.py +4 -3
  6. docling/backend/json/__init__.py +0 -0
  7. docling/backend/json/docling_json_backend.py +58 -0
  8. docling/backend/md_backend.py +49 -36
  9. docling/backend/msexcel_backend.py +50 -38
  10. docling/backend/msword_backend.py +0 -1
  11. docling/backend/pdf_backend.py +0 -2
  12. docling/backend/pypdfium2_backend.py +1 -1
  13. docling/backend/xml/uspto_backend.py +25 -25
  14. docling/cli/main.py +18 -3
  15. docling/datamodel/base_models.py +30 -3
  16. docling/datamodel/document.py +4 -0
  17. docling/datamodel/pipeline_options.py +7 -9
  18. docling/document_converter.py +4 -0
  19. docling/models/base_model.py +62 -6
  20. docling/models/code_formula_model.py +245 -0
  21. docling/models/document_picture_classifier.py +187 -0
  22. docling/models/layout_model.py +10 -86
  23. docling/models/page_assemble_model.py +1 -33
  24. docling/models/rapid_ocr_model.py +1 -0
  25. docling/models/tesseract_ocr_cli_model.py +72 -5
  26. docling/models/tesseract_ocr_model.py +68 -20
  27. docling/pipeline/base_pipeline.py +40 -17
  28. docling/pipeline/standard_pdf_pipeline.py +31 -2
  29. docling/utils/glm_utils.py +4 -1
  30. docling/utils/ocr_utils.py +9 -0
  31. docling/utils/visualization.py +80 -0
  32. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/METADATA +17 -13
  33. docling-2.17.0.dist-info/RECORD +62 -0
  34. docling-2.15.1.dist-info/RECORD +0 -56
  35. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
  36. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
  37. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0
@@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
389
389
  if name == self.Element.TITLE.value:
390
390
  if text:
391
391
  self.parents[self.level + 1] = self.doc.add_title(
392
- parent=self.parents[self.level], # type: ignore[arg-type]
392
+ parent=self.parents[self.level],
393
393
  text=text,
394
394
  )
395
395
  self.level += 1
@@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
406
406
  abstract_item = self.doc.add_heading(
407
407
  heading_text,
408
408
  level=heading_level,
409
- parent=self.parents[heading_level], # type: ignore[arg-type]
409
+ parent=self.parents[heading_level],
410
410
  )
411
411
  self.doc.add_text(
412
412
  label=DocItemLabel.PARAGRAPH,
@@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
434
434
  claims_item = self.doc.add_heading(
435
435
  heading_text,
436
436
  level=heading_level,
437
- parent=self.parents[heading_level], # type: ignore[arg-type]
437
+ parent=self.parents[heading_level],
438
438
  )
439
439
  for text in self.claims:
440
440
  self.doc.add_text(
@@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
452
452
  self.doc.add_text(
453
453
  label=DocItemLabel.PARAGRAPH,
454
454
  text=text,
455
- parent=self.parents[self.level], # type: ignore[arg-type]
455
+ parent=self.parents[self.level],
456
456
  )
457
457
  self.text = ""
458
458
 
@@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
460
460
  self.parents[self.level + 1] = self.doc.add_heading(
461
461
  text=text,
462
462
  level=self.level,
463
- parent=self.parents[self.level], # type: ignore[arg-type]
463
+ parent=self.parents[self.level],
464
464
  )
465
465
  self.level += 1
466
466
  self.text = ""
@@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
470
470
  empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
471
471
  self.doc.add_table(
472
472
  data=empty_table,
473
- parent=self.parents[self.level], # type: ignore[arg-type]
473
+ parent=self.parents[self.level],
474
474
  )
475
475
 
476
476
  def _apply_style(self, text: str, style_tag: str) -> str:
@@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
721
721
  if self.Element.TITLE.value in self.property and text.strip():
722
722
  title = text.strip()
723
723
  self.parents[self.level + 1] = self.doc.add_title(
724
- parent=self.parents[self.level], # type: ignore[arg-type]
724
+ parent=self.parents[self.level],
725
725
  text=title,
726
726
  )
727
727
  self.level += 1
@@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
749
749
  self.parents[self.level + 1] = self.doc.add_heading(
750
750
  text=text.strip(),
751
751
  level=self.level,
752
- parent=self.parents[self.level], # type: ignore[arg-type]
752
+ parent=self.parents[self.level],
753
753
  )
754
754
  self.level += 1
755
755
 
@@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
769
769
  claims_item = self.doc.add_heading(
770
770
  heading_text,
771
771
  level=heading_level,
772
- parent=self.parents[heading_level], # type: ignore[arg-type]
772
+ parent=self.parents[heading_level],
773
773
  )
774
774
  for text in self.claims:
775
775
  self.doc.add_text(
@@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
787
787
  abstract_item = self.doc.add_heading(
788
788
  heading_text,
789
789
  level=heading_level,
790
- parent=self.parents[heading_level], # type: ignore[arg-type]
790
+ parent=self.parents[heading_level],
791
791
  )
792
792
  self.doc.add_text(
793
793
  label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
@@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
799
799
  self.doc.add_text(
800
800
  label=DocItemLabel.PARAGRAPH,
801
801
  text=paragraph,
802
- parent=self.parents[self.level], # type: ignore[arg-type]
802
+ parent=self.parents[self.level],
803
803
  )
804
804
  elif self.Element.CLAIM.value in self.property:
805
805
  # we may need a space after a paragraph in claim text
@@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
811
811
  empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
812
812
  self.doc.add_table(
813
813
  data=empty_table,
814
- parent=self.parents[self.level], # type: ignore[arg-type]
814
+ parent=self.parents[self.level],
815
815
  )
816
816
 
817
817
  def _apply_style(self, text: str, style_tag: str) -> str:
@@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
938
938
  self.parents[self.level + 1] = self.doc.add_heading(
939
939
  heading.value,
940
940
  level=self.level,
941
- parent=self.parents[self.level], # type: ignore[arg-type]
941
+ parent=self.parents[self.level],
942
942
  )
943
943
  self.level += 1
944
944
 
@@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):
959
959
 
960
960
  if field == self.Field.TITLE.value:
961
961
  self.parents[self.level + 1] = self.doc.add_title(
962
- parent=self.parents[self.level], text=value # type: ignore[arg-type]
962
+ parent=self.parents[self.level], text=value
963
963
  )
964
964
  self.level += 1
965
965
 
@@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
971
971
  self.doc.add_text(
972
972
  label=DocItemLabel.PARAGRAPH,
973
973
  text=value,
974
- parent=self.parents[self.level], # type: ignore[arg-type]
974
+ parent=self.parents[self.level],
975
975
  )
976
976
 
977
977
  elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
978
978
  self.doc.add_text(
979
979
  label=DocItemLabel.PARAGRAPH,
980
980
  text="",
981
- parent=self.parents[self.level], # type: ignore[arg-type]
981
+ parent=self.parents[self.level],
982
982
  )
983
983
 
984
984
  elif (
@@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
996
996
  last_claim = self.doc.add_text(
997
997
  label=DocItemLabel.PARAGRAPH,
998
998
  text="",
999
- parent=self.parents[self.level], # type: ignore[arg-type]
999
+ parent=self.parents[self.level],
1000
1000
  )
1001
1001
 
1002
1002
  last_claim.text += f" {value}" if last_claim.text else value
@@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
1012
1012
  self.parents[self.level + 1] = self.doc.add_heading(
1013
1013
  value,
1014
1014
  level=self.level,
1015
- parent=self.parents[self.level], # type: ignore[arg-type]
1015
+ parent=self.parents[self.level],
1016
1016
  )
1017
1017
  self.level += 1
1018
1018
 
@@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
1029
1029
  self.doc.add_text(
1030
1030
  label=DocItemLabel.PARAGRAPH,
1031
1031
  text=value,
1032
- parent=self.parents[self.level], # type: ignore[arg-type]
1032
+ parent=self.parents[self.level],
1033
1033
  )
1034
1034
 
1035
1035
  def parse(self, patent_content: str) -> Optional[DoclingDocument]:
@@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
1283
1283
  title = text.strip()
1284
1284
  if title:
1285
1285
  self.parents[self.level + 1] = self.doc.add_text(
1286
- parent=self.parents[self.level], # type: ignore[arg-type]
1286
+ parent=self.parents[self.level],
1287
1287
  label=DocItemLabel.TITLE,
1288
1288
  text=title,
1289
1289
  )
@@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
1301
1301
  abstract_item = self.doc.add_heading(
1302
1302
  heading_text,
1303
1303
  level=heading_level,
1304
- parent=self.parents[heading_level], # type: ignore[arg-type]
1304
+ parent=self.parents[heading_level],
1305
1305
  )
1306
1306
  self.doc.add_text(
1307
1307
  label=DocItemLabel.PARAGRAPH,
@@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
1331
1331
  claims_item = self.doc.add_heading(
1332
1332
  heading_text,
1333
1333
  level=heading_level,
1334
- parent=self.parents[heading_level], # type: ignore[arg-type]
1334
+ parent=self.parents[heading_level],
1335
1335
  )
1336
1336
  for text in self.claims:
1337
1337
  self.doc.add_text(
@@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
1350
1350
  self.parents[self.level + 1] = self.doc.add_heading(
1351
1351
  text=text,
1352
1352
  level=self.level,
1353
- parent=self.parents[self.level], # type: ignore[arg-type]
1353
+ parent=self.parents[self.level],
1354
1354
  )
1355
1355
  self.level += 1
1356
1356
  else:
1357
1357
  self.doc.add_text(
1358
1358
  label=DocItemLabel.PARAGRAPH,
1359
1359
  text=text,
1360
- parent=self.parents[self.level], # type: ignore[arg-type]
1360
+ parent=self.parents[self.level],
1361
1361
  )
1362
1362
  self.text = ""
1363
1363
 
@@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
1366
1366
  empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
1367
1367
  self.doc.add_table(
1368
1368
  data=empty_table,
1369
- parent=self.parents[self.level], # type: ignore[arg-type]
1369
+ parent=self.parents[self.level],
1370
1370
  )
1371
1371
 
1372
1372
  def _apply_style(self, text: str, style_tag: str) -> str:
docling/cli/main.py CHANGED
@@ -1,18 +1,18 @@
1
1
  import importlib
2
- import json
3
2
  import logging
3
+ import platform
4
4
  import re
5
+ import sys
5
6
  import tempfile
6
7
  import time
7
8
  import warnings
8
- from enum import Enum
9
9
  from pathlib import Path
10
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
11
11
 
12
12
  import typer
13
13
  from docling_core.types.doc import ImageRefMode
14
14
  from docling_core.utils.file import resolve_source_to_path
15
- from pydantic import TypeAdapter, ValidationError
15
+ from pydantic import TypeAdapter
16
16
 
17
17
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
18
18
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -65,10 +65,15 @@ def version_callback(value: bool):
65
65
  docling_core_version = importlib.metadata.version("docling-core")
66
66
  docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
67
67
  docling_parse_version = importlib.metadata.version("docling-parse")
68
+ platform_str = platform.platform()
69
+ py_impl_version = sys.implementation.cache_tag
70
+ py_lang_version = platform.python_version()
68
71
  print(f"Docling version: {docling_version}")
69
72
  print(f"Docling Core version: {docling_core_version}")
70
73
  print(f"Docling IBM Models version: {docling_ibm_models_version}")
71
74
  print(f"Docling Parse version: {docling_parse_version}")
75
+ print(f"Python: {py_impl_version} ({py_lang_version})")
76
+ print(f"Platform: {platform_str}")
72
77
  raise typer.Exit()
73
78
 
74
79
 
@@ -206,6 +211,14 @@ def convert(
206
211
  TableFormerMode,
207
212
  typer.Option(..., help="The mode to use in the table structure model."),
208
213
  ] = TableFormerMode.FAST,
214
+ enrich_code: Annotated[
215
+ bool,
216
+ typer.Option(..., help="Enable the code enrichment model in the pipeline."),
217
+ ] = False,
218
+ enrich_formula: Annotated[
219
+ bool,
220
+ typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
221
+ ] = False,
209
222
  artifacts_path: Annotated[
210
223
  Optional[Path],
211
224
  typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -360,6 +373,8 @@ def convert(
360
373
  do_ocr=ocr,
361
374
  ocr_options=ocr_options,
362
375
  do_table_structure=True,
376
+ do_code_enrichment=enrich_code,
377
+ do_formula_enrichment=enrich_formula,
363
378
  document_timeout=document_timeout,
364
379
  )
365
380
  pipeline_options.table_structure_options.do_cell_matching = (
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
4
  from docling_core.types.doc import (
5
5
  BoundingBox,
6
6
  DocItemLabel,
7
+ NodeItem,
7
8
  PictureDataType,
8
9
  Size,
9
10
  TableCell,
@@ -40,6 +41,7 @@ class InputFormat(str, Enum):
40
41
  MD = "md"
41
42
  XLSX = "xlsx"
42
43
  XML_USPTO = "xml_uspto"
44
+ JSON_DOCLING = "json_docling"
43
45
 
44
46
 
45
47
  class OutputFormat(str, Enum):
@@ -61,6 +63,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
61
63
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
62
64
  InputFormat.XLSX: ["xlsx"],
63
65
  InputFormat.XML_USPTO: ["xml", "txt"],
66
+ InputFormat.JSON_DOCLING: ["json"],
64
67
  }
65
68
 
66
69
  FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -89,6 +92,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
89
92
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
90
93
  ],
91
94
  InputFormat.XML_USPTO: ["application/xml", "text/plain"],
95
+ InputFormat.JSON_DOCLING: ["application/json"],
92
96
  }
93
97
 
94
98
  MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -201,6 +205,13 @@ class AssembledUnit(BaseModel):
201
205
  headers: List[PageElement] = []
202
206
 
203
207
 
208
+ class ItemAndImageEnrichmentElement(BaseModel):
209
+ model_config = ConfigDict(arbitrary_types_allowed=True)
210
+
211
+ item: NodeItem
212
+ image: Image
213
+
214
+
204
215
  class Page(BaseModel):
205
216
  model_config = ConfigDict(arbitrary_types_allowed=True)
206
217
 
@@ -219,12 +230,28 @@ class Page(BaseModel):
219
230
  {}
220
231
  ) # Cache of images in different scales. By default it is cleared during assembling.
221
232
 
222
- def get_image(self, scale: float = 1.0) -> Optional[Image]:
233
+ def get_image(
234
+ self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
235
+ ) -> Optional[Image]:
223
236
  if self._backend is None:
224
237
  return self._image_cache.get(scale, None)
238
+
225
239
  if not scale in self._image_cache:
226
- self._image_cache[scale] = self._backend.get_page_image(scale=scale)
227
- return self._image_cache[scale]
240
+ if cropbox is None:
241
+ self._image_cache[scale] = self._backend.get_page_image(scale=scale)
242
+ else:
243
+ return self._backend.get_page_image(scale=scale, cropbox=cropbox)
244
+
245
+ if cropbox is None:
246
+ return self._image_cache[scale]
247
+ else:
248
+ page_im = self._image_cache[scale]
249
+ assert self.size is not None
250
+ return page_im.crop(
251
+ cropbox.to_top_left_origin(page_height=self.size.height)
252
+ .scaled(scale=scale)
253
+ .as_tuple()
254
+ )
228
255
 
229
256
  @property
230
257
  def image(self) -> Optional[Image]:
@@ -350,6 +350,10 @@ class _DocumentConversionInput(BaseModel):
350
350
  mime = FormatToMimeType[InputFormat.HTML][0]
351
351
  elif ext in FormatToExtensions[InputFormat.MD]:
352
352
  mime = FormatToMimeType[InputFormat.MD][0]
353
+ elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
354
+ mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
355
+ elif ext in FormatToExtensions[InputFormat.PDF]:
356
+ mime = FormatToMimeType[InputFormat.PDF][0]
353
357
  return mime
354
358
 
355
359
  @staticmethod
@@ -1,17 +1,11 @@
1
1
  import logging
2
2
  import os
3
- import warnings
4
3
  from enum import Enum
5
4
  from pathlib import Path
6
- from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
5
+ from typing import Any, List, Literal, Optional, Union
7
6
 
8
- from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
9
- from pydantic_settings import (
10
- BaseSettings,
11
- PydanticBaseSettingsSource,
12
- SettingsConfigDict,
13
- )
14
- from typing_extensions import deprecated
7
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
8
+ from pydantic_settings import BaseSettings, SettingsConfigDict
15
9
 
16
10
  _log = logging.getLogger(__name__)
17
11
 
@@ -125,6 +119,7 @@ class RapidOcrOptions(OcrOptions):
125
119
  det_model_path: Optional[str] = None # same default as rapidocr
126
120
  cls_model_path: Optional[str] = None # same default as rapidocr
127
121
  rec_model_path: Optional[str] = None # same default as rapidocr
122
+ rec_keys_path: Optional[str] = None # same default as rapidocr
128
123
 
129
124
  model_config = ConfigDict(
130
125
  extra="forbid",
@@ -225,6 +220,9 @@ class PdfPipelineOptions(PipelineOptions):
225
220
  artifacts_path: Optional[Union[Path, str]] = None
226
221
  do_table_structure: bool = True # True: perform table structure extraction
227
222
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
223
+ do_code_enrichment: bool = False # True: perform code OCR
224
+ do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
225
+ do_picture_classification: bool = False # True: classify pictures in documents
228
226
 
229
227
  table_structure_options: TableStructureOptions = TableStructureOptions()
230
228
  ocr_options: Union[
@@ -11,6 +11,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
11
11
  from docling.backend.asciidoc_backend import AsciiDocBackend
12
12
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
13
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
+ from docling.backend.json.docling_json_backend import DoclingJSONBackend
14
15
  from docling.backend.md_backend import MarkdownDocumentBackend
15
16
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
17
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
@@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
136
137
  InputFormat.PDF: FormatOption(
137
138
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
138
139
  ),
140
+ InputFormat.JSON_DOCLING: FormatOption(
141
+ pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
142
+ ),
139
143
  }
140
144
  if (options := format_to_default_options.get(format)) is not None:
141
145
  return options
@@ -1,9 +1,10 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any, Iterable
2
+ from typing import Any, Generic, Iterable, Optional
3
3
 
4
- from docling_core.types.doc import DoclingDocument, NodeItem
4
+ from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem
5
+ from typing_extensions import TypeVar
5
6
 
6
- from docling.datamodel.base_models import Page
7
+ from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
7
8
  from docling.datamodel.document import ConversionResult
8
9
 
9
10
 
@@ -15,14 +16,69 @@ class BasePageModel(ABC):
15
16
  pass
16
17
 
17
18
 
18
- class BaseEnrichmentModel(ABC):
19
+ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
20
+
21
+
22
+ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
19
23
 
20
24
  @abstractmethod
21
25
  def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
22
26
  pass
23
27
 
28
+ @abstractmethod
29
+ def prepare_element(
30
+ self, conv_res: ConversionResult, element: NodeItem
31
+ ) -> Optional[EnrichElementT]:
32
+ pass
33
+
24
34
  @abstractmethod
25
35
  def __call__(
26
- self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
27
- ) -> Iterable[Any]:
36
+ self, doc: DoclingDocument, element_batch: Iterable[EnrichElementT]
37
+ ) -> Iterable[NodeItem]:
28
38
  pass
39
+
40
+
41
+ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
42
+
43
+ def prepare_element(
44
+ self, conv_res: ConversionResult, element: NodeItem
45
+ ) -> Optional[NodeItem]:
46
+ if self.is_processable(doc=conv_res.document, element=element):
47
+ return element
48
+ return None
49
+
50
+
51
+ class BaseItemAndImageEnrichmentModel(
52
+ GenericEnrichmentModel[ItemAndImageEnrichmentElement]
53
+ ):
54
+
55
+ images_scale: float
56
+ expansion_factor: float = 0.0
57
+
58
+ def prepare_element(
59
+ self, conv_res: ConversionResult, element: NodeItem
60
+ ) -> Optional[ItemAndImageEnrichmentElement]:
61
+ if not self.is_processable(doc=conv_res.document, element=element):
62
+ return None
63
+
64
+ assert isinstance(element, TextItem)
65
+ element_prov = element.prov[0]
66
+
67
+ bbox = element_prov.bbox
68
+ width = bbox.r - bbox.l
69
+ height = bbox.t - bbox.b
70
+
71
+ # TODO: move to a utility in the BoundingBox class
72
+ expanded_bbox = BoundingBox(
73
+ l=bbox.l - width * self.expansion_factor,
74
+ t=bbox.t + height * self.expansion_factor,
75
+ r=bbox.r + width * self.expansion_factor,
76
+ b=bbox.b - height * self.expansion_factor,
77
+ coord_origin=bbox.coord_origin,
78
+ )
79
+
80
+ page_ix = element_prov.page_no - 1
81
+ cropped_image = conv_res.pages[page_ix].get_image(
82
+ scale=self.images_scale, cropbox=expanded_bbox
83
+ )
84
+ return ItemAndImageEnrichmentElement(item=element, image=cropped_image)