docling 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. docling/backend/asciidoc_backend.py +7 -15
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +2 -2
  4. docling/backend/docling_parse_v2_backend.py +2 -2
  5. docling/backend/docling_parse_v4_backend.py +3 -4
  6. docling/backend/docx/latex/latex_dict.py +0 -5
  7. docling/backend/docx/latex/omml.py +4 -7
  8. docling/backend/html_backend.py +26 -9
  9. docling/backend/md_backend.py +5 -7
  10. docling/backend/msexcel_backend.py +271 -95
  11. docling/backend/mspowerpoint_backend.py +4 -7
  12. docling/backend/msword_backend.py +23 -15
  13. docling/backend/pdf_backend.py +2 -1
  14. docling/backend/pypdfium2_backend.py +3 -3
  15. docling/backend/xml/jats_backend.py +10 -13
  16. docling/backend/xml/uspto_backend.py +15 -19
  17. docling/cli/main.py +27 -9
  18. docling/cli/models.py +2 -3
  19. docling/datamodel/base_models.py +40 -5
  20. docling/datamodel/document.py +18 -10
  21. docling/datamodel/pipeline_options.py +29 -4
  22. docling/document_converter.py +5 -5
  23. docling/models/api_vlm_model.py +66 -0
  24. docling/models/base_model.py +2 -4
  25. docling/models/base_ocr_model.py +2 -2
  26. docling/models/code_formula_model.py +2 -1
  27. docling/models/document_picture_classifier.py +2 -1
  28. docling/models/easyocr_model.py +10 -11
  29. docling/models/factories/__init__.py +2 -2
  30. docling/models/factories/base_factory.py +1 -1
  31. docling/models/hf_mlx_model.py +4 -6
  32. docling/models/hf_vlm_model.py +7 -5
  33. docling/models/layout_model.py +2 -2
  34. docling/models/ocr_mac_model.py +3 -4
  35. docling/models/page_assemble_model.py +7 -12
  36. docling/models/page_preprocessing_model.py +2 -1
  37. docling/models/picture_description_api_model.py +9 -75
  38. docling/models/picture_description_base_model.py +16 -5
  39. docling/models/picture_description_vlm_model.py +2 -3
  40. docling/models/rapid_ocr_model.py +2 -3
  41. docling/models/readingorder_model.py +8 -23
  42. docling/models/table_structure_model.py +2 -6
  43. docling/models/tesseract_ocr_cli_model.py +17 -16
  44. docling/models/tesseract_ocr_model.py +8 -6
  45. docling/pipeline/base_pipeline.py +4 -8
  46. docling/pipeline/simple_pipeline.py +0 -1
  47. docling/pipeline/standard_pdf_pipeline.py +6 -3
  48. docling/pipeline/vlm_pipeline.py +27 -20
  49. docling/utils/api_image_request.py +61 -0
  50. docling/utils/export.py +2 -4
  51. docling/utils/glm_utils.py +2 -2
  52. docling/utils/layout_postprocessor.py +4 -2
  53. docling/utils/model_downloader.py +7 -7
  54. docling/utils/utils.py +1 -1
  55. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/METADATA +4 -3
  56. docling-2.31.0.dist-info/RECORD +86 -0
  57. docling-2.29.0.dist-info/RECORD +0 -84
  58. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
  59. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
  60. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,66 @@
1
+ from collections.abc import Iterable
2
+
3
+ from docling.datamodel.base_models import Page, VlmPrediction
4
+ from docling.datamodel.document import ConversionResult
5
+ from docling.datamodel.pipeline_options import ApiVlmOptions
6
+ from docling.exceptions import OperationNotAllowed
7
+ from docling.models.base_model import BasePageModel
8
+ from docling.utils.api_image_request import api_image_request
9
+ from docling.utils.profiling import TimeRecorder
10
+
11
+
12
+ class ApiVlmModel(BasePageModel):
13
+ def __init__(
14
+ self,
15
+ enabled: bool,
16
+ enable_remote_services: bool,
17
+ vlm_options: ApiVlmOptions,
18
+ ):
19
+ self.enabled = enabled
20
+ self.vlm_options = vlm_options
21
+ if self.enabled:
22
+ if not enable_remote_services:
23
+ raise OperationNotAllowed(
24
+ "Connections to remote services is only allowed when set explicitly. "
25
+ "pipeline_options.enable_remote_services=True, or using the CLI "
26
+ "--enable-remote-services."
27
+ )
28
+
29
+ self.timeout = self.vlm_options.timeout
30
+ self.prompt_content = (
31
+ f"This is a page from a document.\n{self.vlm_options.prompt}"
32
+ )
33
+ self.params = {
34
+ **self.vlm_options.params,
35
+ "temperature": 0,
36
+ }
37
+
38
+ def __call__(
39
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
40
+ ) -> Iterable[Page]:
41
+ for page in page_batch:
42
+ assert page._backend is not None
43
+ if not page._backend.is_valid():
44
+ yield page
45
+ else:
46
+ with TimeRecorder(conv_res, "vlm"):
47
+ assert page.size is not None
48
+
49
+ hi_res_image = page.get_image(scale=self.vlm_options.scale)
50
+ assert hi_res_image is not None
51
+ if hi_res_image:
52
+ if hi_res_image.mode != "RGB":
53
+ hi_res_image = hi_res_image.convert("RGB")
54
+
55
+ page_tags = api_image_request(
56
+ image=hi_res_image,
57
+ prompt=self.prompt_content,
58
+ url=self.vlm_options.url,
59
+ timeout=self.timeout,
60
+ headers=self.vlm_options.headers,
61
+ **self.params,
62
+ )
63
+
64
+ page.predictions.vlm_response = VlmPrediction(text=page_tags)
65
+
66
+ yield page
@@ -1,5 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any, Generic, Iterable, Optional, Protocol, Type
2
+ from collections.abc import Iterable
3
+ from typing import Generic, Optional, Protocol, Type
3
4
 
4
5
  from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
5
6
  from typing_extensions import TypeVar
@@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
29
30
 
30
31
 
31
32
  class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
32
-
33
33
  elements_batch_size: int = settings.perf.elements_batch_size
34
34
 
35
35
  @abstractmethod
@@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
50
50
 
51
51
 
52
52
  class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
53
-
54
53
  def prepare_element(
55
54
  self, conv_res: ConversionResult, element: NodeItem
56
55
  ) -> Optional[NodeItem]:
@@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
62
61
  class BaseItemAndImageEnrichmentModel(
63
62
  GenericEnrichmentModel[ItemAndImageEnrichmentElement]
64
63
  ):
65
-
66
64
  images_scale: float
67
65
  expansion_factor: float = 0.0
68
66
 
@@ -1,12 +1,12 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Type
6
+ from typing import List, Optional, Type
6
7
 
7
8
  import numpy as np
8
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
10
10
  from PIL import Image, ImageDraw
11
11
  from rtree import index
12
12
  from scipy.ndimage import binary_dilation, find_objects, label
@@ -1,7 +1,8 @@
1
1
  import re
2
2
  from collections import Counter
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, List, Literal, Optional, Tuple, Union
5
+ from typing import List, Literal, Optional, Tuple, Union
5
6
 
6
7
  import numpy as np
7
8
  from docling_core.types.doc import (
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, List, Literal, Optional, Tuple, Union
3
+ from typing import List, Literal, Optional, Union
3
4
 
4
5
  import numpy as np
5
6
  from docling_core.types.doc import (
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import warnings
3
3
  import zipfile
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Type
6
+ from typing import List, Optional, Type
6
7
 
7
8
  import numpy
8
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
58
59
  device = decide_device(accelerator_options.device)
59
60
  # Enable easyocr GPU if running on CUDA, MPS
60
61
  use_gpu = any(
61
- [
62
- device.startswith(x)
63
- for x in [
64
- AcceleratorDevice.CUDA.value,
65
- AcceleratorDevice.MPS.value,
66
- ]
62
+ device.startswith(x)
63
+ for x in [
64
+ AcceleratorDevice.CUDA.value,
65
+ AcceleratorDevice.MPS.value,
67
66
  ]
68
67
  )
69
68
  else:
@@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
98
97
  progress: bool = False,
99
98
  ) -> Path:
100
99
  # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
101
- from easyocr.config import detection_models as det_models_dict
102
- from easyocr.config import recognition_models as rec_models_dict
100
+ from easyocr.config import (
101
+ detection_models as det_models_dict,
102
+ recognition_models as rec_models_dict,
103
+ )
103
104
 
104
105
  if local_dir is None:
105
106
  local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
@@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
126
127
  def __call__(
127
128
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
128
129
  ) -> Iterable[Page]:
129
-
130
130
  if not self.enabled:
131
131
  yield from page_batch
132
132
  return
133
133
 
134
134
  for page in page_batch:
135
-
136
135
  assert page._backend is not None
137
136
  if not page._backend.is_valid():
138
137
  yield page
@@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
11
 
12
- @lru_cache()
12
+ @lru_cache
13
13
  def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
14
14
  factory = OcrFactory()
15
15
  factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
@@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
17
17
  return factory
18
18
 
19
19
 
20
- @lru_cache()
20
+ @lru_cache
21
21
  def get_picture_description_factory(
22
22
  allow_external_plugins: bool = False,
23
23
  ) -> PictureDescriptionFactory:
@@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
33
33
 
34
34
  @property
35
35
  def registered_kind(self) -> list[str]:
36
- return list(opt.kind for opt in self._classes.keys())
36
+ return [opt.kind for opt in self._classes.keys()]
37
37
 
38
38
  def get_enum(self) -> enum.Enum:
39
39
  return enum.Enum(
@@ -1,25 +1,22 @@
1
1
  import logging
2
2
  import time
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, List, Optional
5
+ from typing import Optional
5
6
 
6
7
  from docling.datamodel.base_models import Page, VlmPrediction
7
8
  from docling.datamodel.document import ConversionResult
8
9
  from docling.datamodel.pipeline_options import (
9
- AcceleratorDevice,
10
10
  AcceleratorOptions,
11
11
  HuggingFaceVlmOptions,
12
12
  )
13
- from docling.datamodel.settings import settings
14
13
  from docling.models.base_model import BasePageModel
15
- from docling.utils.accelerator_utils import decide_device
16
14
  from docling.utils.profiling import TimeRecorder
17
15
 
18
16
  _log = logging.getLogger(__name__)
19
17
 
20
18
 
21
19
  class HuggingFaceMlxModel(BasePageModel):
22
-
23
20
  def __init__(
24
21
  self,
25
22
  enabled: bool,
@@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
32
29
  self.vlm_options = vlm_options
33
30
 
34
31
  if self.enabled:
35
-
36
32
  try:
37
33
  from mlx_vlm import generate, load # type: ignore
38
34
  from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
@@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
125
121
  generation_time = time.time() - start_time
126
122
  page_tags = output
127
123
 
124
+ _log.debug(f"Generation time {generation_time:.2f} seconds.")
125
+
128
126
  # inference_time = time.time() - start_time
129
127
  # tokens_per_second = num_tokens / generation_time
130
128
  # print("")
@@ -1,16 +1,15 @@
1
1
  import logging
2
2
  import time
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, List, Optional
5
+ from typing import Optional
5
6
 
6
7
  from docling.datamodel.base_models import Page, VlmPrediction
7
8
  from docling.datamodel.document import ConversionResult
8
9
  from docling.datamodel.pipeline_options import (
9
- AcceleratorDevice,
10
10
  AcceleratorOptions,
11
11
  HuggingFaceVlmOptions,
12
12
  )
13
- from docling.datamodel.settings import settings
14
13
  from docling.models.base_model import BasePageModel
15
14
  from docling.utils.accelerator_utils import decide_device
16
15
  from docling.utils.profiling import TimeRecorder
@@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
19
18
 
20
19
 
21
20
  class HuggingFaceVlmModel(BasePageModel):
22
-
23
21
  def __init__(
24
22
  self,
25
23
  enabled: bool,
@@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
42
40
  device = decide_device(accelerator_options.device)
43
41
  self.device = device
44
42
 
45
- _log.debug("Available device for HuggingFace VLM: {}".format(device))
43
+ _log.debug(f"Available device for HuggingFace VLM: {device}")
46
44
 
47
45
  repo_cache_folder = vlm_options.repo_id.replace("/", "--")
48
46
 
@@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
168
166
  num_tokens = len(generated_ids[0])
169
167
  page_tags = generated_texts
170
168
 
169
+ _log.debug(
170
+ f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
171
+ )
172
+
171
173
  # inference_time = time.time() - start_time
172
174
  # tokens_per_second = num_tokens / generation_time
173
175
  # print("")
@@ -1,8 +1,9 @@
1
1
  import copy
2
2
  import logging
3
3
  import warnings
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, Optional, Union
6
+ from typing import Optional
6
7
 
7
8
  from docling_core.types.doc import DocItemLabel
8
9
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
142
143
  def __call__(
143
144
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
144
145
  ) -> Iterable[Page]:
145
-
146
146
  for page in page_batch:
147
147
  assert page._backend is not None
148
148
  if not page._backend.is_valid():
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import sys
3
3
  import tempfile
4
+ from collections.abc import Iterable
4
5
  from pathlib import Path
5
- from typing import Iterable, Optional, Tuple, Type
6
+ from typing import Optional, Type
6
7
 
7
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
8
9
  from docling_core.types.doc.page import BoundingRectangle, TextCell
@@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
41
42
 
42
43
  if self.enabled:
43
44
  if "darwin" != sys.platform:
44
- raise RuntimeError(f"OcrMac is only supported on Mac.")
45
+ raise RuntimeError("OcrMac is only supported on Mac.")
45
46
  install_errmsg = (
46
47
  "ocrmac is not correctly installed. "
47
48
  "Please install it via `pip install ocrmac` to use this OCR engine. "
@@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
58
59
  def __call__(
59
60
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
60
61
  ) -> Iterable[Page]:
61
-
62
62
  if not self.enabled:
63
63
  yield from page_batch
64
64
  return
@@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
69
69
  yield page
70
70
  else:
71
71
  with TimeRecorder(conv_res, "ocr"):
72
-
73
72
  ocr_rects = self.get_ocr_rects(page)
74
73
 
75
74
  all_ocr_cells = []
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import re
3
- from typing import Iterable, List
3
+ from collections.abc import Iterable
4
+ from typing import List
4
5
 
5
6
  from pydantic import BaseModel
6
7
 
@@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
53
54
  sanitized_text = "".join(lines)
54
55
 
55
56
  # Text normalization
56
- sanitized_text = sanitized_text.replace("⁄", "/")
57
- sanitized_text = sanitized_text.replace("’", "'")
58
- sanitized_text = sanitized_text.replace("‘", "'")
57
+ sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
58
+ sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
59
+ sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
59
60
  sanitized_text = sanitized_text.replace("“", '"')
60
61
  sanitized_text = sanitized_text.replace("”", '"')
61
62
  sanitized_text = sanitized_text.replace("•", "·")
@@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
71
72
  yield page
72
73
  else:
73
74
  with TimeRecorder(conv_res, "page_assemble"):
74
-
75
75
  assert page.predictions.layout is not None
76
76
 
77
77
  # assembles some JSON output page by page.
@@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
83
83
  for cluster in page.predictions.layout.clusters:
84
84
  # _log.info("Cluster label seen:", cluster.label)
85
85
  if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
86
-
87
86
  textlines = [
88
87
  cell.text.replace("\x02", "-").strip()
89
88
  for cell in cluster.cells
@@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
109
108
  tbl = page.predictions.tablestructure.table_map.get(
110
109
  cluster.id, None
111
110
  )
112
- if (
113
- not tbl
114
- ): # fallback: add table without structure, if it isn't present
111
+ if not tbl: # fallback: add table without structure, if it isn't present
115
112
  tbl = Table(
116
113
  label=cluster.label,
117
114
  id=cluster.id,
@@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
130
127
  fig = page.predictions.figures_classification.figure_map.get(
131
128
  cluster.id, None
132
129
  )
133
- if (
134
- not fig
135
- ): # fallback: add figure without classification, if it isn't present
130
+ if not fig: # fallback: add figure without classification, if it isn't present
136
131
  fig = FigureElement(
137
132
  label=cluster.label,
138
133
  id=cluster.id,
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, Optional
3
+ from typing import Optional
3
4
 
4
5
  from PIL import ImageDraw
5
6
  from pydantic import BaseModel
@@ -1,12 +1,8 @@
1
- import base64
2
- import io
3
- import logging
1
+ from collections.abc import Iterable
4
2
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Type, Union
3
+ from typing import Optional, Type, Union
6
4
 
7
- import requests
8
5
  from PIL import Image
9
- from pydantic import BaseModel, ConfigDict
10
6
 
11
7
  from docling.datamodel.pipeline_options import (
12
8
  AcceleratorOptions,
@@ -15,37 +11,7 @@ from docling.datamodel.pipeline_options import (
15
11
  )
16
12
  from docling.exceptions import OperationNotAllowed
17
13
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
18
-
19
- _log = logging.getLogger(__name__)
20
-
21
-
22
- class ChatMessage(BaseModel):
23
- role: str
24
- content: str
25
-
26
-
27
- class ResponseChoice(BaseModel):
28
- index: int
29
- message: ChatMessage
30
- finish_reason: str
31
-
32
-
33
- class ResponseUsage(BaseModel):
34
- prompt_tokens: int
35
- completion_tokens: int
36
- total_tokens: int
37
-
38
-
39
- class ApiResponse(BaseModel):
40
- model_config = ConfigDict(
41
- protected_namespaces=(),
42
- )
43
-
44
- id: str
45
- model: Optional[str] = None # returned by openai
46
- choices: List[ResponseChoice]
47
- created: int
48
- usage: ResponseUsage
14
+ from docling.utils.api_image_request import api_image_request
49
15
 
50
16
 
51
17
  class PictureDescriptionApiModel(PictureDescriptionBaseModel):
@@ -83,43 +49,11 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
83
49
  # Note: technically we could make a batch request here,
84
50
  # but not all APIs will allow for it. For example, vllm won't allow more than 1.
85
51
  for image in images:
86
- img_io = io.BytesIO()
87
- image.save(img_io, "PNG")
88
- image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
89
-
90
- messages = [
91
- {
92
- "role": "user",
93
- "content": [
94
- {
95
- "type": "text",
96
- "text": self.options.prompt,
97
- },
98
- {
99
- "type": "image_url",
100
- "image_url": {
101
- "url": f"data:image/png;base64,{image_base64}"
102
- },
103
- },
104
- ],
105
- }
106
- ]
107
-
108
- payload = {
109
- "messages": messages,
110
- **self.options.params,
111
- }
112
-
113
- r = requests.post(
114
- str(self.options.url),
115
- headers=self.options.headers,
116
- json=payload,
52
+ yield api_image_request(
53
+ image=image,
54
+ prompt=self.options.prompt,
55
+ url=self.options.url,
117
56
  timeout=self.options.timeout,
57
+ headers=self.options.headers,
58
+ **self.options.params,
118
59
  )
119
- if not r.ok:
120
- _log.error(f"Error calling the API. Reponse was {r.text}")
121
- r.raise_for_status()
122
-
123
- api_resp = ApiResponse.model_validate_json(r.text)
124
- generated_text = api_resp.choices[0].message.content.strip()
125
- yield generated_text
@@ -1,12 +1,11 @@
1
- import logging
2
1
  from abc import abstractmethod
2
+ from collections.abc import Iterable
3
3
  from pathlib import Path
4
- from typing import Any, Iterable, List, Optional, Type, Union
4
+ from typing import List, Optional, Type, Union
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  DoclingDocument,
8
8
  NodeItem,
9
- PictureClassificationClass,
10
9
  PictureItem,
11
10
  )
12
11
  from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
@@ -63,8 +62,20 @@ class PictureDescriptionBaseModel(
63
62
  elements: List[PictureItem] = []
64
63
  for el in element_batch:
65
64
  assert isinstance(el.item, PictureItem)
66
- elements.append(el.item)
67
- images.append(el.image)
65
+ describe_image = True
66
+ # Don't describe the image if it's smaller than the threshold
67
+ if len(el.item.prov) > 0:
68
+ prov = el.item.prov[0] # PictureItems have at most a single provenance
69
+ page = doc.pages.get(prov.page_no)
70
+ if page is not None:
71
+ page_area = page.size.width * page.size.height
72
+ if page_area > 0:
73
+ area_fraction = prov.bbox.area() / page_area
74
+ if area_fraction < self.options.picture_area_threshold:
75
+ describe_image = False
76
+ if describe_image:
77
+ elements.append(el.item)
78
+ images.append(el.image)
68
79
 
69
80
  outputs = self._annotate_images(images)
70
81
 
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from pathlib import Path
2
- from typing import Iterable, Optional, Type, Union
3
+ from typing import Optional, Type, Union
3
4
 
4
5
  from PIL import Image
5
6
 
@@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
13
14
 
14
15
 
15
16
  class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
16
-
17
17
  @classmethod
18
18
  def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
19
19
  return PictureDescriptionVlmOptions
@@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
36
36
  self.options: PictureDescriptionVlmOptions
37
37
 
38
38
  if self.enabled:
39
-
40
39
  if artifacts_path is None:
41
40
  artifacts_path = self.download_models(repo_id=self.options.repo_id)
42
41
  else:
@@ -1,6 +1,7 @@
1
1
  import logging
2
+ from collections.abc import Iterable
2
3
  from pathlib import Path
3
- from typing import Iterable, Optional, Type
4
+ from typing import Optional, Type
4
5
 
5
6
  import numpy
6
7
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
74
75
  def __call__(
75
76
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
76
77
  ) -> Iterable[Page]:
77
-
78
78
  if not self.enabled:
79
79
  yield from page_batch
80
80
  return
81
81
 
82
82
  for page in page_batch:
83
-
84
83
  assert page._backend is not None
85
84
  if not page._backend.is_valid():
86
85
  yield page