docling 2.31.2__tar.gz → 2.32.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {docling-2.31.2 → docling-2.32.0}/PKG-INFO +1 -1
  2. {docling-2.31.2 → docling-2.32.0}/docling/datamodel/base_models.py +1 -0
  3. {docling-2.31.2 → docling-2.32.0}/docling/datamodel/pipeline_options.py +2 -0
  4. {docling-2.31.2 → docling-2.32.0}/docling/datamodel/settings.py +6 -4
  5. {docling-2.31.2 → docling-2.32.0}/docling/models/api_vlm_model.py +8 -3
  6. {docling-2.31.2 → docling-2.32.0}/docling/models/picture_description_api_model.py +7 -2
  7. {docling-2.31.2 → docling-2.32.0}/docling/models/tesseract_ocr_cli_model.py +1 -1
  8. {docling-2.31.2 → docling-2.32.0}/pyproject.toml +1 -1
  9. {docling-2.31.2 → docling-2.32.0}/LICENSE +0 -0
  10. {docling-2.31.2 → docling-2.32.0}/README.md +0 -0
  11. {docling-2.31.2 → docling-2.32.0}/docling/__init__.py +0 -0
  12. {docling-2.31.2 → docling-2.32.0}/docling/backend/__init__.py +0 -0
  13. {docling-2.31.2 → docling-2.32.0}/docling/backend/abstract_backend.py +0 -0
  14. {docling-2.31.2 → docling-2.32.0}/docling/backend/asciidoc_backend.py +0 -0
  15. {docling-2.31.2 → docling-2.32.0}/docling/backend/csv_backend.py +0 -0
  16. {docling-2.31.2 → docling-2.32.0}/docling/backend/docling_parse_backend.py +0 -0
  17. {docling-2.31.2 → docling-2.32.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  18. {docling-2.31.2 → docling-2.32.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  19. {docling-2.31.2 → docling-2.32.0}/docling/backend/docx/__init__.py +0 -0
  20. {docling-2.31.2 → docling-2.32.0}/docling/backend/docx/latex/__init__.py +0 -0
  21. {docling-2.31.2 → docling-2.32.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  22. {docling-2.31.2 → docling-2.32.0}/docling/backend/docx/latex/omml.py +0 -0
  23. {docling-2.31.2 → docling-2.32.0}/docling/backend/html_backend.py +0 -0
  24. {docling-2.31.2 → docling-2.32.0}/docling/backend/json/__init__.py +0 -0
  25. {docling-2.31.2 → docling-2.32.0}/docling/backend/json/docling_json_backend.py +0 -0
  26. {docling-2.31.2 → docling-2.32.0}/docling/backend/md_backend.py +0 -0
  27. {docling-2.31.2 → docling-2.32.0}/docling/backend/msexcel_backend.py +0 -0
  28. {docling-2.31.2 → docling-2.32.0}/docling/backend/mspowerpoint_backend.py +0 -0
  29. {docling-2.31.2 → docling-2.32.0}/docling/backend/msword_backend.py +0 -0
  30. {docling-2.31.2 → docling-2.32.0}/docling/backend/pdf_backend.py +0 -0
  31. {docling-2.31.2 → docling-2.32.0}/docling/backend/pypdfium2_backend.py +0 -0
  32. {docling-2.31.2 → docling-2.32.0}/docling/backend/xml/__init__.py +0 -0
  33. {docling-2.31.2 → docling-2.32.0}/docling/backend/xml/jats_backend.py +0 -0
  34. {docling-2.31.2 → docling-2.32.0}/docling/backend/xml/uspto_backend.py +0 -0
  35. {docling-2.31.2 → docling-2.32.0}/docling/chunking/__init__.py +0 -0
  36. {docling-2.31.2 → docling-2.32.0}/docling/cli/__init__.py +0 -0
  37. {docling-2.31.2 → docling-2.32.0}/docling/cli/main.py +0 -0
  38. {docling-2.31.2 → docling-2.32.0}/docling/cli/models.py +0 -0
  39. {docling-2.31.2 → docling-2.32.0}/docling/cli/tools.py +0 -0
  40. {docling-2.31.2 → docling-2.32.0}/docling/datamodel/__init__.py +0 -0
  41. {docling-2.31.2 → docling-2.32.0}/docling/datamodel/document.py +0 -0
  42. {docling-2.31.2 → docling-2.32.0}/docling/document_converter.py +0 -0
  43. {docling-2.31.2 → docling-2.32.0}/docling/exceptions.py +0 -0
  44. {docling-2.31.2 → docling-2.32.0}/docling/models/__init__.py +0 -0
  45. {docling-2.31.2 → docling-2.32.0}/docling/models/base_model.py +0 -0
  46. {docling-2.31.2 → docling-2.32.0}/docling/models/base_ocr_model.py +0 -0
  47. {docling-2.31.2 → docling-2.32.0}/docling/models/code_formula_model.py +0 -0
  48. {docling-2.31.2 → docling-2.32.0}/docling/models/document_picture_classifier.py +0 -0
  49. {docling-2.31.2 → docling-2.32.0}/docling/models/easyocr_model.py +0 -0
  50. {docling-2.31.2 → docling-2.32.0}/docling/models/factories/__init__.py +0 -0
  51. {docling-2.31.2 → docling-2.32.0}/docling/models/factories/base_factory.py +0 -0
  52. {docling-2.31.2 → docling-2.32.0}/docling/models/factories/ocr_factory.py +0 -0
  53. {docling-2.31.2 → docling-2.32.0}/docling/models/factories/picture_description_factory.py +0 -0
  54. {docling-2.31.2 → docling-2.32.0}/docling/models/hf_mlx_model.py +0 -0
  55. {docling-2.31.2 → docling-2.32.0}/docling/models/hf_vlm_model.py +0 -0
  56. {docling-2.31.2 → docling-2.32.0}/docling/models/layout_model.py +0 -0
  57. {docling-2.31.2 → docling-2.32.0}/docling/models/ocr_mac_model.py +0 -0
  58. {docling-2.31.2 → docling-2.32.0}/docling/models/page_assemble_model.py +0 -0
  59. {docling-2.31.2 → docling-2.32.0}/docling/models/page_preprocessing_model.py +0 -0
  60. {docling-2.31.2 → docling-2.32.0}/docling/models/picture_description_base_model.py +0 -0
  61. {docling-2.31.2 → docling-2.32.0}/docling/models/picture_description_vlm_model.py +0 -0
  62. {docling-2.31.2 → docling-2.32.0}/docling/models/plugins/__init__.py +0 -0
  63. {docling-2.31.2 → docling-2.32.0}/docling/models/plugins/defaults.py +0 -0
  64. {docling-2.31.2 → docling-2.32.0}/docling/models/rapid_ocr_model.py +0 -0
  65. {docling-2.31.2 → docling-2.32.0}/docling/models/readingorder_model.py +0 -0
  66. {docling-2.31.2 → docling-2.32.0}/docling/models/table_structure_model.py +0 -0
  67. {docling-2.31.2 → docling-2.32.0}/docling/models/tesseract_ocr_model.py +0 -0
  68. {docling-2.31.2 → docling-2.32.0}/docling/pipeline/__init__.py +0 -0
  69. {docling-2.31.2 → docling-2.32.0}/docling/pipeline/base_pipeline.py +0 -0
  70. {docling-2.31.2 → docling-2.32.0}/docling/pipeline/simple_pipeline.py +0 -0
  71. {docling-2.31.2 → docling-2.32.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  72. {docling-2.31.2 → docling-2.32.0}/docling/pipeline/vlm_pipeline.py +0 -0
  73. {docling-2.31.2 → docling-2.32.0}/docling/py.typed +0 -0
  74. {docling-2.31.2 → docling-2.32.0}/docling/utils/__init__.py +0 -0
  75. {docling-2.31.2 → docling-2.32.0}/docling/utils/accelerator_utils.py +0 -0
  76. {docling-2.31.2 → docling-2.32.0}/docling/utils/api_image_request.py +0 -0
  77. {docling-2.31.2 → docling-2.32.0}/docling/utils/export.py +0 -0
  78. {docling-2.31.2 → docling-2.32.0}/docling/utils/glm_utils.py +0 -0
  79. {docling-2.31.2 → docling-2.32.0}/docling/utils/layout_postprocessor.py +0 -0
  80. {docling-2.31.2 → docling-2.32.0}/docling/utils/locks.py +0 -0
  81. {docling-2.31.2 → docling-2.32.0}/docling/utils/model_downloader.py +0 -0
  82. {docling-2.31.2 → docling-2.32.0}/docling/utils/ocr_utils.py +0 -0
  83. {docling-2.31.2 → docling-2.32.0}/docling/utils/profiling.py +0 -0
  84. {docling-2.31.2 → docling-2.32.0}/docling/utils/utils.py +0 -0
  85. {docling-2.31.2 → docling-2.32.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.31.2
3
+ Version: 2.32.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -90,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
90
90
  "image/tiff",
91
91
  "image/gif",
92
92
  "image/bmp",
93
+ "image/webp",
93
94
  ],
94
95
  InputFormat.PDF: ["application/pdf"],
95
96
  InputFormat.ASCIIDOC: ["text/asciidoc"],
@@ -225,6 +225,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
225
225
  headers: Dict[str, str] = {}
226
226
  params: Dict[str, Any] = {}
227
227
  timeout: float = 20
228
+ concurrency: int = 1
228
229
 
229
230
  prompt: str = "Describe this image in a few sentences."
230
231
  provenance: str = ""
@@ -295,6 +296,7 @@ class ApiVlmOptions(BaseVlmOptions):
295
296
  params: Dict[str, Any] = {}
296
297
  scale: float = 2.0
297
298
  timeout: float = 60
299
+ concurrency: int = 1
298
300
  response_format: ResponseFormat
299
301
 
300
302
 
@@ -56,13 +56,15 @@ class DebugSettings(BaseModel):
56
56
 
57
57
 
58
58
  class AppSettings(BaseSettings):
59
- model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
59
+ model_config = SettingsConfigDict(
60
+ env_prefix="DOCLING_", env_nested_delimiter="_", env_nested_max_split=1
61
+ )
60
62
 
61
- perf: BatchConcurrencySettings
62
- debug: DebugSettings
63
+ perf: BatchConcurrencySettings = BatchConcurrencySettings()
64
+ debug: DebugSettings = DebugSettings()
63
65
 
64
66
  cache_dir: Path = Path.home() / ".cache" / "docling"
65
67
  artifacts_path: Optional[Path] = None
66
68
 
67
69
 
68
- settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
70
+ settings = AppSettings()
@@ -1,4 +1,5 @@
1
1
  from collections.abc import Iterable
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
 
3
4
  from docling.datamodel.base_models import Page, VlmPrediction
4
5
  from docling.datamodel.document import ConversionResult
@@ -27,6 +28,7 @@ class ApiVlmModel(BasePageModel):
27
28
  )
28
29
 
29
30
  self.timeout = self.vlm_options.timeout
31
+ self.concurrency = self.vlm_options.concurrency
30
32
  self.prompt_content = (
31
33
  f"This is a page from a document.\n{self.vlm_options.prompt}"
32
34
  )
@@ -38,10 +40,10 @@ class ApiVlmModel(BasePageModel):
38
40
  def __call__(
39
41
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
40
42
  ) -> Iterable[Page]:
41
- for page in page_batch:
43
+ def _vlm_request(page):
42
44
  assert page._backend is not None
43
45
  if not page._backend.is_valid():
44
- yield page
46
+ return page
45
47
  else:
46
48
  with TimeRecorder(conv_res, "vlm"):
47
49
  assert page.size is not None
@@ -63,4 +65,7 @@ class ApiVlmModel(BasePageModel):
63
65
 
64
66
  page.predictions.vlm_response = VlmPrediction(text=page_tags)
65
67
 
66
- yield page
68
+ return page
69
+
70
+ with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
71
+ yield from executor.map(_vlm_request, page_batch)
@@ -1,4 +1,5 @@
1
1
  from collections.abc import Iterable
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
  from pathlib import Path
3
4
  from typing import Optional, Type, Union
4
5
 
@@ -37,6 +38,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
37
38
  accelerator_options=accelerator_options,
38
39
  )
39
40
  self.options: PictureDescriptionApiOptions
41
+ self.concurrency = self.options.concurrency
40
42
 
41
43
  if self.enabled:
42
44
  if not enable_remote_services:
@@ -48,8 +50,8 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
48
50
  def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
49
51
  # Note: technically we could make a batch request here,
50
52
  # but not all APIs will allow for it. For example, vllm won't allow more than 1.
51
- for image in images:
52
- yield api_image_request(
53
+ def _api_request(image):
54
+ return api_image_request(
53
55
  image=image,
54
56
  prompt=self.options.prompt,
55
57
  url=self.options.url,
@@ -57,3 +59,6 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
57
59
  headers=self.options.headers,
58
60
  **self.options.params,
59
61
  )
62
+
63
+ with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
64
+ yield from executor.map(_api_request, images)
@@ -249,7 +249,7 @@ class TesseractOcrCliModel(BaseOcrModel):
249
249
  cell = TextCell(
250
250
  index=ix,
251
251
  text=str(text),
252
- orig=text,
252
+ orig=str(text),
253
253
  from_ocr=True,
254
254
  confidence=conf / 100.0,
255
255
  rect=BoundingRectangle.from_bounding_box(
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.31.2" # DO NOT EDIT, updated automatically
3
+ version = "2.32.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = [
6
6
  "Christoph Auer <cau@zurich.ibm.com>",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes