docling 2.31.1__tar.gz → 2.32.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.31.1 → docling-2.32.0}/PKG-INFO +2 -1
- {docling-2.31.1 → docling-2.32.0}/docling/backend/asciidoc_backend.py +1 -1
- {docling-2.31.1 → docling-2.32.0}/docling/datamodel/base_models.py +1 -0
- {docling-2.31.1 → docling-2.32.0}/docling/datamodel/pipeline_options.py +2 -0
- {docling-2.31.1 → docling-2.32.0}/docling/datamodel/settings.py +6 -4
- {docling-2.31.1 → docling-2.32.0}/docling/models/api_vlm_model.py +8 -3
- {docling-2.31.1 → docling-2.32.0}/docling/models/picture_description_api_model.py +7 -2
- {docling-2.31.1 → docling-2.32.0}/docling/models/tesseract_ocr_cli_model.py +1 -1
- {docling-2.31.1 → docling-2.32.0}/pyproject.toml +2 -1
- {docling-2.31.1 → docling-2.32.0}/LICENSE +0 -0
- {docling-2.31.1 → docling-2.32.0}/README.md +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/html_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/md_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/chunking/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/cli/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/cli/main.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/cli/models.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/cli/tools.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/datamodel/document.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/document_converter.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/exceptions.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/base_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/hf_mlx_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/hf_vlm_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/layout_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/py.typed +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/__init__.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/export.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/locks.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/profiling.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/utils.py +0 -0
- {docling-2.31.1 → docling-2.32.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.32.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -28,6 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
|
+
Requires-Dist: click (<8.2.0)
|
31
32
|
Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
|
32
33
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
34
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
@@ -287,7 +287,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
287
287
|
|
288
288
|
# ========= Section headers
|
289
289
|
def _is_section_header(self, line):
|
290
|
-
return re.match(r"
|
290
|
+
return re.match(r"^==+\s+", line)
|
291
291
|
|
292
292
|
def _parse_section_header(self, line):
|
293
293
|
match = re.match(r"^(=+)\s+(.*)", line)
|
@@ -225,6 +225,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|
225
225
|
headers: Dict[str, str] = {}
|
226
226
|
params: Dict[str, Any] = {}
|
227
227
|
timeout: float = 20
|
228
|
+
concurrency: int = 1
|
228
229
|
|
229
230
|
prompt: str = "Describe this image in a few sentences."
|
230
231
|
provenance: str = ""
|
@@ -295,6 +296,7 @@ class ApiVlmOptions(BaseVlmOptions):
|
|
295
296
|
params: Dict[str, Any] = {}
|
296
297
|
scale: float = 2.0
|
297
298
|
timeout: float = 60
|
299
|
+
concurrency: int = 1
|
298
300
|
response_format: ResponseFormat
|
299
301
|
|
300
302
|
|
@@ -56,13 +56,15 @@ class DebugSettings(BaseModel):
|
|
56
56
|
|
57
57
|
|
58
58
|
class AppSettings(BaseSettings):
|
59
|
-
model_config = SettingsConfigDict(
|
59
|
+
model_config = SettingsConfigDict(
|
60
|
+
env_prefix="DOCLING_", env_nested_delimiter="_", env_nested_max_split=1
|
61
|
+
)
|
60
62
|
|
61
|
-
perf: BatchConcurrencySettings
|
62
|
-
debug: DebugSettings
|
63
|
+
perf: BatchConcurrencySettings = BatchConcurrencySettings()
|
64
|
+
debug: DebugSettings = DebugSettings()
|
63
65
|
|
64
66
|
cache_dir: Path = Path.home() / ".cache" / "docling"
|
65
67
|
artifacts_path: Optional[Path] = None
|
66
68
|
|
67
69
|
|
68
|
-
settings = AppSettings(
|
70
|
+
settings = AppSettings()
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from collections.abc import Iterable
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
2
3
|
|
3
4
|
from docling.datamodel.base_models import Page, VlmPrediction
|
4
5
|
from docling.datamodel.document import ConversionResult
|
@@ -27,6 +28,7 @@ class ApiVlmModel(BasePageModel):
|
|
27
28
|
)
|
28
29
|
|
29
30
|
self.timeout = self.vlm_options.timeout
|
31
|
+
self.concurrency = self.vlm_options.concurrency
|
30
32
|
self.prompt_content = (
|
31
33
|
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
32
34
|
)
|
@@ -38,10 +40,10 @@ class ApiVlmModel(BasePageModel):
|
|
38
40
|
def __call__(
|
39
41
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
40
42
|
) -> Iterable[Page]:
|
41
|
-
|
43
|
+
def _vlm_request(page):
|
42
44
|
assert page._backend is not None
|
43
45
|
if not page._backend.is_valid():
|
44
|
-
|
46
|
+
return page
|
45
47
|
else:
|
46
48
|
with TimeRecorder(conv_res, "vlm"):
|
47
49
|
assert page.size is not None
|
@@ -63,4 +65,7 @@ class ApiVlmModel(BasePageModel):
|
|
63
65
|
|
64
66
|
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
65
67
|
|
66
|
-
|
68
|
+
return page
|
69
|
+
|
70
|
+
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
|
71
|
+
yield from executor.map(_vlm_request, page_batch)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from collections.abc import Iterable
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
2
3
|
from pathlib import Path
|
3
4
|
from typing import Optional, Type, Union
|
4
5
|
|
@@ -37,6 +38,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|
37
38
|
accelerator_options=accelerator_options,
|
38
39
|
)
|
39
40
|
self.options: PictureDescriptionApiOptions
|
41
|
+
self.concurrency = self.options.concurrency
|
40
42
|
|
41
43
|
if self.enabled:
|
42
44
|
if not enable_remote_services:
|
@@ -48,8 +50,8 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|
48
50
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
49
51
|
# Note: technically we could make a batch request here,
|
50
52
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
51
|
-
|
52
|
-
|
53
|
+
def _api_request(image):
|
54
|
+
return api_image_request(
|
53
55
|
image=image,
|
54
56
|
prompt=self.options.prompt,
|
55
57
|
url=self.options.url,
|
@@ -57,3 +59,6 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|
57
59
|
headers=self.options.headers,
|
58
60
|
**self.options.params,
|
59
61
|
)
|
62
|
+
|
63
|
+
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
|
64
|
+
yield from executor.map(_api_request, images)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.32.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = [
|
6
6
|
"Christoph Auer <cau@zurich.ibm.com>",
|
@@ -90,6 +90,7 @@ pillow = ">=10.0.0,<12.0.0"
|
|
90
90
|
tqdm = "^4.65.0"
|
91
91
|
pluggy = "^1.0.0"
|
92
92
|
pylatexenc = "^2.10"
|
93
|
+
click = "<8.2.0"
|
93
94
|
|
94
95
|
[tool.poetry.group.dev.dependencies]
|
95
96
|
python = "^3.9.2"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|