docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Data models for document extraction functionality."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from docling.datamodel.base_models import ConversionStatus, ErrorItem, VlmStopReason
|
|
8
|
+
from docling.datamodel.document import InputDocument
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ExtractedPageData(BaseModel):
|
|
12
|
+
"""Data model for extracted content from a single page."""
|
|
13
|
+
|
|
14
|
+
page_no: int = Field(..., description="1-indexed page number")
|
|
15
|
+
extracted_data: Optional[Dict[str, Any]] = Field(
|
|
16
|
+
None, description="Extracted structured data from the page"
|
|
17
|
+
)
|
|
18
|
+
raw_text: Optional[str] = Field(None, description="Raw extracted text")
|
|
19
|
+
errors: List[str] = Field(
|
|
20
|
+
default_factory=list,
|
|
21
|
+
description="Any errors encountered during extraction for this page",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ExtractionResult(BaseModel):
|
|
26
|
+
"""Result of document extraction."""
|
|
27
|
+
|
|
28
|
+
input: InputDocument
|
|
29
|
+
status: ConversionStatus = ConversionStatus.PENDING
|
|
30
|
+
errors: List[ErrorItem] = []
|
|
31
|
+
|
|
32
|
+
# Pages field - always a list for consistency
|
|
33
|
+
pages: List[ExtractedPageData] = Field(
|
|
34
|
+
default_factory=list, description="Extracted data from each page"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Type alias for template parameters that can be string, dict, or BaseModel
|
|
39
|
+
ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
9
|
+
|
|
10
|
+
_log = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LayoutModelConfig(BaseModel):
|
|
14
|
+
name: str
|
|
15
|
+
repo_id: str
|
|
16
|
+
revision: str
|
|
17
|
+
model_path: str
|
|
18
|
+
supported_devices: list[AcceleratorDevice] = [
|
|
19
|
+
AcceleratorDevice.CPU,
|
|
20
|
+
AcceleratorDevice.CUDA,
|
|
21
|
+
AcceleratorDevice.MPS,
|
|
22
|
+
AcceleratorDevice.XPU,
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def model_repo_folder(self) -> str:
|
|
27
|
+
return self.repo_id.replace("/", "--")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# HuggingFace Layout Models
|
|
31
|
+
|
|
32
|
+
# Default Docling Layout Model
|
|
33
|
+
DOCLING_LAYOUT_V2 = LayoutModelConfig(
|
|
34
|
+
name="docling_layout_v2",
|
|
35
|
+
repo_id="docling-project/docling-layout-old",
|
|
36
|
+
revision="main",
|
|
37
|
+
model_path="",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
DOCLING_LAYOUT_HERON = LayoutModelConfig(
|
|
41
|
+
name="docling_layout_heron",
|
|
42
|
+
repo_id="docling-project/docling-layout-heron",
|
|
43
|
+
revision="main",
|
|
44
|
+
model_path="",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
|
|
48
|
+
name="docling_layout_heron_101",
|
|
49
|
+
repo_id="docling-project/docling-layout-heron-101",
|
|
50
|
+
revision="main",
|
|
51
|
+
model_path="",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
|
|
55
|
+
name="docling_layout_egret_medium",
|
|
56
|
+
repo_id="docling-project/docling-layout-egret-medium",
|
|
57
|
+
revision="main",
|
|
58
|
+
model_path="",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
|
|
62
|
+
name="docling_layout_egret_large",
|
|
63
|
+
repo_id="docling-project/docling-layout-egret-large",
|
|
64
|
+
revision="main",
|
|
65
|
+
model_path="",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
|
|
69
|
+
name="docling_layout_egret_xlarge",
|
|
70
|
+
repo_id="docling-project/docling-layout-egret-xlarge",
|
|
71
|
+
revision="main",
|
|
72
|
+
model_path="",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Example for a hypothetical alternative model
|
|
76
|
+
# ALTERNATIVE_LAYOUT = LayoutModelConfig(
|
|
77
|
+
# name="alternative_layout",
|
|
78
|
+
# repo_id="someorg/alternative-layout",
|
|
79
|
+
# revision="main",
|
|
80
|
+
# model_path="model_artifacts/layout_alt",
|
|
81
|
+
# )
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class LayoutModelType(str, Enum):
|
|
85
|
+
DOCLING_LAYOUT_V2 = "docling_layout_v2"
|
|
86
|
+
DOCLING_LAYOUT_HERON = "docling_layout_heron"
|
|
87
|
+
DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
|
|
88
|
+
DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
|
|
89
|
+
DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
|
|
90
|
+
DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
|
|
91
|
+
# ALTERNATIVE_LAYOUT = "alternative_layout"
|
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Annotated, Any, ClassVar, Dict, List, Literal, Optional, Union
|
|
6
|
+
|
|
7
|
+
from docling_core.types.doc import PictureClassificationLabel
|
|
8
|
+
from pydantic import (
|
|
9
|
+
AnyUrl,
|
|
10
|
+
BaseModel,
|
|
11
|
+
ConfigDict,
|
|
12
|
+
Field,
|
|
13
|
+
)
|
|
14
|
+
from typing_extensions import deprecated
|
|
15
|
+
|
|
16
|
+
from docling.datamodel import asr_model_specs, vlm_model_specs
|
|
17
|
+
|
|
18
|
+
# Import the following for backwards compatibility
|
|
19
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
|
20
|
+
from docling.datamodel.layout_model_specs import (
|
|
21
|
+
DOCLING_LAYOUT_EGRET_LARGE,
|
|
22
|
+
DOCLING_LAYOUT_EGRET_MEDIUM,
|
|
23
|
+
DOCLING_LAYOUT_EGRET_XLARGE,
|
|
24
|
+
DOCLING_LAYOUT_HERON,
|
|
25
|
+
DOCLING_LAYOUT_HERON_101,
|
|
26
|
+
DOCLING_LAYOUT_V2,
|
|
27
|
+
LayoutModelConfig,
|
|
28
|
+
)
|
|
29
|
+
from docling.datamodel.pipeline_options_asr_model import (
|
|
30
|
+
InlineAsrOptions,
|
|
31
|
+
)
|
|
32
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
|
33
|
+
ApiVlmOptions,
|
|
34
|
+
InferenceFramework,
|
|
35
|
+
InlineVlmOptions,
|
|
36
|
+
ResponseFormat,
|
|
37
|
+
)
|
|
38
|
+
from docling.datamodel.vlm_model_specs import (
|
|
39
|
+
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
|
40
|
+
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
|
41
|
+
NU_EXTRACT_2B_TRANSFORMERS,
|
|
42
|
+
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
|
43
|
+
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
|
44
|
+
VlmModelType,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
_log = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class BaseOptions(BaseModel):
|
|
51
|
+
"""Base class for options."""
|
|
52
|
+
|
|
53
|
+
kind: ClassVar[str]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TableFormerMode(str, Enum):
|
|
57
|
+
"""Modes for the TableFormer model."""
|
|
58
|
+
|
|
59
|
+
FAST = "fast"
|
|
60
|
+
ACCURATE = "accurate"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class BaseTableStructureOptions(BaseOptions):
|
|
64
|
+
"""Base options for table structure models."""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class TableStructureOptions(BaseTableStructureOptions):
|
|
68
|
+
"""Options for the table structure."""
|
|
69
|
+
|
|
70
|
+
kind: ClassVar[str] = "docling_tableformer"
|
|
71
|
+
do_cell_matching: bool = (
|
|
72
|
+
True
|
|
73
|
+
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
|
74
|
+
# are merged across table columns.
|
|
75
|
+
# False: Let table structure model define the text cells, ignore PDF cells.
|
|
76
|
+
)
|
|
77
|
+
mode: TableFormerMode = TableFormerMode.ACCURATE
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class OcrOptions(BaseOptions):
|
|
81
|
+
"""OCR options."""
|
|
82
|
+
|
|
83
|
+
lang: Annotated[
|
|
84
|
+
List[str],
|
|
85
|
+
Field(
|
|
86
|
+
description="List of OCR languages to use. The format must match the values of the OCR engine of choice.",
|
|
87
|
+
examples=[["deu", "eng"]],
|
|
88
|
+
),
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
force_full_page_ocr: Annotated[
|
|
92
|
+
bool,
|
|
93
|
+
Field(
|
|
94
|
+
description="If enabled, a full-page OCR is always applied.",
|
|
95
|
+
examples=[False],
|
|
96
|
+
),
|
|
97
|
+
] = False
|
|
98
|
+
|
|
99
|
+
bitmap_area_threshold: Annotated[
|
|
100
|
+
float,
|
|
101
|
+
Field(
|
|
102
|
+
description="Percentage of the page area for a bitmap to be processed with OCR.",
|
|
103
|
+
examples=[0.05, 0.1],
|
|
104
|
+
),
|
|
105
|
+
] = 0.05
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class OcrAutoOptions(OcrOptions):
|
|
109
|
+
"""Options for pick OCR engine automatically."""
|
|
110
|
+
|
|
111
|
+
kind: ClassVar[Literal["auto"]] = "auto"
|
|
112
|
+
lang: Annotated[
|
|
113
|
+
List[str],
|
|
114
|
+
Field(
|
|
115
|
+
description="The automatic OCR engine will use the default values of the engine. Please specify the engine explicitly to change the language selection.",
|
|
116
|
+
),
|
|
117
|
+
] = []
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class RapidOcrOptions(OcrOptions):
|
|
121
|
+
"""Options for the RapidOCR engine."""
|
|
122
|
+
|
|
123
|
+
kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
|
|
124
|
+
|
|
125
|
+
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
|
126
|
+
lang: List[str] = [
|
|
127
|
+
"english",
|
|
128
|
+
"chinese",
|
|
129
|
+
]
|
|
130
|
+
# However, language as a parameter is not supported by rapidocr yet
|
|
131
|
+
# and hence changing this options doesn't affect anything.
|
|
132
|
+
|
|
133
|
+
# For more details on supported languages by RapidOCR visit
|
|
134
|
+
# https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
|
135
|
+
|
|
136
|
+
# For more details on the following options visit
|
|
137
|
+
# https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
|
138
|
+
|
|
139
|
+
# https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#__tabbed_3_4
|
|
140
|
+
backend: Literal["onnxruntime", "openvino", "paddle", "torch"] = "onnxruntime"
|
|
141
|
+
text_score: float = 0.5 # same default as rapidocr
|
|
142
|
+
|
|
143
|
+
use_det: Optional[bool] = None # same default as rapidocr
|
|
144
|
+
use_cls: Optional[bool] = None # same default as rapidocr
|
|
145
|
+
use_rec: Optional[bool] = None # same default as rapidocr
|
|
146
|
+
|
|
147
|
+
print_verbose: bool = False # same default as rapidocr
|
|
148
|
+
|
|
149
|
+
det_model_path: Optional[str] = None # same default as rapidocr
|
|
150
|
+
cls_model_path: Optional[str] = None # same default as rapidocr
|
|
151
|
+
rec_model_path: Optional[str] = None # same default as rapidocr
|
|
152
|
+
rec_keys_path: Optional[str] = None # same default as rapidocr
|
|
153
|
+
rec_font_path: Optional[str] = None # Deprecated, please use font_path instead
|
|
154
|
+
font_path: Optional[str] = None # same default as rapidocr
|
|
155
|
+
|
|
156
|
+
# Dictionary to overwrite or pass-through additional parameters
|
|
157
|
+
rapidocr_params: Dict[str, Any] = Field(default_factory=dict)
|
|
158
|
+
|
|
159
|
+
model_config = ConfigDict(
|
|
160
|
+
extra="forbid",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class EasyOcrOptions(OcrOptions):
|
|
165
|
+
"""Options for the EasyOCR engine."""
|
|
166
|
+
|
|
167
|
+
kind: ClassVar[Literal["easyocr"]] = "easyocr"
|
|
168
|
+
lang: List[str] = ["fr", "de", "es", "en"]
|
|
169
|
+
|
|
170
|
+
use_gpu: Optional[bool] = None
|
|
171
|
+
|
|
172
|
+
confidence_threshold: float = 0.5
|
|
173
|
+
|
|
174
|
+
model_storage_directory: Optional[str] = None
|
|
175
|
+
recog_network: Optional[str] = "standard"
|
|
176
|
+
download_enabled: bool = True
|
|
177
|
+
|
|
178
|
+
suppress_mps_warnings: bool = True
|
|
179
|
+
|
|
180
|
+
model_config = ConfigDict(
|
|
181
|
+
extra="forbid",
|
|
182
|
+
protected_namespaces=(),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class TesseractCliOcrOptions(OcrOptions):
|
|
187
|
+
"""Options for the TesseractCli engine."""
|
|
188
|
+
|
|
189
|
+
kind: ClassVar[Literal["tesseract"]] = "tesseract"
|
|
190
|
+
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
191
|
+
tesseract_cmd: str = "tesseract"
|
|
192
|
+
path: Optional[str] = None
|
|
193
|
+
psm: Optional[int] = (
|
|
194
|
+
None # Page Segmentation Mode (0-13), defaults to tesseract's default
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
model_config = ConfigDict(
|
|
198
|
+
extra="forbid",
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class TesseractOcrOptions(OcrOptions):
|
|
203
|
+
"""Options for the Tesseract engine."""
|
|
204
|
+
|
|
205
|
+
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
|
|
206
|
+
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
207
|
+
path: Optional[str] = None
|
|
208
|
+
psm: Optional[int] = (
|
|
209
|
+
None # Page Segmentation Mode (0-13), defaults to tesseract's default
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
model_config = ConfigDict(
|
|
213
|
+
extra="forbid",
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class OcrMacOptions(OcrOptions):
|
|
218
|
+
"""Options for the Mac OCR engine."""
|
|
219
|
+
|
|
220
|
+
kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
|
|
221
|
+
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
|
222
|
+
recognition: str = "accurate"
|
|
223
|
+
framework: str = "vision"
|
|
224
|
+
|
|
225
|
+
model_config = ConfigDict(
|
|
226
|
+
extra="forbid",
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class PictureDescriptionBaseOptions(BaseOptions):
|
|
231
|
+
batch_size: int = 8
|
|
232
|
+
scale: float = 2
|
|
233
|
+
|
|
234
|
+
picture_area_threshold: float = (
|
|
235
|
+
0.05 # percentage of the area for a picture to processed with the models
|
|
236
|
+
)
|
|
237
|
+
classification_allow: Optional[List[PictureClassificationLabel]] = None
|
|
238
|
+
classification_deny: Optional[List[PictureClassificationLabel]] = None
|
|
239
|
+
classification_min_confidence: float = 0.0
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|
243
|
+
kind: ClassVar[Literal["api"]] = "api"
|
|
244
|
+
|
|
245
|
+
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
|
246
|
+
headers: Dict[str, str] = {}
|
|
247
|
+
params: Dict[str, Any] = {}
|
|
248
|
+
timeout: float = 20
|
|
249
|
+
concurrency: int = 1
|
|
250
|
+
|
|
251
|
+
prompt: str = "Describe this image in a few sentences."
|
|
252
|
+
provenance: str = ""
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
|
256
|
+
kind: ClassVar[Literal["vlm"]] = "vlm"
|
|
257
|
+
|
|
258
|
+
repo_id: str
|
|
259
|
+
prompt: str = "Describe this image in a few sentences."
|
|
260
|
+
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
|
|
261
|
+
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
|
|
262
|
+
|
|
263
|
+
@property
|
|
264
|
+
def repo_cache_folder(self) -> str:
|
|
265
|
+
return self.repo_id.replace("/", "--")
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# SmolVLM
|
|
269
|
+
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
|
270
|
+
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# GraniteVision
|
|
274
|
+
granite_picture_description = PictureDescriptionVlmOptions(
|
|
275
|
+
repo_id="ibm-granite/granite-vision-3.3-2b",
|
|
276
|
+
prompt="What is shown in this image?",
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# Define an enum for the backend options
|
|
281
|
+
class PdfBackend(str, Enum):
|
|
282
|
+
"""Enum of valid PDF backends."""
|
|
283
|
+
|
|
284
|
+
PYPDFIUM2 = "pypdfium2"
|
|
285
|
+
DLPARSE_V1 = "dlparse_v1"
|
|
286
|
+
DLPARSE_V2 = "dlparse_v2"
|
|
287
|
+
DLPARSE_V4 = "dlparse_v4"
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# Define an enum for the ocr engines
|
|
291
|
+
@deprecated(
|
|
292
|
+
"Use get_ocr_factory().registered_kind to get a list of registered OCR engines."
|
|
293
|
+
)
|
|
294
|
+
class OcrEngine(str, Enum):
|
|
295
|
+
"""Enum of valid OCR engines."""
|
|
296
|
+
|
|
297
|
+
AUTO = "auto"
|
|
298
|
+
EASYOCR = "easyocr"
|
|
299
|
+
TESSERACT_CLI = "tesseract_cli"
|
|
300
|
+
TESSERACT = "tesseract"
|
|
301
|
+
OCRMAC = "ocrmac"
|
|
302
|
+
RAPIDOCR = "rapidocr"
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class PipelineOptions(BaseOptions):
|
|
306
|
+
"""Base pipeline options."""
|
|
307
|
+
|
|
308
|
+
document_timeout: Annotated[
|
|
309
|
+
Optional[float],
|
|
310
|
+
Field(
|
|
311
|
+
description="Maximum allowed processing time for a document before timing out. If None, no timeout is enforced.",
|
|
312
|
+
examples=[10.0, 20.0],
|
|
313
|
+
),
|
|
314
|
+
] = None
|
|
315
|
+
|
|
316
|
+
accelerator_options: Annotated[
|
|
317
|
+
AcceleratorOptions,
|
|
318
|
+
Field(
|
|
319
|
+
description="Configuration options for hardware acceleration (e.g., GPU or optimized execution settings).",
|
|
320
|
+
),
|
|
321
|
+
] = AcceleratorOptions()
|
|
322
|
+
|
|
323
|
+
enable_remote_services: Annotated[
|
|
324
|
+
bool,
|
|
325
|
+
Field(
|
|
326
|
+
description="Enable calling external APIs or cloud services during pipeline execution.",
|
|
327
|
+
examples=[False],
|
|
328
|
+
),
|
|
329
|
+
] = False
|
|
330
|
+
|
|
331
|
+
allow_external_plugins: Annotated[
|
|
332
|
+
bool,
|
|
333
|
+
Field(
|
|
334
|
+
description="Allow loading external third-party plugins or modules. Disabled by default for safety.",
|
|
335
|
+
examples=[False],
|
|
336
|
+
),
|
|
337
|
+
] = False
|
|
338
|
+
|
|
339
|
+
artifacts_path: Annotated[
|
|
340
|
+
Optional[Union[Path, str]],
|
|
341
|
+
Field(
|
|
342
|
+
description="Filesystem path where pipeline artifacts should be stored. If None, artifacts will be fetched. You can use the utility `docling-tools models download` to pre-fetch the model artifacts.",
|
|
343
|
+
examples=["./artifacts", "/tmp/docling_outputs"],
|
|
344
|
+
),
|
|
345
|
+
] = None
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class ConvertPipelineOptions(PipelineOptions):
|
|
349
|
+
"""Base convert pipeline options."""
|
|
350
|
+
|
|
351
|
+
do_picture_classification: bool = False # True: classify pictures in documents
|
|
352
|
+
|
|
353
|
+
do_picture_description: bool = False # True: run describe pictures in documents
|
|
354
|
+
picture_description_options: PictureDescriptionBaseOptions = (
|
|
355
|
+
smolvlm_picture_description
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
class PaginatedPipelineOptions(ConvertPipelineOptions):
|
|
360
|
+
images_scale: float = 1.0
|
|
361
|
+
generate_page_images: bool = False
|
|
362
|
+
generate_picture_images: bool = False
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
366
|
+
generate_page_images: bool = True
|
|
367
|
+
force_backend_text: bool = (
|
|
368
|
+
False # (To be used with vlms, or other generative models)
|
|
369
|
+
)
|
|
370
|
+
# If True, text from backend will be used instead of generated text
|
|
371
|
+
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
|
372
|
+
vlm_model_specs.GRANITEDOCLING_TRANSFORMERS
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
class BaseLayoutOptions(BaseOptions):
|
|
377
|
+
"""Base options for layout models."""
|
|
378
|
+
|
|
379
|
+
keep_empty_clusters: bool = (
|
|
380
|
+
False # Whether to keep clusters that contain no text cells
|
|
381
|
+
)
|
|
382
|
+
skip_cell_assignment: bool = (
|
|
383
|
+
False # Skip cell-to-cluster assignment for VLM-only processing
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
class LayoutOptions(BaseLayoutOptions):
|
|
388
|
+
"""Options for layout processing."""
|
|
389
|
+
|
|
390
|
+
kind: ClassVar[str] = "docling_layout_default"
|
|
391
|
+
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
|
|
392
|
+
model_spec: LayoutModelConfig = DOCLING_LAYOUT_HERON
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
class AsrPipelineOptions(PipelineOptions):
|
|
396
|
+
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class VlmExtractionPipelineOptions(PipelineOptions):
|
|
400
|
+
"""Options for extraction pipeline."""
|
|
401
|
+
|
|
402
|
+
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
406
|
+
"""Options for the PDF pipeline."""
|
|
407
|
+
|
|
408
|
+
do_table_structure: bool = True # True: perform table structure extraction
|
|
409
|
+
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
|
410
|
+
do_code_enrichment: bool = False # True: perform code OCR
|
|
411
|
+
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
|
412
|
+
force_backend_text: bool = (
|
|
413
|
+
False # (To be used with vlms, or other generative models)
|
|
414
|
+
)
|
|
415
|
+
# If True, text from backend will be used instead of generated text
|
|
416
|
+
|
|
417
|
+
table_structure_options: BaseTableStructureOptions = TableStructureOptions()
|
|
418
|
+
ocr_options: OcrOptions = OcrAutoOptions()
|
|
419
|
+
layout_options: BaseLayoutOptions = LayoutOptions()
|
|
420
|
+
|
|
421
|
+
images_scale: float = 1.0
|
|
422
|
+
generate_page_images: bool = False
|
|
423
|
+
generate_picture_images: bool = False
|
|
424
|
+
generate_table_images: bool = Field(
|
|
425
|
+
default=False,
|
|
426
|
+
deprecated=(
|
|
427
|
+
"Field `generate_table_images` is deprecated. "
|
|
428
|
+
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
|
|
429
|
+
"before conversion and then use the `TableItem.get_image` function."
|
|
430
|
+
),
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
generate_parsed_pages: bool = False
|
|
434
|
+
|
|
435
|
+
### Arguments for threaded PDF pipeline with batching and backpressure control
|
|
436
|
+
|
|
437
|
+
# Batch sizes for different stages
|
|
438
|
+
ocr_batch_size: int = 4
|
|
439
|
+
layout_batch_size: int = 4
|
|
440
|
+
table_batch_size: int = 4
|
|
441
|
+
|
|
442
|
+
# Timing control
|
|
443
|
+
batch_polling_interval_seconds: float = 0.5
|
|
444
|
+
|
|
445
|
+
# Backpressure and queue control
|
|
446
|
+
queue_max_size: int = 100
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
class ProcessingPipeline(str, Enum):
|
|
450
|
+
LEGACY = "legacy"
|
|
451
|
+
STANDARD = "standard"
|
|
452
|
+
VLM = "vlm"
|
|
453
|
+
ASR = "asr"
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
|
457
|
+
"""Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
3
|
+
|
|
4
|
+
from pydantic import AnyUrl, BaseModel
|
|
5
|
+
from typing_extensions import deprecated
|
|
6
|
+
|
|
7
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
8
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
|
9
|
+
# InferenceFramework,
|
|
10
|
+
TransformersModelType,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseAsrOptions(BaseModel):
|
|
15
|
+
kind: str
|
|
16
|
+
# prompt: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InferenceAsrFramework(str, Enum):
|
|
20
|
+
MLX = "mlx"
|
|
21
|
+
# TRANSFORMERS = "transformers" # disabled for now
|
|
22
|
+
WHISPER = "whisper"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class InlineAsrOptions(BaseAsrOptions):
|
|
26
|
+
kind: Literal["inline_model_options"] = "inline_model_options"
|
|
27
|
+
|
|
28
|
+
repo_id: str
|
|
29
|
+
|
|
30
|
+
verbose: bool = False
|
|
31
|
+
timestamps: bool = True
|
|
32
|
+
|
|
33
|
+
temperature: float = 0.0
|
|
34
|
+
max_new_tokens: int = 256
|
|
35
|
+
max_time_chunk: float = 30.0
|
|
36
|
+
|
|
37
|
+
torch_dtype: Optional[str] = None
|
|
38
|
+
supported_devices: List[AcceleratorDevice] = [
|
|
39
|
+
AcceleratorDevice.CPU,
|
|
40
|
+
AcceleratorDevice.CUDA,
|
|
41
|
+
AcceleratorDevice.MPS,
|
|
42
|
+
AcceleratorDevice.XPU,
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def repo_cache_folder(self) -> str:
|
|
47
|
+
return self.repo_id.replace("/", "--")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class InlineAsrNativeWhisperOptions(InlineAsrOptions):
|
|
51
|
+
inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
|
|
52
|
+
|
|
53
|
+
language: str = "en"
|
|
54
|
+
supported_devices: List[AcceleratorDevice] = [
|
|
55
|
+
AcceleratorDevice.CPU,
|
|
56
|
+
AcceleratorDevice.CUDA,
|
|
57
|
+
]
|
|
58
|
+
word_timestamps: bool = True
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class InlineAsrMlxWhisperOptions(InlineAsrOptions):
|
|
62
|
+
"""
|
|
63
|
+
MLX Whisper options for Apple Silicon optimization.
|
|
64
|
+
|
|
65
|
+
Uses mlx-whisper library for efficient inference on Apple Silicon devices.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
inference_framework: InferenceAsrFramework = InferenceAsrFramework.MLX
|
|
69
|
+
|
|
70
|
+
language: str = "en"
|
|
71
|
+
task: str = "transcribe" # "transcribe" or "translate"
|
|
72
|
+
supported_devices: List[AcceleratorDevice] = [
|
|
73
|
+
AcceleratorDevice.MPS, # MLX is optimized for Apple Silicon
|
|
74
|
+
]
|
|
75
|
+
word_timestamps: bool = True
|
|
76
|
+
no_speech_threshold: float = 0.6 # Threshold for detecting speech
|
|
77
|
+
logprob_threshold: float = -1.0 # Log probability threshold
|
|
78
|
+
compression_ratio_threshold: float = 2.4 # Compression ratio threshold
|