docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from PIL import Image
|
|
9
|
+
from pydantic import AnyUrl
|
|
10
|
+
|
|
11
|
+
from docling.datamodel.base_models import OpenAiApiResponse, VlmStopReason
|
|
12
|
+
from docling.models.utils.generation_utils import GenerationStopper
|
|
13
|
+
|
|
14
|
+
_log = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def api_image_request(
|
|
18
|
+
image: Image.Image,
|
|
19
|
+
prompt: str,
|
|
20
|
+
url: AnyUrl,
|
|
21
|
+
timeout: float = 20,
|
|
22
|
+
headers: Optional[dict[str, str]] = None,
|
|
23
|
+
**params,
|
|
24
|
+
) -> Tuple[str, Optional[int], VlmStopReason]:
|
|
25
|
+
img_io = BytesIO()
|
|
26
|
+
image = (
|
|
27
|
+
image.copy()
|
|
28
|
+
) # Fix for inconsistent PIL image width/height to actual byte data
|
|
29
|
+
image = image.convert("RGBA")
|
|
30
|
+
good_image = True
|
|
31
|
+
try:
|
|
32
|
+
image.save(img_io, "PNG")
|
|
33
|
+
except Exception as e:
|
|
34
|
+
good_image = False
|
|
35
|
+
_log.error(f"Error, corrupter PNG of size: {image.size}: {e}")
|
|
36
|
+
|
|
37
|
+
if good_image:
|
|
38
|
+
try:
|
|
39
|
+
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
|
40
|
+
|
|
41
|
+
messages = [
|
|
42
|
+
{
|
|
43
|
+
"role": "user",
|
|
44
|
+
"content": [
|
|
45
|
+
{
|
|
46
|
+
"type": "image_url",
|
|
47
|
+
"image_url": {
|
|
48
|
+
"url": f"data:image/png;base64,{image_base64}"
|
|
49
|
+
},
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"type": "text",
|
|
53
|
+
"text": prompt,
|
|
54
|
+
},
|
|
55
|
+
],
|
|
56
|
+
}
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
payload = {
|
|
60
|
+
"messages": messages,
|
|
61
|
+
**params,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
headers = headers or {}
|
|
65
|
+
|
|
66
|
+
r = requests.post(
|
|
67
|
+
str(url),
|
|
68
|
+
headers=headers,
|
|
69
|
+
json=payload,
|
|
70
|
+
timeout=timeout,
|
|
71
|
+
)
|
|
72
|
+
if not r.ok:
|
|
73
|
+
_log.error(f"Error calling the API. Response was {r.text}")
|
|
74
|
+
# image.show()
|
|
75
|
+
# r.raise_for_status()
|
|
76
|
+
|
|
77
|
+
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
|
78
|
+
generated_text = api_resp.choices[0].message.content.strip()
|
|
79
|
+
num_tokens = api_resp.usage.total_tokens
|
|
80
|
+
stop_reason = (
|
|
81
|
+
VlmStopReason.LENGTH
|
|
82
|
+
if api_resp.choices[0].finish_reason == "length"
|
|
83
|
+
else VlmStopReason.END_OF_SEQUENCE
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return generated_text, num_tokens, stop_reason
|
|
87
|
+
except Exception as e:
|
|
88
|
+
_log.error(f"Error, could not process request: {e}")
|
|
89
|
+
return "", 0, VlmStopReason.UNSPECIFIED
|
|
90
|
+
else:
|
|
91
|
+
return "", 0, VlmStopReason.UNSPECIFIED
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def api_image_request_streaming(
|
|
95
|
+
image: Image.Image,
|
|
96
|
+
prompt: str,
|
|
97
|
+
url: AnyUrl,
|
|
98
|
+
*,
|
|
99
|
+
timeout: float = 20,
|
|
100
|
+
headers: Optional[dict[str, str]] = None,
|
|
101
|
+
generation_stoppers: list[GenerationStopper] = [],
|
|
102
|
+
**params,
|
|
103
|
+
) -> Tuple[str, Optional[int]]:
|
|
104
|
+
"""
|
|
105
|
+
Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
|
|
106
|
+
Parses SSE lines: 'data: {json}\\n\\n', terminated by 'data: [DONE]'.
|
|
107
|
+
Accumulates text and calls stopper.should_stop(window) as chunks arrive.
|
|
108
|
+
If stopper triggers, the HTTP connection is closed to abort server-side generation.
|
|
109
|
+
"""
|
|
110
|
+
img_io = BytesIO()
|
|
111
|
+
image.save(img_io, "PNG")
|
|
112
|
+
image_b64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
|
113
|
+
|
|
114
|
+
messages = [
|
|
115
|
+
{
|
|
116
|
+
"role": "user",
|
|
117
|
+
"content": [
|
|
118
|
+
{
|
|
119
|
+
"type": "image_url",
|
|
120
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
121
|
+
},
|
|
122
|
+
{"type": "text", "text": prompt},
|
|
123
|
+
],
|
|
124
|
+
}
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
payload = {
|
|
128
|
+
"messages": messages,
|
|
129
|
+
"stream": True, # <-- critical for SSE streaming
|
|
130
|
+
**params,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
# Debug: Log the payload to verify temperature is included
|
|
134
|
+
_log.debug(f"API streaming request payload: {json.dumps(payload, indent=2)}")
|
|
135
|
+
|
|
136
|
+
# Some servers require Accept: text/event-stream for SSE.
|
|
137
|
+
# It's safe to set it; OpenAI-compatible servers tolerate it.
|
|
138
|
+
hdrs = {"Accept": "text/event-stream", **(headers or {})}
|
|
139
|
+
|
|
140
|
+
# Try to force temperature via header if server ignores payload parameter
|
|
141
|
+
if "temperature" in params:
|
|
142
|
+
hdrs["X-Temperature"] = str(params["temperature"])
|
|
143
|
+
|
|
144
|
+
# Stream the HTTP response
|
|
145
|
+
with requests.post(
|
|
146
|
+
str(url), headers=hdrs, json=payload, timeout=timeout, stream=True
|
|
147
|
+
) as r:
|
|
148
|
+
if not r.ok:
|
|
149
|
+
_log.error(
|
|
150
|
+
f"Error calling the API {url} in streaming mode. Response was {r.text}"
|
|
151
|
+
)
|
|
152
|
+
r.raise_for_status()
|
|
153
|
+
|
|
154
|
+
full_text = []
|
|
155
|
+
for raw_line in r.iter_lines(decode_unicode=True):
|
|
156
|
+
if not raw_line: # keep-alives / blank lines
|
|
157
|
+
continue
|
|
158
|
+
if not raw_line.startswith("data:"):
|
|
159
|
+
# Some proxies inject comments; ignore anything not starting with 'data:'
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
data = raw_line[len("data:") :].strip()
|
|
163
|
+
if data == "[DONE]":
|
|
164
|
+
break
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
obj = json.loads(data)
|
|
168
|
+
except json.JSONDecodeError:
|
|
169
|
+
_log.debug("Skipping non-JSON SSE chunk: %r", data[:200])
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
# OpenAI-compatible delta format
|
|
173
|
+
# obj["choices"][0]["delta"]["content"] may be None or missing (e.g., tool calls)
|
|
174
|
+
try:
|
|
175
|
+
delta = obj["choices"][0].get("delta") or {}
|
|
176
|
+
piece = delta.get("content") or ""
|
|
177
|
+
except (KeyError, IndexError) as e:
|
|
178
|
+
_log.debug("Unexpected SSE chunk shape: %s", e)
|
|
179
|
+
piece = ""
|
|
180
|
+
|
|
181
|
+
# Try to extract token count
|
|
182
|
+
num_tokens = None
|
|
183
|
+
try:
|
|
184
|
+
if "usage" in obj:
|
|
185
|
+
usage = obj["usage"]
|
|
186
|
+
num_tokens = usage.get("total_tokens")
|
|
187
|
+
except Exception as e:
|
|
188
|
+
num_tokens = None
|
|
189
|
+
_log.debug("Usage key not included in response: %s", e)
|
|
190
|
+
|
|
191
|
+
if piece:
|
|
192
|
+
full_text.append(piece)
|
|
193
|
+
for stopper in generation_stoppers:
|
|
194
|
+
# Respect stopper's lookback window. We use a simple string window which
|
|
195
|
+
# works with the GenerationStopper interface.
|
|
196
|
+
lookback = max(1, stopper.lookback_tokens())
|
|
197
|
+
window = "".join(full_text)[-lookback:]
|
|
198
|
+
if stopper.should_stop(window):
|
|
199
|
+
# Break out of the loop cleanly. The context manager will handle
|
|
200
|
+
# closing the connection when we exit the 'with' block.
|
|
201
|
+
# vLLM/OpenAI-compatible servers will detect the client disconnect
|
|
202
|
+
# and abort the request server-side.
|
|
203
|
+
return "".join(full_text), num_tokens
|
|
204
|
+
|
|
205
|
+
return "".join(full_text), num_tokens
|
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
"""Utilities for parsing DeepSeek OCR annotated markdown format."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
from docling_core.types.doc import (
|
|
8
|
+
BoundingBox,
|
|
9
|
+
CoordOrigin,
|
|
10
|
+
DocItemLabel,
|
|
11
|
+
DoclingDocument,
|
|
12
|
+
DocumentOrigin,
|
|
13
|
+
ImageRef,
|
|
14
|
+
ProvenanceItem,
|
|
15
|
+
RefItem,
|
|
16
|
+
Size,
|
|
17
|
+
TableCell,
|
|
18
|
+
TableData,
|
|
19
|
+
TextItem,
|
|
20
|
+
)
|
|
21
|
+
from lxml import etree
|
|
22
|
+
from PIL import Image as PILImage
|
|
23
|
+
|
|
24
|
+
_log = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _parse_table_html(html_content: str) -> TableData:
|
|
28
|
+
"""Parse HTML table content and create TableData structure.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
html_content: HTML string containing <table> element
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
TableData with parsed table structure
|
|
35
|
+
"""
|
|
36
|
+
# Extract table HTML if wrapped in other content
|
|
37
|
+
table_match = re.search(
|
|
38
|
+
r"<table[^>]*>.*?</table>", html_content, re.DOTALL | re.IGNORECASE
|
|
39
|
+
)
|
|
40
|
+
if not table_match:
|
|
41
|
+
# No table found, return empty table
|
|
42
|
+
return TableData(num_rows=0, num_cols=0, table_cells=[])
|
|
43
|
+
|
|
44
|
+
table_html = table_match.group(0)
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
# Parse HTML with lxml
|
|
48
|
+
parser = etree.HTMLParser()
|
|
49
|
+
tree = etree.fromstring(table_html, parser)
|
|
50
|
+
|
|
51
|
+
# Find all rows
|
|
52
|
+
rows = tree.xpath(".//tr")
|
|
53
|
+
if not rows:
|
|
54
|
+
return TableData(num_rows=0, num_cols=0, table_cells=[])
|
|
55
|
+
|
|
56
|
+
# Calculate grid dimensions
|
|
57
|
+
num_rows = len(rows)
|
|
58
|
+
num_cols = 0
|
|
59
|
+
|
|
60
|
+
# First pass: determine number of columns
|
|
61
|
+
for row in rows:
|
|
62
|
+
cells = row.xpath("./td | ./th")
|
|
63
|
+
col_count = 0
|
|
64
|
+
for cell in cells:
|
|
65
|
+
colspan = int(cell.get("colspan", "1"))
|
|
66
|
+
col_count += colspan
|
|
67
|
+
num_cols = max(num_cols, col_count)
|
|
68
|
+
|
|
69
|
+
# Create grid to track cell positions
|
|
70
|
+
grid: list[list[Union[None, str]]] = [
|
|
71
|
+
[None for _ in range(num_cols)] for _ in range(num_rows)
|
|
72
|
+
]
|
|
73
|
+
table_data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
|
74
|
+
|
|
75
|
+
# Second pass: populate cells
|
|
76
|
+
for row_idx, row in enumerate(rows):
|
|
77
|
+
cells = row.xpath("./td | ./th")
|
|
78
|
+
col_idx = 0
|
|
79
|
+
|
|
80
|
+
for cell in cells:
|
|
81
|
+
# Find next available column
|
|
82
|
+
while col_idx < num_cols and grid[row_idx][col_idx] is not None:
|
|
83
|
+
col_idx += 1
|
|
84
|
+
|
|
85
|
+
if col_idx >= num_cols:
|
|
86
|
+
break
|
|
87
|
+
|
|
88
|
+
# Get cell properties
|
|
89
|
+
text = "".join(cell.itertext()).strip()
|
|
90
|
+
colspan = int(cell.get("colspan", "1"))
|
|
91
|
+
rowspan = int(cell.get("rowspan", "1"))
|
|
92
|
+
is_header = cell.tag.lower() == "th"
|
|
93
|
+
|
|
94
|
+
# Mark grid cells as occupied
|
|
95
|
+
for r in range(row_idx, min(row_idx + rowspan, num_rows)):
|
|
96
|
+
for c in range(col_idx, min(col_idx + colspan, num_cols)):
|
|
97
|
+
grid[r][c] = text
|
|
98
|
+
|
|
99
|
+
# Create table cell
|
|
100
|
+
table_cell = TableCell(
|
|
101
|
+
text=text,
|
|
102
|
+
row_span=rowspan,
|
|
103
|
+
col_span=colspan,
|
|
104
|
+
start_row_offset_idx=row_idx,
|
|
105
|
+
end_row_offset_idx=row_idx + rowspan,
|
|
106
|
+
start_col_offset_idx=col_idx,
|
|
107
|
+
end_col_offset_idx=col_idx + colspan,
|
|
108
|
+
column_header=is_header and row_idx == 0,
|
|
109
|
+
row_header=is_header and col_idx == 0,
|
|
110
|
+
)
|
|
111
|
+
table_data.table_cells.append(table_cell)
|
|
112
|
+
|
|
113
|
+
col_idx += colspan
|
|
114
|
+
|
|
115
|
+
return table_data
|
|
116
|
+
|
|
117
|
+
except Exception as e:
|
|
118
|
+
_log.warning(f"Failed to parse table HTML: {e}")
|
|
119
|
+
return TableData(num_rows=0, num_cols=0, table_cells=[])
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _collect_annotation_content(
|
|
123
|
+
lines: list[str],
|
|
124
|
+
i: int,
|
|
125
|
+
label_str: str,
|
|
126
|
+
annotation_pattern: str,
|
|
127
|
+
visited_lines: set[int],
|
|
128
|
+
) -> tuple[str, int]:
|
|
129
|
+
"""Collect content for an annotation.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
lines: All lines from the document
|
|
133
|
+
i: Current line index (after annotation line)
|
|
134
|
+
label_str: The annotation label (e.g., 'table', 'text')
|
|
135
|
+
annotation_pattern: Regex pattern to match annotations
|
|
136
|
+
visited_lines: Set of already visited line indices
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Tuple of (content string, next line index)
|
|
140
|
+
"""
|
|
141
|
+
content_lines = []
|
|
142
|
+
|
|
143
|
+
# Special handling for table: extract only <table>...</table>
|
|
144
|
+
if label_str == "table":
|
|
145
|
+
table_started = False
|
|
146
|
+
ii = i
|
|
147
|
+
while ii < len(lines):
|
|
148
|
+
line = lines[ii]
|
|
149
|
+
if "<table" in line.lower():
|
|
150
|
+
table_started = True
|
|
151
|
+
if table_started:
|
|
152
|
+
visited_lines.add(ii)
|
|
153
|
+
content_lines.append(line.rstrip())
|
|
154
|
+
if table_started and "</table>" in line.lower():
|
|
155
|
+
break
|
|
156
|
+
ii += 1
|
|
157
|
+
else:
|
|
158
|
+
# Original logic for other labels
|
|
159
|
+
while i < len(lines):
|
|
160
|
+
content_line = lines[i].strip()
|
|
161
|
+
if content_line:
|
|
162
|
+
if re.match(annotation_pattern, content_line):
|
|
163
|
+
break
|
|
164
|
+
visited_lines.add(i)
|
|
165
|
+
content_lines.append(lines[i].rstrip())
|
|
166
|
+
i += 1
|
|
167
|
+
if label_str not in ["figure", "image"]:
|
|
168
|
+
break
|
|
169
|
+
else:
|
|
170
|
+
i += 1
|
|
171
|
+
if content_lines:
|
|
172
|
+
break
|
|
173
|
+
|
|
174
|
+
return "\n".join(content_lines), i
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _process_annotation_item(
|
|
178
|
+
label_str: str,
|
|
179
|
+
content: str,
|
|
180
|
+
prov: ProvenanceItem,
|
|
181
|
+
caption_item: Optional[Union[TextItem, RefItem]],
|
|
182
|
+
page_doc: DoclingDocument,
|
|
183
|
+
label_map: dict[str, DocItemLabel],
|
|
184
|
+
) -> None:
|
|
185
|
+
"""Process and add a single annotation item to the document.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
label_str: The annotation label
|
|
189
|
+
content: The content text
|
|
190
|
+
prov: Provenance information
|
|
191
|
+
caption_item: Optional caption item to link
|
|
192
|
+
page_doc: Document to add item to
|
|
193
|
+
label_map: Mapping of label strings to DocItemLabel
|
|
194
|
+
"""
|
|
195
|
+
doc_label = label_map.get(label_str, DocItemLabel.TEXT)
|
|
196
|
+
|
|
197
|
+
if label_str in ["figure", "image"]:
|
|
198
|
+
page_doc.add_picture(caption=caption_item, prov=prov)
|
|
199
|
+
elif label_str == "table":
|
|
200
|
+
table_data = _parse_table_html(content)
|
|
201
|
+
page_doc.add_table(data=table_data, caption=caption_item, prov=prov)
|
|
202
|
+
elif label_str == "title":
|
|
203
|
+
clean_content = content
|
|
204
|
+
if content.startswith("#"):
|
|
205
|
+
hash_count = 0
|
|
206
|
+
for char in content:
|
|
207
|
+
if char == "#":
|
|
208
|
+
hash_count += 1
|
|
209
|
+
else:
|
|
210
|
+
break
|
|
211
|
+
clean_content = content[hash_count:].strip()
|
|
212
|
+
page_doc.add_title(text=clean_content, prov=prov)
|
|
213
|
+
elif label_str == "sub_title":
|
|
214
|
+
heading_level = 1
|
|
215
|
+
clean_content = content
|
|
216
|
+
if content.startswith("#"):
|
|
217
|
+
hash_count = 0
|
|
218
|
+
for char in content:
|
|
219
|
+
if char == "#":
|
|
220
|
+
hash_count += 1
|
|
221
|
+
else:
|
|
222
|
+
break
|
|
223
|
+
if hash_count > 1:
|
|
224
|
+
heading_level = hash_count - 1
|
|
225
|
+
clean_content = content[hash_count:].strip()
|
|
226
|
+
page_doc.add_heading(text=clean_content, level=heading_level, prov=prov)
|
|
227
|
+
else:
|
|
228
|
+
page_doc.add_text(label=doc_label, text=content, prov=prov)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def parse_deepseekocr_markdown(
|
|
232
|
+
content: str,
|
|
233
|
+
original_page_size: Size,
|
|
234
|
+
page_no: int,
|
|
235
|
+
filename: str = "file",
|
|
236
|
+
page_image: Optional[PILImage.Image] = None,
|
|
237
|
+
) -> DoclingDocument:
|
|
238
|
+
"""Parse DeepSeek OCR markdown with label[[x1, y1, x2, y2]] format.
|
|
239
|
+
|
|
240
|
+
This function parses markdown content that has been annotated with bounding box
|
|
241
|
+
coordinates for different document elements.
|
|
242
|
+
|
|
243
|
+
Labels supported:
|
|
244
|
+
- text: Standard body text
|
|
245
|
+
- title: Main document or section titles
|
|
246
|
+
- sub_title: Secondary headings or sub-headers
|
|
247
|
+
- table: Tabular data
|
|
248
|
+
- table_caption: Descriptive text for tables
|
|
249
|
+
- figure: Image-based elements or diagrams
|
|
250
|
+
- figure_caption: Titles or descriptions for figures/images
|
|
251
|
+
- header / footer: Content at top or bottom margins of pages
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
content: The annotated markdown content string
|
|
255
|
+
page_image: Optional PIL Image of the page
|
|
256
|
+
page_no: Page number (default: 1)
|
|
257
|
+
filename: Source filename (default: "file")
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
DoclingDocument with parsed content
|
|
261
|
+
"""
|
|
262
|
+
# Label mapping
|
|
263
|
+
label_map = {
|
|
264
|
+
"text": DocItemLabel.TEXT,
|
|
265
|
+
"title": DocItemLabel.TITLE,
|
|
266
|
+
"sub_title": DocItemLabel.SECTION_HEADER,
|
|
267
|
+
"table": DocItemLabel.TABLE,
|
|
268
|
+
"table_caption": DocItemLabel.CAPTION,
|
|
269
|
+
"figure": DocItemLabel.PICTURE,
|
|
270
|
+
"figure_caption": DocItemLabel.CAPTION,
|
|
271
|
+
"image": DocItemLabel.PICTURE,
|
|
272
|
+
"image_caption": DocItemLabel.CAPTION,
|
|
273
|
+
"header": DocItemLabel.PAGE_HEADER,
|
|
274
|
+
"footer": DocItemLabel.PAGE_FOOTER,
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
# Pattern to match: <|ref|>label<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|> or label[[x1, y1, x2, y2]]
|
|
278
|
+
annotation_pattern = r"^(?:<\|ref\|>)?(\w+)(?:<\|/ref\|>)?(?:<\|det\|>)?\[\[([0-9., ]+)\]\](?:<\|/det\|>)?\s*$"
|
|
279
|
+
|
|
280
|
+
# Create a new document
|
|
281
|
+
origin = DocumentOrigin(
|
|
282
|
+
filename=filename,
|
|
283
|
+
mimetype="text/markdown",
|
|
284
|
+
binary_hash=0,
|
|
285
|
+
)
|
|
286
|
+
page_doc = DoclingDocument(name=filename.rsplit(".", 1)[0], origin=origin)
|
|
287
|
+
|
|
288
|
+
# Get page dimensions - use original page size if provided, otherwise image size
|
|
289
|
+
pg_width = original_page_size.width
|
|
290
|
+
pg_height = original_page_size.height
|
|
291
|
+
|
|
292
|
+
# Calculate scale factor for bbox conversion
|
|
293
|
+
# VLM produces bboxes in unit of 1000
|
|
294
|
+
scale_x = pg_width / 1000
|
|
295
|
+
scale_y = pg_height / 1000
|
|
296
|
+
|
|
297
|
+
# Calculate DPI for the image
|
|
298
|
+
image_dpi = 72
|
|
299
|
+
if page_image is not None:
|
|
300
|
+
image_dpi = int(72 * page_image.width / pg_width)
|
|
301
|
+
|
|
302
|
+
# Add page metadata
|
|
303
|
+
page_doc.add_page(
|
|
304
|
+
page_no=page_no,
|
|
305
|
+
size=Size(width=pg_width, height=pg_height),
|
|
306
|
+
image=ImageRef.from_pil(image=page_image, dpi=image_dpi)
|
|
307
|
+
if page_image
|
|
308
|
+
else None,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# Split into lines and parse - collect all annotations first
|
|
312
|
+
lines = content.split("\n")
|
|
313
|
+
annotations = []
|
|
314
|
+
i = 0
|
|
315
|
+
visited_lines: set[int] = set()
|
|
316
|
+
|
|
317
|
+
while i < len(lines):
|
|
318
|
+
if i in visited_lines:
|
|
319
|
+
i += 1
|
|
320
|
+
continue
|
|
321
|
+
|
|
322
|
+
line = lines[i].strip()
|
|
323
|
+
match = re.match(annotation_pattern, line)
|
|
324
|
+
if match:
|
|
325
|
+
label_str = match.group(1)
|
|
326
|
+
coords_str = match.group(2)
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
coords = [float(x.strip()) for x in coords_str.split(",")]
|
|
330
|
+
if len(coords) == 4:
|
|
331
|
+
# Scale bounding box from image coordinates to original page coordinates
|
|
332
|
+
bbox = BoundingBox(
|
|
333
|
+
l=coords[0] * scale_x,
|
|
334
|
+
t=coords[1] * scale_y,
|
|
335
|
+
r=coords[2] * scale_x,
|
|
336
|
+
b=coords[3] * scale_y,
|
|
337
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
338
|
+
)
|
|
339
|
+
prov = ProvenanceItem(page_no=page_no, bbox=bbox, charspan=[0, 0])
|
|
340
|
+
|
|
341
|
+
# Get the content (next non-empty line)
|
|
342
|
+
i += 1
|
|
343
|
+
content_text, i = _collect_annotation_content(
|
|
344
|
+
lines, i, label_str, annotation_pattern, visited_lines
|
|
345
|
+
)
|
|
346
|
+
annotations.append((label_str, content_text, prov))
|
|
347
|
+
continue
|
|
348
|
+
except (ValueError, IndexError):
|
|
349
|
+
pass
|
|
350
|
+
i += 1
|
|
351
|
+
|
|
352
|
+
# Process annotations and link captions that appear AFTER tables/figures
|
|
353
|
+
for idx, (label_str, content_text, prov) in enumerate(annotations):
|
|
354
|
+
# Check if NEXT annotation is a caption for this table/figure/image
|
|
355
|
+
# (caption appears AFTER table in the file: table[[...]] then table_caption[[...]])
|
|
356
|
+
caption_item = None
|
|
357
|
+
if label_str in ["table", "figure", "image"] and idx + 1 < len(annotations):
|
|
358
|
+
next_label, next_content, next_prov = annotations[idx + 1]
|
|
359
|
+
if (
|
|
360
|
+
(label_str == "table" and next_label == "table_caption")
|
|
361
|
+
or (label_str == "figure" and next_label == "figure_caption")
|
|
362
|
+
or (label_str == "image" and next_label == "image_caption")
|
|
363
|
+
):
|
|
364
|
+
# Create caption item
|
|
365
|
+
caption_label = label_map.get(next_label, DocItemLabel.CAPTION)
|
|
366
|
+
caption_item = page_doc.add_text(
|
|
367
|
+
label=caption_label,
|
|
368
|
+
text=next_content,
|
|
369
|
+
prov=next_prov,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Skip if this is a caption that was already processed
|
|
373
|
+
if label_str in ["figure_caption", "table_caption", "image_caption"]:
|
|
374
|
+
if idx > 0:
|
|
375
|
+
prev_label = annotations[idx - 1][0]
|
|
376
|
+
if (
|
|
377
|
+
(label_str == "table_caption" and prev_label == "table")
|
|
378
|
+
or (label_str == "figure_caption" and prev_label == "figure")
|
|
379
|
+
or (label_str == "image_caption" and prev_label == "image")
|
|
380
|
+
):
|
|
381
|
+
continue
|
|
382
|
+
|
|
383
|
+
# Add the item
|
|
384
|
+
_process_annotation_item(
|
|
385
|
+
label_str, content_text, prov, caption_item, page_doc, label_map
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
return page_doc
|