docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,205 @@
1
+ import base64
2
+ import json
3
+ import logging
4
+ from io import BytesIO
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ import requests
8
+ from PIL import Image
9
+ from pydantic import AnyUrl
10
+
11
+ from docling.datamodel.base_models import OpenAiApiResponse, VlmStopReason
12
+ from docling.models.utils.generation_utils import GenerationStopper
13
+
14
+ _log = logging.getLogger(__name__)
15
+
16
+
17
+ def api_image_request(
18
+ image: Image.Image,
19
+ prompt: str,
20
+ url: AnyUrl,
21
+ timeout: float = 20,
22
+ headers: Optional[dict[str, str]] = None,
23
+ **params,
24
+ ) -> Tuple[str, Optional[int], VlmStopReason]:
25
+ img_io = BytesIO()
26
+ image = (
27
+ image.copy()
28
+ ) # Fix for inconsistent PIL image width/height to actual byte data
29
+ image = image.convert("RGBA")
30
+ good_image = True
31
+ try:
32
+ image.save(img_io, "PNG")
33
+ except Exception as e:
34
+ good_image = False
35
+ _log.error(f"Error, corrupter PNG of size: {image.size}: {e}")
36
+
37
+ if good_image:
38
+ try:
39
+ image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
40
+
41
+ messages = [
42
+ {
43
+ "role": "user",
44
+ "content": [
45
+ {
46
+ "type": "image_url",
47
+ "image_url": {
48
+ "url": f"data:image/png;base64,{image_base64}"
49
+ },
50
+ },
51
+ {
52
+ "type": "text",
53
+ "text": prompt,
54
+ },
55
+ ],
56
+ }
57
+ ]
58
+
59
+ payload = {
60
+ "messages": messages,
61
+ **params,
62
+ }
63
+
64
+ headers = headers or {}
65
+
66
+ r = requests.post(
67
+ str(url),
68
+ headers=headers,
69
+ json=payload,
70
+ timeout=timeout,
71
+ )
72
+ if not r.ok:
73
+ _log.error(f"Error calling the API. Response was {r.text}")
74
+ # image.show()
75
+ # r.raise_for_status()
76
+
77
+ api_resp = OpenAiApiResponse.model_validate_json(r.text)
78
+ generated_text = api_resp.choices[0].message.content.strip()
79
+ num_tokens = api_resp.usage.total_tokens
80
+ stop_reason = (
81
+ VlmStopReason.LENGTH
82
+ if api_resp.choices[0].finish_reason == "length"
83
+ else VlmStopReason.END_OF_SEQUENCE
84
+ )
85
+
86
+ return generated_text, num_tokens, stop_reason
87
+ except Exception as e:
88
+ _log.error(f"Error, could not process request: {e}")
89
+ return "", 0, VlmStopReason.UNSPECIFIED
90
+ else:
91
+ return "", 0, VlmStopReason.UNSPECIFIED
92
+
93
+
94
+ def api_image_request_streaming(
95
+ image: Image.Image,
96
+ prompt: str,
97
+ url: AnyUrl,
98
+ *,
99
+ timeout: float = 20,
100
+ headers: Optional[dict[str, str]] = None,
101
+ generation_stoppers: list[GenerationStopper] = [],
102
+ **params,
103
+ ) -> Tuple[str, Optional[int]]:
104
+ """
105
+ Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
106
+ Parses SSE lines: 'data: {json}\\n\\n', terminated by 'data: [DONE]'.
107
+ Accumulates text and calls stopper.should_stop(window) as chunks arrive.
108
+ If stopper triggers, the HTTP connection is closed to abort server-side generation.
109
+ """
110
+ img_io = BytesIO()
111
+ image.save(img_io, "PNG")
112
+ image_b64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
113
+
114
+ messages = [
115
+ {
116
+ "role": "user",
117
+ "content": [
118
+ {
119
+ "type": "image_url",
120
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
121
+ },
122
+ {"type": "text", "text": prompt},
123
+ ],
124
+ }
125
+ ]
126
+
127
+ payload = {
128
+ "messages": messages,
129
+ "stream": True, # <-- critical for SSE streaming
130
+ **params,
131
+ }
132
+
133
+ # Debug: Log the payload to verify temperature is included
134
+ _log.debug(f"API streaming request payload: {json.dumps(payload, indent=2)}")
135
+
136
+ # Some servers require Accept: text/event-stream for SSE.
137
+ # It's safe to set it; OpenAI-compatible servers tolerate it.
138
+ hdrs = {"Accept": "text/event-stream", **(headers or {})}
139
+
140
+ # Try to force temperature via header if server ignores payload parameter
141
+ if "temperature" in params:
142
+ hdrs["X-Temperature"] = str(params["temperature"])
143
+
144
+ # Stream the HTTP response
145
+ with requests.post(
146
+ str(url), headers=hdrs, json=payload, timeout=timeout, stream=True
147
+ ) as r:
148
+ if not r.ok:
149
+ _log.error(
150
+ f"Error calling the API {url} in streaming mode. Response was {r.text}"
151
+ )
152
+ r.raise_for_status()
153
+
154
+ full_text = []
155
+ for raw_line in r.iter_lines(decode_unicode=True):
156
+ if not raw_line: # keep-alives / blank lines
157
+ continue
158
+ if not raw_line.startswith("data:"):
159
+ # Some proxies inject comments; ignore anything not starting with 'data:'
160
+ continue
161
+
162
+ data = raw_line[len("data:") :].strip()
163
+ if data == "[DONE]":
164
+ break
165
+
166
+ try:
167
+ obj = json.loads(data)
168
+ except json.JSONDecodeError:
169
+ _log.debug("Skipping non-JSON SSE chunk: %r", data[:200])
170
+ continue
171
+
172
+ # OpenAI-compatible delta format
173
+ # obj["choices"][0]["delta"]["content"] may be None or missing (e.g., tool calls)
174
+ try:
175
+ delta = obj["choices"][0].get("delta") or {}
176
+ piece = delta.get("content") or ""
177
+ except (KeyError, IndexError) as e:
178
+ _log.debug("Unexpected SSE chunk shape: %s", e)
179
+ piece = ""
180
+
181
+ # Try to extract token count
182
+ num_tokens = None
183
+ try:
184
+ if "usage" in obj:
185
+ usage = obj["usage"]
186
+ num_tokens = usage.get("total_tokens")
187
+ except Exception as e:
188
+ num_tokens = None
189
+ _log.debug("Usage key not included in response: %s", e)
190
+
191
+ if piece:
192
+ full_text.append(piece)
193
+ for stopper in generation_stoppers:
194
+ # Respect stopper's lookback window. We use a simple string window which
195
+ # works with the GenerationStopper interface.
196
+ lookback = max(1, stopper.lookback_tokens())
197
+ window = "".join(full_text)[-lookback:]
198
+ if stopper.should_stop(window):
199
+ # Break out of the loop cleanly. The context manager will handle
200
+ # closing the connection when we exit the 'with' block.
201
+ # vLLM/OpenAI-compatible servers will detect the client disconnect
202
+ # and abort the request server-side.
203
+ return "".join(full_text), num_tokens
204
+
205
+ return "".join(full_text), num_tokens
@@ -0,0 +1,388 @@
1
+ """Utilities for parsing DeepSeek OCR annotated markdown format."""
2
+
3
+ import logging
4
+ import re
5
+ from typing import Optional, Union
6
+
7
+ from docling_core.types.doc import (
8
+ BoundingBox,
9
+ CoordOrigin,
10
+ DocItemLabel,
11
+ DoclingDocument,
12
+ DocumentOrigin,
13
+ ImageRef,
14
+ ProvenanceItem,
15
+ RefItem,
16
+ Size,
17
+ TableCell,
18
+ TableData,
19
+ TextItem,
20
+ )
21
+ from lxml import etree
22
+ from PIL import Image as PILImage
23
+
24
+ _log = logging.getLogger(__name__)
25
+
26
+
27
+ def _parse_table_html(html_content: str) -> TableData:
28
+ """Parse HTML table content and create TableData structure.
29
+
30
+ Args:
31
+ html_content: HTML string containing <table> element
32
+
33
+ Returns:
34
+ TableData with parsed table structure
35
+ """
36
+ # Extract table HTML if wrapped in other content
37
+ table_match = re.search(
38
+ r"<table[^>]*>.*?</table>", html_content, re.DOTALL | re.IGNORECASE
39
+ )
40
+ if not table_match:
41
+ # No table found, return empty table
42
+ return TableData(num_rows=0, num_cols=0, table_cells=[])
43
+
44
+ table_html = table_match.group(0)
45
+
46
+ try:
47
+ # Parse HTML with lxml
48
+ parser = etree.HTMLParser()
49
+ tree = etree.fromstring(table_html, parser)
50
+
51
+ # Find all rows
52
+ rows = tree.xpath(".//tr")
53
+ if not rows:
54
+ return TableData(num_rows=0, num_cols=0, table_cells=[])
55
+
56
+ # Calculate grid dimensions
57
+ num_rows = len(rows)
58
+ num_cols = 0
59
+
60
+ # First pass: determine number of columns
61
+ for row in rows:
62
+ cells = row.xpath("./td | ./th")
63
+ col_count = 0
64
+ for cell in cells:
65
+ colspan = int(cell.get("colspan", "1"))
66
+ col_count += colspan
67
+ num_cols = max(num_cols, col_count)
68
+
69
+ # Create grid to track cell positions
70
+ grid: list[list[Union[None, str]]] = [
71
+ [None for _ in range(num_cols)] for _ in range(num_rows)
72
+ ]
73
+ table_data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
74
+
75
+ # Second pass: populate cells
76
+ for row_idx, row in enumerate(rows):
77
+ cells = row.xpath("./td | ./th")
78
+ col_idx = 0
79
+
80
+ for cell in cells:
81
+ # Find next available column
82
+ while col_idx < num_cols and grid[row_idx][col_idx] is not None:
83
+ col_idx += 1
84
+
85
+ if col_idx >= num_cols:
86
+ break
87
+
88
+ # Get cell properties
89
+ text = "".join(cell.itertext()).strip()
90
+ colspan = int(cell.get("colspan", "1"))
91
+ rowspan = int(cell.get("rowspan", "1"))
92
+ is_header = cell.tag.lower() == "th"
93
+
94
+ # Mark grid cells as occupied
95
+ for r in range(row_idx, min(row_idx + rowspan, num_rows)):
96
+ for c in range(col_idx, min(col_idx + colspan, num_cols)):
97
+ grid[r][c] = text
98
+
99
+ # Create table cell
100
+ table_cell = TableCell(
101
+ text=text,
102
+ row_span=rowspan,
103
+ col_span=colspan,
104
+ start_row_offset_idx=row_idx,
105
+ end_row_offset_idx=row_idx + rowspan,
106
+ start_col_offset_idx=col_idx,
107
+ end_col_offset_idx=col_idx + colspan,
108
+ column_header=is_header and row_idx == 0,
109
+ row_header=is_header and col_idx == 0,
110
+ )
111
+ table_data.table_cells.append(table_cell)
112
+
113
+ col_idx += colspan
114
+
115
+ return table_data
116
+
117
+ except Exception as e:
118
+ _log.warning(f"Failed to parse table HTML: {e}")
119
+ return TableData(num_rows=0, num_cols=0, table_cells=[])
120
+
121
+
122
+ def _collect_annotation_content(
123
+ lines: list[str],
124
+ i: int,
125
+ label_str: str,
126
+ annotation_pattern: str,
127
+ visited_lines: set[int],
128
+ ) -> tuple[str, int]:
129
+ """Collect content for an annotation.
130
+
131
+ Args:
132
+ lines: All lines from the document
133
+ i: Current line index (after annotation line)
134
+ label_str: The annotation label (e.g., 'table', 'text')
135
+ annotation_pattern: Regex pattern to match annotations
136
+ visited_lines: Set of already visited line indices
137
+
138
+ Returns:
139
+ Tuple of (content string, next line index)
140
+ """
141
+ content_lines = []
142
+
143
+ # Special handling for table: extract only <table>...</table>
144
+ if label_str == "table":
145
+ table_started = False
146
+ ii = i
147
+ while ii < len(lines):
148
+ line = lines[ii]
149
+ if "<table" in line.lower():
150
+ table_started = True
151
+ if table_started:
152
+ visited_lines.add(ii)
153
+ content_lines.append(line.rstrip())
154
+ if table_started and "</table>" in line.lower():
155
+ break
156
+ ii += 1
157
+ else:
158
+ # Original logic for other labels
159
+ while i < len(lines):
160
+ content_line = lines[i].strip()
161
+ if content_line:
162
+ if re.match(annotation_pattern, content_line):
163
+ break
164
+ visited_lines.add(i)
165
+ content_lines.append(lines[i].rstrip())
166
+ i += 1
167
+ if label_str not in ["figure", "image"]:
168
+ break
169
+ else:
170
+ i += 1
171
+ if content_lines:
172
+ break
173
+
174
+ return "\n".join(content_lines), i
175
+
176
+
177
+ def _process_annotation_item(
178
+ label_str: str,
179
+ content: str,
180
+ prov: ProvenanceItem,
181
+ caption_item: Optional[Union[TextItem, RefItem]],
182
+ page_doc: DoclingDocument,
183
+ label_map: dict[str, DocItemLabel],
184
+ ) -> None:
185
+ """Process and add a single annotation item to the document.
186
+
187
+ Args:
188
+ label_str: The annotation label
189
+ content: The content text
190
+ prov: Provenance information
191
+ caption_item: Optional caption item to link
192
+ page_doc: Document to add item to
193
+ label_map: Mapping of label strings to DocItemLabel
194
+ """
195
+ doc_label = label_map.get(label_str, DocItemLabel.TEXT)
196
+
197
+ if label_str in ["figure", "image"]:
198
+ page_doc.add_picture(caption=caption_item, prov=prov)
199
+ elif label_str == "table":
200
+ table_data = _parse_table_html(content)
201
+ page_doc.add_table(data=table_data, caption=caption_item, prov=prov)
202
+ elif label_str == "title":
203
+ clean_content = content
204
+ if content.startswith("#"):
205
+ hash_count = 0
206
+ for char in content:
207
+ if char == "#":
208
+ hash_count += 1
209
+ else:
210
+ break
211
+ clean_content = content[hash_count:].strip()
212
+ page_doc.add_title(text=clean_content, prov=prov)
213
+ elif label_str == "sub_title":
214
+ heading_level = 1
215
+ clean_content = content
216
+ if content.startswith("#"):
217
+ hash_count = 0
218
+ for char in content:
219
+ if char == "#":
220
+ hash_count += 1
221
+ else:
222
+ break
223
+ if hash_count > 1:
224
+ heading_level = hash_count - 1
225
+ clean_content = content[hash_count:].strip()
226
+ page_doc.add_heading(text=clean_content, level=heading_level, prov=prov)
227
+ else:
228
+ page_doc.add_text(label=doc_label, text=content, prov=prov)
229
+
230
+
231
+ def parse_deepseekocr_markdown(
232
+ content: str,
233
+ original_page_size: Size,
234
+ page_no: int,
235
+ filename: str = "file",
236
+ page_image: Optional[PILImage.Image] = None,
237
+ ) -> DoclingDocument:
238
+ """Parse DeepSeek OCR markdown with label[[x1, y1, x2, y2]] format.
239
+
240
+ This function parses markdown content that has been annotated with bounding box
241
+ coordinates for different document elements.
242
+
243
+ Labels supported:
244
+ - text: Standard body text
245
+ - title: Main document or section titles
246
+ - sub_title: Secondary headings or sub-headers
247
+ - table: Tabular data
248
+ - table_caption: Descriptive text for tables
249
+ - figure: Image-based elements or diagrams
250
+ - figure_caption: Titles or descriptions for figures/images
251
+ - header / footer: Content at top or bottom margins of pages
252
+
253
+ Args:
254
+ content: The annotated markdown content string
255
+ page_image: Optional PIL Image of the page
256
+ page_no: Page number (default: 1)
257
+ filename: Source filename (default: "file")
258
+
259
+ Returns:
260
+ DoclingDocument with parsed content
261
+ """
262
+ # Label mapping
263
+ label_map = {
264
+ "text": DocItemLabel.TEXT,
265
+ "title": DocItemLabel.TITLE,
266
+ "sub_title": DocItemLabel.SECTION_HEADER,
267
+ "table": DocItemLabel.TABLE,
268
+ "table_caption": DocItemLabel.CAPTION,
269
+ "figure": DocItemLabel.PICTURE,
270
+ "figure_caption": DocItemLabel.CAPTION,
271
+ "image": DocItemLabel.PICTURE,
272
+ "image_caption": DocItemLabel.CAPTION,
273
+ "header": DocItemLabel.PAGE_HEADER,
274
+ "footer": DocItemLabel.PAGE_FOOTER,
275
+ }
276
+
277
+ # Pattern to match: <|ref|>label<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|> or label[[x1, y1, x2, y2]]
278
+ annotation_pattern = r"^(?:<\|ref\|>)?(\w+)(?:<\|/ref\|>)?(?:<\|det\|>)?\[\[([0-9., ]+)\]\](?:<\|/det\|>)?\s*$"
279
+
280
+ # Create a new document
281
+ origin = DocumentOrigin(
282
+ filename=filename,
283
+ mimetype="text/markdown",
284
+ binary_hash=0,
285
+ )
286
+ page_doc = DoclingDocument(name=filename.rsplit(".", 1)[0], origin=origin)
287
+
288
+ # Get page dimensions - use original page size if provided, otherwise image size
289
+ pg_width = original_page_size.width
290
+ pg_height = original_page_size.height
291
+
292
+ # Calculate scale factor for bbox conversion
293
+ # VLM produces bboxes in unit of 1000
294
+ scale_x = pg_width / 1000
295
+ scale_y = pg_height / 1000
296
+
297
+ # Calculate DPI for the image
298
+ image_dpi = 72
299
+ if page_image is not None:
300
+ image_dpi = int(72 * page_image.width / pg_width)
301
+
302
+ # Add page metadata
303
+ page_doc.add_page(
304
+ page_no=page_no,
305
+ size=Size(width=pg_width, height=pg_height),
306
+ image=ImageRef.from_pil(image=page_image, dpi=image_dpi)
307
+ if page_image
308
+ else None,
309
+ )
310
+
311
+ # Split into lines and parse - collect all annotations first
312
+ lines = content.split("\n")
313
+ annotations = []
314
+ i = 0
315
+ visited_lines: set[int] = set()
316
+
317
+ while i < len(lines):
318
+ if i in visited_lines:
319
+ i += 1
320
+ continue
321
+
322
+ line = lines[i].strip()
323
+ match = re.match(annotation_pattern, line)
324
+ if match:
325
+ label_str = match.group(1)
326
+ coords_str = match.group(2)
327
+
328
+ try:
329
+ coords = [float(x.strip()) for x in coords_str.split(",")]
330
+ if len(coords) == 4:
331
+ # Scale bounding box from image coordinates to original page coordinates
332
+ bbox = BoundingBox(
333
+ l=coords[0] * scale_x,
334
+ t=coords[1] * scale_y,
335
+ r=coords[2] * scale_x,
336
+ b=coords[3] * scale_y,
337
+ coord_origin=CoordOrigin.TOPLEFT,
338
+ )
339
+ prov = ProvenanceItem(page_no=page_no, bbox=bbox, charspan=[0, 0])
340
+
341
+ # Get the content (next non-empty line)
342
+ i += 1
343
+ content_text, i = _collect_annotation_content(
344
+ lines, i, label_str, annotation_pattern, visited_lines
345
+ )
346
+ annotations.append((label_str, content_text, prov))
347
+ continue
348
+ except (ValueError, IndexError):
349
+ pass
350
+ i += 1
351
+
352
+ # Process annotations and link captions that appear AFTER tables/figures
353
+ for idx, (label_str, content_text, prov) in enumerate(annotations):
354
+ # Check if NEXT annotation is a caption for this table/figure/image
355
+ # (caption appears AFTER table in the file: table[[...]] then table_caption[[...]])
356
+ caption_item = None
357
+ if label_str in ["table", "figure", "image"] and idx + 1 < len(annotations):
358
+ next_label, next_content, next_prov = annotations[idx + 1]
359
+ if (
360
+ (label_str == "table" and next_label == "table_caption")
361
+ or (label_str == "figure" and next_label == "figure_caption")
362
+ or (label_str == "image" and next_label == "image_caption")
363
+ ):
364
+ # Create caption item
365
+ caption_label = label_map.get(next_label, DocItemLabel.CAPTION)
366
+ caption_item = page_doc.add_text(
367
+ label=caption_label,
368
+ text=next_content,
369
+ prov=next_prov,
370
+ )
371
+
372
+ # Skip if this is a caption that was already processed
373
+ if label_str in ["figure_caption", "table_caption", "image_caption"]:
374
+ if idx > 0:
375
+ prev_label = annotations[idx - 1][0]
376
+ if (
377
+ (label_str == "table_caption" and prev_label == "table")
378
+ or (label_str == "figure_caption" and prev_label == "figure")
379
+ or (label_str == "image_caption" and prev_label == "image")
380
+ ):
381
+ continue
382
+
383
+ # Add the item
384
+ _process_annotation_item(
385
+ label_str, content_text, prov, caption_item, page_doc, label_map
386
+ )
387
+
388
+ return page_doc