docling 2.1.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@ if TYPE_CHECKING:
13
13
  class AbstractDocumentBackend(ABC):
14
14
  @abstractmethod
15
15
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
16
+ self.file = in_doc.file
16
17
  self.path_or_stream = path_or_stream
17
18
  self.document_hash = in_doc.document_hash
18
19
  self.input_format = in_doc.format
@@ -0,0 +1,435 @@
1
+ import logging
2
+ import os
3
+ import re
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import Set, Union
7
+
8
+ from docling_core.types.doc import (
9
+ DocItem,
10
+ DocItemLabel,
11
+ DoclingDocument,
12
+ DocumentOrigin,
13
+ GroupItem,
14
+ GroupLabel,
15
+ ImageRef,
16
+ NodeItem,
17
+ Size,
18
+ TableCell,
19
+ TableData,
20
+ )
21
+ from pydantic import AnyUrl
22
+
23
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
24
+ from docling.datamodel.base_models import InputFormat
25
+ from docling.datamodel.document import InputDocument
26
+
27
+ _log = logging.getLogger(__name__)
28
+
29
+
30
+ class AsciiDocBackend(DeclarativeDocumentBackend):
31
+
32
+ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
33
+ super().__init__(in_doc, path_or_stream)
34
+
35
+ self.path_or_stream = path_or_stream
36
+
37
+ try:
38
+ if isinstance(self.path_or_stream, BytesIO):
39
+ text_stream = self.path_or_stream.getvalue().decode("utf-8")
40
+ self.lines = text_stream.split("\n")
41
+ if isinstance(self.path_or_stream, Path):
42
+ with open(self.path_or_stream, "r", encoding="utf-8") as f:
43
+ self.lines = f.readlines()
44
+ self.valid = True
45
+
46
+ except Exception as e:
47
+ raise RuntimeError(
48
+ f"Could not initialize AsciiDoc backend for file with hash {self.document_hash}."
49
+ ) from e
50
+ return
51
+
52
+ def is_valid(self) -> bool:
53
+ return self.valid
54
+
55
+ @classmethod
56
+ def supports_pagination(cls) -> bool:
57
+ return False
58
+
59
+ def unload(self):
60
+ return
61
+
62
+ @classmethod
63
+ def supported_formats(cls) -> Set[InputFormat]:
64
+ return {InputFormat.ASCIIDOC}
65
+
66
+ def convert(self) -> DoclingDocument:
67
+ """
68
+ Parses the ASCII into a structured document model.
69
+ """
70
+
71
+ origin = DocumentOrigin(
72
+ filename=self.file.name or "file",
73
+ mimetype="text/asciidoc",
74
+ binary_hash=self.document_hash,
75
+ )
76
+
77
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
78
+
79
+ doc = self._parse(doc)
80
+
81
+ return doc
82
+
83
+ def _parse(self, doc: DoclingDocument):
84
+ """
85
+ Main function that orchestrates the parsing by yielding components:
86
+ title, section headers, text, lists, and tables.
87
+ """
88
+
89
+ content = ""
90
+
91
+ in_list = False
92
+ in_table = False
93
+
94
+ text_data: list[str] = []
95
+ table_data: list[str] = []
96
+ caption_data: list[str] = []
97
+
98
+ # parents: dict[int, Union[DocItem, GroupItem, None]] = {}
99
+ parents: dict[int, Union[GroupItem, None]] = {}
100
+ # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
101
+ indents: dict[int, Union[GroupItem, None]] = {}
102
+
103
+ for i in range(0, 10):
104
+ parents[i] = None
105
+ indents[i] = None
106
+
107
+ for line in self.lines:
108
+ # line = line.strip()
109
+
110
+ # Title
111
+ if self._is_title(line):
112
+ item = self._parse_title(line)
113
+ level = item["level"]
114
+
115
+ parents[level] = doc.add_text(
116
+ text=item["text"], label=DocItemLabel.TITLE
117
+ )
118
+
119
+ # Section headers
120
+ elif self._is_section_header(line):
121
+ item = self._parse_section_header(line)
122
+ level = item["level"]
123
+
124
+ parents[level] = doc.add_heading(
125
+ text=item["text"], level=item["level"], parent=parents[level - 1]
126
+ )
127
+ for k, v in parents.items():
128
+ if k > level:
129
+ parents[k] = None
130
+
131
+ # Lists
132
+ elif self._is_list_item(line):
133
+
134
+ _log.debug(f"line: {line}")
135
+ item = self._parse_list_item(line)
136
+ _log.debug(f"parsed list-item: {item}")
137
+
138
+ level = self._get_current_level(parents)
139
+
140
+ if not in_list:
141
+ in_list = True
142
+
143
+ parents[level + 1] = doc.add_group(
144
+ parent=parents[level], name="list", label=GroupLabel.LIST
145
+ )
146
+ indents[level + 1] = item["indent"]
147
+
148
+ elif in_list and item["indent"] > indents[level]:
149
+ parents[level + 1] = doc.add_group(
150
+ parent=parents[level], name="list", label=GroupLabel.LIST
151
+ )
152
+ indents[level + 1] = item["indent"]
153
+
154
+ elif in_list and item["indent"] < indents[level]:
155
+
156
+ # print(item["indent"], " => ", indents[level])
157
+ while item["indent"] < indents[level]:
158
+ # print(item["indent"], " => ", indents[level])
159
+ parents[level] = None
160
+ indents[level] = None
161
+ level -= 1
162
+
163
+ doc.add_list_item(
164
+ item["text"], parent=self._get_current_parent(parents)
165
+ )
166
+
167
+ elif in_list and not self._is_list_item(line):
168
+ in_list = False
169
+
170
+ level = self._get_current_level(parents)
171
+ parents[level] = None
172
+
173
+ # Tables
174
+ elif line.strip() == "|===" and not in_table: # start of table
175
+ in_table = True
176
+
177
+ elif self._is_table_line(line): # within a table
178
+ in_table = True
179
+ table_data.append(self._parse_table_line(line))
180
+
181
+ elif in_table and (
182
+ (not self._is_table_line(line)) or line.strip() == "|==="
183
+ ): # end of table
184
+
185
+ caption = None
186
+ if len(caption_data) > 0:
187
+ caption = doc.add_text(
188
+ text=" ".join(caption_data), label=DocItemLabel.CAPTION
189
+ )
190
+
191
+ caption_data = []
192
+
193
+ data = self._populate_table_as_grid(table_data)
194
+ doc.add_table(
195
+ data=data, parent=self._get_current_parent(parents), caption=caption
196
+ )
197
+
198
+ in_table = False
199
+ table_data = []
200
+
201
+ # Picture
202
+ elif self._is_picture(line):
203
+
204
+ caption = None
205
+ if len(caption_data) > 0:
206
+ caption = doc.add_text(
207
+ text=" ".join(caption_data), label=DocItemLabel.CAPTION
208
+ )
209
+
210
+ caption_data = []
211
+
212
+ item = self._parse_picture(line)
213
+
214
+ size = None
215
+ if "width" in item and "height" in item:
216
+ size = Size(width=int(item["width"]), height=int(item["height"]))
217
+
218
+ uri = None
219
+ if (
220
+ "uri" in item
221
+ and not item["uri"].startswith("http")
222
+ and item["uri"].startswith("//")
223
+ ):
224
+ uri = "file:" + item["uri"]
225
+ elif (
226
+ "uri" in item
227
+ and not item["uri"].startswith("http")
228
+ and item["uri"].startswith("/")
229
+ ):
230
+ uri = "file:/" + item["uri"]
231
+ elif "uri" in item and not item["uri"].startswith("http"):
232
+ uri = "file://" + item["uri"]
233
+
234
+ image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
235
+ doc.add_picture(image=image, caption=caption)
236
+
237
+ # Caption
238
+ elif self._is_caption(line) and len(caption_data) == 0:
239
+ item = self._parse_caption(line)
240
+ caption_data.append(item["text"])
241
+
242
+ elif (
243
+ len(line.strip()) > 0 and len(caption_data) > 0
244
+ ): # allow multiline captions
245
+ item = self._parse_text(line)
246
+ caption_data.append(item["text"])
247
+
248
+ # Plain text
249
+ elif len(line.strip()) == 0 and len(text_data) > 0:
250
+ doc.add_text(
251
+ text=" ".join(text_data),
252
+ label=DocItemLabel.PARAGRAPH,
253
+ parent=self._get_current_parent(parents),
254
+ )
255
+ text_data = []
256
+
257
+ elif len(line.strip()) > 0: # allow multiline texts
258
+
259
+ item = self._parse_text(line)
260
+ text_data.append(item["text"])
261
+
262
+ if len(text_data) > 0:
263
+ doc.add_text(
264
+ text=" ".join(text_data),
265
+ label=DocItemLabel.PARAGRAPH,
266
+ parent=self._get_current_parent(parents),
267
+ )
268
+ text_data = []
269
+
270
+ if in_table and len(table_data) > 0:
271
+ data = self._populate_table_as_grid(table_data)
272
+ doc.add_table(data=data, parent=self._get_current_parent(parents))
273
+
274
+ in_table = False
275
+ table_data = []
276
+
277
+ return doc
278
+
279
+ def _get_current_level(self, parents):
280
+ for k, v in parents.items():
281
+ if v == None and k > 0:
282
+ return k - 1
283
+
284
+ return 0
285
+
286
+ def _get_current_parent(self, parents):
287
+ for k, v in parents.items():
288
+ if v == None and k > 0:
289
+ return parents[k - 1]
290
+
291
+ return None
292
+
293
+ # ========= Title
294
+ def _is_title(self, line):
295
+ return re.match(r"^= ", line)
296
+
297
+ def _parse_title(self, line):
298
+ return {"type": "title", "text": line[2:].strip(), "level": 0}
299
+
300
+ # ========= Section headers
301
+ def _is_section_header(self, line):
302
+ return re.match(r"^==+", line)
303
+
304
+ def _parse_section_header(self, line):
305
+ match = re.match(r"^(=+)\s+(.*)", line)
306
+
307
+ marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
308
+ text = match.group(2) # The actual text of the list item
309
+
310
+ header_level = marker.count("=") # number of '=' represents level
311
+ return {
312
+ "type": "header",
313
+ "level": header_level - 1,
314
+ "text": text.strip(),
315
+ }
316
+
317
+ # ========= Lists
318
+ def _is_list_item(self, line):
319
+ return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
320
+
321
+ def _parse_list_item(self, line):
322
+ """Extract the item marker (number or bullet symbol) and the text of the item."""
323
+
324
+ match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
325
+ if match:
326
+ indent = match.group(1)
327
+ marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
328
+ text = match.group(3) # The actual text of the list item
329
+
330
+ if marker == "*" or marker == "-":
331
+ return {
332
+ "type": "list_item",
333
+ "marker": marker,
334
+ "text": text.strip(),
335
+ "numbered": False,
336
+ "indent": 0 if indent == None else len(indent),
337
+ }
338
+ else:
339
+ return {
340
+ "type": "list_item",
341
+ "marker": marker,
342
+ "text": text.strip(),
343
+ "numbered": True,
344
+ "indent": 0 if indent == None else len(indent),
345
+ }
346
+ else:
347
+ # Fallback if no match
348
+ return {
349
+ "type": "list_item",
350
+ "marker": "-",
351
+ "text": line,
352
+ "numbered": False,
353
+ "indent": 0,
354
+ }
355
+
356
+ # ========= Tables
357
+ def _is_table_line(self, line):
358
+ return re.match(r"^\|.*\|", line)
359
+
360
+ def _parse_table_line(self, line):
361
+ # Split table cells and trim extra spaces
362
+ return [cell.strip() for cell in line.split("|") if cell.strip()]
363
+
364
+ def _populate_table_as_grid(self, table_data):
365
+
366
+ num_rows = len(table_data)
367
+
368
+ # Adjust the table data into a grid format
369
+ num_cols = max(len(row) for row in table_data)
370
+
371
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
372
+ for row_idx, row in enumerate(table_data):
373
+ # Pad rows with empty strings to match column count
374
+ # grid.append(row + [''] * (max_cols - len(row)))
375
+
376
+ for col_idx, text in enumerate(row):
377
+ row_span = 1
378
+ col_span = 1
379
+
380
+ cell = TableCell(
381
+ text=text,
382
+ row_span=row_span,
383
+ col_span=col_span,
384
+ start_row_offset_idx=row_idx,
385
+ end_row_offset_idx=row_idx + row_span,
386
+ start_col_offset_idx=col_idx,
387
+ end_col_offset_idx=col_idx + col_span,
388
+ col_header=False,
389
+ row_header=False,
390
+ )
391
+ data.table_cells.append(cell)
392
+
393
+ return data
394
+
395
+ # ========= Pictures
396
+ def _is_picture(self, line):
397
+ return re.match(r"^image::", line)
398
+
399
+ def _parse_picture(self, line):
400
+ """
401
+ Parse an image macro, extracting its path and attributes.
402
+ Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
403
+ """
404
+ mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
405
+ if mtch:
406
+ picture_path = mtch.group(1).strip()
407
+ attributes = mtch.group(2).split(",")
408
+ picture_info = {"type": "picture", "uri": picture_path}
409
+
410
+ # Extract optional attributes (alt text, width, height, alignment)
411
+ if attributes:
412
+ picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
413
+ for attr in attributes[1:]:
414
+ key, value = attr.split("=")
415
+ picture_info[key.strip()] = value.strip()
416
+
417
+ return picture_info
418
+
419
+ return {"type": "picture", "uri": line}
420
+
421
+ # ========= Captions
422
+ def _is_caption(self, line):
423
+ return re.match(r"^\.(.+)", line)
424
+
425
+ def _parse_caption(self, line):
426
+ mtch = re.match(r"^\.(.+)", line)
427
+ if mtch:
428
+ text = mtch.group(1)
429
+ return {"type": "caption", "text": text}
430
+
431
+ return {"type": "caption", "text": ""}
432
+
433
+ # ========= Plain text
434
+ def _parse_text(self, line):
435
+ return {"type": "text", "text": line.strip()}
@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
9
- from docling_parse.docling_parse import pdf_parser
9
+ from docling_parse.docling_parse import pdf_parser_v1
10
10
  from PIL import Image, ImageDraw
11
11
  from pypdfium2 import PdfPage
12
12
 
@@ -19,7 +19,7 @@ _log = logging.getLogger(__name__)
19
19
 
20
20
  class DoclingParsePageBackend(PdfPageBackend):
21
21
  def __init__(
22
- self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
22
+ self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
23
23
  ):
24
24
  self._ppage = page_obj
25
25
  parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
@@ -192,7 +192,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
192
192
  super().__init__(in_doc, path_or_stream)
193
193
 
194
194
  self._pdoc = pdfium.PdfDocument(self.path_or_stream)
195
- self.parser = pdf_parser()
195
+ self.parser = pdf_parser_v1()
196
196
 
197
197
  success = False
198
198
  if isinstance(self.path_or_stream, BytesIO):
@@ -26,9 +26,9 @@ class DoclingParseV2PageBackend(PdfPageBackend):
26
26
  self._ppage = page_obj
27
27
  parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
28
28
 
29
- self.valid = "pages" in parsed_page
29
+ self.valid = "pages" in parsed_page and len(parsed_page["pages"]) == 1
30
30
  if self.valid:
31
- self._dpage = parsed_page["pages"][page_no]
31
+ self._dpage = parsed_page["pages"][0]
32
32
  else:
33
33
  _log.info(
34
34
  f"An error occured when loading page {page_no} of document {document_hash}."
@@ -223,7 +223,15 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
223
223
  )
224
224
 
225
225
  def page_count(self) -> int:
226
- return len(self._pdoc) # To be replaced with docling-parse API
226
+ # return len(self._pdoc) # To be replaced with docling-parse API
227
+
228
+ len_1 = len(self._pdoc)
229
+ len_2 = self.parser.number_of_pages(self.document_hash)
230
+
231
+ if len_1 != len_2:
232
+ _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
233
+
234
+ return len_2
227
235
 
228
236
  def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
229
237
  return DoclingParseV2PageBackend(
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
9
9
  DoclingDocument,
10
+ DocumentOrigin,
10
11
  GroupLabel,
11
12
  TableCell,
12
13
  TableData,
@@ -66,7 +67,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
66
67
 
67
68
  def convert(self) -> DoclingDocument:
68
69
  # access self.path_or_stream to load stuff
69
- doc = DoclingDocument(name="dummy")
70
+ origin = DocumentOrigin(
71
+ filename=self.file.name or "file",
72
+ mimetype="text/html",
73
+ binary_hash=self.document_hash,
74
+ )
75
+
76
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
70
77
  _log.debug("Trying to convert HTML...")
71
78
 
72
79
  if self.is_valid():
@@ -0,0 +1,293 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ import marko
7
+ import marko.ext
8
+ import marko.ext.gfm
9
+ import marko.inline
10
+ from docling_core.types.doc import (
11
+ DocItemLabel,
12
+ DoclingDocument,
13
+ DocumentOrigin,
14
+ GroupLabel,
15
+ TableCell,
16
+ TableData,
17
+ )
18
+ from marko import Markdown
19
+
20
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
21
+ from docling.datamodel.base_models import InputFormat
22
+ from docling.datamodel.document import InputDocument
23
+
24
+ _log = logging.getLogger(__name__)
25
+
26
+
27
+ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
28
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
29
+ super().__init__(in_doc, path_or_stream)
30
+
31
+ _log.debug("MD INIT!!!")
32
+
33
+ # Markdown file:
34
+ self.path_or_stream = path_or_stream
35
+ self.valid = True
36
+ self.markdown = "" # To store original Markdown string
37
+
38
+ self.in_table = False
39
+ self.md_table_buffer: list[str] = []
40
+ self.inline_text_buffer = ""
41
+
42
+ try:
43
+ if isinstance(self.path_or_stream, BytesIO):
44
+ text_stream = self.path_or_stream.getvalue().decode("utf-8")
45
+ self.markdown = text_stream
46
+ if isinstance(self.path_or_stream, Path):
47
+ with open(self.path_or_stream, "r", encoding="utf-8") as f:
48
+ md_content = f.read()
49
+ self.markdown = md_content
50
+ self.valid = True
51
+
52
+ _log.debug(self.markdown)
53
+ except Exception as e:
54
+ raise RuntimeError(
55
+ f"Could not initialize MD backend for file with hash {self.document_hash}."
56
+ ) from e
57
+ return
58
+
59
+ def close_table(self, doc=None):
60
+ if self.in_table:
61
+ _log.debug("=== TABLE START ===")
62
+ for md_table_row in self.md_table_buffer:
63
+ _log.debug(md_table_row)
64
+ _log.debug("=== TABLE END ===")
65
+ tcells = []
66
+ result_table = []
67
+ for n, md_table_row in enumerate(self.md_table_buffer):
68
+ data = []
69
+ if n == 0:
70
+ header = [t.strip() for t in md_table_row.split("|")[1:-1]]
71
+ for value in header:
72
+ data.append(value)
73
+ result_table.append(data)
74
+ if n > 1:
75
+ values = [t.strip() for t in md_table_row.split("|")[1:-1]]
76
+ for value in values:
77
+ data.append(value)
78
+ result_table.append(data)
79
+
80
+ for trow_ind, trow in enumerate(result_table):
81
+ for tcol_ind, cellval in enumerate(trow):
82
+ row_span = (
83
+ 1 # currently supporting just simple tables (without spans)
84
+ )
85
+ col_span = (
86
+ 1 # currently supporting just simple tables (without spans)
87
+ )
88
+ icell = TableCell(
89
+ text=cellval.strip(),
90
+ row_span=row_span,
91
+ col_span=col_span,
92
+ start_row_offset_idx=trow_ind,
93
+ end_row_offset_idx=trow_ind + row_span,
94
+ start_col_offset_idx=tcol_ind,
95
+ end_col_offset_idx=tcol_ind + col_span,
96
+ col_header=False,
97
+ row_header=False,
98
+ )
99
+ tcells.append(icell)
100
+
101
+ num_rows = len(result_table)
102
+ num_cols = len(result_table[0])
103
+ self.in_table = False
104
+ self.md_table_buffer = [] # clean table markdown buffer
105
+ # Initialize Docling TableData
106
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
107
+ # Populate
108
+ for tcell in tcells:
109
+ data.table_cells.append(tcell)
110
+ if len(tcells) > 0:
111
+ doc.add_table(data=data)
112
+ return
113
+
114
+ def process_inline_text(self, parent_element, doc=None):
115
+ # self.inline_text_buffer += str(text_in)
116
+ txt = self.inline_text_buffer.strip()
117
+ if len(txt) > 0:
118
+ doc.add_text(
119
+ label=DocItemLabel.PARAGRAPH,
120
+ parent=parent_element,
121
+ text=txt,
122
+ )
123
+ self.inline_text_buffer = ""
124
+
125
+ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
126
+ # Iterates over all elements in the AST
127
+ # Check for different element types and process relevant details
128
+ if isinstance(element, marko.block.Heading):
129
+ self.close_table(doc)
130
+ self.process_inline_text(parent_element, doc)
131
+ _log.debug(
132
+ f" - Heading level {element.level}, content: {element.children[0].children}"
133
+ )
134
+ if element.level == 1:
135
+ doc_label = DocItemLabel.TITLE
136
+ else:
137
+ doc_label = DocItemLabel.SECTION_HEADER
138
+ snippet_text = element.children[0].children.strip()
139
+
140
+ parent_element = doc.add_text(
141
+ label=doc_label, parent=parent_element, text=snippet_text
142
+ )
143
+
144
+ elif isinstance(element, marko.block.List):
145
+ self.close_table(doc)
146
+ self.process_inline_text(parent_element, doc)
147
+ _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
148
+ list_label = GroupLabel.LIST
149
+ if element.ordered:
150
+ list_label = GroupLabel.ORDERED_LIST
151
+ parent_element = doc.add_group(
152
+ label=list_label, name=f"list", parent=parent_element
153
+ )
154
+
155
+ elif isinstance(element, marko.block.ListItem):
156
+ self.close_table(doc)
157
+ self.process_inline_text(parent_element, doc)
158
+ _log.debug(" - List item")
159
+
160
+ snippet_text = str(element.children[0].children[0].children)
161
+ is_numbered = False
162
+ if parent_element.label == GroupLabel.ORDERED_LIST:
163
+ is_numbered = True
164
+ doc.add_list_item(
165
+ enumerated=is_numbered, parent=parent_element, text=snippet_text
166
+ )
167
+
168
+ elif isinstance(element, marko.inline.Image):
169
+ self.close_table(doc)
170
+ self.process_inline_text(parent_element, doc)
171
+ _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
172
+ doc.add_picture(parent=parent_element, caption=element.title)
173
+
174
+ elif isinstance(element, marko.block.Paragraph):
175
+ self.process_inline_text(parent_element, doc)
176
+
177
+ elif isinstance(element, marko.inline.RawText):
178
+ _log.debug(f" - Paragraph (raw text): {element.children}")
179
+ snippet_text = str(element.children).strip()
180
+ # Detect start of the table:
181
+ if "|" in snippet_text:
182
+ # most likely part of the markdown table
183
+ self.in_table = True
184
+ if len(self.md_table_buffer) > 0:
185
+ self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
186
+ snippet_text
187
+ )
188
+ else:
189
+ self.md_table_buffer.append(snippet_text)
190
+ else:
191
+ self.close_table(doc)
192
+ self.in_table = False
193
+ # most likely just inline text
194
+ self.inline_text_buffer += str(
195
+ element.children
196
+ ) # do not strip an inline text, as it may contain important spaces
197
+
198
+ elif isinstance(element, marko.inline.CodeSpan):
199
+ self.close_table(doc)
200
+ self.process_inline_text(parent_element, doc)
201
+ _log.debug(f" - Code Span: {element.children}")
202
+ snippet_text = str(element.children).strip()
203
+ doc.add_text(
204
+ label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
205
+ )
206
+
207
+ elif isinstance(element, marko.block.CodeBlock):
208
+ self.close_table(doc)
209
+ self.process_inline_text(parent_element, doc)
210
+ _log.debug(f" - Code Block: {element.children}")
211
+ snippet_text = str(element.children[0].children).strip()
212
+ doc.add_text(
213
+ label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
214
+ )
215
+
216
+ elif isinstance(element, marko.block.FencedCode):
217
+ self.close_table(doc)
218
+ self.process_inline_text(parent_element, doc)
219
+ _log.debug(f" - Code Block: {element.children}")
220
+ snippet_text = str(element.children[0].children).strip()
221
+ doc.add_text(
222
+ label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
223
+ )
224
+
225
+ elif isinstance(element, marko.inline.LineBreak):
226
+ self.process_inline_text(parent_element, doc)
227
+ if self.in_table:
228
+ _log.debug("Line break in a table")
229
+ self.md_table_buffer.append("")
230
+
231
+ elif isinstance(element, marko.block.HTMLBlock):
232
+ self.process_inline_text(parent_element, doc)
233
+ self.close_table(doc)
234
+ _log.debug("HTML Block: {}".format(element))
235
+ if (
236
+ len(element.children) > 0
237
+ ): # If Marko doesn't return any content for HTML block, skip it
238
+ snippet_text = str(element.children).strip()
239
+ doc.add_text(
240
+ label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
241
+ )
242
+ else:
243
+ if not isinstance(element, str):
244
+ self.close_table(doc)
245
+ _log.debug("Some other element: {}".format(element))
246
+
247
+ # Iterate through the element's children (if any)
248
+ if not isinstance(element, marko.block.ListItem):
249
+ if not isinstance(element, marko.block.Heading):
250
+ if not isinstance(element, marko.block.FencedCode):
251
+ # if not isinstance(element, marko.block.Paragraph):
252
+ if hasattr(element, "children"):
253
+ for child in element.children:
254
+ self.iterate_elements(child, depth + 1, doc, parent_element)
255
+
256
+ def is_valid(self) -> bool:
257
+ return self.valid
258
+
259
+ def unload(self):
260
+ if isinstance(self.path_or_stream, BytesIO):
261
+ self.path_or_stream.close()
262
+ self.path_or_stream = None
263
+
264
+ @classmethod
265
+ def supports_pagination(cls) -> bool:
266
+ return False
267
+
268
+ @classmethod
269
+ def supported_formats(cls) -> Set[InputFormat]:
270
+ return {InputFormat.MD}
271
+
272
+ def convert(self) -> DoclingDocument:
273
+ _log.debug("converting Markdown...")
274
+
275
+ origin = DocumentOrigin(
276
+ filename=self.file.name or "file",
277
+ mimetype="text/markdown",
278
+ binary_hash=self.document_hash,
279
+ )
280
+
281
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
282
+
283
+ if self.is_valid():
284
+ # Parse the markdown into an abstract syntax tree (AST)
285
+ marko_parser = Markdown()
286
+ parsed_ast = marko_parser.parse(self.markdown)
287
+ # Start iterating from the root of the AST
288
+ self.iterate_elements(parsed_ast, 0, doc, None)
289
+ else:
290
+ raise RuntimeError(
291
+ f"Cannot convert md with {self.document_hash} because the backend failed to init."
292
+ )
293
+ return doc
@@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
83
83
  # Parses the PPTX into a structured document model.
84
84
  # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
85
85
 
86
- fname = ""
87
- if isinstance(self.path_or_stream, Path):
88
- fname = self.path_or_stream.name
89
-
90
86
  origin = DocumentOrigin(
91
- filename=fname,
87
+ filename=self.file.name or "file",
92
88
  mimetype="application/vnd.ms-powerpoint",
93
89
  binary_hash=self.document_hash,
94
90
  )
95
- if len(fname) > 0:
96
- docname = Path(fname).stem
97
- else:
98
- docname = "stream"
91
+
99
92
  doc = DoclingDocument(
100
- name=docname, origin=origin
93
+ name=self.file.stem or "file", origin=origin
101
94
  ) # must add origin information
102
95
  doc = self.walk_linear(self.pptx_obj, doc)
103
96
 
@@ -119,10 +112,16 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
119
112
 
120
113
  def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
121
114
  is_a_list = False
115
+ is_list_group_created = False
122
116
  enum_list_item_value = 0
117
+ new_list = None
118
+ bullet_type = "None"
119
+ list_text = ""
120
+ list_label = GroupLabel.LIST
121
+ prov = self.generate_prov(shape, slide_ind, shape.text.strip())
122
+
123
+ # Identify if shape contains lists
123
124
  for paragraph in shape.text_frame.paragraphs:
124
- enum_list_item_value += 1
125
- bullet_type = "None"
126
125
  # Check if paragraph is a bullet point using the `element` XML
127
126
  p = paragraph._element
128
127
  if (
@@ -143,29 +142,32 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
143
142
  if paragraph.level > 0:
144
143
  # Most likely a sub-list
145
144
  is_a_list = True
146
- list_text = paragraph.text.strip()
147
-
148
- prov = self.generate_prov(shape, slide_ind, shape.text.strip())
149
145
 
150
146
  if is_a_list:
151
147
  # Determine if this is an unordered list or an ordered list.
152
148
  # Set GroupLabel.ORDERED_LIST when it fits.
153
- list_label = GroupLabel.LIST
154
149
  if bullet_type == "Numbered":
155
150
  list_label = GroupLabel.ORDERED_LIST
156
151
 
157
- new_list = doc.add_group(
158
- label=list_label, name=f"list", parent=parent_slide
159
- )
160
- else:
161
- new_list = None
162
-
163
152
  if is_a_list:
164
153
  _log.debug("LIST DETECTED!")
165
154
  else:
166
155
  _log.debug("No List")
167
156
 
168
- # for e in p.iter():
157
+ # If there is a list inside of the shape, create a new docling list to assign list items to
158
+ # if is_a_list:
159
+ # new_list = doc.add_group(
160
+ # label=list_label, name=f"list", parent=parent_slide
161
+ # )
162
+
163
+ # Iterate through paragraphs to build up text
164
+ for paragraph in shape.text_frame.paragraphs:
165
+ # p_text = paragraph.text.strip()
166
+ p = paragraph._element
167
+ enum_list_item_value += 1
168
+ inline_paragraph_text = ""
169
+ inline_list_item_text = ""
170
+
169
171
  for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
170
172
  if len(e.text.strip()) > 0:
171
173
  e_is_a_list_item = False
@@ -187,15 +189,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
187
189
  e_is_a_list_item = False
188
190
 
189
191
  if e_is_a_list_item:
192
+ if len(inline_paragraph_text) > 0:
193
+ # output accumulated inline text:
194
+ doc.add_text(
195
+ label=doc_label,
196
+ parent=parent_slide,
197
+ text=inline_paragraph_text,
198
+ prov=prov,
199
+ )
190
200
  # Set marker and enumerated arguments if this is an enumeration element.
191
- enum_marker = str(enum_list_item_value) + "."
192
- doc.add_list_item(
193
- marker=enum_marker,
194
- enumerated=is_numbered,
195
- parent=new_list,
196
- text=list_text,
197
- prov=prov,
198
- )
201
+ inline_list_item_text += e.text
202
+ # print(e.text)
199
203
  else:
200
204
  # Assign proper label to the text, depending if it's a Title or Section Header
201
205
  # For other types of text, assign - PARAGRAPH
@@ -210,15 +214,34 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
210
214
  doc_label = DocItemLabel.TITLE
211
215
  elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
212
216
  DocItemLabel.SECTION_HEADER
213
-
214
217
  enum_list_item_value = 0
218
+ inline_paragraph_text += e.text
215
219
 
216
- doc.add_text(
217
- label=doc_label,
218
- parent=parent_slide,
219
- text=list_text,
220
- prov=prov,
221
- )
220
+ if len(inline_paragraph_text) > 0:
221
+ # output accumulated inline text:
222
+ doc.add_text(
223
+ label=doc_label,
224
+ parent=parent_slide,
225
+ text=inline_paragraph_text,
226
+ prov=prov,
227
+ )
228
+
229
+ if len(inline_list_item_text) > 0:
230
+ enum_marker = ""
231
+ if is_numbered:
232
+ enum_marker = str(enum_list_item_value) + "."
233
+ if not is_list_group_created:
234
+ new_list = doc.add_group(
235
+ label=list_label, name=f"list", parent=parent_slide
236
+ )
237
+ is_list_group_created = True
238
+ doc.add_list_item(
239
+ marker=enum_marker,
240
+ enumerated=is_numbered,
241
+ parent=new_list,
242
+ text=inline_list_item_text,
243
+ prov=prov,
244
+ )
222
245
  return
223
246
 
224
247
  def handle_title(self, shape, parent_slide, slide_ind, doc):
@@ -311,7 +334,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
311
334
  if len(tcells) > 0:
312
335
  # If table is not fully empty...
313
336
  # Create Docling table
314
- doc.add_table(data=data, prov=prov)
337
+ doc.add_table(parent=parent_slide, data=data, prov=prov)
315
338
  return
316
339
 
317
340
  def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
@@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
85
85
  def convert(self) -> DoclingDocument:
86
86
  # Parses the DOCX into a structured document model.
87
87
 
88
- fname = ""
89
- if isinstance(self.path_or_stream, Path):
90
- fname = self.path_or_stream.name
91
-
92
88
  origin = DocumentOrigin(
93
- filename=fname,
89
+ filename=self.file.name or "file",
94
90
  mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
95
91
  binary_hash=self.document_hash,
96
92
  )
97
- if len(fname) > 0:
98
- docname = Path(fname).stem
99
- else:
100
- docname = "stream"
101
- doc = DoclingDocument(name=docname, origin=origin)
93
+
94
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
102
95
  if self.is_valid():
103
96
  assert self.docx_obj is not None
104
97
  doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
@@ -30,6 +30,8 @@ class InputFormat(str, Enum):
30
30
  HTML = "html"
31
31
  IMAGE = "image"
32
32
  PDF = "pdf"
33
+ ASCIIDOC = "asciidoc"
34
+ MD = "md"
33
35
 
34
36
 
35
37
  class OutputFormat(str, Enum):
@@ -43,29 +45,33 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
43
45
  InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
44
46
  InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
45
47
  InputFormat.PDF: ["pdf"],
48
+ InputFormat.MD: ["md"],
46
49
  InputFormat.HTML: ["html", "htm", "xhtml"],
47
50
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
51
+ InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
48
52
  }
49
53
 
50
- FormatToMimeType: Dict[InputFormat, Set[str]] = {
51
- InputFormat.DOCX: {
54
+ FormatToMimeType: Dict[InputFormat, List[str]] = {
55
+ InputFormat.DOCX: [
52
56
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
53
57
  "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
54
- },
55
- InputFormat.PPTX: {
58
+ ],
59
+ InputFormat.PPTX: [
56
60
  "application/vnd.openxmlformats-officedocument.presentationml.template",
57
61
  "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
58
62
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
59
- },
60
- InputFormat.HTML: {"text/html", "application/xhtml+xml"},
61
- InputFormat.IMAGE: {
63
+ ],
64
+ InputFormat.HTML: ["text/html", "application/xhtml+xml"],
65
+ InputFormat.IMAGE: [
62
66
  "image/png",
63
67
  "image/jpeg",
64
68
  "image/tiff",
65
69
  "image/gif",
66
70
  "image/bmp",
67
- },
68
- InputFormat.PDF: {"application/pdf"},
71
+ ],
72
+ InputFormat.PDF: ["application/pdf"],
73
+ InputFormat.ASCIIDOC: ["text/asciidoc"],
74
+ InputFormat.MD: ["text/markdown", "text/x-markdown"],
69
75
  }
70
76
  MimeTypeToFormat = {
71
77
  mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
@@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
45
45
  ConversionStatus,
46
46
  DocumentStream,
47
47
  ErrorItem,
48
+ FormatToExtensions,
49
+ FormatToMimeType,
48
50
  InputFormat,
49
51
  MimeTypeToFormat,
50
52
  Page,
@@ -143,11 +145,13 @@ class InputDocument(BaseModel):
143
145
  self.valid = False
144
146
 
145
147
  except (FileNotFoundError, OSError) as e:
148
+ self.valid = False
146
149
  _log.exception(
147
150
  f"File {self.file.name} not found or cannot be opened.", exc_info=e
148
151
  )
149
152
  # raise
150
153
  except RuntimeError as e:
154
+ self.valid = False
151
155
  _log.exception(
152
156
  f"An unexpected error occurred while opening the document {self.file.name}",
153
157
  exc_info=e,
@@ -166,6 +170,8 @@ class InputDocument(BaseModel):
166
170
  )
167
171
 
168
172
  self._backend = backend(self, path_or_stream=path_or_stream)
173
+ if not self._backend.is_valid():
174
+ self.valid = False
169
175
 
170
176
 
171
177
  class DocumentFormat(str, Enum):
@@ -480,26 +486,48 @@ class _DocumentConversionInput(BaseModel):
480
486
  else:
481
487
  raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
482
488
 
483
- def _guess_format(self, obj):
484
- content = None
489
+ def _guess_format(self, obj: Union[Path, DocumentStream]):
490
+ content = b"" # empty binary blob
491
+ format = None
492
+
485
493
  if isinstance(obj, Path):
486
494
  mime = filetype.guess_mime(str(obj))
487
495
  if mime is None:
496
+ ext = obj.suffix[1:]
497
+ mime = self._mime_from_extension(ext)
498
+ if mime is None: # must guess from
488
499
  with obj.open("rb") as f:
489
500
  content = f.read(1024) # Read first 1KB
490
501
 
491
502
  elif isinstance(obj, DocumentStream):
492
- obj.stream.seek(0)
493
503
  content = obj.stream.read(8192)
494
504
  obj.stream.seek(0)
495
505
  mime = filetype.guess_mime(content)
506
+ if mime is None:
507
+ ext = (
508
+ obj.name.rsplit(".", 1)[-1]
509
+ if ("." in obj.name and not obj.name.startswith("."))
510
+ else ""
511
+ )
512
+ mime = self._mime_from_extension(ext)
496
513
 
497
- if mime is None:
498
- mime = self._detect_html_xhtml(content)
514
+ mime = mime or self._detect_html_xhtml(content)
515
+ mime = mime or "text/plain"
499
516
 
500
517
  format = MimeTypeToFormat.get(mime)
501
518
  return format
502
519
 
520
+ def _mime_from_extension(self, ext):
521
+ mime = None
522
+ if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
523
+ mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
524
+ elif ext in FormatToExtensions[InputFormat.HTML]:
525
+ mime = FormatToMimeType[InputFormat.HTML][0]
526
+ elif ext in FormatToExtensions[InputFormat.MD]:
527
+ mime = FormatToMimeType[InputFormat.MD][0]
528
+
529
+ return mime
530
+
503
531
  def _detect_html_xhtml(self, content):
504
532
  content_str = content.decode("ascii", errors="ignore").lower()
505
533
  # Remove XML comments
@@ -8,8 +8,10 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type
8
8
  from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
9
 
10
10
  from docling.backend.abstract_backend import AbstractDocumentBackend
11
+ from docling.backend.asciidoc_backend import AsciiDocBackend
11
12
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
12
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
+ from docling.backend.md_backend import MarkdownDocumentBackend
13
15
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
14
16
  from docling.backend.msword_backend import MsWordDocumentBackend
15
17
  from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
@@ -52,6 +54,16 @@ class PowerpointFormatOption(FormatOption):
52
54
  backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
53
55
 
54
56
 
57
+ class MarkdownFormatOption(FormatOption):
58
+ pipeline_cls: Type = SimplePipeline
59
+ backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
60
+
61
+
62
+ class AsciiDocFormatOption(FormatOption):
63
+ pipeline_cls: Type = SimplePipeline
64
+ backend: Type[AbstractDocumentBackend] = AsciiDocBackend
65
+
66
+
55
67
  class HTMLFormatOption(FormatOption):
56
68
  pipeline_cls: Type = SimplePipeline
57
69
  backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
@@ -74,6 +86,12 @@ _format_to_default_options = {
74
86
  InputFormat.PPTX: FormatOption(
75
87
  pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
76
88
  ),
89
+ InputFormat.MD: FormatOption(
90
+ pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
91
+ ),
92
+ InputFormat.ASCIIDOC: FormatOption(
93
+ pipeline_cls=SimplePipeline, backend=AsciiDocBackend
94
+ ),
77
95
  InputFormat.HTML: FormatOption(
78
96
  pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
79
97
  ),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.1.0
3
+ Version: 2.2.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -22,13 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: tesserocr
23
23
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
- Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
26
- Requires-Dist: docling-core (>=2.0.0,<3.0.0)
25
+ Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
+ Requires-Dist: docling-core (>=2.1.0,<3.0.0)
27
27
  Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
- Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
28
+ Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0)
30
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
32
+ Requires-Dist: marko (>=2.1.2,<3.0.0)
32
33
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
33
34
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
34
35
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
@@ -50,7 +51,7 @@ Description-Content-Type: text/markdown
50
51
 
51
52
  <p align="center">
52
53
  <a href="https://github.com/ds4sd/docling">
53
- <img loading="lazy" alt="Docling" src="docs/assets/docling_processing.png" width="100%"/>
54
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
54
55
  </a>
55
56
  </p>
56
57
 
@@ -100,7 +101,7 @@ To convert individual documents, use `convert()`, for example:
100
101
  ```python
101
102
  from docling.document_converter import DocumentConverter
102
103
 
103
- source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
104
+ source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
104
105
  converter = DocumentConverter()
105
106
  result = converter.convert(source)
106
107
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
@@ -144,6 +145,6 @@ If you use Docling in your projects, please consider citing the following:
144
145
 
145
146
  ## License
146
147
 
147
- The Docling codebase is under MIT license.
148
+ The Docling codebase is under MIT license.
148
149
  For individual model usage, please refer to the model licenses found in the original packages.
149
150
 
@@ -1,21 +1,23 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=8Lh1gf1P9AnzlwB989OVBgLmokTpfI0LxYRfuvYTqoo,1646
4
- docling/backend/docling_parse_backend.py,sha256=UgBpopZIP5YkhwhybiqDnqVsSqv9DAAPFkafhfL0pPo,7623
5
- docling/backend/docling_parse_v2_backend.py,sha256=VY7MsiyqjN3Vl0UkyezriiVJMLbLRrQVuKjWaTgIUwY,8336
6
- docling/backend/html_backend.py,sha256=MlhEXaA0tgX_tLuQLnkex43gsKqpqHWnbkssxY4n_kc,14753
7
- docling/backend/mspowerpoint_backend.py,sha256=2UYfMMeWwgDtvIKQELCA-bYv5Z-rGvbMiBNcidNL_uE,14332
8
- docling/backend/msword_backend.py,sha256=4SDqZAZxLr6VV50OU3MRBAV8SwZMCyJCUbNVMVUpitc,17659
3
+ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
+ docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
5
+ docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
6
+ docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
7
+ docling/backend/html_backend.py,sha256=wfh5PWEwoqsCXxFCQbFBdJvEtlqZhXgqfPfTYETWHfE,14974
8
+ docling/backend/md_backend.py,sha256=osYiNLnep9UgLq8mUH9bmwG3kP9RXxt69I8LlyeJN6g,11505
9
+ docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
10
+ docling/backend/msword_backend.py,sha256=6bY0ebOaeSbpskUJY5t5pOf4a2VclWzeHeSo-vzsaO0,17470
9
11
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
10
12
  docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
11
13
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
14
  docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
13
15
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- docling/datamodel/base_models.py,sha256=Ha-DoRZoksjHSZHWqUSiQ79MTBEfY5ur8U_LVtyBRYU,5153
15
- docling/datamodel/document.py,sha256=FZQyJtHSeGBrZwFf-GGXDu-Dyp4iIl7VbVnTupmlUqk,19532
16
+ docling/datamodel/base_models.py,sha256=Mx0xR6YmRP8thu8CjOxjbGHLUJctqIvFwRZQ-8tQowY,5380
17
+ docling/datamodel/document.py,sha256=mkPXDms9jtPFY1pfBSicNaVRZwbbfzYFUj0dJDbMgG8,20612
16
18
  docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
17
19
  docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
18
- docling/document_converter.py,sha256=S_t9hs2uZfXC38LC0hTaAihrSJIrCvnTiuY5SvUccgk,9587
20
+ docling/document_converter.py,sha256=T-Y2pWwbCIofW209XJ3wlc5TiGeQqMbDqgzcVWyZ_0Y,10227
19
21
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
22
  docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
21
23
  docling/models/base_ocr_model.py,sha256=SYelQRValiUo6M_p_9-J7CqNIOFO-EkK58j90SMsKQY,5028
@@ -35,8 +37,8 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
37
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
36
38
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
37
39
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
38
- docling-2.1.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
39
- docling-2.1.0.dist-info/METADATA,sha256=SorLD4OMK1dU3bX5eqnw5GHqPrPwdhQ7JfYvOyajE20,6109
40
- docling-2.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
41
- docling-2.1.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
42
- docling-2.1.0.dist-info/RECORD,,
40
+ docling-2.2.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
41
+ docling-2.2.0.dist-info/METADATA,sha256=TkaywA2l2ImdMc9WpUYWUQy3n50zG9Y9eC7ziElBlU0,6205
42
+ docling-2.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
43
+ docling-2.2.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
44
+ docling-2.2.0.dist-info/RECORD,,