docling 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ import marko
7
+ import marko.ext
8
+ import marko.ext.gfm
9
+ import marko.inline
10
+ from docling_core.types.doc import (
11
+ DocItemLabel,
12
+ DoclingDocument,
13
+ DocumentOrigin,
14
+ GroupLabel,
15
+ TableCell,
16
+ TableData,
17
+ )
18
+ from marko import Markdown
19
+
20
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
21
+ from docling.datamodel.base_models import InputFormat
22
+ from docling.datamodel.document import InputDocument
23
+
24
+ _log = logging.getLogger(__name__)
25
+
26
+
27
+ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
28
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
29
+ super().__init__(in_doc, path_or_stream)
30
+
31
+ _log.debug("MD INIT!!!")
32
+
33
+ # Markdown file:
34
+ self.path_or_stream = path_or_stream
35
+ self.valid = True
36
+ self.markdown = "" # To store original Markdown string
37
+
38
+ self.in_table = False
39
+ self.md_table_buffer: list[str] = []
40
+ self.inline_text_buffer = ""
41
+
42
+ try:
43
+ if isinstance(self.path_or_stream, BytesIO):
44
+ text_stream = self.path_or_stream.getvalue().decode("utf-8")
45
+ self.markdown = text_stream
46
+ if isinstance(self.path_or_stream, Path):
47
+ with open(self.path_or_stream, "r", encoding="utf-8") as f:
48
+ md_content = f.read()
49
+ self.markdown = md_content
50
+ self.valid = True
51
+
52
+ _log.debug(self.markdown)
53
+ except Exception as e:
54
+ raise RuntimeError(
55
+ f"Could not initialize MD backend for file with hash {self.document_hash}."
56
+ ) from e
57
+ return
58
+
59
+ def close_table(self, doc=None):
60
+ if self.in_table:
61
+ _log.debug("=== TABLE START ===")
62
+ for md_table_row in self.md_table_buffer:
63
+ _log.debug(md_table_row)
64
+ _log.debug("=== TABLE END ===")
65
+ tcells = []
66
+ result_table = []
67
+ for n, md_table_row in enumerate(self.md_table_buffer):
68
+ data = []
69
+ if n == 0:
70
+ header = [t.strip() for t in md_table_row.split("|")[1:-1]]
71
+ for value in header:
72
+ data.append(value)
73
+ result_table.append(data)
74
+ if n > 1:
75
+ values = [t.strip() for t in md_table_row.split("|")[1:-1]]
76
+ for value in values:
77
+ data.append(value)
78
+ result_table.append(data)
79
+
80
+ for trow_ind, trow in enumerate(result_table):
81
+ for tcol_ind, cellval in enumerate(trow):
82
+ row_span = (
83
+ 1 # currently supporting just simple tables (without spans)
84
+ )
85
+ col_span = (
86
+ 1 # currently supporting just simple tables (without spans)
87
+ )
88
+ icell = TableCell(
89
+ text=cellval.strip(),
90
+ row_span=row_span,
91
+ col_span=col_span,
92
+ start_row_offset_idx=trow_ind,
93
+ end_row_offset_idx=trow_ind + row_span,
94
+ start_col_offset_idx=tcol_ind,
95
+ end_col_offset_idx=tcol_ind + col_span,
96
+ col_header=False,
97
+ row_header=False,
98
+ )
99
+ tcells.append(icell)
100
+
101
+ num_rows = len(result_table)
102
+ num_cols = len(result_table[0])
103
+ self.in_table = False
104
+ self.md_table_buffer = [] # clean table markdown buffer
105
+ # Initialize Docling TableData
106
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
107
+ # Populate
108
+ for tcell in tcells:
109
+ data.table_cells.append(tcell)
110
+ if len(tcells) > 0:
111
+ doc.add_table(data=data)
112
+ return
113
+
114
+ def process_inline_text(self, parent_element, doc=None):
115
+ # self.inline_text_buffer += str(text_in)
116
+ txt = self.inline_text_buffer.strip()
117
+ if len(txt) > 0:
118
+ doc.add_text(
119
+ label=DocItemLabel.PARAGRAPH,
120
+ parent=parent_element,
121
+ text=txt,
122
+ )
123
+ self.inline_text_buffer = ""
124
+
125
+ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
126
+ # Iterates over all elements in the AST
127
+ # Check for different element types and process relevant details
128
+ if isinstance(element, marko.block.Heading):
129
+ self.close_table(doc)
130
+ self.process_inline_text(parent_element, doc)
131
+ _log.debug(
132
+ f" - Heading level {element.level}, content: {element.children[0].children}"
133
+ )
134
+ if element.level == 1:
135
+ doc_label = DocItemLabel.TITLE
136
+ else:
137
+ doc_label = DocItemLabel.SECTION_HEADER
138
+ snippet_text = element.children[0].children.strip()
139
+
140
+ parent_element = doc.add_text(
141
+ label=doc_label, parent=parent_element, text=snippet_text
142
+ )
143
+
144
+ elif isinstance(element, marko.block.List):
145
+ self.close_table(doc)
146
+ self.process_inline_text(parent_element, doc)
147
+ _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
148
+ list_label = GroupLabel.LIST
149
+ if element.ordered:
150
+ list_label = GroupLabel.ORDERED_LIST
151
+ parent_element = doc.add_group(
152
+ label=list_label, name=f"list", parent=parent_element
153
+ )
154
+
155
+ elif isinstance(element, marko.block.ListItem):
156
+ self.close_table(doc)
157
+ self.process_inline_text(parent_element, doc)
158
+ _log.debug(" - List item")
159
+
160
+ snippet_text = str(element.children[0].children[0].children)
161
+ is_numbered = False
162
+ if parent_element.label == GroupLabel.ORDERED_LIST:
163
+ is_numbered = True
164
+ doc.add_list_item(
165
+ enumerated=is_numbered, parent=parent_element, text=snippet_text
166
+ )
167
+
168
+ elif isinstance(element, marko.inline.Image):
169
+ self.close_table(doc)
170
+ self.process_inline_text(parent_element, doc)
171
+ _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
172
+ doc.add_picture(parent=parent_element, caption=element.title)
173
+
174
+ elif isinstance(element, marko.block.Paragraph):
175
+ self.process_inline_text(parent_element, doc)
176
+
177
+ elif isinstance(element, marko.inline.RawText):
178
+ _log.debug(f" - Paragraph (raw text): {element.children}")
179
+ snippet_text = str(element.children).strip()
180
+ # Detect start of the table:
181
+ if "|" in snippet_text:
182
+ # most likely part of the markdown table
183
+ self.in_table = True
184
+ if len(self.md_table_buffer) > 0:
185
+ self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
186
+ snippet_text
187
+ )
188
+ else:
189
+ self.md_table_buffer.append(snippet_text)
190
+ else:
191
+ self.close_table(doc)
192
+ self.in_table = False
193
+ # most likely just inline text
194
+ self.inline_text_buffer += str(
195
+ element.children
196
+ ) # do not strip an inline text, as it may contain important spaces
197
+
198
+ elif isinstance(element, marko.inline.CodeSpan):
199
+ self.close_table(doc)
200
+ self.process_inline_text(parent_element, doc)
201
+ _log.debug(f" - Code Span: {element.children}")
202
+ snippet_text = str(element.children).strip()
203
+ doc.add_text(
204
+ label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
205
+ )
206
+
207
+ elif isinstance(element, marko.block.CodeBlock):
208
+ self.close_table(doc)
209
+ self.process_inline_text(parent_element, doc)
210
+ _log.debug(f" - Code Block: {element.children}")
211
+ snippet_text = str(element.children[0].children).strip()
212
+ doc.add_text(
213
+ label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
214
+ )
215
+
216
+ elif isinstance(element, marko.block.FencedCode):
217
+ self.close_table(doc)
218
+ self.process_inline_text(parent_element, doc)
219
+ _log.debug(f" - Code Block: {element.children}")
220
+ snippet_text = str(element.children[0].children).strip()
221
+ doc.add_text(
222
+ label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
223
+ )
224
+
225
+ elif isinstance(element, marko.inline.LineBreak):
226
+ self.process_inline_text(parent_element, doc)
227
+ if self.in_table:
228
+ _log.debug("Line break in a table")
229
+ self.md_table_buffer.append("")
230
+
231
+ elif isinstance(element, marko.block.HTMLBlock):
232
+ self.process_inline_text(parent_element, doc)
233
+ self.close_table(doc)
234
+ _log.debug("HTML Block: {}".format(element))
235
+ if (
236
+ len(element.children) > 0
237
+ ): # If Marko doesn't return any content for HTML block, skip it
238
+ snippet_text = str(element.children).strip()
239
+ doc.add_text(
240
+ label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
241
+ )
242
+ else:
243
+ if not isinstance(element, str):
244
+ self.close_table(doc)
245
+ _log.debug("Some other element: {}".format(element))
246
+
247
+ # Iterate through the element's children (if any)
248
+ if not isinstance(element, marko.block.ListItem):
249
+ if not isinstance(element, marko.block.Heading):
250
+ if not isinstance(element, marko.block.FencedCode):
251
+ # if not isinstance(element, marko.block.Paragraph):
252
+ if hasattr(element, "children"):
253
+ for child in element.children:
254
+ self.iterate_elements(child, depth + 1, doc, parent_element)
255
+
256
+ def is_valid(self) -> bool:
257
+ return self.valid
258
+
259
+ def unload(self):
260
+ if isinstance(self.path_or_stream, BytesIO):
261
+ self.path_or_stream.close()
262
+ self.path_or_stream = None
263
+
264
+ @classmethod
265
+ def supports_pagination(cls) -> bool:
266
+ return False
267
+
268
+ @classmethod
269
+ def supported_formats(cls) -> Set[InputFormat]:
270
+ return {InputFormat.MD}
271
+
272
+ def convert(self) -> DoclingDocument:
273
+ _log.debug("converting Markdown...")
274
+
275
+ origin = DocumentOrigin(
276
+ filename=self.file.name or "file",
277
+ mimetype="text/markdown",
278
+ binary_hash=self.document_hash,
279
+ )
280
+
281
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
282
+
283
+ if self.is_valid():
284
+ # Parse the markdown into an abstract syntax tree (AST)
285
+ marko_parser = Markdown()
286
+ parsed_ast = marko_parser.parse(self.markdown)
287
+ # Start iterating from the root of the AST
288
+ self.iterate_elements(parsed_ast, 0, doc, None)
289
+ else:
290
+ raise RuntimeError(
291
+ f"Cannot convert md with {self.document_hash} because the backend failed to init."
292
+ )
293
+ return doc
@@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
83
83
  # Parses the PPTX into a structured document model.
84
84
  # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
85
85
 
86
- fname = ""
87
- if isinstance(self.path_or_stream, Path):
88
- fname = self.path_or_stream.name
89
-
90
86
  origin = DocumentOrigin(
91
- filename=fname,
87
+ filename=self.file.name or "file",
92
88
  mimetype="application/vnd.ms-powerpoint",
93
89
  binary_hash=self.document_hash,
94
90
  )
95
- if len(fname) > 0:
96
- docname = Path(fname).stem
97
- else:
98
- docname = "stream"
91
+
99
92
  doc = DoclingDocument(
100
- name=docname, origin=origin
93
+ name=self.file.stem or "file", origin=origin
101
94
  ) # must add origin information
102
95
  doc = self.walk_linear(self.pptx_obj, doc)
103
96
 
@@ -119,10 +112,16 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
119
112
 
120
113
  def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
121
114
  is_a_list = False
115
+ is_list_group_created = False
122
116
  enum_list_item_value = 0
117
+ new_list = None
118
+ bullet_type = "None"
119
+ list_text = ""
120
+ list_label = GroupLabel.LIST
121
+ prov = self.generate_prov(shape, slide_ind, shape.text.strip())
122
+
123
+ # Identify if shape contains lists
123
124
  for paragraph in shape.text_frame.paragraphs:
124
- enum_list_item_value += 1
125
- bullet_type = "None"
126
125
  # Check if paragraph is a bullet point using the `element` XML
127
126
  p = paragraph._element
128
127
  if (
@@ -143,29 +142,32 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
143
142
  if paragraph.level > 0:
144
143
  # Most likely a sub-list
145
144
  is_a_list = True
146
- list_text = paragraph.text.strip()
147
-
148
- prov = self.generate_prov(shape, slide_ind, shape.text.strip())
149
145
 
150
146
  if is_a_list:
151
147
  # Determine if this is an unordered list or an ordered list.
152
148
  # Set GroupLabel.ORDERED_LIST when it fits.
153
- list_label = GroupLabel.LIST
154
149
  if bullet_type == "Numbered":
155
150
  list_label = GroupLabel.ORDERED_LIST
156
151
 
157
- new_list = doc.add_group(
158
- label=list_label, name=f"list", parent=parent_slide
159
- )
160
- else:
161
- new_list = None
162
-
163
152
  if is_a_list:
164
153
  _log.debug("LIST DETECTED!")
165
154
  else:
166
155
  _log.debug("No List")
167
156
 
168
- # for e in p.iter():
157
+ # If there is a list inside of the shape, create a new docling list to assign list items to
158
+ # if is_a_list:
159
+ # new_list = doc.add_group(
160
+ # label=list_label, name=f"list", parent=parent_slide
161
+ # )
162
+
163
+ # Iterate through paragraphs to build up text
164
+ for paragraph in shape.text_frame.paragraphs:
165
+ # p_text = paragraph.text.strip()
166
+ p = paragraph._element
167
+ enum_list_item_value += 1
168
+ inline_paragraph_text = ""
169
+ inline_list_item_text = ""
170
+
169
171
  for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
170
172
  if len(e.text.strip()) > 0:
171
173
  e_is_a_list_item = False
@@ -187,15 +189,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
187
189
  e_is_a_list_item = False
188
190
 
189
191
  if e_is_a_list_item:
192
+ if len(inline_paragraph_text) > 0:
193
+ # output accumulated inline text:
194
+ doc.add_text(
195
+ label=doc_label,
196
+ parent=parent_slide,
197
+ text=inline_paragraph_text,
198
+ prov=prov,
199
+ )
190
200
  # Set marker and enumerated arguments if this is an enumeration element.
191
- enum_marker = str(enum_list_item_value) + "."
192
- doc.add_list_item(
193
- marker=enum_marker,
194
- enumerated=is_numbered,
195
- parent=new_list,
196
- text=list_text,
197
- prov=prov,
198
- )
201
+ inline_list_item_text += e.text
202
+ # print(e.text)
199
203
  else:
200
204
  # Assign proper label to the text, depending if it's a Title or Section Header
201
205
  # For other types of text, assign - PARAGRAPH
@@ -210,15 +214,34 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
210
214
  doc_label = DocItemLabel.TITLE
211
215
  elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
212
216
  DocItemLabel.SECTION_HEADER
213
-
214
217
  enum_list_item_value = 0
218
+ inline_paragraph_text += e.text
215
219
 
216
- doc.add_text(
217
- label=doc_label,
218
- parent=parent_slide,
219
- text=list_text,
220
- prov=prov,
221
- )
220
+ if len(inline_paragraph_text) > 0:
221
+ # output accumulated inline text:
222
+ doc.add_text(
223
+ label=doc_label,
224
+ parent=parent_slide,
225
+ text=inline_paragraph_text,
226
+ prov=prov,
227
+ )
228
+
229
+ if len(inline_list_item_text) > 0:
230
+ enum_marker = ""
231
+ if is_numbered:
232
+ enum_marker = str(enum_list_item_value) + "."
233
+ if not is_list_group_created:
234
+ new_list = doc.add_group(
235
+ label=list_label, name=f"list", parent=parent_slide
236
+ )
237
+ is_list_group_created = True
238
+ doc.add_list_item(
239
+ marker=enum_marker,
240
+ enumerated=is_numbered,
241
+ parent=new_list,
242
+ text=inline_list_item_text,
243
+ prov=prov,
244
+ )
222
245
  return
223
246
 
224
247
  def handle_title(self, shape, parent_slide, slide_ind, doc):
@@ -311,7 +334,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
311
334
  if len(tcells) > 0:
312
335
  # If table is not fully empty...
313
336
  # Create Docling table
314
- doc.add_table(data=data, prov=prov)
337
+ doc.add_table(parent=parent_slide, data=data, prov=prov)
315
338
  return
316
339
 
317
340
  def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
@@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
85
85
  def convert(self) -> DoclingDocument:
86
86
  # Parses the DOCX into a structured document model.
87
87
 
88
- fname = ""
89
- if isinstance(self.path_or_stream, Path):
90
- fname = self.path_or_stream.name
91
-
92
88
  origin = DocumentOrigin(
93
- filename=fname,
89
+ filename=self.file.name or "file",
94
90
  mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
95
91
  binary_hash=self.document_hash,
96
92
  )
97
- if len(fname) > 0:
98
- docname = Path(fname).stem
99
- else:
100
- docname = "stream"
101
- doc = DoclingDocument(name=docname, origin=origin)
93
+
94
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
102
95
  if self.is_valid():
103
96
  assert self.docx_obj is not None
104
97
  doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
@@ -30,6 +30,8 @@ class InputFormat(str, Enum):
30
30
  HTML = "html"
31
31
  IMAGE = "image"
32
32
  PDF = "pdf"
33
+ ASCIIDOC = "asciidoc"
34
+ MD = "md"
33
35
 
34
36
 
35
37
  class OutputFormat(str, Enum):
@@ -43,29 +45,33 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
43
45
  InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
44
46
  InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
45
47
  InputFormat.PDF: ["pdf"],
48
+ InputFormat.MD: ["md"],
46
49
  InputFormat.HTML: ["html", "htm", "xhtml"],
47
50
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
51
+ InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
48
52
  }
49
53
 
50
- FormatToMimeType: Dict[InputFormat, Set[str]] = {
51
- InputFormat.DOCX: {
54
+ FormatToMimeType: Dict[InputFormat, List[str]] = {
55
+ InputFormat.DOCX: [
52
56
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
53
57
  "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
54
- },
55
- InputFormat.PPTX: {
58
+ ],
59
+ InputFormat.PPTX: [
56
60
  "application/vnd.openxmlformats-officedocument.presentationml.template",
57
61
  "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
58
62
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
59
- },
60
- InputFormat.HTML: {"text/html", "application/xhtml+xml"},
61
- InputFormat.IMAGE: {
63
+ ],
64
+ InputFormat.HTML: ["text/html", "application/xhtml+xml"],
65
+ InputFormat.IMAGE: [
62
66
  "image/png",
63
67
  "image/jpeg",
64
68
  "image/tiff",
65
69
  "image/gif",
66
70
  "image/bmp",
67
- },
68
- InputFormat.PDF: {"application/pdf"},
71
+ ],
72
+ InputFormat.PDF: ["application/pdf"],
73
+ InputFormat.ASCIIDOC: ["text/asciidoc"],
74
+ InputFormat.MD: ["text/markdown", "text/x-markdown"],
69
75
  }
70
76
  MimeTypeToFormat = {
71
77
  mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
@@ -6,12 +6,6 @@ from pathlib import Path, PurePath
6
6
  from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
7
7
 
8
8
  import filetype
9
- from docling_core.types import BaseText
10
- from docling_core.types import Document as DsDocument
11
- from docling_core.types import DocumentDescription as DsDocumentDescription
12
- from docling_core.types import FileInfoObject as DsFileInfoObject
13
- from docling_core.types import PageDimensions, PageReference, Prov, Ref
14
- from docling_core.types import Table as DsSchemaTable
15
9
  from docling_core.types.doc import (
16
10
  DocItem,
17
11
  DocItemLabel,
@@ -22,7 +16,22 @@ from docling_core.types.doc import (
22
16
  TextItem,
23
17
  )
24
18
  from docling_core.types.doc.document import ListItem
25
- from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
19
+ from docling_core.types.legacy_doc.base import (
20
+ BaseText,
21
+ Figure,
22
+ GlmTableCell,
23
+ PageDimensions,
24
+ PageReference,
25
+ Prov,
26
+ Ref,
27
+ )
28
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
29
+ from docling_core.types.legacy_doc.base import TableCell
30
+ from docling_core.types.legacy_doc.document import (
31
+ CCSDocumentDescription as DsDocumentDescription,
32
+ )
33
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
26
35
  from docling_core.utils.file import resolve_file_source
27
36
  from pydantic import BaseModel
28
37
  from typing_extensions import deprecated
@@ -36,6 +45,8 @@ from docling.datamodel.base_models import (
36
45
  ConversionStatus,
37
46
  DocumentStream,
38
47
  ErrorItem,
48
+ FormatToExtensions,
49
+ FormatToMimeType,
39
50
  InputFormat,
40
51
  MimeTypeToFormat,
41
52
  Page,
@@ -134,11 +145,13 @@ class InputDocument(BaseModel):
134
145
  self.valid = False
135
146
 
136
147
  except (FileNotFoundError, OSError) as e:
148
+ self.valid = False
137
149
  _log.exception(
138
150
  f"File {self.file.name} not found or cannot be opened.", exc_info=e
139
151
  )
140
152
  # raise
141
153
  except RuntimeError as e:
154
+ self.valid = False
142
155
  _log.exception(
143
156
  f"An unexpected error occurred while opening the document {self.file.name}",
144
157
  exc_info=e,
@@ -157,6 +170,8 @@ class InputDocument(BaseModel):
157
170
  )
158
171
 
159
172
  self._backend = backend(self, path_or_stream=path_or_stream)
173
+ if not self._backend.is_valid():
174
+ self.valid = False
160
175
 
161
176
 
162
177
  class DocumentFormat(str, Enum):
@@ -471,26 +486,48 @@ class _DocumentConversionInput(BaseModel):
471
486
  else:
472
487
  raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
473
488
 
474
- def _guess_format(self, obj):
475
- content = None
489
+ def _guess_format(self, obj: Union[Path, DocumentStream]):
490
+ content = b"" # empty binary blob
491
+ format = None
492
+
476
493
  if isinstance(obj, Path):
477
494
  mime = filetype.guess_mime(str(obj))
478
495
  if mime is None:
496
+ ext = obj.suffix[1:]
497
+ mime = self._mime_from_extension(ext)
498
+ if mime is None: # must guess from
479
499
  with obj.open("rb") as f:
480
500
  content = f.read(1024) # Read first 1KB
481
501
 
482
502
  elif isinstance(obj, DocumentStream):
483
- obj.stream.seek(0)
484
503
  content = obj.stream.read(8192)
485
504
  obj.stream.seek(0)
486
505
  mime = filetype.guess_mime(content)
506
+ if mime is None:
507
+ ext = (
508
+ obj.name.rsplit(".", 1)[-1]
509
+ if ("." in obj.name and not obj.name.startswith("."))
510
+ else ""
511
+ )
512
+ mime = self._mime_from_extension(ext)
487
513
 
488
- if mime is None:
489
- mime = self._detect_html_xhtml(content)
514
+ mime = mime or self._detect_html_xhtml(content)
515
+ mime = mime or "text/plain"
490
516
 
491
517
  format = MimeTypeToFormat.get(mime)
492
518
  return format
493
519
 
520
+ def _mime_from_extension(self, ext):
521
+ mime = None
522
+ if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
523
+ mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
524
+ elif ext in FormatToExtensions[InputFormat.HTML]:
525
+ mime = FormatToMimeType[InputFormat.HTML][0]
526
+ elif ext in FormatToExtensions[InputFormat.MD]:
527
+ mime = FormatToMimeType[InputFormat.MD][0]
528
+
529
+ return mime
530
+
494
531
  def _detect_html_xhtml(self, content):
495
532
  content_str = content.decode("ascii", errors="ignore").lower()
496
533
  # Remove XML comments
@@ -22,6 +22,9 @@ class TableStructureOptions(BaseModel):
22
22
 
23
23
  class OcrOptions(BaseModel):
24
24
  kind: str
25
+ bitmap_area_threshold: float = (
26
+ 0.05 # percentage of the area for a bitmap to processed with OCR
27
+ )
25
28
 
26
29
 
27
30
  class EasyOcrOptions(OcrOptions):