docling 1.19.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +379 -324
  12. docling/datamodel/pipeline_options.py +16 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +19 -6
  17. docling/models/ds_glm_model.py +220 -22
  18. docling/models/easyocr_model.py +45 -40
  19. docling/models/layout_model.py +130 -114
  20. docling/models/page_assemble_model.py +119 -95
  21. docling/models/page_preprocessing_model.py +61 -0
  22. docling/models/table_structure_model.py +122 -111
  23. docling/models/tesseract_ocr_cli_model.py +63 -56
  24. docling/models/tesseract_ocr_model.py +58 -50
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.1.0.dist-info/METADATA +149 -0
  31. docling-2.1.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.1.dist-info/METADATA +0 -380
  35. docling-1.19.1.dist-info/RECORD +0 -34
  36. {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,425 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ from bs4 import BeautifulSoup
7
+ from docling_core.types.doc import (
8
+ DocItemLabel,
9
+ DoclingDocument,
10
+ GroupLabel,
11
+ TableCell,
12
+ TableData,
13
+ )
14
+
15
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
16
+ from docling.datamodel.base_models import InputFormat
17
+ from docling.datamodel.document import InputDocument
18
+
19
+ _log = logging.getLogger(__name__)
20
+
21
+
22
+ class HTMLDocumentBackend(DeclarativeDocumentBackend):
23
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
24
+ super().__init__(in_doc, path_or_stream)
25
+ _log.debug("About to init HTML backend...")
26
+ self.soup = None
27
+ # HTML file:
28
+ self.path_or_stream = path_or_stream
29
+ # Initialise the parents for the hierarchy
30
+ self.max_levels = 10
31
+ self.level = 0
32
+ self.parents = {} # type: ignore
33
+ for i in range(0, self.max_levels):
34
+ self.parents[i] = None
35
+ self.labels = {} # type: ignore
36
+
37
+ try:
38
+ if isinstance(self.path_or_stream, BytesIO):
39
+ text_stream = self.path_or_stream.getvalue().decode("utf-8")
40
+ self.soup = BeautifulSoup(text_stream, "html.parser")
41
+ if isinstance(self.path_or_stream, Path):
42
+ with open(self.path_or_stream, "r", encoding="utf-8") as f:
43
+ html_content = f.read()
44
+ self.soup = BeautifulSoup(html_content, "html.parser")
45
+ except Exception as e:
46
+ raise RuntimeError(
47
+ f"Could not initialize HTML backend for file with hash {self.document_hash}."
48
+ ) from e
49
+
50
+ def is_valid(self) -> bool:
51
+ return self.soup is not None
52
+
53
+ @classmethod
54
+ def supports_pagination(cls) -> bool:
55
+ return False
56
+
57
+ def unload(self):
58
+ if isinstance(self.path_or_stream, BytesIO):
59
+ self.path_or_stream.close()
60
+
61
+ self.path_or_stream = None
62
+
63
+ @classmethod
64
+ def supported_formats(cls) -> Set[InputFormat]:
65
+ return {InputFormat.HTML}
66
+
67
+ def convert(self) -> DoclingDocument:
68
+ # access self.path_or_stream to load stuff
69
+ doc = DoclingDocument(name="dummy")
70
+ _log.debug("Trying to convert HTML...")
71
+
72
+ if self.is_valid():
73
+ assert self.soup is not None
74
+ # Replace <br> tags with newline characters
75
+ for br in self.soup.body.find_all("br"):
76
+ br.replace_with("\n")
77
+ doc = self.walk(self.soup.body, doc)
78
+ else:
79
+ raise RuntimeError(
80
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
81
+ )
82
+ return doc
83
+
84
+ def walk(self, element, doc):
85
+ try:
86
+ # Iterate over elements in the body of the document
87
+ for idx, element in enumerate(element.children):
88
+ try:
89
+ self.analyse_element(element, idx, doc)
90
+ except Exception as exc_child:
91
+
92
+ _log.error(" -> error treating child: ", exc_child)
93
+ _log.error(" => element: ", element, "\n")
94
+ raise exc_child
95
+
96
+ except Exception as exc:
97
+ pass
98
+
99
+ return doc
100
+
101
+ def analyse_element(self, element, idx, doc):
102
+ """
103
+ if element.name!=None:
104
+ _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
105
+ """
106
+
107
+ if element.name in self.labels:
108
+ self.labels[element.name] += 1
109
+ else:
110
+ self.labels[element.name] = 1
111
+
112
+ if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
113
+ self.handle_header(element, idx, doc)
114
+ elif element.name in ["p"]:
115
+ self.handle_paragraph(element, idx, doc)
116
+ elif element.name in ["ul", "ol"]:
117
+ self.handle_list(element, idx, doc)
118
+ elif element.name in ["li"]:
119
+ self.handle_listitem(element, idx, doc)
120
+ elif element.name == "table":
121
+ self.handle_table(element, idx, doc)
122
+ elif element.name == "figure":
123
+ self.handle_figure(element, idx, doc)
124
+ elif element.name == "img":
125
+ self.handle_image(element, idx, doc)
126
+ else:
127
+ self.walk(element, doc)
128
+
129
+ def get_direct_text(self, item):
130
+ """Get the direct text of the <li> element (ignoring nested lists)."""
131
+ text = item.find(string=True, recursive=False)
132
+
133
+ if isinstance(text, str):
134
+ return text.strip()
135
+
136
+ return ""
137
+
138
+ # Function to recursively extract text from all child nodes
139
+ def extract_text_recursively(self, item):
140
+ result = []
141
+
142
+ if isinstance(item, str):
143
+ return [item]
144
+
145
+ result.append(self.get_direct_text(item))
146
+
147
+ try:
148
+ # Iterate over the children (and their text and tails)
149
+ for child in item:
150
+ try:
151
+ # Recursively get the child's text content
152
+ result.extend(self.extract_text_recursively(child))
153
+ except:
154
+ pass
155
+ except:
156
+ _log.warn("item has no children")
157
+ pass
158
+
159
+ return " ".join(result)
160
+
161
+ def handle_header(self, element, idx, doc):
162
+ """Handles header tags (h1, h2, etc.)."""
163
+ hlevel = int(element.name.replace("h", ""))
164
+ slevel = hlevel - 1
165
+
166
+ label = DocItemLabel.SECTION_HEADER
167
+ text = element.text.strip()
168
+
169
+ if hlevel == 1:
170
+ for key, val in self.parents.items():
171
+ self.parents[key] = None
172
+
173
+ self.level = 1
174
+ self.parents[self.level] = doc.add_text(
175
+ parent=self.parents[0], label=DocItemLabel.TITLE, text=text
176
+ )
177
+
178
+ elif hlevel == self.level:
179
+ self.parents[hlevel] = doc.add_text(
180
+ parent=self.parents[hlevel - 1], label=label, text=text
181
+ )
182
+
183
+ elif hlevel > self.level:
184
+
185
+ # add invisible group
186
+ for i in range(self.level + 1, hlevel):
187
+ self.parents[i] = doc.add_group(
188
+ name=f"header-{i}",
189
+ label=GroupLabel.SECTION,
190
+ parent=self.parents[i - 1],
191
+ )
192
+
193
+ self.parents[hlevel] = doc.add_text(
194
+ parent=self.parents[hlevel - 1], label=label, text=text
195
+ )
196
+ self.level = hlevel
197
+
198
+ elif hlevel < self.level:
199
+
200
+ # remove the tail
201
+ for key, val in self.parents.items():
202
+ if key > hlevel:
203
+ self.parents[key] = None
204
+
205
+ self.parents[hlevel] = doc.add_text(
206
+ parent=self.parents[hlevel - 1], label=label, text=text
207
+ )
208
+ self.level = hlevel
209
+
210
+ def handle_paragraph(self, element, idx, doc):
211
+ """Handles paragraph tags (p)."""
212
+ if element.text is None:
213
+ return
214
+ text = element.text.strip()
215
+ label = DocItemLabel.PARAGRAPH
216
+ if len(text) == 0:
217
+ return
218
+ doc.add_text(parent=self.parents[self.level], label=label, text=text)
219
+
220
+ def handle_list(self, element, idx, doc):
221
+ """Handles list tags (ul, ol) and their list items."""
222
+
223
+ if element.name == "ul":
224
+ # create a list group
225
+ self.parents[self.level + 1] = doc.add_group(
226
+ parent=self.parents[self.level], name="list", label=GroupLabel.LIST
227
+ )
228
+ elif element.name == "ol":
229
+ # create a list group
230
+ self.parents[self.level + 1] = doc.add_group(
231
+ parent=self.parents[self.level],
232
+ name="ordered list",
233
+ label=GroupLabel.ORDERED_LIST,
234
+ )
235
+ self.level += 1
236
+
237
+ self.walk(element, doc)
238
+
239
+ self.parents[self.level + 1] = None
240
+ self.level -= 1
241
+
242
+ def handle_listitem(self, element, idx, doc):
243
+ """Handles listitem tags (li)."""
244
+ nested_lists = element.find(["ul", "ol"])
245
+
246
+ parent_list_label = self.parents[self.level].label
247
+ index_in_list = len(self.parents[self.level].children) + 1
248
+
249
+ if nested_lists:
250
+ name = element.name
251
+ text = self.get_direct_text(element)
252
+
253
+ marker = ""
254
+ enumerated = False
255
+ if parent_list_label == GroupLabel.ORDERED_LIST:
256
+ marker = str(index_in_list)
257
+ enumerated = True
258
+
259
+ # create a list-item
260
+ self.parents[self.level + 1] = doc.add_list_item(
261
+ text=text,
262
+ enumerated=enumerated,
263
+ marker=marker,
264
+ parent=self.parents[self.level],
265
+ )
266
+ self.level += 1
267
+
268
+ self.walk(element, doc)
269
+
270
+ self.parents[self.level + 1] = None
271
+ self.level -= 1
272
+
273
+ elif isinstance(element.text, str):
274
+ text = element.text.strip()
275
+
276
+ marker = ""
277
+ enumerated = False
278
+ if parent_list_label == GroupLabel.ORDERED_LIST:
279
+ marker = f"{str(index_in_list)}."
280
+ enumerated = True
281
+ doc.add_list_item(
282
+ text=text,
283
+ enumerated=enumerated,
284
+ marker=marker,
285
+ parent=self.parents[self.level],
286
+ )
287
+ else:
288
+ _log.warn("list-item has no text: ", element)
289
+
290
+ def handle_table(self, element, idx, doc):
291
+ """Handles table tags."""
292
+
293
+ nested_tables = element.find("table")
294
+ if nested_tables is not None:
295
+ _log.warn("detected nested tables: skipping for now")
296
+ return
297
+
298
+ # Count the number of rows (number of <tr> elements)
299
+ num_rows = len(element.find_all("tr"))
300
+
301
+ # Find the number of columns (taking into account colspan)
302
+ num_cols = 0
303
+ for row in element.find_all("tr"):
304
+ col_count = 0
305
+ for cell in row.find_all(["td", "th"]):
306
+ colspan = int(cell.get("colspan", 1))
307
+ col_count += colspan
308
+ num_cols = max(num_cols, col_count)
309
+
310
+ grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
311
+
312
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
313
+
314
+ # Iterate over the rows in the table
315
+ for row_idx, row in enumerate(element.find_all("tr")):
316
+
317
+ # For each row, find all the column cells (both <td> and <th>)
318
+ cells = row.find_all(["td", "th"])
319
+
320
+ # Check if each cell in the row is a header -> means it is a column header
321
+ col_header = True
322
+ for j, html_cell in enumerate(cells):
323
+ if html_cell.name == "td":
324
+ col_header = False
325
+
326
+ col_idx = 0
327
+ # Extract and print the text content of each cell
328
+ for _, html_cell in enumerate(cells):
329
+
330
+ text = html_cell.text
331
+ try:
332
+ text = self.extract_table_cell_text(html_cell)
333
+ except Exception as exc:
334
+ _log.warn("exception: ", exc)
335
+ exit(-1)
336
+
337
+ # label = html_cell.name
338
+
339
+ col_span = int(html_cell.get("colspan", 1))
340
+ row_span = int(html_cell.get("rowspan", 1))
341
+
342
+ while grid[row_idx][col_idx] is not None:
343
+ col_idx += 1
344
+ for r in range(row_span):
345
+ for c in range(col_span):
346
+ grid[row_idx + r][col_idx + c] = text
347
+
348
+ cell = TableCell(
349
+ text=text,
350
+ row_span=row_span,
351
+ col_span=col_span,
352
+ start_row_offset_idx=row_idx,
353
+ end_row_offset_idx=row_idx + row_span,
354
+ start_col_offset_idx=col_idx,
355
+ end_col_offset_idx=col_idx + col_span,
356
+ col_header=col_header,
357
+ row_header=((not col_header) and html_cell.name == "th"),
358
+ )
359
+ data.table_cells.append(cell)
360
+
361
+ doc.add_table(data=data, parent=self.parents[self.level])
362
+
363
+ def get_list_text(self, list_element, level=0):
364
+ """Recursively extract text from <ul> or <ol> with proper indentation."""
365
+ result = []
366
+ bullet_char = "*" # Default bullet character for unordered lists
367
+
368
+ if list_element.name == "ol": # For ordered lists, use numbers
369
+ for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
370
+ # Add numbering for ordered lists
371
+ result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
372
+ # Handle nested lists
373
+ nested_list = li.find(["ul", "ol"])
374
+ if nested_list:
375
+ result.extend(self.get_list_text(nested_list, level + 1))
376
+ elif list_element.name == "ul": # For unordered lists, use bullet points
377
+ for li in list_element.find_all("li", recursive=False):
378
+ # Add bullet points for unordered lists
379
+ result.append(
380
+ f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
381
+ )
382
+ # Handle nested lists
383
+ nested_list = li.find(["ul", "ol"])
384
+ if nested_list:
385
+ result.extend(self.get_list_text(nested_list, level + 1))
386
+
387
+ return result
388
+
389
+ def extract_table_cell_text(self, cell):
390
+ """Extract text from a table cell, including lists with indents."""
391
+ contains_lists = cell.find(["ul", "ol"])
392
+ if contains_lists is None:
393
+ return cell.text
394
+ else:
395
+ _log.debug(
396
+ "should extract the content correctly for table-cells with lists ..."
397
+ )
398
+ return cell.text
399
+
400
+ def handle_figure(self, element, idx, doc):
401
+ """Handles image tags (img)."""
402
+
403
+ # Extract the image URI from the <img> tag
404
+ # image_uri = root.xpath('//figure//img/@src')[0]
405
+
406
+ contains_captions = element.find(["figcaption"])
407
+ if contains_captions is None:
408
+ doc.add_picture(parent=self.parents[self.level], caption=None)
409
+
410
+ else:
411
+ texts = []
412
+ for item in contains_captions:
413
+ texts.append(item.text)
414
+
415
+ fig_caption = doc.add_text(
416
+ label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
417
+ )
418
+ doc.add_picture(
419
+ parent=self.parents[self.level],
420
+ caption=fig_caption,
421
+ )
422
+
423
+ def handle_image(self, element, idx, doc):
424
+ """Handles image tags (img)."""
425
+ doc.add_picture(parent=self.parents[self.level], caption=None)