docling 2.23.0__tar.gz → 2.24.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {docling-2.23.0 → docling-2.24.0}/PKG-INFO +3 -4
  2. {docling-2.23.0 → docling-2.24.0}/docling/backend/html_backend.py +152 -149
  3. {docling-2.23.0 → docling-2.24.0}/docling/backend/xml/jats_backend.py +6 -68
  4. {docling-2.23.0 → docling-2.24.0}/docling/backend/xml/uspto_backend.py +48 -27
  5. {docling-2.23.0 → docling-2.24.0}/docling/models/page_assemble_model.py +8 -0
  6. docling-2.24.0/docling/models/readingorder_model.py +389 -0
  7. {docling-2.23.0 → docling-2.24.0}/docling/models/tesseract_ocr_cli_model.py +3 -1
  8. {docling-2.23.0 → docling-2.24.0}/docling/pipeline/standard_pdf_pipeline.py +2 -2
  9. {docling-2.23.0 → docling-2.24.0}/pyproject.toml +3 -6
  10. docling-2.23.0/docling/models/ds_glm_model.py +0 -386
  11. {docling-2.23.0 → docling-2.24.0}/LICENSE +0 -0
  12. {docling-2.23.0 → docling-2.24.0}/README.md +0 -0
  13. {docling-2.23.0 → docling-2.24.0}/docling/__init__.py +0 -0
  14. {docling-2.23.0 → docling-2.24.0}/docling/backend/__init__.py +0 -0
  15. {docling-2.23.0 → docling-2.24.0}/docling/backend/abstract_backend.py +0 -0
  16. {docling-2.23.0 → docling-2.24.0}/docling/backend/asciidoc_backend.py +0 -0
  17. {docling-2.23.0 → docling-2.24.0}/docling/backend/csv_backend.py +0 -0
  18. {docling-2.23.0 → docling-2.24.0}/docling/backend/docling_parse_backend.py +0 -0
  19. {docling-2.23.0 → docling-2.24.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  20. {docling-2.23.0 → docling-2.24.0}/docling/backend/json/__init__.py +0 -0
  21. {docling-2.23.0 → docling-2.24.0}/docling/backend/json/docling_json_backend.py +0 -0
  22. {docling-2.23.0 → docling-2.24.0}/docling/backend/md_backend.py +0 -0
  23. {docling-2.23.0 → docling-2.24.0}/docling/backend/msexcel_backend.py +0 -0
  24. {docling-2.23.0 → docling-2.24.0}/docling/backend/mspowerpoint_backend.py +0 -0
  25. {docling-2.23.0 → docling-2.24.0}/docling/backend/msword_backend.py +0 -0
  26. {docling-2.23.0 → docling-2.24.0}/docling/backend/pdf_backend.py +0 -0
  27. {docling-2.23.0 → docling-2.24.0}/docling/backend/pypdfium2_backend.py +0 -0
  28. {docling-2.23.0 → docling-2.24.0}/docling/backend/xml/__init__.py +0 -0
  29. {docling-2.23.0 → docling-2.24.0}/docling/chunking/__init__.py +0 -0
  30. {docling-2.23.0 → docling-2.24.0}/docling/cli/__init__.py +0 -0
  31. {docling-2.23.0 → docling-2.24.0}/docling/cli/main.py +0 -0
  32. {docling-2.23.0 → docling-2.24.0}/docling/cli/models.py +0 -0
  33. {docling-2.23.0 → docling-2.24.0}/docling/cli/tools.py +0 -0
  34. {docling-2.23.0 → docling-2.24.0}/docling/datamodel/__init__.py +0 -0
  35. {docling-2.23.0 → docling-2.24.0}/docling/datamodel/base_models.py +0 -0
  36. {docling-2.23.0 → docling-2.24.0}/docling/datamodel/document.py +0 -0
  37. {docling-2.23.0 → docling-2.24.0}/docling/datamodel/pipeline_options.py +0 -0
  38. {docling-2.23.0 → docling-2.24.0}/docling/datamodel/settings.py +0 -0
  39. {docling-2.23.0 → docling-2.24.0}/docling/document_converter.py +0 -0
  40. {docling-2.23.0 → docling-2.24.0}/docling/exceptions.py +0 -0
  41. {docling-2.23.0 → docling-2.24.0}/docling/models/__init__.py +0 -0
  42. {docling-2.23.0 → docling-2.24.0}/docling/models/base_model.py +0 -0
  43. {docling-2.23.0 → docling-2.24.0}/docling/models/base_ocr_model.py +0 -0
  44. {docling-2.23.0 → docling-2.24.0}/docling/models/code_formula_model.py +0 -0
  45. {docling-2.23.0 → docling-2.24.0}/docling/models/document_picture_classifier.py +0 -0
  46. {docling-2.23.0 → docling-2.24.0}/docling/models/easyocr_model.py +0 -0
  47. {docling-2.23.0 → docling-2.24.0}/docling/models/layout_model.py +0 -0
  48. {docling-2.23.0 → docling-2.24.0}/docling/models/ocr_mac_model.py +0 -0
  49. {docling-2.23.0 → docling-2.24.0}/docling/models/page_preprocessing_model.py +0 -0
  50. {docling-2.23.0 → docling-2.24.0}/docling/models/picture_description_api_model.py +0 -0
  51. {docling-2.23.0 → docling-2.24.0}/docling/models/picture_description_base_model.py +0 -0
  52. {docling-2.23.0 → docling-2.24.0}/docling/models/picture_description_vlm_model.py +0 -0
  53. {docling-2.23.0 → docling-2.24.0}/docling/models/rapid_ocr_model.py +0 -0
  54. {docling-2.23.0 → docling-2.24.0}/docling/models/table_structure_model.py +0 -0
  55. {docling-2.23.0 → docling-2.24.0}/docling/models/tesseract_ocr_model.py +0 -0
  56. {docling-2.23.0 → docling-2.24.0}/docling/pipeline/__init__.py +0 -0
  57. {docling-2.23.0 → docling-2.24.0}/docling/pipeline/base_pipeline.py +0 -0
  58. {docling-2.23.0 → docling-2.24.0}/docling/pipeline/simple_pipeline.py +0 -0
  59. {docling-2.23.0 → docling-2.24.0}/docling/py.typed +0 -0
  60. {docling-2.23.0 → docling-2.24.0}/docling/utils/__init__.py +0 -0
  61. {docling-2.23.0 → docling-2.24.0}/docling/utils/accelerator_utils.py +0 -0
  62. {docling-2.23.0 → docling-2.24.0}/docling/utils/export.py +0 -0
  63. {docling-2.23.0 → docling-2.24.0}/docling/utils/glm_utils.py +0 -0
  64. {docling-2.23.0 → docling-2.24.0}/docling/utils/layout_postprocessor.py +0 -0
  65. {docling-2.23.0 → docling-2.24.0}/docling/utils/model_downloader.py +0 -0
  66. {docling-2.23.0 → docling-2.24.0}/docling/utils/ocr_utils.py +0 -0
  67. {docling-2.23.0 → docling-2.24.0}/docling/utils/profiling.py +0 -0
  68. {docling-2.23.0 → docling-2.24.0}/docling/utils/utils.py +0 -0
  69. {docling-2.23.0 → docling-2.24.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.23.0
3
+ Version: 2.24.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -25,11 +25,10 @@ Provides-Extra: ocrmac
25
25
  Provides-Extra: rapidocr
26
26
  Provides-Extra: tesserocr
27
27
  Provides-Extra: vlm
28
- Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
28
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
29
29
  Requires-Dist: certifi (>=2024.7.4)
30
- Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
31
30
  Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
32
- Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
+ Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
32
  Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
34
33
  Requires-Dist: easyocr (>=1.7,<2.0)
35
34
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
@@ -1,17 +1,20 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Optional, Set, Union
4
+ from typing import Optional, Union, cast
5
5
 
6
- from bs4 import BeautifulSoup, Tag
6
+ from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
7
7
  from docling_core.types.doc import (
8
+ DocItem,
8
9
  DocItemLabel,
9
10
  DoclingDocument,
10
11
  DocumentOrigin,
12
+ GroupItem,
11
13
  GroupLabel,
12
14
  TableCell,
13
15
  TableData,
14
16
  )
17
+ from typing_extensions import override
15
18
 
16
19
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
17
20
  from docling.datamodel.base_models import InputFormat
@@ -21,6 +24,7 @@ _log = logging.getLogger(__name__)
21
24
 
22
25
 
23
26
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
27
+ @override
24
28
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
25
29
  super().__init__(in_doc, path_or_stream)
26
30
  _log.debug("About to init HTML backend...")
@@ -30,10 +34,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
30
34
  # Initialise the parents for the hierarchy
31
35
  self.max_levels = 10
32
36
  self.level = 0
33
- self.parents = {} # type: ignore
37
+ self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
34
38
  for i in range(0, self.max_levels):
35
39
  self.parents[i] = None
36
- self.labels = {} # type: ignore
37
40
 
38
41
  try:
39
42
  if isinstance(self.path_or_stream, BytesIO):
@@ -48,13 +51,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
48
51
  f"Could not initialize HTML backend for file with hash {self.document_hash}."
49
52
  ) from e
50
53
 
54
+ @override
51
55
  def is_valid(self) -> bool:
52
56
  return self.soup is not None
53
57
 
54
58
  @classmethod
59
+ @override
55
60
  def supports_pagination(cls) -> bool:
56
61
  return False
57
62
 
63
+ @override
58
64
  def unload(self):
59
65
  if isinstance(self.path_or_stream, BytesIO):
60
66
  self.path_or_stream.close()
@@ -62,9 +68,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
62
68
  self.path_or_stream = None
63
69
 
64
70
  @classmethod
65
- def supported_formats(cls) -> Set[InputFormat]:
71
+ @override
72
+ def supported_formats(cls) -> set[InputFormat]:
66
73
  return {InputFormat.HTML}
67
74
 
75
+ @override
68
76
  def convert(self) -> DoclingDocument:
69
77
  # access self.path_or_stream to load stuff
70
78
  origin = DocumentOrigin(
@@ -80,98 +88,73 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
80
88
  assert self.soup is not None
81
89
  content = self.soup.body or self.soup
82
90
  # Replace <br> tags with newline characters
83
- for br in content.find_all("br"):
84
- br.replace_with("\n")
85
- doc = self.walk(content, doc)
91
+ for br in content("br"):
92
+ br.replace_with(NavigableString("\n"))
93
+ self.walk(content, doc)
86
94
  else:
87
95
  raise RuntimeError(
88
96
  f"Cannot convert doc with {self.document_hash} because the backend failed to init."
89
97
  )
90
98
  return doc
91
99
 
92
- def walk(self, element: Tag, doc: DoclingDocument):
93
- try:
94
- # Iterate over elements in the body of the document
95
- for idx, element in enumerate(element.children):
100
+ def walk(self, tag: Tag, doc: DoclingDocument) -> None:
101
+ # Iterate over elements in the body of the document
102
+ for element in tag.children:
103
+ if isinstance(element, Tag):
96
104
  try:
97
- self.analyse_element(element, idx, doc)
105
+ self.analyze_tag(cast(Tag, element), doc)
98
106
  except Exception as exc_child:
99
-
100
- _log.error(" -> error treating child: ", exc_child)
101
- _log.error(" => element: ", element, "\n")
107
+ _log.error(
108
+ f"Error processing child from tag{tag.name}: {exc_child}"
109
+ )
102
110
  raise exc_child
103
111
 
104
- except Exception as exc:
105
- pass
106
-
107
- return doc
108
-
109
- def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
110
- """
111
- if element.name!=None:
112
- _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
113
- """
114
-
115
- if element.name in self.labels:
116
- self.labels[element.name] += 1
112
+ return
113
+
114
+ def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
115
+ if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
116
+ self.handle_header(tag, doc)
117
+ elif tag.name in ["p"]:
118
+ self.handle_paragraph(tag, doc)
119
+ elif tag.name in ["pre"]:
120
+ self.handle_code(tag, doc)
121
+ elif tag.name in ["ul", "ol"]:
122
+ self.handle_list(tag, doc)
123
+ elif tag.name in ["li"]:
124
+ self.handle_list_item(tag, doc)
125
+ elif tag.name == "table":
126
+ self.handle_table(tag, doc)
127
+ elif tag.name == "figure":
128
+ self.handle_figure(tag, doc)
129
+ elif tag.name == "img":
130
+ self.handle_image(doc)
117
131
  else:
118
- self.labels[element.name] = 1
119
-
120
- if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
121
- self.handle_header(element, idx, doc)
122
- elif element.name in ["p"]:
123
- self.handle_paragraph(element, idx, doc)
124
- elif element.name in ["pre"]:
125
- self.handle_code(element, idx, doc)
126
- elif element.name in ["ul", "ol"]:
127
- self.handle_list(element, idx, doc)
128
- elif element.name in ["li"]:
129
- self.handle_listitem(element, idx, doc)
130
- elif element.name == "table":
131
- self.handle_table(element, idx, doc)
132
- elif element.name == "figure":
133
- self.handle_figure(element, idx, doc)
134
- elif element.name == "img":
135
- self.handle_image(element, idx, doc)
136
- else:
137
- self.walk(element, doc)
132
+ self.walk(tag, doc)
138
133
 
139
- def get_direct_text(self, item: Tag):
140
- """Get the direct text of the <li> element (ignoring nested lists)."""
141
- text = item.find(string=True, recursive=False)
142
- if isinstance(text, str):
143
- return text.strip()
134
+ def get_text(self, item: PageElement) -> str:
135
+ """Get the text content of a tag."""
136
+ parts: list[str] = self.extract_text_recursively(item)
144
137
 
145
- return ""
138
+ return "".join(parts) + " "
146
139
 
147
140
  # Function to recursively extract text from all child nodes
148
- def extract_text_recursively(self, item: Tag):
149
- result = []
141
+ def extract_text_recursively(self, item: PageElement) -> list[str]:
142
+ result: list[str] = []
150
143
 
151
- if isinstance(item, str):
144
+ if isinstance(item, NavigableString):
152
145
  return [item]
153
146
 
154
- if item.name not in ["ul", "ol"]:
155
- try:
156
- # Iterate over the children (and their text and tails)
157
- for child in item:
158
- try:
159
- # Recursively get the child's text content
160
- result.extend(self.extract_text_recursively(child))
161
- except:
162
- pass
163
- except:
164
- _log.warn("item has no children")
165
- pass
166
-
167
- return "".join(result) + " "
168
-
169
- def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
147
+ tag = cast(Tag, item)
148
+ if tag.name not in ["ul", "ol"]:
149
+ for child in tag:
150
+ # Recursively get the child's text content
151
+ result.extend(self.extract_text_recursively(child))
152
+
153
+ return ["".join(result) + " "]
154
+
155
+ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
170
156
  """Handles header tags (h1, h2, etc.)."""
171
157
  hlevel = int(element.name.replace("h", ""))
172
- slevel = hlevel - 1
173
-
174
- label = DocItemLabel.SECTION_HEADER
175
158
  text = element.text.strip()
176
159
 
177
160
  if hlevel == 1:
@@ -197,7 +180,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
197
180
  elif hlevel < self.level:
198
181
 
199
182
  # remove the tail
200
- for key, val in self.parents.items():
183
+ for key in self.parents.keys():
201
184
  if key > hlevel:
202
185
  self.parents[key] = None
203
186
  self.level = hlevel
@@ -208,27 +191,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
208
191
  level=hlevel,
209
192
  )
210
193
 
211
- def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
194
+ def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
212
195
  """Handles monospace code snippets (pre)."""
213
196
  if element.text is None:
214
197
  return
215
198
  text = element.text.strip()
216
- label = DocItemLabel.CODE
217
- if len(text) == 0:
218
- return
219
- doc.add_code(parent=self.parents[self.level], text=text)
199
+ if text:
200
+ doc.add_code(parent=self.parents[self.level], text=text)
220
201
 
221
- def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
202
+ def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
222
203
  """Handles paragraph tags (p)."""
223
204
  if element.text is None:
224
205
  return
225
206
  text = element.text.strip()
226
207
  label = DocItemLabel.PARAGRAPH
227
- if len(text) == 0:
228
- return
229
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
208
+ if text:
209
+ doc.add_text(parent=self.parents[self.level], label=label, text=text)
230
210
 
231
- def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
211
+ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
232
212
  """Handles list tags (ul, ol) and their list items."""
233
213
 
234
214
  if element.name == "ul":
@@ -250,25 +230,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
250
230
  self.parents[self.level + 1] = None
251
231
  self.level -= 1
252
232
 
253
- def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
233
+ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
254
234
  """Handles listitem tags (li)."""
255
- nested_lists = element.find(["ul", "ol"])
235
+ nested_list = element.find(["ul", "ol"])
256
236
 
257
- parent_list_label = self.parents[self.level].label
258
- index_in_list = len(self.parents[self.level].children) + 1
237
+ parent = self.parents[self.level]
238
+ if parent is None:
239
+ _log.warning(f"list-item has no parent in DoclingDocument: {element}")
240
+ return
241
+ parent_label: str = parent.label
242
+ index_in_list = len(parent.children) + 1
259
243
 
260
- if nested_lists:
261
- name = element.name
244
+ if nested_list:
262
245
  # Text in list item can be hidden within hierarchy, hence
263
246
  # we need to extract it recursively
264
- text = self.extract_text_recursively(element)
247
+ text: str = self.get_text(element)
265
248
  # Flatten text, remove break lines:
266
249
  text = text.replace("\n", "").replace("\r", "")
267
250
  text = " ".join(text.split()).strip()
268
251
 
269
252
  marker = ""
270
253
  enumerated = False
271
- if parent_list_label == GroupLabel.ORDERED_LIST:
254
+ if parent_label == GroupLabel.ORDERED_LIST:
272
255
  marker = str(index_in_list)
273
256
  enumerated = True
274
257
 
@@ -278,7 +261,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
278
261
  text=text,
279
262
  enumerated=enumerated,
280
263
  marker=marker,
281
- parent=self.parents[self.level],
264
+ parent=parent,
282
265
  )
283
266
  self.level += 1
284
267
 
@@ -287,74 +270,94 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
287
270
  self.parents[self.level + 1] = None
288
271
  self.level -= 1
289
272
 
290
- elif isinstance(element.text, str):
273
+ elif element.text.strip():
291
274
  text = element.text.strip()
292
275
 
293
276
  marker = ""
294
277
  enumerated = False
295
- if parent_list_label == GroupLabel.ORDERED_LIST:
278
+ if parent_label == GroupLabel.ORDERED_LIST:
296
279
  marker = f"{str(index_in_list)}."
297
280
  enumerated = True
298
281
  doc.add_list_item(
299
282
  text=text,
300
283
  enumerated=enumerated,
301
284
  marker=marker,
302
- parent=self.parents[self.level],
285
+ parent=parent,
303
286
  )
304
287
  else:
305
- _log.warn("list-item has no text: ", element)
306
-
307
- def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
308
- """Handles table tags."""
288
+ _log.warning(f"list-item has no text: {element}")
309
289
 
290
+ @staticmethod
291
+ def parse_table_data(element: Tag) -> Optional[TableData]:
310
292
  nested_tables = element.find("table")
311
293
  if nested_tables is not None:
312
- _log.warn("detected nested tables: skipping for now")
313
- return
294
+ _log.warning("Skipping nested table.")
295
+ return None
314
296
 
315
297
  # Count the number of rows (number of <tr> elements)
316
- num_rows = len(element.find_all("tr"))
298
+ num_rows = len(element("tr"))
317
299
 
318
300
  # Find the number of columns (taking into account colspan)
319
301
  num_cols = 0
320
- for row in element.find_all("tr"):
302
+ for row in element("tr"):
321
303
  col_count = 0
322
- for cell in row.find_all(["td", "th"]):
323
- colspan = int(cell.get("colspan", 1))
304
+ if not isinstance(row, Tag):
305
+ continue
306
+ for cell in row(["td", "th"]):
307
+ if not isinstance(row, Tag):
308
+ continue
309
+ val = cast(Tag, cell).get("colspan", "1")
310
+ colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
324
311
  col_count += colspan
325
312
  num_cols = max(num_cols, col_count)
326
313
 
327
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
314
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
328
315
 
329
316
  data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
330
317
 
331
318
  # Iterate over the rows in the table
332
- for row_idx, row in enumerate(element.find_all("tr")):
319
+ for row_idx, row in enumerate(element("tr")):
320
+ if not isinstance(row, Tag):
321
+ continue
333
322
 
334
323
  # For each row, find all the column cells (both <td> and <th>)
335
- cells = row.find_all(["td", "th"])
324
+ cells = row(["td", "th"])
336
325
 
337
326
  # Check if each cell in the row is a header -> means it is a column header
338
327
  col_header = True
339
- for j, html_cell in enumerate(cells):
340
- if html_cell.name == "td":
328
+ for html_cell in cells:
329
+ if isinstance(html_cell, Tag) and html_cell.name == "td":
341
330
  col_header = False
342
331
 
332
+ # Extract the text content of each cell
343
333
  col_idx = 0
344
- # Extract and print the text content of each cell
345
- for _, html_cell in enumerate(cells):
346
-
334
+ for html_cell in cells:
335
+ if not isinstance(html_cell, Tag):
336
+ continue
337
+
338
+ # extract inline formulas
339
+ for formula in html_cell("inline-formula"):
340
+ math_parts = formula.text.split("$$")
341
+ if len(math_parts) == 3:
342
+ math_formula = f"$${math_parts[1]}$$"
343
+ formula.replace_with(NavigableString(math_formula))
344
+
345
+ # TODO: extract content correctly from table-cells with lists
347
346
  text = html_cell.text
348
- try:
349
- text = self.extract_table_cell_text(html_cell)
350
- except Exception as exc:
351
- _log.warn("exception: ", exc)
352
- exit(-1)
353
347
 
354
348
  # label = html_cell.name
355
-
356
- col_span = int(html_cell.get("colspan", 1))
357
- row_span = int(html_cell.get("rowspan", 1))
349
+ col_val = html_cell.get("colspan", "1")
350
+ col_span = (
351
+ int(col_val)
352
+ if isinstance(col_val, str) and col_val.isnumeric()
353
+ else 1
354
+ )
355
+ row_val = html_cell.get("rowspan", "1")
356
+ row_span = (
357
+ int(row_val)
358
+ if isinstance(row_val, str) and row_val.isnumeric()
359
+ else 1
360
+ )
358
361
 
359
362
  while grid[row_idx][col_idx] is not None:
360
363
  col_idx += 1
@@ -362,7 +365,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
362
365
  for c in range(col_span):
363
366
  grid[row_idx + r][col_idx + c] = text
364
367
 
365
- cell = TableCell(
368
+ table_cell = TableCell(
366
369
  text=text,
367
370
  row_span=row_span,
368
371
  col_span=col_span,
@@ -373,57 +376,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
373
376
  col_header=col_header,
374
377
  row_header=((not col_header) and html_cell.name == "th"),
375
378
  )
376
- data.table_cells.append(cell)
379
+ data.table_cells.append(table_cell)
377
380
 
378
- doc.add_table(data=data, parent=self.parents[self.level])
381
+ return data
379
382
 
380
- def get_list_text(self, list_element: Tag, level=0):
383
+ def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
384
+ """Handles table tags."""
385
+
386
+ table_data = HTMLDocumentBackend.parse_table_data(element)
387
+
388
+ if table_data is not None:
389
+ doc.add_table(data=table_data, parent=self.parents[self.level])
390
+
391
+ def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
381
392
  """Recursively extract text from <ul> or <ol> with proper indentation."""
382
393
  result = []
383
394
  bullet_char = "*" # Default bullet character for unordered lists
384
395
 
385
396
  if list_element.name == "ol": # For ordered lists, use numbers
386
- for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
397
+ for i, li in enumerate(list_element("li", recursive=False), 1):
398
+ if not isinstance(li, Tag):
399
+ continue
387
400
  # Add numbering for ordered lists
388
401
  result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
389
402
  # Handle nested lists
390
403
  nested_list = li.find(["ul", "ol"])
391
- if nested_list:
404
+ if isinstance(nested_list, Tag):
392
405
  result.extend(self.get_list_text(nested_list, level + 1))
393
406
  elif list_element.name == "ul": # For unordered lists, use bullet points
394
- for li in list_element.find_all("li", recursive=False):
407
+ for li in list_element("li", recursive=False):
408
+ if not isinstance(li, Tag):
409
+ continue
395
410
  # Add bullet points for unordered lists
396
411
  result.append(
397
412
  f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
398
413
  )
399
414
  # Handle nested lists
400
415
  nested_list = li.find(["ul", "ol"])
401
- if nested_list:
416
+ if isinstance(nested_list, Tag):
402
417
  result.extend(self.get_list_text(nested_list, level + 1))
403
418
 
404
419
  return result
405
420
 
406
- def extract_table_cell_text(self, cell: Tag):
407
- """Extract text from a table cell, including lists with indents."""
408
- contains_lists = cell.find(["ul", "ol"])
409
- if contains_lists is None:
410
- return cell.text
411
- else:
412
- _log.debug(
413
- "should extract the content correctly for table-cells with lists ..."
414
- )
415
- return cell.text
416
-
417
- def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
421
+ def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
418
422
  """Handles image tags (img)."""
419
423
 
420
424
  # Extract the image URI from the <img> tag
421
425
  # image_uri = root.xpath('//figure//img/@src')[0]
422
426
 
423
427
  contains_captions = element.find(["figcaption"])
424
- if contains_captions is None:
428
+ if not isinstance(contains_captions, Tag):
425
429
  doc.add_picture(parent=self.parents[self.level], caption=None)
426
-
427
430
  else:
428
431
  texts = []
429
432
  for item in contains_captions:
@@ -437,6 +440,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
437
440
  caption=fig_caption,
438
441
  )
439
442
 
440
- def handle_image(self, element: Tag, idx, doc: DoclingDocument):
443
+ def handle_image(self, doc: DoclingDocument) -> None:
441
444
  """Handles image tags (img)."""
442
445
  doc.add_picture(parent=self.parents[self.level], caption=None)
@@ -4,7 +4,7 @@ from io import BytesIO
4
4
  from pathlib import Path
5
5
  from typing import Final, Optional, Union
6
6
 
7
- from bs4 import BeautifulSoup
7
+ from bs4 import BeautifulSoup, Tag
8
8
  from docling_core.types.doc import (
9
9
  DocItemLabel,
10
10
  DoclingDocument,
@@ -12,14 +12,13 @@ from docling_core.types.doc import (
12
12
  GroupItem,
13
13
  GroupLabel,
14
14
  NodeItem,
15
- TableCell,
16
- TableData,
17
15
  TextItem,
18
16
  )
19
17
  from lxml import etree
20
18
  from typing_extensions import TypedDict, override
21
19
 
22
20
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
21
+ from docling.backend.html_backend import HTMLDocumentBackend
23
22
  from docling.datamodel.base_models import InputFormat
24
23
  from docling.datamodel.document import InputDocument
25
24
 
@@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
540
539
  ) -> None:
541
540
  soup = BeautifulSoup(table_xml_component["content"], "html.parser")
542
541
  table_tag = soup.find("table")
543
-
544
- nested_tables = table_tag.find("table")
545
- if nested_tables:
546
- _log.warning(f"Skipping nested table in {str(self.file)}")
542
+ if not isinstance(table_tag, Tag):
547
543
  return
548
544
 
549
- # Count the number of rows (number of <tr> elements)
550
- num_rows = len(table_tag.find_all("tr"))
551
-
552
- # Find the number of columns (taking into account colspan)
553
- num_cols = 0
554
- for row in table_tag.find_all("tr"):
555
- col_count = 0
556
- for cell in row.find_all(["td", "th"]):
557
- colspan = int(cell.get("colspan", 1))
558
- col_count += colspan
559
- num_cols = max(num_cols, col_count)
560
-
561
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
562
-
563
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
564
-
565
- # Iterate over the rows in the table
566
- for row_idx, row in enumerate(table_tag.find_all("tr")):
567
- # For each row, find all the column cells (both <td> and <th>)
568
- cells = row.find_all(["td", "th"])
569
-
570
- # Check if each cell in the row is a header -> means it is a column header
571
- col_header = True
572
- for j, html_cell in enumerate(cells):
573
- if html_cell.name == "td":
574
- col_header = False
575
-
576
- # Extract and print the text content of each cell
577
- col_idx = 0
578
- for _, html_cell in enumerate(cells):
579
- # extract inline formulas
580
- for formula in html_cell.find_all("inline-formula"):
581
- math_parts = formula.text.split("$$")
582
- if len(math_parts) == 3:
583
- math_formula = f"$${math_parts[1]}$$"
584
- formula.replaceWith(math_formula)
585
- text = html_cell.text
586
-
587
- col_span = int(html_cell.get("colspan", 1))
588
- row_span = int(html_cell.get("rowspan", 1))
589
-
590
- while grid[row_idx][col_idx] is not None:
591
- col_idx += 1
592
- for r in range(row_span):
593
- for c in range(col_span):
594
- grid[row_idx + r][col_idx + c] = text
595
-
596
- cell = TableCell(
597
- text=text,
598
- row_span=row_span,
599
- col_span=col_span,
600
- start_row_offset_idx=row_idx,
601
- end_row_offset_idx=row_idx + row_span,
602
- start_col_offset_idx=col_idx,
603
- end_col_offset_idx=col_idx + col_span,
604
- col_header=col_header,
605
- row_header=((not col_header) and html_cell.name == "th"),
606
- )
607
- data.table_cells.append(cell)
545
+ data = HTMLDocumentBackend.parse_table_data(table_tag)
608
546
 
609
547
  # TODO: format label vs caption once styling is supported
610
548
  label = table_xml_component["label"]
@@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
616
554
  else None
617
555
  )
618
556
 
619
- doc.add_table(data=data, parent=parent, caption=table_caption)
557
+ if data is not None:
558
+ doc.add_table(data=data, parent=parent, caption=table_caption)
620
559
 
621
560
  return
622
561
 
@@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
673
612
  def _walk_linear(
674
613
  self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
675
614
  ) -> str:
676
- # _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
677
615
  skip_tags = ["term"]
678
616
  flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
679
617
  new_parent: NodeItem = parent