docling 2.23.0__py3-none-any.whl → 2.23.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,20 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Optional, Set, Union
4
+ from typing import Optional, Union, cast
5
5
 
6
- from bs4 import BeautifulSoup, Tag
6
+ from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
7
7
  from docling_core.types.doc import (
8
+ DocItem,
8
9
  DocItemLabel,
9
10
  DoclingDocument,
10
11
  DocumentOrigin,
12
+ GroupItem,
11
13
  GroupLabel,
12
14
  TableCell,
13
15
  TableData,
14
16
  )
17
+ from typing_extensions import override
15
18
 
16
19
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
17
20
  from docling.datamodel.base_models import InputFormat
@@ -21,6 +24,7 @@ _log = logging.getLogger(__name__)
21
24
 
22
25
 
23
26
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
27
+ @override
24
28
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
25
29
  super().__init__(in_doc, path_or_stream)
26
30
  _log.debug("About to init HTML backend...")
@@ -30,10 +34,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
30
34
  # Initialise the parents for the hierarchy
31
35
  self.max_levels = 10
32
36
  self.level = 0
33
- self.parents = {} # type: ignore
37
+ self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
34
38
  for i in range(0, self.max_levels):
35
39
  self.parents[i] = None
36
- self.labels = {} # type: ignore
37
40
 
38
41
  try:
39
42
  if isinstance(self.path_or_stream, BytesIO):
@@ -48,13 +51,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
48
51
  f"Could not initialize HTML backend for file with hash {self.document_hash}."
49
52
  ) from e
50
53
 
54
+ @override
51
55
  def is_valid(self) -> bool:
52
56
  return self.soup is not None
53
57
 
54
58
  @classmethod
59
+ @override
55
60
  def supports_pagination(cls) -> bool:
56
61
  return False
57
62
 
63
+ @override
58
64
  def unload(self):
59
65
  if isinstance(self.path_or_stream, BytesIO):
60
66
  self.path_or_stream.close()
@@ -62,9 +68,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
62
68
  self.path_or_stream = None
63
69
 
64
70
  @classmethod
65
- def supported_formats(cls) -> Set[InputFormat]:
71
+ @override
72
+ def supported_formats(cls) -> set[InputFormat]:
66
73
  return {InputFormat.HTML}
67
74
 
75
+ @override
68
76
  def convert(self) -> DoclingDocument:
69
77
  # access self.path_or_stream to load stuff
70
78
  origin = DocumentOrigin(
@@ -80,98 +88,73 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
80
88
  assert self.soup is not None
81
89
  content = self.soup.body or self.soup
82
90
  # Replace <br> tags with newline characters
83
- for br in content.find_all("br"):
84
- br.replace_with("\n")
85
- doc = self.walk(content, doc)
91
+ for br in content("br"):
92
+ br.replace_with(NavigableString("\n"))
93
+ self.walk(content, doc)
86
94
  else:
87
95
  raise RuntimeError(
88
96
  f"Cannot convert doc with {self.document_hash} because the backend failed to init."
89
97
  )
90
98
  return doc
91
99
 
92
- def walk(self, element: Tag, doc: DoclingDocument):
93
- try:
94
- # Iterate over elements in the body of the document
95
- for idx, element in enumerate(element.children):
100
+ def walk(self, tag: Tag, doc: DoclingDocument) -> None:
101
+ # Iterate over elements in the body of the document
102
+ for element in tag.children:
103
+ if isinstance(element, Tag):
96
104
  try:
97
- self.analyse_element(element, idx, doc)
105
+ self.analyze_tag(cast(Tag, element), doc)
98
106
  except Exception as exc_child:
99
-
100
- _log.error(" -> error treating child: ", exc_child)
101
- _log.error(" => element: ", element, "\n")
107
+ _log.error(
108
+ f"Error processing child from tag{tag.name}: {exc_child}"
109
+ )
102
110
  raise exc_child
103
111
 
104
- except Exception as exc:
105
- pass
106
-
107
- return doc
108
-
109
- def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
110
- """
111
- if element.name!=None:
112
- _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
113
- """
114
-
115
- if element.name in self.labels:
116
- self.labels[element.name] += 1
112
+ return
113
+
114
+ def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
115
+ if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
116
+ self.handle_header(tag, doc)
117
+ elif tag.name in ["p"]:
118
+ self.handle_paragraph(tag, doc)
119
+ elif tag.name in ["pre"]:
120
+ self.handle_code(tag, doc)
121
+ elif tag.name in ["ul", "ol"]:
122
+ self.handle_list(tag, doc)
123
+ elif tag.name in ["li"]:
124
+ self.handle_list_item(tag, doc)
125
+ elif tag.name == "table":
126
+ self.handle_table(tag, doc)
127
+ elif tag.name == "figure":
128
+ self.handle_figure(tag, doc)
129
+ elif tag.name == "img":
130
+ self.handle_image(doc)
117
131
  else:
118
- self.labels[element.name] = 1
119
-
120
- if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
121
- self.handle_header(element, idx, doc)
122
- elif element.name in ["p"]:
123
- self.handle_paragraph(element, idx, doc)
124
- elif element.name in ["pre"]:
125
- self.handle_code(element, idx, doc)
126
- elif element.name in ["ul", "ol"]:
127
- self.handle_list(element, idx, doc)
128
- elif element.name in ["li"]:
129
- self.handle_listitem(element, idx, doc)
130
- elif element.name == "table":
131
- self.handle_table(element, idx, doc)
132
- elif element.name == "figure":
133
- self.handle_figure(element, idx, doc)
134
- elif element.name == "img":
135
- self.handle_image(element, idx, doc)
136
- else:
137
- self.walk(element, doc)
132
+ self.walk(tag, doc)
138
133
 
139
- def get_direct_text(self, item: Tag):
140
- """Get the direct text of the <li> element (ignoring nested lists)."""
141
- text = item.find(string=True, recursive=False)
142
- if isinstance(text, str):
143
- return text.strip()
134
+ def get_text(self, item: PageElement) -> str:
135
+ """Get the text content of a tag."""
136
+ parts: list[str] = self.extract_text_recursively(item)
144
137
 
145
- return ""
138
+ return "".join(parts) + " "
146
139
 
147
140
  # Function to recursively extract text from all child nodes
148
- def extract_text_recursively(self, item: Tag):
149
- result = []
141
+ def extract_text_recursively(self, item: PageElement) -> list[str]:
142
+ result: list[str] = []
150
143
 
151
- if isinstance(item, str):
144
+ if isinstance(item, NavigableString):
152
145
  return [item]
153
146
 
154
- if item.name not in ["ul", "ol"]:
155
- try:
156
- # Iterate over the children (and their text and tails)
157
- for child in item:
158
- try:
159
- # Recursively get the child's text content
160
- result.extend(self.extract_text_recursively(child))
161
- except:
162
- pass
163
- except:
164
- _log.warn("item has no children")
165
- pass
166
-
167
- return "".join(result) + " "
168
-
169
- def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
147
+ tag = cast(Tag, item)
148
+ if tag.name not in ["ul", "ol"]:
149
+ for child in tag:
150
+ # Recursively get the child's text content
151
+ result.extend(self.extract_text_recursively(child))
152
+
153
+ return ["".join(result) + " "]
154
+
155
+ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
170
156
  """Handles header tags (h1, h2, etc.)."""
171
157
  hlevel = int(element.name.replace("h", ""))
172
- slevel = hlevel - 1
173
-
174
- label = DocItemLabel.SECTION_HEADER
175
158
  text = element.text.strip()
176
159
 
177
160
  if hlevel == 1:
@@ -197,7 +180,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
197
180
  elif hlevel < self.level:
198
181
 
199
182
  # remove the tail
200
- for key, val in self.parents.items():
183
+ for key in self.parents.keys():
201
184
  if key > hlevel:
202
185
  self.parents[key] = None
203
186
  self.level = hlevel
@@ -208,27 +191,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
208
191
  level=hlevel,
209
192
  )
210
193
 
211
- def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
194
+ def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
212
195
  """Handles monospace code snippets (pre)."""
213
196
  if element.text is None:
214
197
  return
215
198
  text = element.text.strip()
216
- label = DocItemLabel.CODE
217
- if len(text) == 0:
218
- return
219
- doc.add_code(parent=self.parents[self.level], text=text)
199
+ if text:
200
+ doc.add_code(parent=self.parents[self.level], text=text)
220
201
 
221
- def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
202
+ def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
222
203
  """Handles paragraph tags (p)."""
223
204
  if element.text is None:
224
205
  return
225
206
  text = element.text.strip()
226
207
  label = DocItemLabel.PARAGRAPH
227
- if len(text) == 0:
228
- return
229
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
208
+ if text:
209
+ doc.add_text(parent=self.parents[self.level], label=label, text=text)
230
210
 
231
- def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
211
+ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
232
212
  """Handles list tags (ul, ol) and their list items."""
233
213
 
234
214
  if element.name == "ul":
@@ -250,25 +230,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
250
230
  self.parents[self.level + 1] = None
251
231
  self.level -= 1
252
232
 
253
- def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
233
+ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
254
234
  """Handles listitem tags (li)."""
255
- nested_lists = element.find(["ul", "ol"])
235
+ nested_list = element.find(["ul", "ol"])
256
236
 
257
- parent_list_label = self.parents[self.level].label
258
- index_in_list = len(self.parents[self.level].children) + 1
237
+ parent = self.parents[self.level]
238
+ if parent is None:
239
+ _log.warning(f"list-item has no parent in DoclingDocument: {element}")
240
+ return
241
+ parent_label: str = parent.label
242
+ index_in_list = len(parent.children) + 1
259
243
 
260
- if nested_lists:
261
- name = element.name
244
+ if nested_list:
262
245
  # Text in list item can be hidden within hierarchy, hence
263
246
  # we need to extract it recursively
264
- text = self.extract_text_recursively(element)
247
+ text: str = self.get_text(element)
265
248
  # Flatten text, remove break lines:
266
249
  text = text.replace("\n", "").replace("\r", "")
267
250
  text = " ".join(text.split()).strip()
268
251
 
269
252
  marker = ""
270
253
  enumerated = False
271
- if parent_list_label == GroupLabel.ORDERED_LIST:
254
+ if parent_label == GroupLabel.ORDERED_LIST:
272
255
  marker = str(index_in_list)
273
256
  enumerated = True
274
257
 
@@ -278,7 +261,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
278
261
  text=text,
279
262
  enumerated=enumerated,
280
263
  marker=marker,
281
- parent=self.parents[self.level],
264
+ parent=parent,
282
265
  )
283
266
  self.level += 1
284
267
 
@@ -287,74 +270,94 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
287
270
  self.parents[self.level + 1] = None
288
271
  self.level -= 1
289
272
 
290
- elif isinstance(element.text, str):
273
+ elif element.text.strip():
291
274
  text = element.text.strip()
292
275
 
293
276
  marker = ""
294
277
  enumerated = False
295
- if parent_list_label == GroupLabel.ORDERED_LIST:
278
+ if parent_label == GroupLabel.ORDERED_LIST:
296
279
  marker = f"{str(index_in_list)}."
297
280
  enumerated = True
298
281
  doc.add_list_item(
299
282
  text=text,
300
283
  enumerated=enumerated,
301
284
  marker=marker,
302
- parent=self.parents[self.level],
285
+ parent=parent,
303
286
  )
304
287
  else:
305
- _log.warn("list-item has no text: ", element)
306
-
307
- def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
308
- """Handles table tags."""
288
+ _log.warning(f"list-item has no text: {element}")
309
289
 
290
+ @staticmethod
291
+ def parse_table_data(element: Tag) -> Optional[TableData]:
310
292
  nested_tables = element.find("table")
311
293
  if nested_tables is not None:
312
- _log.warn("detected nested tables: skipping for now")
313
- return
294
+ _log.warning("Skipping nested table.")
295
+ return None
314
296
 
315
297
  # Count the number of rows (number of <tr> elements)
316
- num_rows = len(element.find_all("tr"))
298
+ num_rows = len(element("tr"))
317
299
 
318
300
  # Find the number of columns (taking into account colspan)
319
301
  num_cols = 0
320
- for row in element.find_all("tr"):
302
+ for row in element("tr"):
321
303
  col_count = 0
322
- for cell in row.find_all(["td", "th"]):
323
- colspan = int(cell.get("colspan", 1))
304
+ if not isinstance(row, Tag):
305
+ continue
306
+ for cell in row(["td", "th"]):
307
+ if not isinstance(row, Tag):
308
+ continue
309
+ val = cast(Tag, cell).get("colspan", "1")
310
+ colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
324
311
  col_count += colspan
325
312
  num_cols = max(num_cols, col_count)
326
313
 
327
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
314
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
328
315
 
329
316
  data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
330
317
 
331
318
  # Iterate over the rows in the table
332
- for row_idx, row in enumerate(element.find_all("tr")):
319
+ for row_idx, row in enumerate(element("tr")):
320
+ if not isinstance(row, Tag):
321
+ continue
333
322
 
334
323
  # For each row, find all the column cells (both <td> and <th>)
335
- cells = row.find_all(["td", "th"])
324
+ cells = row(["td", "th"])
336
325
 
337
326
  # Check if each cell in the row is a header -> means it is a column header
338
327
  col_header = True
339
- for j, html_cell in enumerate(cells):
340
- if html_cell.name == "td":
328
+ for html_cell in cells:
329
+ if isinstance(html_cell, Tag) and html_cell.name == "td":
341
330
  col_header = False
342
331
 
332
+ # Extract the text content of each cell
343
333
  col_idx = 0
344
- # Extract and print the text content of each cell
345
- for _, html_cell in enumerate(cells):
346
-
334
+ for html_cell in cells:
335
+ if not isinstance(html_cell, Tag):
336
+ continue
337
+
338
+ # extract inline formulas
339
+ for formula in html_cell("inline-formula"):
340
+ math_parts = formula.text.split("$$")
341
+ if len(math_parts) == 3:
342
+ math_formula = f"$${math_parts[1]}$$"
343
+ formula.replace_with(NavigableString(math_formula))
344
+
345
+ # TODO: extract content correctly from table-cells with lists
347
346
  text = html_cell.text
348
- try:
349
- text = self.extract_table_cell_text(html_cell)
350
- except Exception as exc:
351
- _log.warn("exception: ", exc)
352
- exit(-1)
353
347
 
354
348
  # label = html_cell.name
355
-
356
- col_span = int(html_cell.get("colspan", 1))
357
- row_span = int(html_cell.get("rowspan", 1))
349
+ col_val = html_cell.get("colspan", "1")
350
+ col_span = (
351
+ int(col_val)
352
+ if isinstance(col_val, str) and col_val.isnumeric()
353
+ else 1
354
+ )
355
+ row_val = html_cell.get("rowspan", "1")
356
+ row_span = (
357
+ int(row_val)
358
+ if isinstance(row_val, str) and row_val.isnumeric()
359
+ else 1
360
+ )
358
361
 
359
362
  while grid[row_idx][col_idx] is not None:
360
363
  col_idx += 1
@@ -362,7 +365,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
362
365
  for c in range(col_span):
363
366
  grid[row_idx + r][col_idx + c] = text
364
367
 
365
- cell = TableCell(
368
+ table_cell = TableCell(
366
369
  text=text,
367
370
  row_span=row_span,
368
371
  col_span=col_span,
@@ -373,57 +376,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
373
376
  col_header=col_header,
374
377
  row_header=((not col_header) and html_cell.name == "th"),
375
378
  )
376
- data.table_cells.append(cell)
379
+ data.table_cells.append(table_cell)
377
380
 
378
- doc.add_table(data=data, parent=self.parents[self.level])
381
+ return data
379
382
 
380
- def get_list_text(self, list_element: Tag, level=0):
383
+ def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
384
+ """Handles table tags."""
385
+
386
+ table_data = HTMLDocumentBackend.parse_table_data(element)
387
+
388
+ if table_data is not None:
389
+ doc.add_table(data=table_data, parent=self.parents[self.level])
390
+
391
+ def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
381
392
  """Recursively extract text from <ul> or <ol> with proper indentation."""
382
393
  result = []
383
394
  bullet_char = "*" # Default bullet character for unordered lists
384
395
 
385
396
  if list_element.name == "ol": # For ordered lists, use numbers
386
- for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
397
+ for i, li in enumerate(list_element("li", recursive=False), 1):
398
+ if not isinstance(li, Tag):
399
+ continue
387
400
  # Add numbering for ordered lists
388
401
  result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
389
402
  # Handle nested lists
390
403
  nested_list = li.find(["ul", "ol"])
391
- if nested_list:
404
+ if isinstance(nested_list, Tag):
392
405
  result.extend(self.get_list_text(nested_list, level + 1))
393
406
  elif list_element.name == "ul": # For unordered lists, use bullet points
394
- for li in list_element.find_all("li", recursive=False):
407
+ for li in list_element("li", recursive=False):
408
+ if not isinstance(li, Tag):
409
+ continue
395
410
  # Add bullet points for unordered lists
396
411
  result.append(
397
412
  f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
398
413
  )
399
414
  # Handle nested lists
400
415
  nested_list = li.find(["ul", "ol"])
401
- if nested_list:
416
+ if isinstance(nested_list, Tag):
402
417
  result.extend(self.get_list_text(nested_list, level + 1))
403
418
 
404
419
  return result
405
420
 
406
- def extract_table_cell_text(self, cell: Tag):
407
- """Extract text from a table cell, including lists with indents."""
408
- contains_lists = cell.find(["ul", "ol"])
409
- if contains_lists is None:
410
- return cell.text
411
- else:
412
- _log.debug(
413
- "should extract the content correctly for table-cells with lists ..."
414
- )
415
- return cell.text
416
-
417
- def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
421
+ def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
418
422
  """Handles image tags (img)."""
419
423
 
420
424
  # Extract the image URI from the <img> tag
421
425
  # image_uri = root.xpath('//figure//img/@src')[0]
422
426
 
423
427
  contains_captions = element.find(["figcaption"])
424
- if contains_captions is None:
428
+ if not isinstance(contains_captions, Tag):
425
429
  doc.add_picture(parent=self.parents[self.level], caption=None)
426
-
427
430
  else:
428
431
  texts = []
429
432
  for item in contains_captions:
@@ -437,6 +440,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
437
440
  caption=fig_caption,
438
441
  )
439
442
 
440
- def handle_image(self, element: Tag, idx, doc: DoclingDocument):
443
+ def handle_image(self, doc: DoclingDocument) -> None:
441
444
  """Handles image tags (img)."""
442
445
  doc.add_picture(parent=self.parents[self.level], caption=None)
@@ -4,7 +4,7 @@ from io import BytesIO
4
4
  from pathlib import Path
5
5
  from typing import Final, Optional, Union
6
6
 
7
- from bs4 import BeautifulSoup
7
+ from bs4 import BeautifulSoup, Tag
8
8
  from docling_core.types.doc import (
9
9
  DocItemLabel,
10
10
  DoclingDocument,
@@ -12,14 +12,13 @@ from docling_core.types.doc import (
12
12
  GroupItem,
13
13
  GroupLabel,
14
14
  NodeItem,
15
- TableCell,
16
- TableData,
17
15
  TextItem,
18
16
  )
19
17
  from lxml import etree
20
18
  from typing_extensions import TypedDict, override
21
19
 
22
20
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
21
+ from docling.backend.html_backend import HTMLDocumentBackend
23
22
  from docling.datamodel.base_models import InputFormat
24
23
  from docling.datamodel.document import InputDocument
25
24
 
@@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
540
539
  ) -> None:
541
540
  soup = BeautifulSoup(table_xml_component["content"], "html.parser")
542
541
  table_tag = soup.find("table")
543
-
544
- nested_tables = table_tag.find("table")
545
- if nested_tables:
546
- _log.warning(f"Skipping nested table in {str(self.file)}")
542
+ if not isinstance(table_tag, Tag):
547
543
  return
548
544
 
549
- # Count the number of rows (number of <tr> elements)
550
- num_rows = len(table_tag.find_all("tr"))
551
-
552
- # Find the number of columns (taking into account colspan)
553
- num_cols = 0
554
- for row in table_tag.find_all("tr"):
555
- col_count = 0
556
- for cell in row.find_all(["td", "th"]):
557
- colspan = int(cell.get("colspan", 1))
558
- col_count += colspan
559
- num_cols = max(num_cols, col_count)
560
-
561
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
562
-
563
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
564
-
565
- # Iterate over the rows in the table
566
- for row_idx, row in enumerate(table_tag.find_all("tr")):
567
- # For each row, find all the column cells (both <td> and <th>)
568
- cells = row.find_all(["td", "th"])
569
-
570
- # Check if each cell in the row is a header -> means it is a column header
571
- col_header = True
572
- for j, html_cell in enumerate(cells):
573
- if html_cell.name == "td":
574
- col_header = False
575
-
576
- # Extract and print the text content of each cell
577
- col_idx = 0
578
- for _, html_cell in enumerate(cells):
579
- # extract inline formulas
580
- for formula in html_cell.find_all("inline-formula"):
581
- math_parts = formula.text.split("$$")
582
- if len(math_parts) == 3:
583
- math_formula = f"$${math_parts[1]}$$"
584
- formula.replaceWith(math_formula)
585
- text = html_cell.text
586
-
587
- col_span = int(html_cell.get("colspan", 1))
588
- row_span = int(html_cell.get("rowspan", 1))
589
-
590
- while grid[row_idx][col_idx] is not None:
591
- col_idx += 1
592
- for r in range(row_span):
593
- for c in range(col_span):
594
- grid[row_idx + r][col_idx + c] = text
595
-
596
- cell = TableCell(
597
- text=text,
598
- row_span=row_span,
599
- col_span=col_span,
600
- start_row_offset_idx=row_idx,
601
- end_row_offset_idx=row_idx + row_span,
602
- start_col_offset_idx=col_idx,
603
- end_col_offset_idx=col_idx + col_span,
604
- col_header=col_header,
605
- row_header=((not col_header) and html_cell.name == "th"),
606
- )
607
- data.table_cells.append(cell)
545
+ data = HTMLDocumentBackend.parse_table_data(table_tag)
608
546
 
609
547
  # TODO: format label vs caption once styling is supported
610
548
  label = table_xml_component["label"]
@@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
616
554
  else None
617
555
  )
618
556
 
619
- doc.add_table(data=data, parent=parent, caption=table_caption)
557
+ if data is not None:
558
+ doc.add_table(data=data, parent=parent, caption=table_caption)
620
559
 
621
560
  return
622
561
 
@@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
673
612
  def _walk_linear(
674
613
  self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
675
614
  ) -> str:
676
- # _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
677
615
  skip_tags = ["term"]
678
616
  flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
679
617
  new_parent: NodeItem = parent
@@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
14
14
  from enum import Enum, unique
15
15
  from io import BytesIO
16
16
  from pathlib import Path
17
- from typing import Any, Final, Optional, Union
17
+ from typing import Final, Optional, Union
18
18
 
19
19
  from bs4 import BeautifulSoup, Tag
20
20
  from docling_core.types.doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
1406
1406
  http://oasis-open.org/specs/soextblx.dtd
1407
1407
  """
1408
1408
 
1409
+ class ColInfo(TypedDict):
1410
+ ncols: int
1411
+ colinfo: list[dict]
1412
+
1409
1413
  class MinColInfoType(TypedDict):
1410
1414
  offset: list[int]
1411
1415
  colwidth: list[int]
@@ -1425,7 +1429,7 @@ class XmlTable:
1425
1429
  self.empty_text = ""
1426
1430
  self._soup = BeautifulSoup(input, features="xml")
1427
1431
 
1428
- def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
1432
+ def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
1429
1433
  """Create a unified range along the table groups.
1430
1434
 
1431
1435
  Args:
@@ -1532,19 +1536,26 @@ class XmlTable:
1532
1536
  Returns:
1533
1537
  A docling table object.
1534
1538
  """
1535
- tgs_align = []
1536
- tg_secs = table.find_all("tgroup")
1539
+ tgs_align: list[XmlTable.ColInfo] = []
1540
+ tg_secs = table("tgroup")
1537
1541
  if tg_secs:
1538
1542
  for tg_sec in tg_secs:
1539
- ncols = tg_sec.get("cols", None)
1540
- if ncols:
1541
- ncols = int(ncols)
1542
- tg_align = {"ncols": ncols, "colinfo": []}
1543
- cs_secs = tg_sec.find_all("colspec")
1543
+ if not isinstance(tg_sec, Tag):
1544
+ continue
1545
+ col_val = tg_sec.get("cols")
1546
+ ncols = (
1547
+ int(col_val)
1548
+ if isinstance(col_val, str) and col_val.isnumeric()
1549
+ else 1
1550
+ )
1551
+ tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
1552
+ cs_secs = tg_sec("colspec")
1544
1553
  if cs_secs:
1545
1554
  for cs_sec in cs_secs:
1546
- colname = cs_sec.get("colname", None)
1547
- colwidth = cs_sec.get("colwidth", None)
1555
+ if not isinstance(cs_sec, Tag):
1556
+ continue
1557
+ colname = cs_sec.get("colname")
1558
+ colwidth = cs_sec.get("colwidth")
1548
1559
  tg_align["colinfo"].append(
1549
1560
  {"colname": colname, "colwidth": colwidth}
1550
1561
  )
@@ -1565,16 +1576,23 @@ class XmlTable:
1565
1576
  table_data: list[TableCell] = []
1566
1577
  i_row_global = 0
1567
1578
  is_row_empty: bool = True
1568
- tg_secs = table.find_all("tgroup")
1579
+ tg_secs = table("tgroup")
1569
1580
  if tg_secs:
1570
1581
  for itg, tg_sec in enumerate(tg_secs):
1582
+ if not isinstance(tg_sec, Tag):
1583
+ continue
1571
1584
  tg_range = tgs_range[itg]
1572
- row_secs = tg_sec.find_all(["row", "tr"])
1585
+ row_secs = tg_sec(["row", "tr"])
1573
1586
 
1574
1587
  if row_secs:
1575
1588
  for row_sec in row_secs:
1576
- entry_secs = row_sec.find_all(["entry", "td"])
1577
- is_header: bool = row_sec.parent.name in ["thead"]
1589
+ if not isinstance(row_sec, Tag):
1590
+ continue
1591
+ entry_secs = row_sec(["entry", "td"])
1592
+ is_header: bool = (
1593
+ row_sec.parent is not None
1594
+ and row_sec.parent.name == "thead"
1595
+ )
1578
1596
 
1579
1597
  ncols = 0
1580
1598
  local_row: list[TableCell] = []
@@ -1582,23 +1600,26 @@ class XmlTable:
1582
1600
  if entry_secs:
1583
1601
  wrong_nbr_cols = False
1584
1602
  for ientry, entry_sec in enumerate(entry_secs):
1603
+ if not isinstance(entry_sec, Tag):
1604
+ continue
1585
1605
  text = entry_sec.get_text().strip()
1586
1606
 
1587
1607
  # start-end
1588
- namest = entry_sec.attrs.get("namest", None)
1589
- nameend = entry_sec.attrs.get("nameend", None)
1590
- if isinstance(namest, str) and namest.isnumeric():
1591
- namest = int(namest)
1592
- else:
1593
- namest = ientry + 1
1608
+ namest = entry_sec.get("namest")
1609
+ nameend = entry_sec.get("nameend")
1610
+ start = (
1611
+ int(namest)
1612
+ if isinstance(namest, str) and namest.isnumeric()
1613
+ else ientry + 1
1614
+ )
1594
1615
  if isinstance(nameend, str) and nameend.isnumeric():
1595
- nameend = int(nameend)
1616
+ end = int(nameend)
1596
1617
  shift = 0
1597
1618
  else:
1598
- nameend = ientry + 2
1619
+ end = ientry + 2
1599
1620
  shift = 1
1600
1621
 
1601
- if nameend > len(tg_range["cell_offst"]):
1622
+ if end > len(tg_range["cell_offst"]):
1602
1623
  wrong_nbr_cols = True
1603
1624
  self.nbr_messages += 1
1604
1625
  if self.nbr_messages <= self.max_nbr_messages:
@@ -1608,8 +1629,8 @@ class XmlTable:
1608
1629
  break
1609
1630
 
1610
1631
  range_ = [
1611
- tg_range["cell_offst"][namest - 1],
1612
- tg_range["cell_offst"][nameend - 1] - shift,
1632
+ tg_range["cell_offst"][start - 1],
1633
+ tg_range["cell_offst"][end - 1] - shift,
1613
1634
  ]
1614
1635
 
1615
1636
  # add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ class XmlTable:
1668
1689
  A docling table data.
1669
1690
  """
1670
1691
  section = self._soup.find("table")
1671
- if section is not None:
1692
+ if isinstance(section, Tag):
1672
1693
  table = self._parse_table(section)
1673
1694
  if table.num_rows == 0 or table.num_cols == 0:
1674
1695
  _log.warning("The parsed USPTO table is empty")
@@ -114,7 +114,9 @@ class TesseractOcrCliModel(BaseOcrModel):
114
114
  # _log.info("df: ", df.head())
115
115
 
116
116
  # Filter rows that contain actual text (ignore header or empty rows)
117
- df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
117
+ df_filtered = df[
118
+ df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
119
+ ]
118
120
 
119
121
  return df_filtered
120
122
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.23.0
3
+ Version: 2.23.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -25,7 +25,7 @@ Provides-Extra: ocrmac
25
25
  Provides-Extra: rapidocr
26
26
  Provides-Extra: tesserocr
27
27
  Provides-Extra: vlm
28
- Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
28
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
29
29
  Requires-Dist: certifi (>=2024.7.4)
30
30
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
31
31
  Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
@@ -5,7 +5,7 @@ docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQ
5
5
  docling/backend/csv_backend.py,sha256=xuId4JGEXjoyPgO9Fy9hQ5C-ezXvJwv0TGB8fyFHgWM,4533
6
6
  docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
7
7
  docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
8
- docling/backend/html_backend.py,sha256=YTPLZiEEEuGaP6G62skK3wXJ0KftuqBCl8erNXeJyoE,15893
8
+ docling/backend/html_backend.py,sha256=BxYvYmgcio6IqROMFKgyYyoankcNUccalCeYlmTE4fk,16094
9
9
  docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
11
11
  docling/backend/md_backend.py,sha256=NaVfcnEH-5bwVovjn76EobF6B6Wm8AhaTZ4E8k0TUPo,16826
@@ -15,8 +15,8 @@ docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4y
15
15
  docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
16
16
  docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
17
17
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- docling/backend/xml/jats_backend.py,sha256=JI1iibmrob9Gv9y7zoFncavQ0oJaGWnQoLkozAIiTQU,27513
19
- docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
18
+ docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqafCvdr7s,24945
19
+ docling/backend/xml/uspto_backend.py,sha256=IGUNeF2xpLeaVrX6nKb-jXgtSYD2ozULsrDPcrI1IbQ,71040
20
20
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
21
21
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
@@ -45,7 +45,7 @@ docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0
45
45
  docling/models/picture_description_vlm_model.py,sha256=a2vYUdlcA0--_8neY0tTiU8reCf29NCbVMKwWdMy2QQ,3653
46
46
  docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
47
47
  docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
48
- docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
48
+ docling/models/tesseract_ocr_cli_model.py,sha256=F5EhS4NDEmLkPq-a0P7o2LrzjmJgACzlYXTDvtD3NtY,9343
49
49
  docling/models/tesseract_ocr_model.py,sha256=ikGu6QNknLG64c9yYIb0Ix6MGhBzOoa1ODbNc8MT5r8,8508
50
50
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
@@ -62,8 +62,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
62
62
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
63
63
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
64
64
  docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
65
- docling-2.23.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
66
- docling-2.23.0.dist-info/METADATA,sha256=O4EJYC_yjLCFfKnhnzgSW4qGLOHaatDWDXsQS2EJDjU,8720
67
- docling-2.23.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
68
- docling-2.23.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
69
- docling-2.23.0.dist-info/RECORD,,
65
+ docling-2.23.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
66
+ docling-2.23.1.dist-info/METADATA,sha256=PDA1FnXkfCr0QYxO_s4bVRhACiGkXzpycTLTmqKmJ6c,8719
67
+ docling-2.23.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
68
+ docling-2.23.1.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
69
+ docling-2.23.1.dist-info/RECORD,,