docling 2.53.0__py3-none-any.whl → 2.55.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +1 -1
- docling/backend/html_backend.py +254 -136
- docling/backend/md_backend.py +4 -1
- docling/backend/msword_backend.py +177 -76
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/jats_backend.py +111 -7
- docling/backend/xml/uspto_backend.py +1 -1
- docling/cli/main.py +5 -0
- docling/datamodel/base_models.py +23 -23
- docling/datamodel/document.py +2 -0
- docling/datamodel/pipeline_options_vlm_model.py +13 -2
- docling/datamodel/vlm_model_specs.py +9 -0
- docling/document_converter.py +4 -0
- docling/models/api_vlm_model.py +45 -16
- docling/models/base_model.py +2 -1
- docling/models/readingorder_model.py +1 -1
- docling/models/table_structure_model.py +3 -3
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +6 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +75 -14
- docling/models/vlm_models_inline/mlx_model.py +58 -1
- docling/models/vlm_models_inline/vllm_model.py +189 -124
- docling/utils/api_image_request.py +107 -1
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/METADATA +5 -5
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/RECORD +29 -27
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/WHEEL +0 -0
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/entry_points.txt +0 -0
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/top_level.txt +0 -0
|
@@ -78,7 +78,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
|
78
78
|
|
|
79
79
|
return doc
|
|
80
80
|
|
|
81
|
-
def _parse(self, doc: DoclingDocument):
|
|
81
|
+
def _parse(self, doc: DoclingDocument):
|
|
82
82
|
"""
|
|
83
83
|
Main function that orchestrates the parsing by yielding components:
|
|
84
84
|
title, section headers, text, lists, and tables.
|
docling/backend/html_backend.py
CHANGED
|
@@ -17,8 +17,11 @@ from docling_core.types.doc import (
|
|
|
17
17
|
DocumentOrigin,
|
|
18
18
|
GroupItem,
|
|
19
19
|
GroupLabel,
|
|
20
|
+
RefItem,
|
|
21
|
+
RichTableCell,
|
|
20
22
|
TableCell,
|
|
21
23
|
TableData,
|
|
24
|
+
TableItem,
|
|
22
25
|
TextItem,
|
|
23
26
|
)
|
|
24
27
|
from docling_core.types.doc.document import ContentLayer, Formatting, Script
|
|
@@ -276,10 +279,175 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
276
279
|
# reset context
|
|
277
280
|
self.ctx = _Context()
|
|
278
281
|
self._walk(content, doc)
|
|
279
|
-
|
|
280
282
|
return doc
|
|
281
283
|
|
|
282
|
-
|
|
284
|
+
@staticmethod
|
|
285
|
+
def group_cell_elements(
|
|
286
|
+
group_name: str,
|
|
287
|
+
doc: DoclingDocument,
|
|
288
|
+
provs_in_cell: list[RefItem],
|
|
289
|
+
docling_table: TableItem,
|
|
290
|
+
) -> RefItem:
|
|
291
|
+
group_element = doc.add_group(
|
|
292
|
+
label=GroupLabel.UNSPECIFIED,
|
|
293
|
+
name=group_name,
|
|
294
|
+
parent=docling_table,
|
|
295
|
+
)
|
|
296
|
+
for prov in provs_in_cell:
|
|
297
|
+
group_element.children.append(prov)
|
|
298
|
+
pr_item = prov.resolve(doc)
|
|
299
|
+
item_parent = pr_item.parent.resolve(doc)
|
|
300
|
+
if pr_item.get_ref() in item_parent.children:
|
|
301
|
+
item_parent.children.remove(pr_item.get_ref())
|
|
302
|
+
pr_item.parent = group_element.get_ref()
|
|
303
|
+
ref_for_rich_cell = group_element.get_ref()
|
|
304
|
+
return ref_for_rich_cell
|
|
305
|
+
|
|
306
|
+
@staticmethod
|
|
307
|
+
def process_rich_table_cells(
|
|
308
|
+
provs_in_cell: list[RefItem],
|
|
309
|
+
group_name: str,
|
|
310
|
+
doc: DoclingDocument,
|
|
311
|
+
docling_table: TableItem,
|
|
312
|
+
) -> tuple[bool, RefItem]:
|
|
313
|
+
rich_table_cell = False
|
|
314
|
+
ref_for_rich_cell = provs_in_cell[0]
|
|
315
|
+
if len(provs_in_cell) > 1:
|
|
316
|
+
# Cell has multiple elements, we need to group them
|
|
317
|
+
rich_table_cell = True
|
|
318
|
+
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
|
319
|
+
group_name, doc, provs_in_cell, docling_table
|
|
320
|
+
)
|
|
321
|
+
elif len(provs_in_cell) == 1:
|
|
322
|
+
item_ref = provs_in_cell[0]
|
|
323
|
+
pr_item = item_ref.resolve(doc)
|
|
324
|
+
if isinstance(pr_item, TextItem):
|
|
325
|
+
# Cell has only one element and it's just a text
|
|
326
|
+
rich_table_cell = False
|
|
327
|
+
doc.delete_items(node_items=[pr_item])
|
|
328
|
+
else:
|
|
329
|
+
rich_table_cell = True
|
|
330
|
+
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
|
331
|
+
group_name, doc, provs_in_cell, docling_table
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
return rich_table_cell, ref_for_rich_cell
|
|
335
|
+
|
|
336
|
+
def parse_table_data(
|
|
337
|
+
self,
|
|
338
|
+
element: Tag,
|
|
339
|
+
doc: DoclingDocument,
|
|
340
|
+
docling_table: TableItem,
|
|
341
|
+
num_rows: int,
|
|
342
|
+
num_cols: int,
|
|
343
|
+
) -> Optional[TableData]:
|
|
344
|
+
for t in cast(list[Tag], element.find_all(["thead", "tbody"], recursive=False)):
|
|
345
|
+
t.unwrap()
|
|
346
|
+
|
|
347
|
+
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
|
348
|
+
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
|
349
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
|
350
|
+
|
|
351
|
+
# Iterate over the rows in the table
|
|
352
|
+
start_row_span = 0
|
|
353
|
+
row_idx = -1
|
|
354
|
+
|
|
355
|
+
# We don't want this recursive to support nested tables
|
|
356
|
+
for row in element("tr", recursive=False):
|
|
357
|
+
if not isinstance(row, Tag):
|
|
358
|
+
continue
|
|
359
|
+
# For each row, find all the column cells (both <td> and <th>)
|
|
360
|
+
# We don't want this recursive to support nested tables
|
|
361
|
+
cells = row(["td", "th"], recursive=False)
|
|
362
|
+
# Check if cell is in a column header or row header
|
|
363
|
+
col_header = True
|
|
364
|
+
row_header = True
|
|
365
|
+
for html_cell in cells:
|
|
366
|
+
if isinstance(html_cell, Tag):
|
|
367
|
+
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
|
368
|
+
if html_cell.name == "td":
|
|
369
|
+
col_header = False
|
|
370
|
+
row_header = False
|
|
371
|
+
elif row_span == 1:
|
|
372
|
+
row_header = False
|
|
373
|
+
if not row_header:
|
|
374
|
+
row_idx += 1
|
|
375
|
+
start_row_span = 0
|
|
376
|
+
else:
|
|
377
|
+
start_row_span += 1
|
|
378
|
+
|
|
379
|
+
# Extract the text content of each cell
|
|
380
|
+
col_idx = 0
|
|
381
|
+
for html_cell in cells:
|
|
382
|
+
if not isinstance(html_cell, Tag):
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
# extract inline formulas
|
|
386
|
+
for formula in html_cell("inline-formula"):
|
|
387
|
+
math_parts = formula.text.split("$$")
|
|
388
|
+
if len(math_parts) == 3:
|
|
389
|
+
math_formula = f"$${math_parts[1]}$$"
|
|
390
|
+
formula.replace_with(NavigableString(math_formula))
|
|
391
|
+
|
|
392
|
+
provs_in_cell: list[RefItem] = []
|
|
393
|
+
# Parse table cell sub-tree for Rich Cells content:
|
|
394
|
+
provs_in_cell = self._walk(html_cell, doc)
|
|
395
|
+
|
|
396
|
+
rich_table_cell = False
|
|
397
|
+
ref_for_rich_cell = None
|
|
398
|
+
if len(provs_in_cell) > 0:
|
|
399
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
|
|
400
|
+
rich_table_cell, ref_for_rich_cell = (
|
|
401
|
+
HTMLDocumentBackend.process_rich_table_cells(
|
|
402
|
+
provs_in_cell, group_name, doc, docling_table
|
|
403
|
+
)
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Extracting text
|
|
407
|
+
text = self.get_text(html_cell).strip()
|
|
408
|
+
col_span, row_span = self._get_cell_spans(html_cell)
|
|
409
|
+
if row_header:
|
|
410
|
+
row_span -= 1
|
|
411
|
+
while (
|
|
412
|
+
col_idx < num_cols
|
|
413
|
+
and grid[row_idx + start_row_span][col_idx] is not None
|
|
414
|
+
):
|
|
415
|
+
col_idx += 1
|
|
416
|
+
for r in range(start_row_span, start_row_span + row_span):
|
|
417
|
+
for c in range(col_span):
|
|
418
|
+
if row_idx + r < num_rows and col_idx + c < num_cols:
|
|
419
|
+
grid[row_idx + r][col_idx + c] = text
|
|
420
|
+
|
|
421
|
+
if rich_table_cell:
|
|
422
|
+
rich_cell = RichTableCell(
|
|
423
|
+
text=text,
|
|
424
|
+
row_span=row_span,
|
|
425
|
+
col_span=col_span,
|
|
426
|
+
start_row_offset_idx=start_row_span + row_idx,
|
|
427
|
+
end_row_offset_idx=start_row_span + row_idx + row_span,
|
|
428
|
+
start_col_offset_idx=col_idx,
|
|
429
|
+
end_col_offset_idx=col_idx + col_span,
|
|
430
|
+
column_header=col_header,
|
|
431
|
+
row_header=((not col_header) and html_cell.name == "th"),
|
|
432
|
+
ref=ref_for_rich_cell, # points to an artificial group around children
|
|
433
|
+
)
|
|
434
|
+
doc.add_table_cell(table_item=docling_table, cell=rich_cell)
|
|
435
|
+
else:
|
|
436
|
+
simple_cell = TableCell(
|
|
437
|
+
text=text,
|
|
438
|
+
row_span=row_span,
|
|
439
|
+
col_span=col_span,
|
|
440
|
+
start_row_offset_idx=start_row_span + row_idx,
|
|
441
|
+
end_row_offset_idx=start_row_span + row_idx + row_span,
|
|
442
|
+
start_col_offset_idx=col_idx,
|
|
443
|
+
end_col_offset_idx=col_idx + col_span,
|
|
444
|
+
column_header=col_header,
|
|
445
|
+
row_header=((not col_header) and html_cell.name == "th"),
|
|
446
|
+
)
|
|
447
|
+
doc.add_table_cell(table_item=docling_table, cell=simple_cell)
|
|
448
|
+
return data
|
|
449
|
+
|
|
450
|
+
def _walk(self, element: Tag, doc: DoclingDocument) -> list[RefItem]:
|
|
283
451
|
"""Parse an XML tag by recursively walking its content.
|
|
284
452
|
|
|
285
453
|
While walking, the method buffers inline text across tags like <b> or <span>,
|
|
@@ -289,17 +457,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
289
457
|
element: The XML tag to parse.
|
|
290
458
|
doc: The Docling document to be updated with the parsed content.
|
|
291
459
|
"""
|
|
460
|
+
added_refs: list[RefItem] = []
|
|
292
461
|
buffer: AnnotatedTextList = AnnotatedTextList()
|
|
293
462
|
|
|
294
463
|
def flush_buffer():
|
|
295
464
|
if not buffer:
|
|
296
|
-
return
|
|
465
|
+
return added_refs
|
|
297
466
|
annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
|
|
298
467
|
parts = annotated_text_list.split_by_newline()
|
|
299
468
|
buffer.clear()
|
|
300
469
|
|
|
301
470
|
if not "".join([el.text for el in annotated_text_list]):
|
|
302
|
-
return
|
|
471
|
+
return added_refs
|
|
303
472
|
|
|
304
473
|
for annotated_text_list in parts:
|
|
305
474
|
with self._use_inline_group(annotated_text_list, doc):
|
|
@@ -309,15 +478,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
309
478
|
annotated_text.text.strip()
|
|
310
479
|
)
|
|
311
480
|
if annotated_text.code:
|
|
312
|
-
doc.add_code(
|
|
481
|
+
docling_code2 = doc.add_code(
|
|
313
482
|
parent=self.parents[self.level],
|
|
314
483
|
text=seg_clean,
|
|
315
484
|
content_layer=self.content_layer,
|
|
316
485
|
formatting=annotated_text.formatting,
|
|
317
486
|
hyperlink=annotated_text.hyperlink,
|
|
318
487
|
)
|
|
488
|
+
added_refs.append(docling_code2.get_ref())
|
|
319
489
|
else:
|
|
320
|
-
doc.add_text(
|
|
490
|
+
docling_text2 = doc.add_text(
|
|
321
491
|
parent=self.parents[self.level],
|
|
322
492
|
label=DocItemLabel.TEXT,
|
|
323
493
|
text=seg_clean,
|
|
@@ -325,25 +495,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
325
495
|
formatting=annotated_text.formatting,
|
|
326
496
|
hyperlink=annotated_text.hyperlink,
|
|
327
497
|
)
|
|
498
|
+
added_refs.append(docling_text2.get_ref())
|
|
328
499
|
|
|
329
500
|
for node in element.contents:
|
|
330
501
|
if isinstance(node, Tag):
|
|
331
502
|
name = node.name.lower()
|
|
332
503
|
if name == "img":
|
|
333
504
|
flush_buffer()
|
|
334
|
-
self._emit_image(node, doc)
|
|
505
|
+
im_ref3 = self._emit_image(node, doc)
|
|
506
|
+
added_refs.append(im_ref3)
|
|
335
507
|
elif name in _FORMAT_TAG_MAP:
|
|
336
508
|
with self._use_format([name]):
|
|
337
|
-
self._walk(node, doc)
|
|
509
|
+
wk = self._walk(node, doc)
|
|
510
|
+
added_refs.extend(wk)
|
|
338
511
|
elif name == "a":
|
|
339
512
|
with self._use_hyperlink(node):
|
|
340
|
-
self._walk(node, doc)
|
|
513
|
+
wk2 = self._walk(node, doc)
|
|
514
|
+
added_refs.extend(wk2)
|
|
341
515
|
elif name in _BLOCK_TAGS:
|
|
342
516
|
flush_buffer()
|
|
343
|
-
self._handle_block(node, doc)
|
|
517
|
+
blk = self._handle_block(node, doc)
|
|
518
|
+
added_refs.extend(blk)
|
|
344
519
|
elif node.find(_BLOCK_TAGS):
|
|
345
520
|
flush_buffer()
|
|
346
|
-
self._walk(node, doc)
|
|
521
|
+
wk3 = self._walk(node, doc)
|
|
522
|
+
added_refs.extend(wk3)
|
|
347
523
|
else:
|
|
348
524
|
buffer.extend(
|
|
349
525
|
self._extract_text_and_hyperlink_recursively(
|
|
@@ -363,6 +539,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
363
539
|
)
|
|
364
540
|
|
|
365
541
|
flush_buffer()
|
|
542
|
+
return added_refs
|
|
366
543
|
|
|
367
544
|
@staticmethod
|
|
368
545
|
def _collect_parent_format_tags(item: PageElement) -> list[str]:
|
|
@@ -581,7 +758,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
581
758
|
self.level -= 1
|
|
582
759
|
self.content_layer = current_layer
|
|
583
760
|
|
|
584
|
-
def _handle_heading(self, tag: Tag, doc: DoclingDocument) ->
|
|
761
|
+
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
|
|
762
|
+
added_ref = []
|
|
585
763
|
tag_name = tag.name.lower()
|
|
586
764
|
# set default content layer to BODY as soon as we encounter a heading
|
|
587
765
|
self.content_layer = ContentLayer.BODY
|
|
@@ -596,12 +774,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
596
774
|
for key in self.parents.keys():
|
|
597
775
|
self.parents[key] = None
|
|
598
776
|
self.level = 0
|
|
599
|
-
self.parents[self.level + 1] = doc.add_title(
|
|
777
|
+
docling_title = self.parents[self.level + 1] = doc.add_title(
|
|
600
778
|
text_clean,
|
|
601
779
|
content_layer=self.content_layer,
|
|
602
780
|
formatting=annotated_text.formatting,
|
|
603
781
|
hyperlink=annotated_text.hyperlink,
|
|
604
782
|
)
|
|
783
|
+
added_ref = [docling_title.get_ref()]
|
|
605
784
|
# the other levels need to be lowered by 1 if a title was set
|
|
606
785
|
else:
|
|
607
786
|
level -= 1
|
|
@@ -623,7 +802,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
623
802
|
_log.debug(f"Remove the tail of level {key}")
|
|
624
803
|
self.parents[key] = None
|
|
625
804
|
self.level = level
|
|
626
|
-
self.parents[self.level + 1] = doc.add_heading(
|
|
805
|
+
docling_heading = self.parents[self.level + 1] = doc.add_heading(
|
|
627
806
|
parent=self.parents[self.level],
|
|
628
807
|
text=text_clean,
|
|
629
808
|
orig=annotated_text.text,
|
|
@@ -632,12 +811,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
632
811
|
formatting=annotated_text.formatting,
|
|
633
812
|
hyperlink=annotated_text.hyperlink,
|
|
634
813
|
)
|
|
814
|
+
added_ref = [docling_heading.get_ref()]
|
|
635
815
|
self.level += 1
|
|
636
816
|
for img_tag in tag("img"):
|
|
637
817
|
if isinstance(img_tag, Tag):
|
|
638
|
-
self._emit_image(img_tag, doc)
|
|
818
|
+
im_ref = self._emit_image(img_tag, doc)
|
|
819
|
+
added_ref.append(im_ref)
|
|
820
|
+
return added_ref
|
|
639
821
|
|
|
640
|
-
def _handle_list(self, tag: Tag, doc: DoclingDocument) ->
|
|
822
|
+
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
|
|
641
823
|
tag_name = tag.name.lower()
|
|
642
824
|
start: Optional[int] = None
|
|
643
825
|
name: str = ""
|
|
@@ -765,20 +947,50 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
765
947
|
|
|
766
948
|
self.parents[self.level + 1] = None
|
|
767
949
|
self.level -= 1
|
|
950
|
+
return list_group.get_ref()
|
|
951
|
+
|
|
952
|
+
@staticmethod
|
|
953
|
+
def get_html_table_row_col(tag: Tag) -> tuple[int, int]:
|
|
954
|
+
for t in cast(list[Tag], tag.find_all(["thead", "tbody"], recursive=False)):
|
|
955
|
+
t.unwrap()
|
|
956
|
+
# Find the number of rows and columns (taking into account spans)
|
|
957
|
+
num_rows: int = 0
|
|
958
|
+
num_cols: int = 0
|
|
959
|
+
for row in tag("tr", recursive=False):
|
|
960
|
+
col_count = 0
|
|
961
|
+
is_row_header = True
|
|
962
|
+
if not isinstance(row, Tag):
|
|
963
|
+
continue
|
|
964
|
+
for cell in row(["td", "th"], recursive=False):
|
|
965
|
+
if not isinstance(row, Tag):
|
|
966
|
+
continue
|
|
967
|
+
cell_tag = cast(Tag, cell)
|
|
968
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
|
969
|
+
col_count += col_span
|
|
970
|
+
if cell_tag.name == "td" or row_span == 1:
|
|
971
|
+
is_row_header = False
|
|
972
|
+
num_cols = max(num_cols, col_count)
|
|
973
|
+
if not is_row_header:
|
|
974
|
+
num_rows += 1
|
|
975
|
+
return num_rows, num_cols
|
|
768
976
|
|
|
769
|
-
def _handle_block(self, tag: Tag, doc: DoclingDocument) ->
|
|
977
|
+
def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
|
|
978
|
+
added_refs = []
|
|
770
979
|
tag_name = tag.name.lower()
|
|
771
980
|
|
|
772
981
|
if tag_name == "figure":
|
|
773
982
|
img_tag = tag.find("img")
|
|
774
983
|
if isinstance(img_tag, Tag):
|
|
775
|
-
self._emit_image(img_tag, doc)
|
|
984
|
+
im_ref = self._emit_image(img_tag, doc)
|
|
985
|
+
added_refs.append(im_ref)
|
|
776
986
|
|
|
777
987
|
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
|
778
|
-
self._handle_heading(tag, doc)
|
|
988
|
+
heading_refs = self._handle_heading(tag, doc)
|
|
989
|
+
added_refs.extend(heading_refs)
|
|
779
990
|
|
|
780
991
|
elif tag_name in {"ul", "ol"}:
|
|
781
|
-
self._handle_list(tag, doc)
|
|
992
|
+
list_ref = self._handle_list(tag, doc)
|
|
993
|
+
added_refs.append(list_ref)
|
|
782
994
|
|
|
783
995
|
elif tag_name in {"p", "address", "summary"}:
|
|
784
996
|
text_list = self._extract_text_and_hyperlink_recursively(
|
|
@@ -791,15 +1003,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
791
1003
|
if seg := annotated_text.text.strip():
|
|
792
1004
|
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
|
793
1005
|
if annotated_text.code:
|
|
794
|
-
doc.add_code(
|
|
1006
|
+
docling_code = doc.add_code(
|
|
795
1007
|
parent=self.parents[self.level],
|
|
796
1008
|
text=seg_clean,
|
|
797
1009
|
content_layer=self.content_layer,
|
|
798
1010
|
formatting=annotated_text.formatting,
|
|
799
1011
|
hyperlink=annotated_text.hyperlink,
|
|
800
1012
|
)
|
|
1013
|
+
added_refs.append(docling_code.get_ref())
|
|
801
1014
|
else:
|
|
802
|
-
doc.add_text(
|
|
1015
|
+
docling_text = doc.add_text(
|
|
803
1016
|
parent=self.parents[self.level],
|
|
804
1017
|
label=DocItemLabel.TEXT,
|
|
805
1018
|
text=seg_clean,
|
|
@@ -807,22 +1020,27 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
807
1020
|
formatting=annotated_text.formatting,
|
|
808
1021
|
hyperlink=annotated_text.hyperlink,
|
|
809
1022
|
)
|
|
1023
|
+
added_refs.append(docling_text.get_ref())
|
|
810
1024
|
|
|
811
1025
|
for img_tag in tag("img"):
|
|
812
1026
|
if isinstance(img_tag, Tag):
|
|
813
1027
|
self._emit_image(img_tag, doc)
|
|
814
1028
|
|
|
815
1029
|
elif tag_name == "table":
|
|
816
|
-
|
|
1030
|
+
num_rows, num_cols = self.get_html_table_row_col(tag)
|
|
1031
|
+
data_e = TableData(num_rows=num_rows, num_cols=num_cols)
|
|
1032
|
+
docling_table = doc.add_table(
|
|
1033
|
+
data=data_e,
|
|
1034
|
+
parent=self.parents[self.level],
|
|
1035
|
+
content_layer=self.content_layer,
|
|
1036
|
+
)
|
|
1037
|
+
added_refs.append(docling_table.get_ref())
|
|
1038
|
+
self.parse_table_data(tag, doc, docling_table, num_rows, num_cols)
|
|
1039
|
+
|
|
817
1040
|
for img_tag in tag("img"):
|
|
818
1041
|
if isinstance(img_tag, Tag):
|
|
819
|
-
self._emit_image(tag, doc)
|
|
820
|
-
|
|
821
|
-
doc.add_table(
|
|
822
|
-
data=data,
|
|
823
|
-
parent=self.parents[self.level],
|
|
824
|
-
content_layer=self.content_layer,
|
|
825
|
-
)
|
|
1042
|
+
im_ref2 = self._emit_image(tag, doc)
|
|
1043
|
+
added_refs.append(im_ref2)
|
|
826
1044
|
|
|
827
1045
|
elif tag_name in {"pre"}:
|
|
828
1046
|
# handle monospace code snippets (pre).
|
|
@@ -835,13 +1053,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
835
1053
|
text_clean = HTMLDocumentBackend._clean_unicode(
|
|
836
1054
|
annotated_text.text.strip()
|
|
837
1055
|
)
|
|
838
|
-
doc.add_code(
|
|
1056
|
+
docling_code2 = doc.add_code(
|
|
839
1057
|
parent=self.parents[self.level],
|
|
840
1058
|
text=text_clean,
|
|
841
1059
|
content_layer=self.content_layer,
|
|
842
1060
|
formatting=annotated_text.formatting,
|
|
843
1061
|
hyperlink=annotated_text.hyperlink,
|
|
844
1062
|
)
|
|
1063
|
+
added_refs.append(docling_code2.get_ref())
|
|
845
1064
|
|
|
846
1065
|
elif tag_name == "footer":
|
|
847
1066
|
with self._use_footer(tag, doc):
|
|
@@ -850,8 +1069,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
850
1069
|
elif tag_name == "details":
|
|
851
1070
|
with self._use_details(tag, doc):
|
|
852
1071
|
self._walk(tag, doc)
|
|
1072
|
+
return added_refs
|
|
853
1073
|
|
|
854
|
-
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) ->
|
|
1074
|
+
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
|
|
855
1075
|
figure = img_tag.find_parent("figure")
|
|
856
1076
|
caption: AnnotatedTextList = AnnotatedTextList()
|
|
857
1077
|
|
|
@@ -894,11 +1114,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
894
1114
|
hyperlink=caption_anno_text.hyperlink,
|
|
895
1115
|
)
|
|
896
1116
|
|
|
897
|
-
doc.add_picture(
|
|
1117
|
+
docling_pic = doc.add_picture(
|
|
898
1118
|
caption=caption_item,
|
|
899
1119
|
parent=self.parents[self.level],
|
|
900
1120
|
content_layer=self.content_layer,
|
|
901
1121
|
)
|
|
1122
|
+
return docling_pic.get_ref()
|
|
902
1123
|
|
|
903
1124
|
@staticmethod
|
|
904
1125
|
def get_text(item: PageElement) -> str:
|
|
@@ -996,106 +1217,3 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
996
1217
|
)
|
|
997
1218
|
|
|
998
1219
|
return int_spans
|
|
999
|
-
|
|
1000
|
-
@staticmethod
|
|
1001
|
-
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
|
1002
|
-
nested_tables = element.find("table")
|
|
1003
|
-
if nested_tables is not None:
|
|
1004
|
-
_log.debug("Skipping nested table.")
|
|
1005
|
-
return None
|
|
1006
|
-
|
|
1007
|
-
# Find the number of rows and columns (taking into account spans)
|
|
1008
|
-
num_rows = 0
|
|
1009
|
-
num_cols = 0
|
|
1010
|
-
for row in element("tr"):
|
|
1011
|
-
col_count = 0
|
|
1012
|
-
is_row_header = True
|
|
1013
|
-
if not isinstance(row, Tag):
|
|
1014
|
-
continue
|
|
1015
|
-
for cell in row(["td", "th"]):
|
|
1016
|
-
if not isinstance(row, Tag):
|
|
1017
|
-
continue
|
|
1018
|
-
cell_tag = cast(Tag, cell)
|
|
1019
|
-
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
|
1020
|
-
col_count += col_span
|
|
1021
|
-
if cell_tag.name == "td" or row_span == 1:
|
|
1022
|
-
is_row_header = False
|
|
1023
|
-
num_cols = max(num_cols, col_count)
|
|
1024
|
-
if not is_row_header:
|
|
1025
|
-
num_rows += 1
|
|
1026
|
-
|
|
1027
|
-
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
|
1028
|
-
|
|
1029
|
-
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
|
1030
|
-
|
|
1031
|
-
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
|
1032
|
-
|
|
1033
|
-
# Iterate over the rows in the table
|
|
1034
|
-
start_row_span = 0
|
|
1035
|
-
row_idx = -1
|
|
1036
|
-
for row in element("tr"):
|
|
1037
|
-
if not isinstance(row, Tag):
|
|
1038
|
-
continue
|
|
1039
|
-
|
|
1040
|
-
# For each row, find all the column cells (both <td> and <th>)
|
|
1041
|
-
cells = row(["td", "th"])
|
|
1042
|
-
|
|
1043
|
-
# Check if cell is in a column header or row header
|
|
1044
|
-
col_header = True
|
|
1045
|
-
row_header = True
|
|
1046
|
-
for html_cell in cells:
|
|
1047
|
-
if isinstance(html_cell, Tag):
|
|
1048
|
-
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
|
1049
|
-
if html_cell.name == "td":
|
|
1050
|
-
col_header = False
|
|
1051
|
-
row_header = False
|
|
1052
|
-
elif row_span == 1:
|
|
1053
|
-
row_header = False
|
|
1054
|
-
if not row_header:
|
|
1055
|
-
row_idx += 1
|
|
1056
|
-
start_row_span = 0
|
|
1057
|
-
else:
|
|
1058
|
-
start_row_span += 1
|
|
1059
|
-
|
|
1060
|
-
# Extract the text content of each cell
|
|
1061
|
-
col_idx = 0
|
|
1062
|
-
for html_cell in cells:
|
|
1063
|
-
if not isinstance(html_cell, Tag):
|
|
1064
|
-
continue
|
|
1065
|
-
|
|
1066
|
-
# extract inline formulas
|
|
1067
|
-
for formula in html_cell("inline-formula"):
|
|
1068
|
-
math_parts = formula.text.split("$$")
|
|
1069
|
-
if len(math_parts) == 3:
|
|
1070
|
-
math_formula = f"$${math_parts[1]}$$"
|
|
1071
|
-
formula.replace_with(NavigableString(math_formula))
|
|
1072
|
-
|
|
1073
|
-
# TODO: extract content correctly from table-cells with lists
|
|
1074
|
-
text = HTMLDocumentBackend.get_text(html_cell).strip()
|
|
1075
|
-
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
|
1076
|
-
if row_header:
|
|
1077
|
-
row_span -= 1
|
|
1078
|
-
while (
|
|
1079
|
-
col_idx < num_cols
|
|
1080
|
-
and grid[row_idx + start_row_span][col_idx] is not None
|
|
1081
|
-
):
|
|
1082
|
-
col_idx += 1
|
|
1083
|
-
for r in range(start_row_span, start_row_span + row_span):
|
|
1084
|
-
for c in range(col_span):
|
|
1085
|
-
if row_idx + r < num_rows and col_idx + c < num_cols:
|
|
1086
|
-
grid[row_idx + r][col_idx + c] = text
|
|
1087
|
-
|
|
1088
|
-
table_cell = TableCell(
|
|
1089
|
-
text=text,
|
|
1090
|
-
row_span=row_span,
|
|
1091
|
-
col_span=col_span,
|
|
1092
|
-
start_row_offset_idx=start_row_span + row_idx,
|
|
1093
|
-
end_row_offset_idx=start_row_span + row_idx + row_span,
|
|
1094
|
-
start_col_offset_idx=col_idx,
|
|
1095
|
-
end_col_offset_idx=col_idx + col_span,
|
|
1096
|
-
column_header=col_header,
|
|
1097
|
-
row_header=((not col_header) and html_cell.name == "th"),
|
|
1098
|
-
)
|
|
1099
|
-
data.table_cells.append(table_cell)
|
|
1100
|
-
|
|
1101
|
-
return data
|
docling/backend/md_backend.py
CHANGED
|
@@ -3,6 +3,7 @@ import re
|
|
|
3
3
|
import warnings
|
|
4
4
|
from copy import deepcopy
|
|
5
5
|
from enum import Enum
|
|
6
|
+
from html import unescape
|
|
6
7
|
from io import BytesIO
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Literal, Optional, Union, cast
|
|
@@ -321,9 +322,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
|
321
322
|
|
|
322
323
|
fig_caption: Optional[TextItem] = None
|
|
323
324
|
if element.title is not None and element.title != "":
|
|
325
|
+
title = unescape(element.title)
|
|
324
326
|
fig_caption = doc.add_text(
|
|
325
327
|
label=DocItemLabel.CAPTION,
|
|
326
|
-
text=
|
|
328
|
+
text=title,
|
|
327
329
|
formatting=formatting,
|
|
328
330
|
hyperlink=hyperlink,
|
|
329
331
|
)
|
|
@@ -351,6 +353,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
|
351
353
|
snippet_text = (
|
|
352
354
|
element.children.strip() if isinstance(element.children, str) else ""
|
|
353
355
|
)
|
|
356
|
+
snippet_text = unescape(snippet_text)
|
|
354
357
|
# Detect start of the table:
|
|
355
358
|
if "|" in snippet_text or self.in_table:
|
|
356
359
|
# most likely part of the markdown table
|