docling-core 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -477,6 +477,26 @@ class ExportedCCSDocument(
477
477
  md_texts: list[str] = []
478
478
 
479
479
  if self.main_text is not None:
480
+ # collect all captions embedded in table and figure objects
481
+ # to avoid repeating them
482
+ embedded_captions = set()
483
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
484
+ item = (
485
+ self._resolve_ref(orig_item)
486
+ if isinstance(orig_item, Ref)
487
+ else orig_item
488
+ )
489
+ if item is None:
490
+ continue
491
+
492
+ if (
493
+ isinstance(item, (Table, Figure))
494
+ and item.text
495
+ and item.obj_type in main_text_labels
496
+ ):
497
+ embedded_captions.add(item.text)
498
+
499
+ # serialize document to markdown
480
500
  for orig_item in self.main_text[main_text_start:main_text_stop]:
481
501
  markdown_text = ""
482
502
 
@@ -492,6 +512,11 @@ class ExportedCCSDocument(
492
512
  if isinstance(item, BaseText) and item_type in main_text_labels:
493
513
  text = item.text
494
514
 
515
+ # skip captions of they are embedded in the actual
516
+ # floating object
517
+ if item_type == "caption" and text in embedded_captions:
518
+ continue
519
+
495
520
  # ignore repeated text
496
521
  if prev_text == text or text is None:
497
522
  continue
@@ -523,8 +548,9 @@ class ExportedCCSDocument(
523
548
  isinstance(item, Table)
524
549
  and item.data
525
550
  and item_type in main_text_labels
526
- and not strict_text
527
551
  ):
552
+
553
+ md_table = ""
528
554
  table = []
529
555
  for row in item.data:
530
556
  tmp = []
@@ -545,15 +571,19 @@ class ExportedCCSDocument(
545
571
  disable_numparse=True,
546
572
  )
547
573
 
548
- markdown_text = md_table
574
+ markdown_text = ""
575
+ if item.text:
576
+ markdown_text = item.text
577
+ if not strict_text:
578
+ markdown_text += "\n" + md_table
549
579
 
550
580
  elif isinstance(item, Figure) and item_type in main_text_labels:
551
581
 
552
582
  markdown_text = ""
553
- if not strict_text:
554
- markdown_text = f"{image_placeholder}"
555
583
  if item.text:
556
- markdown_text += "\n" + item.text
584
+ markdown_text = item.text
585
+ if not strict_text:
586
+ markdown_text += f"\n{image_placeholder}"
557
587
 
558
588
  if markdown_text:
559
589
  md_texts.append(markdown_text)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.6.1
3
+ Version: 1.6.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -24,7 +24,7 @@ docling_core/types/doc/base.py,sha256=QQC8KzQeYWnHFPY2_BNGcbTp6J2_rPbnLjsnbehICn
24
24
  docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
25
25
  docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
26
26
  docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
27
- docling_core/types/doc/document.py,sha256=qTVgmDX1geSRucJJeSV1G_sNjs6hL-kqns8rulbpyGw,22973
27
+ docling_core/types/doc/document.py,sha256=AKp1kOo0tncf9FX3q7qRWQ2Jz_hZE44smZpyrtsRzY4,24104
28
28
  docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
29
29
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
30
30
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -45,8 +45,8 @@ docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvG
45
45
  docling_core/utils/file.py,sha256=VQgzjyvmJnAIHB6ex7ikcmbDAR4GA1ALreuO7Ubrp50,1895
46
46
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
47
47
  docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
48
- docling_core-1.6.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
49
- docling_core-1.6.1.dist-info/METADATA,sha256=JjVY-bxco9p4I2Rq-PY5LJePO5RBCVIIv-ZNK-F8EKA,5383
50
- docling_core-1.6.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
51
- docling_core-1.6.1.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
52
- docling_core-1.6.1.dist-info/RECORD,,
48
+ docling_core-1.6.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
49
+ docling_core-1.6.2.dist-info/METADATA,sha256=tYHPA7hs0aFYMr3W7FkkzsuwSySO5RbWrGN34neN2Mc,5383
50
+ docling_core-1.6.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
51
+ docling_core-1.6.2.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
52
+ docling_core-1.6.2.dist-info/RECORD,,