docling-core 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/document.py +35 -5
- {docling_core-1.6.1.dist-info → docling_core-1.6.2.dist-info}/METADATA +1 -1
- {docling_core-1.6.1.dist-info → docling_core-1.6.2.dist-info}/RECORD +6 -6
- {docling_core-1.6.1.dist-info → docling_core-1.6.2.dist-info}/LICENSE +0 -0
- {docling_core-1.6.1.dist-info → docling_core-1.6.2.dist-info}/WHEEL +0 -0
- {docling_core-1.6.1.dist-info → docling_core-1.6.2.dist-info}/entry_points.txt +0 -0
|
@@ -477,6 +477,26 @@ class ExportedCCSDocument(
|
|
|
477
477
|
md_texts: list[str] = []
|
|
478
478
|
|
|
479
479
|
if self.main_text is not None:
|
|
480
|
+
# collect all captions embedded in table and figure objects
|
|
481
|
+
# to avoid repeating them
|
|
482
|
+
embedded_captions = set()
|
|
483
|
+
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
484
|
+
item = (
|
|
485
|
+
self._resolve_ref(orig_item)
|
|
486
|
+
if isinstance(orig_item, Ref)
|
|
487
|
+
else orig_item
|
|
488
|
+
)
|
|
489
|
+
if item is None:
|
|
490
|
+
continue
|
|
491
|
+
|
|
492
|
+
if (
|
|
493
|
+
isinstance(item, (Table, Figure))
|
|
494
|
+
and item.text
|
|
495
|
+
and item.obj_type in main_text_labels
|
|
496
|
+
):
|
|
497
|
+
embedded_captions.add(item.text)
|
|
498
|
+
|
|
499
|
+
# serialize document to markdown
|
|
480
500
|
for orig_item in self.main_text[main_text_start:main_text_stop]:
|
|
481
501
|
markdown_text = ""
|
|
482
502
|
|
|
@@ -492,6 +512,11 @@ class ExportedCCSDocument(
|
|
|
492
512
|
if isinstance(item, BaseText) and item_type in main_text_labels:
|
|
493
513
|
text = item.text
|
|
494
514
|
|
|
515
|
+
# skip captions of they are embedded in the actual
|
|
516
|
+
# floating object
|
|
517
|
+
if item_type == "caption" and text in embedded_captions:
|
|
518
|
+
continue
|
|
519
|
+
|
|
495
520
|
# ignore repeated text
|
|
496
521
|
if prev_text == text or text is None:
|
|
497
522
|
continue
|
|
@@ -523,8 +548,9 @@ class ExportedCCSDocument(
|
|
|
523
548
|
isinstance(item, Table)
|
|
524
549
|
and item.data
|
|
525
550
|
and item_type in main_text_labels
|
|
526
|
-
and not strict_text
|
|
527
551
|
):
|
|
552
|
+
|
|
553
|
+
md_table = ""
|
|
528
554
|
table = []
|
|
529
555
|
for row in item.data:
|
|
530
556
|
tmp = []
|
|
@@ -545,15 +571,19 @@ class ExportedCCSDocument(
|
|
|
545
571
|
disable_numparse=True,
|
|
546
572
|
)
|
|
547
573
|
|
|
548
|
-
|
|
574
|
+
markdown_text = ""
|
|
575
|
+
if item.text:
|
|
576
|
+
markdown_text = item.text
|
|
577
|
+
if not strict_text:
|
|
578
|
+
markdown_text += "\n" + md_table
|
|
549
579
|
|
|
550
580
|
elif isinstance(item, Figure) and item_type in main_text_labels:
|
|
551
581
|
|
|
552
582
|
markdown_text = ""
|
|
553
|
-
if not strict_text:
|
|
554
|
-
markdown_text = f"{image_placeholder}"
|
|
555
583
|
if item.text:
|
|
556
|
-
markdown_text
|
|
584
|
+
markdown_text = item.text
|
|
585
|
+
if not strict_text:
|
|
586
|
+
markdown_text += f"\n{image_placeholder}"
|
|
557
587
|
|
|
558
588
|
if markdown_text:
|
|
559
589
|
md_texts.append(markdown_text)
|
|
@@ -24,7 +24,7 @@ docling_core/types/doc/base.py,sha256=QQC8KzQeYWnHFPY2_BNGcbTp6J2_rPbnLjsnbehICn
|
|
|
24
24
|
docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
|
|
25
25
|
docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
|
|
26
26
|
docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
|
|
27
|
-
docling_core/types/doc/document.py,sha256=
|
|
27
|
+
docling_core/types/doc/document.py,sha256=AKp1kOo0tncf9FX3q7qRWQ2Jz_hZE44smZpyrtsRzY4,24104
|
|
28
28
|
docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
|
|
29
29
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
30
30
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
@@ -45,8 +45,8 @@ docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvG
|
|
|
45
45
|
docling_core/utils/file.py,sha256=VQgzjyvmJnAIHB6ex7ikcmbDAR4GA1ALreuO7Ubrp50,1895
|
|
46
46
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
47
47
|
docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
|
|
48
|
-
docling_core-1.6.
|
|
49
|
-
docling_core-1.6.
|
|
50
|
-
docling_core-1.6.
|
|
51
|
-
docling_core-1.6.
|
|
52
|
-
docling_core-1.6.
|
|
48
|
+
docling_core-1.6.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
49
|
+
docling_core-1.6.2.dist-info/METADATA,sha256=tYHPA7hs0aFYMr3W7FkkzsuwSySO5RbWrGN34neN2Mc,5383
|
|
50
|
+
docling_core-1.6.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
51
|
+
docling_core-1.6.2.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
|
|
52
|
+
docling_core-1.6.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|