docling-core 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -16,6 +16,7 @@ from pydantic import (
16
16
  StrictStr,
17
17
  model_validator,
18
18
  )
19
+ from tabulate import tabulate
19
20
 
20
21
  from docling_core.search.mapping import es_field
21
22
  from docling_core.types.base import (
@@ -391,3 +392,111 @@ class ExportedCCSDocument(
391
392
  item["$ref"] = ref
392
393
 
393
394
  return data
395
+
396
+ def _resolve_ref(self, item: Ref) -> Optional[Table]:
397
+ """Return the resolved reference in case of table reference, otherwise None."""
398
+ result: Optional[Table] = None
399
+
400
+ # NOTE: currently only resolves table refs & makes assumptions on ref parts
401
+ if item.obj_type == "table" and self.tables:
402
+ parts = item.ref.split("/")
403
+ result = self.tables[int(parts[2])]
404
+
405
+ return result
406
+
407
+ def export_to_markdown(
408
+ self,
409
+ delim: str = "\n\n",
410
+ main_text_start: int = 0,
411
+ main_text_stop: Optional[int] = None,
412
+ ) -> str:
413
+ r"""Serialize to Markdown.
414
+
415
+ Operates on a slice of the document's main_text as defined through arguments
416
+ main_text_start and main_text_stop; defaulting to the whole main_text.
417
+
418
+ Args:
419
+ delim (str, optional): Delimiter to use when concatenating the various
420
+ Markdown parts. Defaults to "\n\n".
421
+ main_text_start (int, optional): Main-text slicing start index (inclusive).
422
+ Defaults to 0.
423
+ main_text_end (Optional[int], optional): Main-text slicing stop index
424
+ (exclusive). Defaults to None.
425
+
426
+ Returns:
427
+ str: The exported Markdown representation.
428
+ """
429
+ has_title = False
430
+ prev_text = ""
431
+ md_texts: list[str] = []
432
+
433
+ if self.main_text is not None:
434
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
435
+ markdown_text = ""
436
+
437
+ item = (
438
+ self._resolve_ref(orig_item)
439
+ if isinstance(orig_item, Ref)
440
+ else orig_item
441
+ )
442
+ if item is None:
443
+ continue
444
+
445
+ item_type = item.obj_type
446
+ if isinstance(item, BaseText) and item_type in {
447
+ "title",
448
+ "subtitle-level-1",
449
+ "paragraph",
450
+ "caption",
451
+ }:
452
+ text = item.text
453
+
454
+ # ignore repeated text
455
+ if prev_text == text:
456
+ continue
457
+ else:
458
+ prev_text = text
459
+
460
+ # first title match
461
+ if item_type == "title" and not has_title:
462
+ markdown_text = f"# {text}"
463
+ has_title = True
464
+
465
+ # secondary titles
466
+ elif item_type in {"title", "subtitle-level-1"} or (
467
+ has_title and item_type == "title"
468
+ ):
469
+ markdown_text = f"## {text}"
470
+
471
+ # normal text
472
+ else:
473
+ markdown_text = text
474
+
475
+ elif isinstance(item, Table) and item.data:
476
+ table = []
477
+ for row in item.data:
478
+ tmp = []
479
+ for col in row:
480
+ tmp.append(col.text)
481
+ table.append(tmp)
482
+
483
+ if len(table) > 1 and len(table[0]) > 0:
484
+ try:
485
+ md_table = tabulate(
486
+ table[1:], headers=table[0], tablefmt="github"
487
+ )
488
+ except ValueError:
489
+ md_table = tabulate(
490
+ table[1:],
491
+ headers=table[0],
492
+ tablefmt="github",
493
+ disable_numparse=True,
494
+ )
495
+
496
+ markdown_text = md_table
497
+
498
+ if markdown_text:
499
+ md_texts.append(markdown_text)
500
+
501
+ result = delim.join(md_texts)
502
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -31,6 +31,7 @@ Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
31
  Requires-Dist: poetry (>=1.8.3,<2.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
33
33
  Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
34
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
35
  Project-URL: Repository, https://github.com/DS4SD/docling-core
35
36
  Description-Content-Type: text/markdown
36
37
 
@@ -20,7 +20,7 @@ docling_core/types/doc/base.py,sha256=-j4vVs3JZuaUjm0fHIkLU9TD_4IZXQuGouLrddEAwP
20
20
  docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
21
21
  docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
22
22
  docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
23
- docling_core/types/doc/document.py,sha256=cMduCiFkPVCmXQehvNkXqXtDiXJJtB72o7_LZXz_S6I,12549
23
+ docling_core/types/doc/document.py,sha256=8L7F53Nr6U-z93zNJDit6nXtjiddysEmJ8KuWFO_iOA,16436
24
24
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
25
25
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
26
26
  docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
@@ -39,8 +39,8 @@ docling_core/utils/ds_generate_docs.py,sha256=0xGBagdC_PGjyeHXYZo90VnVrSTMZgHb0S
39
39
  docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvGqg54qBIvUMHTHdA,1647
40
40
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
41
41
  docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
42
- docling_core-1.0.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
43
- docling_core-1.0.0.dist-info/METADATA,sha256=RPsZbjVEs0aIfMYDmK25CxR1b77iRCqbu8WbodN4l9g,5174
44
- docling_core-1.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
- docling_core-1.0.0.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
46
- docling_core-1.0.0.dist-info/RECORD,,
42
+ docling_core-1.1.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
43
+ docling_core-1.1.0.dist-info/METADATA,sha256=GMKoJSJxOTqMKGqLy6dLdZMkLkLia3tN-elzHJFbWRA,5216
44
+ docling_core-1.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
+ docling_core-1.1.0.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
46
+ docling_core-1.1.0.dist-info/RECORD,,