docling-core 2.20.0__py3-none-any.whl → 2.21.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -61,7 +61,7 @@ _logger = logging.getLogger(__name__)
61
61
 
62
62
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
63
63
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
64
- CURRENT_VERSION: Final = "1.1.0"
64
+ CURRENT_VERSION: Final = "1.2.0"
65
65
 
66
66
  DEFAULT_EXPORT_LABELS = {
67
67
  DocItemLabel.TITLE,
@@ -861,11 +861,11 @@ class PictureItem(FloatingItem):
861
861
  image_placeholder: str = "<!-- image -->",
862
862
  ) -> str:
863
863
  """Export picture to Markdown format."""
864
- default_response = "\n" + image_placeholder + "\n"
864
+ default_response = image_placeholder
865
865
  error_response = (
866
- "\n<!-- 🖼️❌ Image not available. "
866
+ "<!-- 🖼️❌ Image not available. "
867
867
  "Please use `PdfPipelineOptions(generate_picture_images=True)`"
868
- " --> \n"
868
+ " -->"
869
869
  )
870
870
 
871
871
  if image_mode == ImageRefMode.PLACEHOLDER:
@@ -879,7 +879,7 @@ class PictureItem(FloatingItem):
879
879
  and isinstance(self.image.uri, AnyUrl)
880
880
  and self.image.uri.scheme == "data"
881
881
  ):
882
- text = f"\n![Image]({self.image.uri})\n"
882
+ text = f"![Image]({self.image.uri})"
883
883
  return text
884
884
 
885
885
  # get the self.image._pil or crop it out of the page-image
@@ -887,7 +887,7 @@ class PictureItem(FloatingItem):
887
887
 
888
888
  if img is not None:
889
889
  imgb64 = self._image_to_base64(img)
890
- text = f"\n![Image](data:image/png;base64,{imgb64})\n"
890
+ text = f"![Image](data:image/png;base64,{imgb64})"
891
891
 
892
892
  return text
893
893
  else:
@@ -899,7 +899,7 @@ class PictureItem(FloatingItem):
899
899
  ):
900
900
  return default_response
901
901
 
902
- text = f"\n![Image]({quote(str(self.image.uri))})\n"
902
+ text = f"![Image]({quote(str(self.image.uri))})"
903
903
  return text
904
904
 
905
905
  else:
@@ -1397,6 +1397,10 @@ class KeyValueItem(FloatingItem):
1397
1397
 
1398
1398
  graph: GraphData
1399
1399
 
1400
+ def _export_to_markdown(self) -> str:
1401
+ # TODO add actual implementation
1402
+ return "<!-- missing-key-value-item -->"
1403
+
1400
1404
 
1401
1405
  class FormItem(FloatingItem):
1402
1406
  """FormItem."""
@@ -1405,6 +1409,10 @@ class FormItem(FloatingItem):
1405
1409
 
1406
1410
  graph: GraphData
1407
1411
 
1412
+ def _export_to_markdown(self) -> str:
1413
+ # TODO add actual implementation
1414
+ return "<!-- missing-form-item -->"
1415
+
1408
1416
 
1409
1417
  ContentItem = Annotated[
1410
1418
  Union[
@@ -2239,6 +2247,20 @@ class DoclingDocument(BaseModel):
2239
2247
  with open(filename, "w", encoding="utf-8") as fw:
2240
2248
  yaml.dump(out, fw, default_flow_style=default_flow_style)
2241
2249
 
2250
+ @classmethod
2251
+ def load_from_yaml(cls, filename: Path) -> "DoclingDocument":
2252
+ """load_from_yaml.
2253
+
2254
+ Args:
2255
+ filename: The filename to load a YAML-serialized DoclingDocument from.
2256
+
2257
+ Returns:
2258
+ DoclingDocument: the loaded DoclingDocument
2259
+ """
2260
+ with open(filename, encoding="utf-8") as f:
2261
+ data = yaml.load(f, Loader=yaml.FullLoader)
2262
+ return DoclingDocument.model_validate(data)
2263
+
2242
2264
  def export_to_dict(
2243
2265
  self,
2244
2266
  mode: str = "json",
@@ -2254,7 +2276,7 @@ class DoclingDocument(BaseModel):
2254
2276
  self,
2255
2277
  filename: Path,
2256
2278
  artifacts_dir: Optional[Path] = None,
2257
- delim: str = "\n",
2279
+ delim: str = "\n\n", # TODO: deprecate
2258
2280
  from_element: int = 0,
2259
2281
  to_element: int = sys.maxsize,
2260
2282
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
@@ -2297,7 +2319,7 @@ class DoclingDocument(BaseModel):
2297
2319
 
2298
2320
  def export_to_markdown( # noqa: C901
2299
2321
  self,
2300
- delim: str = "\n",
2322
+ delim: str = "\n\n", # TODO deprecate
2301
2323
  from_element: int = 0,
2302
2324
  to_element: int = sys.maxsize,
2303
2325
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
@@ -2344,10 +2366,44 @@ class DoclingDocument(BaseModel):
2344
2366
  :returns: The exported Markdown representation.
2345
2367
  :rtype: str
2346
2368
  """
2347
- mdtexts: list[str] = []
2348
- list_nesting_level = 0 # Track the current list nesting level
2349
- previous_level = 0 # Track the previous item's level
2350
- in_list = False # Track if we're currently processing list items
2369
+ comps = self._get_markdown_components(
2370
+ node=self.body,
2371
+ from_element=from_element,
2372
+ to_element=to_element,
2373
+ labels=labels,
2374
+ strict_text=strict_text,
2375
+ escaping_underscores=escaping_underscores,
2376
+ image_placeholder=image_placeholder,
2377
+ image_mode=image_mode,
2378
+ indent=indent,
2379
+ text_width=text_width,
2380
+ page_no=page_no,
2381
+ included_content_layers=included_content_layers,
2382
+ list_level=0,
2383
+ is_inline_scope=False,
2384
+ visited=set(),
2385
+ )
2386
+ return delim.join(comps)
2387
+
2388
+ def _get_markdown_components( # noqa: C901
2389
+ self,
2390
+ node: NodeItem,
2391
+ from_element: int,
2392
+ to_element: int,
2393
+ labels: set[DocItemLabel],
2394
+ strict_text: bool,
2395
+ escaping_underscores: bool,
2396
+ image_placeholder: str,
2397
+ image_mode: ImageRefMode,
2398
+ indent: int,
2399
+ text_width: int,
2400
+ page_no: Optional[int],
2401
+ included_content_layers: set[ContentLayer],
2402
+ list_level: int,
2403
+ is_inline_scope: bool,
2404
+ visited: set[str], # refs of visited items
2405
+ ) -> list[str]:
2406
+ components: list[str] = [] # components to concatenate
2351
2407
 
2352
2408
  # Our export markdown doesn't contain any emphasis styling:
2353
2409
  # Bold, Italic, or Bold-Italic
@@ -2382,137 +2438,143 @@ class DoclingDocument(BaseModel):
2382
2438
 
2383
2439
  return "".join(parts)
2384
2440
 
2385
- def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
2441
+ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
2386
2442
  if do_escape_underscores and escaping_underscores:
2387
2443
  text = _escape_underscores(text)
2388
2444
  if do_escape_html:
2389
2445
  text = html.escape(text, quote=False)
2390
- mdtexts.append(text)
2446
+ if text:
2447
+ components.append(text)
2391
2448
 
2392
2449
  for ix, (item, level) in enumerate(
2393
2450
  self.iterate_items(
2394
- self.body,
2451
+ node,
2395
2452
  with_groups=True,
2396
2453
  page_no=page_no,
2397
2454
  included_content_layers=included_content_layers,
2398
2455
  )
2399
2456
  ):
2400
- # If we've moved to a lower level, we're exiting one or more groups
2401
- if level < previous_level:
2402
- # Calculate how many levels we've exited
2403
- level_difference = previous_level - level
2404
- # Decrement list_nesting_level for each list group we've exited
2405
- list_nesting_level = max(0, list_nesting_level - level_difference)
2406
-
2407
- previous_level = level # Update previous_level for next iteration
2457
+ if item.self_ref in visited:
2458
+ continue
2459
+ else:
2460
+ visited.add(item.self_ref)
2408
2461
 
2409
2462
  if ix < from_element or to_element <= ix:
2410
2463
  continue # skip as many items as you want
2411
2464
 
2412
- if (isinstance(item, DocItem)) and (item.label not in labels):
2465
+ elif (isinstance(item, DocItem)) and (item.label not in labels):
2413
2466
  continue # skip any label that is not whitelisted
2414
2467
 
2415
- # Handle newlines between different types of content
2416
- if (
2417
- len(mdtexts) > 0
2418
- and not isinstance(item, (ListItem, GroupItem))
2419
- and in_list
2420
- ):
2421
- mdtexts[-1] += "\n"
2422
- in_list = False
2423
-
2424
- if isinstance(item, GroupItem) and item.label in [
2425
- GroupLabel.LIST,
2426
- GroupLabel.ORDERED_LIST,
2427
- ]:
2428
-
2429
- if list_nesting_level == 0: # Check if we're on the top level.
2430
- # In that case a new list starts directly after another list.
2431
- mdtexts.append("\n") # Add a blank line
2432
-
2433
- # Increment list nesting level when entering a new list
2434
- list_nesting_level += 1
2435
- in_list = True
2436
- continue
2437
-
2438
2468
  elif isinstance(item, GroupItem):
2439
- continue
2469
+ if item.label in [
2470
+ GroupLabel.LIST,
2471
+ GroupLabel.ORDERED_LIST,
2472
+ ]:
2473
+ comps = self._get_markdown_components(
2474
+ node=item,
2475
+ from_element=from_element,
2476
+ to_element=to_element,
2477
+ labels=labels,
2478
+ strict_text=strict_text,
2479
+ escaping_underscores=escaping_underscores,
2480
+ image_placeholder=image_placeholder,
2481
+ image_mode=image_mode,
2482
+ indent=indent,
2483
+ text_width=text_width,
2484
+ page_no=page_no,
2485
+ included_content_layers=included_content_layers,
2486
+ list_level=list_level + 1,
2487
+ is_inline_scope=is_inline_scope,
2488
+ visited=visited,
2489
+ )
2490
+ # NOTE: assumes unordered (flag & marker currently in ListItem)
2491
+ indent_str = list_level * indent * " "
2492
+ is_ol = item.label == GroupLabel.ORDERED_LIST
2493
+ text = "\n".join(
2494
+ [
2495
+ # avoid additional marker on already evaled sublists
2496
+ (
2497
+ c
2498
+ if c and c[0] == " "
2499
+ else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c}"
2500
+ )
2501
+ for i, c in enumerate(comps)
2502
+ ]
2503
+ )
2504
+ _ingest_text(text=text)
2505
+ elif item.label == GroupLabel.INLINE:
2506
+ comps = self._get_markdown_components(
2507
+ node=item,
2508
+ from_element=from_element,
2509
+ to_element=to_element,
2510
+ labels=labels,
2511
+ strict_text=strict_text,
2512
+ escaping_underscores=escaping_underscores,
2513
+ image_placeholder=image_placeholder,
2514
+ image_mode=image_mode,
2515
+ indent=indent,
2516
+ text_width=text_width,
2517
+ page_no=page_no,
2518
+ included_content_layers=included_content_layers,
2519
+ list_level=list_level,
2520
+ is_inline_scope=True,
2521
+ visited=visited,
2522
+ )
2523
+ _ingest_text(" ".join(comps))
2524
+ else:
2525
+ continue
2440
2526
 
2441
2527
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2442
- in_list = False
2443
2528
  marker = "" if strict_text else "#"
2444
2529
  text = f"{marker} {item.text}"
2445
- _append_text(text.strip() + "\n")
2530
+ _ingest_text(text.strip())
2446
2531
 
2447
2532
  elif (
2448
2533
  isinstance(item, TextItem)
2449
2534
  and item.label in [DocItemLabel.SECTION_HEADER]
2450
2535
  ) or isinstance(item, SectionHeaderItem):
2451
- in_list = False
2452
2536
  marker = ""
2453
2537
  if not strict_text:
2454
2538
  marker = "#" * level
2455
2539
  if len(marker) < 2:
2456
2540
  marker = "##"
2457
- text = f"{marker} {item.text}\n"
2458
- _append_text(text.strip() + "\n")
2459
-
2460
- elif isinstance(item, CodeItem) and item.label in labels:
2461
- in_list = False
2462
- text = f"```\n{item.text}\n```\n"
2463
- _append_text(text, do_escape_underscores=False, do_escape_html=False)
2464
-
2465
- elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
2466
- in_list = True
2467
- # Calculate indent based on list_nesting_level
2468
- # -1 because level 1 needs no indent
2469
- list_indent = " " * (indent * (list_nesting_level - 1))
2470
-
2471
- marker = ""
2472
- if strict_text:
2473
- marker = ""
2474
- elif item.enumerated:
2475
- marker = item.marker
2476
- else:
2477
- marker = "-" # Markdown needs only dash as item marker.
2541
+ text = f"{marker} {item.text}"
2542
+ _ingest_text(text.strip())
2478
2543
 
2479
- text = f"{list_indent}{marker} {item.text}"
2480
- _append_text(text)
2544
+ elif isinstance(item, CodeItem):
2545
+ text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
2546
+ _ingest_text(text, do_escape_underscores=False, do_escape_html=False)
2481
2547
 
2482
2548
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2483
- in_list = False
2484
2549
  if item.text != "":
2485
- _append_text(
2486
- f"$${item.text}$$\n",
2550
+ _ingest_text(
2551
+ f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
2487
2552
  do_escape_underscores=False,
2488
2553
  do_escape_html=False,
2489
2554
  )
2490
2555
  elif item.orig != "":
2491
- _append_text(
2492
- "<!-- formula-not-decoded -->\n",
2556
+ _ingest_text(
2557
+ "<!-- formula-not-decoded -->",
2493
2558
  do_escape_underscores=False,
2494
2559
  do_escape_html=False,
2495
2560
  )
2496
2561
 
2497
- elif isinstance(item, TextItem) and item.label in labels:
2498
- in_list = False
2562
+ elif isinstance(item, TextItem):
2499
2563
  if len(item.text) and text_width > 0:
2500
2564
  text = item.text
2501
2565
  wrapped_text = textwrap.fill(text, width=text_width)
2502
- _append_text(wrapped_text + "\n")
2566
+ _ingest_text(wrapped_text)
2503
2567
  elif len(item.text):
2504
- text = f"{item.text}\n"
2505
- _append_text(text)
2568
+ _ingest_text(item.text)
2506
2569
 
2507
2570
  elif isinstance(item, TableItem) and not strict_text:
2508
- in_list = False
2509
- _append_text(item.caption_text(self))
2571
+ if caption_text := item.caption_text(self):
2572
+ _ingest_text(caption_text)
2510
2573
  md_table = item.export_to_markdown()
2511
- _append_text("\n" + md_table + "\n")
2574
+ _ingest_text(md_table)
2512
2575
 
2513
2576
  elif isinstance(item, PictureItem) and not strict_text:
2514
- in_list = False
2515
- _append_text(item.caption_text(self))
2577
+ _ingest_text(item.caption_text(self))
2516
2578
 
2517
2579
  line = item.export_to_markdown(
2518
2580
  doc=self,
@@ -2520,19 +2582,17 @@ class DoclingDocument(BaseModel):
2520
2582
  image_mode=image_mode,
2521
2583
  )
2522
2584
 
2523
- _append_text(line, do_escape_html=False, do_escape_underscores=False)
2585
+ _ingest_text(line, do_escape_html=False, do_escape_underscores=False)
2524
2586
 
2525
- elif isinstance(item, DocItem) and item.label in labels:
2526
- in_list = False
2527
- text = "<!-- missing-text -->"
2528
- _append_text(text, do_escape_html=False, do_escape_underscores=False)
2587
+ elif isinstance(item, (KeyValueItem, FormItem)):
2588
+ text = item._export_to_markdown()
2589
+ _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2529
2590
 
2530
- mdtext = (delim.join(mdtexts)).strip()
2531
- mdtext = re.sub(
2532
- r"\n\n\n+", "\n\n", mdtext
2533
- ) # remove cases of double or more empty lines.
2591
+ elif isinstance(item, DocItem):
2592
+ text = "<!-- missing-text -->"
2593
+ _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2534
2594
 
2535
- return mdtext
2595
+ return components
2536
2596
 
2537
2597
  def export_to_text( # noqa: C901
2538
2598
  self,
@@ -2764,14 +2824,17 @@ class DoclingDocument(BaseModel):
2764
2824
  "</figure>"
2765
2825
  )
2766
2826
 
2827
+ img_fallback = _image_fallback(item)
2828
+
2767
2829
  # If the formula is not processed correcty, use its image
2768
2830
  if (
2769
2831
  item.text == ""
2770
2832
  and item.orig != ""
2771
2833
  and image_mode == ImageRefMode.EMBEDDED
2772
2834
  and len(item.prov) > 0
2835
+ and img_fallback is not None
2773
2836
  ):
2774
- text = _image_fallback(item)
2837
+ text = img_fallback
2775
2838
 
2776
2839
  # Building a math equation in MathML format
2777
2840
  # ref https://www.w3.org/TR/wai-aria-1.1/#math
@@ -2791,9 +2854,13 @@ class DoclingDocument(BaseModel):
2791
2854
  "Malformed formula cannot be rendered. "
2792
2855
  f"Error {err.__class__.__name__}, formula={math_formula}"
2793
2856
  )
2794
- if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
2795
- text = _image_fallback(item)
2796
- else:
2857
+ if (
2858
+ image_mode == ImageRefMode.EMBEDDED
2859
+ and len(item.prov) > 0
2860
+ and img_fallback is not None
2861
+ ):
2862
+ text = img_fallback
2863
+ elif len(math_formula) > 0:
2797
2864
  text = f"<pre>{math_formula}</pre>"
2798
2865
 
2799
2866
  elif math_formula != "":
@@ -75,6 +75,7 @@ class GroupLabel(str, Enum):
75
75
  FORM_AREA = "form_area"
76
76
  KEY_VALUE_AREA = "key_value_area"
77
77
  COMMENT_SECTION = "comment_section"
78
+ INLINE = "inline"
78
79
 
79
80
  def __str__(self):
80
81
  """Get string value."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.20.0
3
+ Version: 2.21.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -24,8 +24,8 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
24
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
25
25
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
26
26
  docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
27
- docling_core/types/doc/document.py,sha256=1tL321QdbE5ljnZjaat0yEbLcdmnHzy1EBsEAnXMj3o,107897
28
- docling_core/types/doc/labels.py,sha256=aJ-vcCNzAEFj3NxVKKiGUCit-2ra43st8xlpeWkSOqc,5662
27
+ docling_core/types/doc/document.py,sha256=P8dx5lP3oVrdlrXJx-Y-nk-UM7llDF6ZwOqs046HAM4,110451
28
+ docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
29
29
  docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
30
30
  docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
31
31
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
56
56
  docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
57
57
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
58
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
59
- docling_core-2.20.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-2.20.0.dist-info/METADATA,sha256=KCJ0MWOUOYFy-JP_sBk2wa_qmqLnvWokiuRP436c0fQ,5803
61
- docling_core-2.20.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-2.20.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
- docling_core-2.20.0.dist-info/RECORD,,
59
+ docling_core-2.21.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-2.21.1.dist-info/METADATA,sha256=qz2AeXj0vfiBu24oWyMDiQSPvKM0yUn1Rj85JaUd7Yg,5803
61
+ docling_core-2.21.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-2.21.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-2.21.1.dist-info/RECORD,,