docling-core 2.20.0__tar.gz → 2.21.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.20.0 → docling_core-2.21.0}/PKG-INFO +1 -1
  2. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/doc/document.py +166 -104
  3. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/doc/labels.py +1 -0
  4. {docling_core-2.20.0 → docling_core-2.21.0}/pyproject.toml +1 -1
  5. {docling_core-2.20.0 → docling_core-2.21.0}/LICENSE +0 -0
  6. {docling_core-2.20.0 → docling_core-2.21.0}/README.md +0 -0
  7. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/__init__.py +0 -0
  8. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/cli/__init__.py +0 -0
  9. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/cli/view.py +0 -0
  10. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/py.typed +0 -0
  11. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  12. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  13. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  14. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  15. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  16. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  17. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  18. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  19. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/search/__init__.py +0 -0
  20. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  21. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/search/mapping.py +0 -0
  22. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/search/meta.py +0 -0
  23. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/search/package.py +0 -0
  24. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/transforms/__init__.py +0 -0
  25. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/transforms/chunker/__init__.py +0 -0
  26. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/transforms/chunker/base.py +0 -0
  27. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  28. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  29. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/__init__.py +0 -0
  30. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/base.py +0 -0
  31. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/doc/__init__.py +0 -0
  32. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/doc/base.py +0 -0
  33. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/doc/tokens.py +0 -0
  34. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/doc/utils.py +0 -0
  35. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/legacy_doc/base.py +0 -0
  40. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  41. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  42. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  43. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/legacy_doc/document.py +0 -0
  44. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  45. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/nlp/__init__.py +0 -0
  46. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/nlp/qa.py +0 -0
  47. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/nlp/qa_labels.py +0 -0
  48. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/rec/__init__.py +0 -0
  49. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/rec/attribute.py +0 -0
  50. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/rec/base.py +0 -0
  51. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/rec/predicate.py +0 -0
  52. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/rec/record.py +0 -0
  53. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/rec/statement.py +0 -0
  54. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/types/rec/subject.py +0 -0
  55. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/utils/__init__.py +0 -0
  56. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/utils/alias.py +0 -0
  57. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/utils/file.py +0 -0
  58. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/utils/generate_docs.py +0 -0
  59. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/utils/generate_jsonschema.py +0 -0
  60. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/utils/legacy.py +0 -0
  61. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.20.0 → docling_core-2.21.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.20.0
3
+ Version: 2.21.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -61,7 +61,7 @@ _logger = logging.getLogger(__name__)
61
61
 
62
62
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
63
63
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
64
- CURRENT_VERSION: Final = "1.1.0"
64
+ CURRENT_VERSION: Final = "1.2.0"
65
65
 
66
66
  DEFAULT_EXPORT_LABELS = {
67
67
  DocItemLabel.TITLE,
@@ -861,11 +861,11 @@ class PictureItem(FloatingItem):
861
861
  image_placeholder: str = "<!-- image -->",
862
862
  ) -> str:
863
863
  """Export picture to Markdown format."""
864
- default_response = "\n" + image_placeholder + "\n"
864
+ default_response = image_placeholder
865
865
  error_response = (
866
- "\n<!-- 🖼️❌ Image not available. "
866
+ "<!-- 🖼️❌ Image not available. "
867
867
  "Please use `PdfPipelineOptions(generate_picture_images=True)`"
868
- " --> \n"
868
+ " -->"
869
869
  )
870
870
 
871
871
  if image_mode == ImageRefMode.PLACEHOLDER:
@@ -879,7 +879,7 @@ class PictureItem(FloatingItem):
879
879
  and isinstance(self.image.uri, AnyUrl)
880
880
  and self.image.uri.scheme == "data"
881
881
  ):
882
- text = f"\n![Image]({self.image.uri})\n"
882
+ text = f"![Image]({self.image.uri})"
883
883
  return text
884
884
 
885
885
  # get the self.image._pil or crop it out of the page-image
@@ -887,7 +887,7 @@ class PictureItem(FloatingItem):
887
887
 
888
888
  if img is not None:
889
889
  imgb64 = self._image_to_base64(img)
890
- text = f"\n![Image](data:image/png;base64,{imgb64})\n"
890
+ text = f"![Image](data:image/png;base64,{imgb64})"
891
891
 
892
892
  return text
893
893
  else:
@@ -899,7 +899,7 @@ class PictureItem(FloatingItem):
899
899
  ):
900
900
  return default_response
901
901
 
902
- text = f"\n![Image]({quote(str(self.image.uri))})\n"
902
+ text = f"![Image]({quote(str(self.image.uri))})"
903
903
  return text
904
904
 
905
905
  else:
@@ -1397,6 +1397,10 @@ class KeyValueItem(FloatingItem):
1397
1397
 
1398
1398
  graph: GraphData
1399
1399
 
1400
+ def _export_to_markdown(self) -> str:
1401
+ # TODO add actual implementation
1402
+ return "<!-- missing-key-value-item -->"
1403
+
1400
1404
 
1401
1405
  class FormItem(FloatingItem):
1402
1406
  """FormItem."""
@@ -1405,6 +1409,10 @@ class FormItem(FloatingItem):
1405
1409
 
1406
1410
  graph: GraphData
1407
1411
 
1412
+ def _export_to_markdown(self) -> str:
1413
+ # TODO add actual implementation
1414
+ return "<!-- missing-form-item -->"
1415
+
1408
1416
 
1409
1417
  ContentItem = Annotated[
1410
1418
  Union[
@@ -2239,6 +2247,20 @@ class DoclingDocument(BaseModel):
2239
2247
  with open(filename, "w", encoding="utf-8") as fw:
2240
2248
  yaml.dump(out, fw, default_flow_style=default_flow_style)
2241
2249
 
2250
+ @classmethod
2251
+ def load_from_yaml(cls, filename: Path) -> "DoclingDocument":
2252
+ """load_from_yaml.
2253
+
2254
+ Args:
2255
+ filename: The filename to load a YAML-serialized DoclingDocument from.
2256
+
2257
+ Returns:
2258
+ DoclingDocument: the loaded DoclingDocument
2259
+ """
2260
+ with open(filename, encoding="utf-8") as f:
2261
+ data = yaml.load(f, Loader=yaml.FullLoader)
2262
+ return DoclingDocument.model_validate(data)
2263
+
2242
2264
  def export_to_dict(
2243
2265
  self,
2244
2266
  mode: str = "json",
@@ -2254,7 +2276,7 @@ class DoclingDocument(BaseModel):
2254
2276
  self,
2255
2277
  filename: Path,
2256
2278
  artifacts_dir: Optional[Path] = None,
2257
- delim: str = "\n",
2279
+ delim: str = "\n\n", # TODO: deprecate
2258
2280
  from_element: int = 0,
2259
2281
  to_element: int = sys.maxsize,
2260
2282
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
@@ -2297,7 +2319,7 @@ class DoclingDocument(BaseModel):
2297
2319
 
2298
2320
  def export_to_markdown( # noqa: C901
2299
2321
  self,
2300
- delim: str = "\n",
2322
+ delim: str = "\n\n", # TODO deprecate
2301
2323
  from_element: int = 0,
2302
2324
  to_element: int = sys.maxsize,
2303
2325
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
@@ -2344,10 +2366,44 @@ class DoclingDocument(BaseModel):
2344
2366
  :returns: The exported Markdown representation.
2345
2367
  :rtype: str
2346
2368
  """
2347
- mdtexts: list[str] = []
2348
- list_nesting_level = 0 # Track the current list nesting level
2349
- previous_level = 0 # Track the previous item's level
2350
- in_list = False # Track if we're currently processing list items
2369
+ comps = self._get_markdown_components(
2370
+ node=self.body,
2371
+ from_element=from_element,
2372
+ to_element=to_element,
2373
+ labels=labels,
2374
+ strict_text=strict_text,
2375
+ escaping_underscores=escaping_underscores,
2376
+ image_placeholder=image_placeholder,
2377
+ image_mode=image_mode,
2378
+ indent=indent,
2379
+ text_width=text_width,
2380
+ page_no=page_no,
2381
+ included_content_layers=included_content_layers,
2382
+ list_level=0,
2383
+ is_inline_scope=False,
2384
+ visited=set(),
2385
+ )
2386
+ return delim.join(comps)
2387
+
2388
+ def _get_markdown_components( # noqa: C901
2389
+ self,
2390
+ node: NodeItem,
2391
+ from_element: int,
2392
+ to_element: int,
2393
+ labels: set[DocItemLabel],
2394
+ strict_text: bool,
2395
+ escaping_underscores: bool,
2396
+ image_placeholder: str,
2397
+ image_mode: ImageRefMode,
2398
+ indent: int,
2399
+ text_width: int,
2400
+ page_no: Optional[int],
2401
+ included_content_layers: set[ContentLayer],
2402
+ list_level: int,
2403
+ is_inline_scope: bool,
2404
+ visited: set[str], # refs of visited items
2405
+ ) -> list[str]:
2406
+ components: list[str] = [] # components to concatenate
2351
2407
 
2352
2408
  # Our export markdown doesn't contain any emphasis styling:
2353
2409
  # Bold, Italic, or Bold-Italic
@@ -2382,137 +2438,138 @@ class DoclingDocument(BaseModel):
2382
2438
 
2383
2439
  return "".join(parts)
2384
2440
 
2385
- def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
2441
+ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
2386
2442
  if do_escape_underscores and escaping_underscores:
2387
2443
  text = _escape_underscores(text)
2388
2444
  if do_escape_html:
2389
2445
  text = html.escape(text, quote=False)
2390
- mdtexts.append(text)
2446
+ if text:
2447
+ components.append(text)
2391
2448
 
2392
2449
  for ix, (item, level) in enumerate(
2393
2450
  self.iterate_items(
2394
- self.body,
2451
+ node,
2395
2452
  with_groups=True,
2396
2453
  page_no=page_no,
2397
2454
  included_content_layers=included_content_layers,
2398
2455
  )
2399
2456
  ):
2400
- # If we've moved to a lower level, we're exiting one or more groups
2401
- if level < previous_level:
2402
- # Calculate how many levels we've exited
2403
- level_difference = previous_level - level
2404
- # Decrement list_nesting_level for each list group we've exited
2405
- list_nesting_level = max(0, list_nesting_level - level_difference)
2406
-
2407
- previous_level = level # Update previous_level for next iteration
2457
+ if item.self_ref in visited:
2458
+ continue
2459
+ else:
2460
+ visited.add(item.self_ref)
2408
2461
 
2409
2462
  if ix < from_element or to_element <= ix:
2410
2463
  continue # skip as many items as you want
2411
2464
 
2412
- if (isinstance(item, DocItem)) and (item.label not in labels):
2465
+ elif (isinstance(item, DocItem)) and (item.label not in labels):
2413
2466
  continue # skip any label that is not whitelisted
2414
2467
 
2415
- # Handle newlines between different types of content
2416
- if (
2417
- len(mdtexts) > 0
2418
- and not isinstance(item, (ListItem, GroupItem))
2419
- and in_list
2420
- ):
2421
- mdtexts[-1] += "\n"
2422
- in_list = False
2423
-
2424
- if isinstance(item, GroupItem) and item.label in [
2425
- GroupLabel.LIST,
2426
- GroupLabel.ORDERED_LIST,
2427
- ]:
2428
-
2429
- if list_nesting_level == 0: # Check if we're on the top level.
2430
- # In that case a new list starts directly after another list.
2431
- mdtexts.append("\n") # Add a blank line
2432
-
2433
- # Increment list nesting level when entering a new list
2434
- list_nesting_level += 1
2435
- in_list = True
2436
- continue
2437
-
2438
2468
  elif isinstance(item, GroupItem):
2439
- continue
2469
+ if item.label in [
2470
+ GroupLabel.LIST,
2471
+ GroupLabel.ORDERED_LIST,
2472
+ ]:
2473
+ comps = self._get_markdown_components(
2474
+ node=item,
2475
+ from_element=from_element,
2476
+ to_element=to_element,
2477
+ labels=labels,
2478
+ strict_text=strict_text,
2479
+ escaping_underscores=escaping_underscores,
2480
+ image_placeholder=image_placeholder,
2481
+ image_mode=image_mode,
2482
+ indent=indent,
2483
+ text_width=text_width,
2484
+ page_no=page_no,
2485
+ included_content_layers=included_content_layers,
2486
+ list_level=list_level + 1,
2487
+ is_inline_scope=is_inline_scope,
2488
+ visited=visited,
2489
+ )
2490
+ # NOTE: assumes unordered (flag & marker currently in ListItem)
2491
+ indent_str = list_level * indent * " "
2492
+ text = "\n".join(
2493
+ [
2494
+ # avoid additional marker on already evaled sublists
2495
+ cpt if cpt and cpt[0] == " " else f"{indent_str}- {cpt}"
2496
+ for cpt in comps
2497
+ ]
2498
+ )
2499
+ _ingest_text(text=text)
2500
+ elif item.label == GroupLabel.INLINE:
2501
+ comps = self._get_markdown_components(
2502
+ node=item,
2503
+ from_element=from_element,
2504
+ to_element=to_element,
2505
+ labels=labels,
2506
+ strict_text=strict_text,
2507
+ escaping_underscores=escaping_underscores,
2508
+ image_placeholder=image_placeholder,
2509
+ image_mode=image_mode,
2510
+ indent=indent,
2511
+ text_width=text_width,
2512
+ page_no=page_no,
2513
+ included_content_layers=included_content_layers,
2514
+ list_level=list_level,
2515
+ is_inline_scope=True,
2516
+ visited=visited,
2517
+ )
2518
+ _ingest_text(" ".join(comps))
2519
+ else:
2520
+ continue
2440
2521
 
2441
2522
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2442
- in_list = False
2443
2523
  marker = "" if strict_text else "#"
2444
2524
  text = f"{marker} {item.text}"
2445
- _append_text(text.strip() + "\n")
2525
+ _ingest_text(text.strip())
2446
2526
 
2447
2527
  elif (
2448
2528
  isinstance(item, TextItem)
2449
2529
  and item.label in [DocItemLabel.SECTION_HEADER]
2450
2530
  ) or isinstance(item, SectionHeaderItem):
2451
- in_list = False
2452
2531
  marker = ""
2453
2532
  if not strict_text:
2454
2533
  marker = "#" * level
2455
2534
  if len(marker) < 2:
2456
2535
  marker = "##"
2457
- text = f"{marker} {item.text}\n"
2458
- _append_text(text.strip() + "\n")
2459
-
2460
- elif isinstance(item, CodeItem) and item.label in labels:
2461
- in_list = False
2462
- text = f"```\n{item.text}\n```\n"
2463
- _append_text(text, do_escape_underscores=False, do_escape_html=False)
2464
-
2465
- elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
2466
- in_list = True
2467
- # Calculate indent based on list_nesting_level
2468
- # -1 because level 1 needs no indent
2469
- list_indent = " " * (indent * (list_nesting_level - 1))
2470
-
2471
- marker = ""
2472
- if strict_text:
2473
- marker = ""
2474
- elif item.enumerated:
2475
- marker = item.marker
2476
- else:
2477
- marker = "-" # Markdown needs only dash as item marker.
2536
+ text = f"{marker} {item.text}"
2537
+ _ingest_text(text.strip())
2478
2538
 
2479
- text = f"{list_indent}{marker} {item.text}"
2480
- _append_text(text)
2539
+ elif isinstance(item, CodeItem):
2540
+ text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
2541
+ _ingest_text(text, do_escape_underscores=False, do_escape_html=False)
2481
2542
 
2482
2543
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2483
- in_list = False
2484
2544
  if item.text != "":
2485
- _append_text(
2486
- f"$${item.text}$$\n",
2545
+ _ingest_text(
2546
+ f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
2487
2547
  do_escape_underscores=False,
2488
2548
  do_escape_html=False,
2489
2549
  )
2490
2550
  elif item.orig != "":
2491
- _append_text(
2492
- "<!-- formula-not-decoded -->\n",
2551
+ _ingest_text(
2552
+ "<!-- formula-not-decoded -->",
2493
2553
  do_escape_underscores=False,
2494
2554
  do_escape_html=False,
2495
2555
  )
2496
2556
 
2497
- elif isinstance(item, TextItem) and item.label in labels:
2498
- in_list = False
2557
+ elif isinstance(item, TextItem):
2499
2558
  if len(item.text) and text_width > 0:
2500
2559
  text = item.text
2501
2560
  wrapped_text = textwrap.fill(text, width=text_width)
2502
- _append_text(wrapped_text + "\n")
2561
+ _ingest_text(wrapped_text)
2503
2562
  elif len(item.text):
2504
- text = f"{item.text}\n"
2505
- _append_text(text)
2563
+ _ingest_text(item.text)
2506
2564
 
2507
2565
  elif isinstance(item, TableItem) and not strict_text:
2508
- in_list = False
2509
- _append_text(item.caption_text(self))
2566
+ if caption_text := item.caption_text(self):
2567
+ _ingest_text(caption_text)
2510
2568
  md_table = item.export_to_markdown()
2511
- _append_text("\n" + md_table + "\n")
2569
+ _ingest_text(md_table)
2512
2570
 
2513
2571
  elif isinstance(item, PictureItem) and not strict_text:
2514
- in_list = False
2515
- _append_text(item.caption_text(self))
2572
+ _ingest_text(item.caption_text(self))
2516
2573
 
2517
2574
  line = item.export_to_markdown(
2518
2575
  doc=self,
@@ -2520,19 +2577,17 @@ class DoclingDocument(BaseModel):
2520
2577
  image_mode=image_mode,
2521
2578
  )
2522
2579
 
2523
- _append_text(line, do_escape_html=False, do_escape_underscores=False)
2580
+ _ingest_text(line, do_escape_html=False, do_escape_underscores=False)
2524
2581
 
2525
- elif isinstance(item, DocItem) and item.label in labels:
2526
- in_list = False
2527
- text = "<!-- missing-text -->"
2528
- _append_text(text, do_escape_html=False, do_escape_underscores=False)
2582
+ elif isinstance(item, (KeyValueItem, FormItem)):
2583
+ text = item._export_to_markdown()
2584
+ _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2529
2585
 
2530
- mdtext = (delim.join(mdtexts)).strip()
2531
- mdtext = re.sub(
2532
- r"\n\n\n+", "\n\n", mdtext
2533
- ) # remove cases of double or more empty lines.
2586
+ elif isinstance(item, DocItem):
2587
+ text = "<!-- missing-text -->"
2588
+ _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2534
2589
 
2535
- return mdtext
2590
+ return components
2536
2591
 
2537
2592
  def export_to_text( # noqa: C901
2538
2593
  self,
@@ -2764,14 +2819,17 @@ class DoclingDocument(BaseModel):
2764
2819
  "</figure>"
2765
2820
  )
2766
2821
 
2822
+ img_fallback = _image_fallback(item)
2823
+
2767
2824
  # If the formula is not processed correcty, use its image
2768
2825
  if (
2769
2826
  item.text == ""
2770
2827
  and item.orig != ""
2771
2828
  and image_mode == ImageRefMode.EMBEDDED
2772
2829
  and len(item.prov) > 0
2830
+ and img_fallback is not None
2773
2831
  ):
2774
- text = _image_fallback(item)
2832
+ text = img_fallback
2775
2833
 
2776
2834
  # Building a math equation in MathML format
2777
2835
  # ref https://www.w3.org/TR/wai-aria-1.1/#math
@@ -2791,9 +2849,13 @@ class DoclingDocument(BaseModel):
2791
2849
  "Malformed formula cannot be rendered. "
2792
2850
  f"Error {err.__class__.__name__}, formula={math_formula}"
2793
2851
  )
2794
- if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
2795
- text = _image_fallback(item)
2796
- else:
2852
+ if (
2853
+ image_mode == ImageRefMode.EMBEDDED
2854
+ and len(item.prov) > 0
2855
+ and img_fallback is not None
2856
+ ):
2857
+ text = img_fallback
2858
+ elif len(math_formula) > 0:
2797
2859
  text = f"<pre>{math_formula}</pre>"
2798
2860
 
2799
2861
  elif math_formula != "":
@@ -75,6 +75,7 @@ class GroupLabel(str, Enum):
75
75
  FORM_AREA = "form_area"
76
76
  KEY_VALUE_AREA = "key_value_area"
77
77
  COMMENT_SECTION = "comment_section"
78
+ INLINE = "inline"
78
79
 
79
80
  def __str__(self):
80
81
  """Get string value."""
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.20.0"
3
+ version = "2.21.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes