docling-core 2.21.1__py3-none-any.whl → 2.21.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -73,7 +73,7 @@ class HybridChunker(BaseChunker):
73
73
  for t in text:
74
74
  total += self._count_text_tokens(t)
75
75
  return total
76
- return len(self._tokenizer.tokenize(text, max_length=None))
76
+ return len(self._tokenizer.tokenize(text))
77
77
 
78
78
  class _ChunkLengthInfo(BaseModel):
79
79
  total_len: int
@@ -82,7 +82,7 @@ class HybridChunker(BaseChunker):
82
82
 
83
83
  def _count_chunk_tokens(self, doc_chunk: DocChunk):
84
84
  ser_txt = self.serialize(chunk=doc_chunk)
85
- return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
85
+ return len(self._tokenizer.tokenize(text=ser_txt))
86
86
 
87
87
  def _doc_chunk_length(self, doc_chunk: DocChunk):
88
88
  text_length = self._count_text_tokens(doc_chunk.text)
@@ -800,7 +800,7 @@ class CodeItem(FloatingItem, TextItem):
800
800
  :param add_content: bool: (Default value = True)
801
801
 
802
802
  """
803
- body = f"<{self.label.value}{new_line}"
803
+ body = f"<{self.label.value}>{new_line}"
804
804
 
805
805
  if add_location:
806
806
  body += self.get_location_tokens(
@@ -813,7 +813,7 @@ class CodeItem(FloatingItem, TextItem):
813
813
  if add_content and self.text is not None:
814
814
  body += f"<_{self.code_language.value}_>{self.text}{new_line}"
815
815
 
816
- body += f"</{self.label.value}\n"
816
+ body += f"</{self.label.value}>\n"
817
817
 
818
818
  return body
819
819
 
@@ -2487,7 +2487,6 @@ class DoclingDocument(BaseModel):
2487
2487
  is_inline_scope=is_inline_scope,
2488
2488
  visited=visited,
2489
2489
  )
2490
- # NOTE: assumes unordered (flag & marker currently in ListItem)
2491
2490
  indent_str = list_level * indent * " "
2492
2491
  is_ol = item.label == GroupLabel.ORDERED_LIST
2493
2492
  text = "\n".join(
@@ -2501,7 +2500,12 @@ class DoclingDocument(BaseModel):
2501
2500
  for i, c in enumerate(comps)
2502
2501
  ]
2503
2502
  )
2504
- _ingest_text(text=text)
2503
+ _ingest_text(
2504
+ text=text,
2505
+ # special chars have already been escaped as needed
2506
+ do_escape_html=False,
2507
+ do_escape_underscores=False,
2508
+ )
2505
2509
  elif item.label == GroupLabel.INLINE:
2506
2510
  comps = self._get_markdown_components(
2507
2511
  node=item,
@@ -2520,7 +2524,13 @@ class DoclingDocument(BaseModel):
2520
2524
  is_inline_scope=True,
2521
2525
  visited=visited,
2522
2526
  )
2523
- _ingest_text(" ".join(comps))
2527
+ text = " ".join(comps)
2528
+ _ingest_text(
2529
+ text=text,
2530
+ # special chars have already been escaped as needed
2531
+ do_escape_html=False,
2532
+ do_escape_underscores=False,
2533
+ )
2524
2534
  else:
2525
2535
  continue
2526
2536
 
@@ -2838,7 +2848,7 @@ class DoclingDocument(BaseModel):
2838
2848
 
2839
2849
  # Building a math equation in MathML format
2840
2850
  # ref https://www.w3.org/TR/wai-aria-1.1/#math
2841
- elif formula_to_mathml:
2851
+ elif formula_to_mathml and len(math_formula) > 0:
2842
2852
  try:
2843
2853
  mathml_element = latex2mathml.converter.convert_to_element(
2844
2854
  math_formula, display="block"
@@ -2860,7 +2870,7 @@ class DoclingDocument(BaseModel):
2860
2870
  and img_fallback is not None
2861
2871
  ):
2862
2872
  text = img_fallback
2863
- elif len(math_formula) > 0:
2873
+ else:
2864
2874
  text = f"<pre>{math_formula}</pre>"
2865
2875
 
2866
2876
  elif math_formula != "":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.21.1
3
+ Version: 2.21.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -19,12 +19,12 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
19
19
  docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
20
20
  docling_core/transforms/chunker/base.py,sha256=BSWTiFOsF5YaZaZJZY8nwIdOXb9uufJMRIds7LxRNh8,2546
21
21
  docling_core/transforms/chunker/hierarchical_chunker.py,sha256=MStDUDtzFGc6j8v9AkcAnnSHTDxdoiVrp8FTmRdGqU8,8138
22
- docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnHv2NjWTgf9uex8o0ole7w,9876
22
+ docling_core/transforms/chunker/hybrid_chunker.py,sha256=v-HpFg-HvQLi0gQtHm-6KlMfcMWupkBjwr5qF-rfr4E,9842
23
23
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
24
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
25
25
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
26
26
  docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
27
- docling_core/types/doc/document.py,sha256=P8dx5lP3oVrdlrXJx-Y-nk-UM7llDF6ZwOqs046HAM4,110451
27
+ docling_core/types/doc/document.py,sha256=ajb-E6ZNcOWEY0ngEDhlUDx5oHBjvV9aQELJ616nU94,110854
28
28
  docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
29
29
  docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
30
30
  docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
56
56
  docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
57
57
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
58
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
59
- docling_core-2.21.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-2.21.1.dist-info/METADATA,sha256=qz2AeXj0vfiBu24oWyMDiQSPvKM0yUn1Rj85JaUd7Yg,5803
61
- docling_core-2.21.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-2.21.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
- docling_core-2.21.1.dist-info/RECORD,,
59
+ docling_core-2.21.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-2.21.2.dist-info/METADATA,sha256=qIh8v5l84ULFf25DQ70FqwxHdMDu_06oKYR9ZFr1HJg,5803
61
+ docling_core-2.21.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-2.21.2.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-2.21.2.dist-info/RECORD,,