docling-core 2.21.0__py3-none-any.whl → 2.21.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hybrid_chunker.py +2 -2
- docling_core/types/doc/document.py +24 -9
- {docling_core-2.21.0.dist-info → docling_core-2.21.2.dist-info}/METADATA +1 -1
- {docling_core-2.21.0.dist-info → docling_core-2.21.2.dist-info}/RECORD +7 -7
- {docling_core-2.21.0.dist-info → docling_core-2.21.2.dist-info}/LICENSE +0 -0
- {docling_core-2.21.0.dist-info → docling_core-2.21.2.dist-info}/WHEEL +0 -0
- {docling_core-2.21.0.dist-info → docling_core-2.21.2.dist-info}/entry_points.txt +0 -0
|
@@ -73,7 +73,7 @@ class HybridChunker(BaseChunker):
|
|
|
73
73
|
for t in text:
|
|
74
74
|
total += self._count_text_tokens(t)
|
|
75
75
|
return total
|
|
76
|
-
return len(self._tokenizer.tokenize(text
|
|
76
|
+
return len(self._tokenizer.tokenize(text))
|
|
77
77
|
|
|
78
78
|
class _ChunkLengthInfo(BaseModel):
|
|
79
79
|
total_len: int
|
|
@@ -82,7 +82,7 @@ class HybridChunker(BaseChunker):
|
|
|
82
82
|
|
|
83
83
|
def _count_chunk_tokens(self, doc_chunk: DocChunk):
|
|
84
84
|
ser_txt = self.serialize(chunk=doc_chunk)
|
|
85
|
-
return len(self._tokenizer.tokenize(text=ser_txt
|
|
85
|
+
return len(self._tokenizer.tokenize(text=ser_txt))
|
|
86
86
|
|
|
87
87
|
def _doc_chunk_length(self, doc_chunk: DocChunk):
|
|
88
88
|
text_length = self._count_text_tokens(doc_chunk.text)
|
|
@@ -800,7 +800,7 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
800
800
|
:param add_content: bool: (Default value = True)
|
|
801
801
|
|
|
802
802
|
"""
|
|
803
|
-
body = f"<{self.label.value}{new_line}"
|
|
803
|
+
body = f"<{self.label.value}>{new_line}"
|
|
804
804
|
|
|
805
805
|
if add_location:
|
|
806
806
|
body += self.get_location_tokens(
|
|
@@ -813,7 +813,7 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
813
813
|
if add_content and self.text is not None:
|
|
814
814
|
body += f"<_{self.code_language.value}_>{self.text}{new_line}"
|
|
815
815
|
|
|
816
|
-
body += f"</{self.label.value}
|
|
816
|
+
body += f"</{self.label.value}>\n"
|
|
817
817
|
|
|
818
818
|
return body
|
|
819
819
|
|
|
@@ -2487,16 +2487,25 @@ class DoclingDocument(BaseModel):
|
|
|
2487
2487
|
is_inline_scope=is_inline_scope,
|
|
2488
2488
|
visited=visited,
|
|
2489
2489
|
)
|
|
2490
|
-
# NOTE: assumes unordered (flag & marker currently in ListItem)
|
|
2491
2490
|
indent_str = list_level * indent * " "
|
|
2491
|
+
is_ol = item.label == GroupLabel.ORDERED_LIST
|
|
2492
2492
|
text = "\n".join(
|
|
2493
2493
|
[
|
|
2494
2494
|
# avoid additional marker on already evaled sublists
|
|
2495
|
-
|
|
2496
|
-
|
|
2495
|
+
(
|
|
2496
|
+
c
|
|
2497
|
+
if c and c[0] == " "
|
|
2498
|
+
else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c}"
|
|
2499
|
+
)
|
|
2500
|
+
for i, c in enumerate(comps)
|
|
2497
2501
|
]
|
|
2498
2502
|
)
|
|
2499
|
-
_ingest_text(
|
|
2503
|
+
_ingest_text(
|
|
2504
|
+
text=text,
|
|
2505
|
+
# special chars have already been escaped as needed
|
|
2506
|
+
do_escape_html=False,
|
|
2507
|
+
do_escape_underscores=False,
|
|
2508
|
+
)
|
|
2500
2509
|
elif item.label == GroupLabel.INLINE:
|
|
2501
2510
|
comps = self._get_markdown_components(
|
|
2502
2511
|
node=item,
|
|
@@ -2515,7 +2524,13 @@ class DoclingDocument(BaseModel):
|
|
|
2515
2524
|
is_inline_scope=True,
|
|
2516
2525
|
visited=visited,
|
|
2517
2526
|
)
|
|
2518
|
-
|
|
2527
|
+
text = " ".join(comps)
|
|
2528
|
+
_ingest_text(
|
|
2529
|
+
text=text,
|
|
2530
|
+
# special chars have already been escaped as needed
|
|
2531
|
+
do_escape_html=False,
|
|
2532
|
+
do_escape_underscores=False,
|
|
2533
|
+
)
|
|
2519
2534
|
else:
|
|
2520
2535
|
continue
|
|
2521
2536
|
|
|
@@ -2833,7 +2848,7 @@ class DoclingDocument(BaseModel):
|
|
|
2833
2848
|
|
|
2834
2849
|
# Building a math equation in MathML format
|
|
2835
2850
|
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
2836
|
-
elif formula_to_mathml:
|
|
2851
|
+
elif formula_to_mathml and len(math_formula) > 0:
|
|
2837
2852
|
try:
|
|
2838
2853
|
mathml_element = latex2mathml.converter.convert_to_element(
|
|
2839
2854
|
math_formula, display="block"
|
|
@@ -2855,7 +2870,7 @@ class DoclingDocument(BaseModel):
|
|
|
2855
2870
|
and img_fallback is not None
|
|
2856
2871
|
):
|
|
2857
2872
|
text = img_fallback
|
|
2858
|
-
|
|
2873
|
+
else:
|
|
2859
2874
|
text = f"<pre>{math_formula}</pre>"
|
|
2860
2875
|
|
|
2861
2876
|
elif math_formula != "":
|
|
@@ -19,12 +19,12 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
|
|
|
19
19
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
20
20
|
docling_core/transforms/chunker/base.py,sha256=BSWTiFOsF5YaZaZJZY8nwIdOXb9uufJMRIds7LxRNh8,2546
|
|
21
21
|
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=MStDUDtzFGc6j8v9AkcAnnSHTDxdoiVrp8FTmRdGqU8,8138
|
|
22
|
-
docling_core/transforms/chunker/hybrid_chunker.py,sha256=
|
|
22
|
+
docling_core/transforms/chunker/hybrid_chunker.py,sha256=v-HpFg-HvQLi0gQtHm-6KlMfcMWupkBjwr5qF-rfr4E,9842
|
|
23
23
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
24
24
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
25
25
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
26
26
|
docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
|
|
27
|
-
docling_core/types/doc/document.py,sha256=
|
|
27
|
+
docling_core/types/doc/document.py,sha256=ajb-E6ZNcOWEY0ngEDhlUDx5oHBjvV9aQELJ616nU94,110854
|
|
28
28
|
docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
|
|
29
29
|
docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
|
|
30
30
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
56
56
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
57
57
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
58
58
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
59
|
-
docling_core-2.21.
|
|
60
|
-
docling_core-2.21.
|
|
61
|
-
docling_core-2.21.
|
|
62
|
-
docling_core-2.21.
|
|
63
|
-
docling_core-2.21.
|
|
59
|
+
docling_core-2.21.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
60
|
+
docling_core-2.21.2.dist-info/METADATA,sha256=qIh8v5l84ULFf25DQ70FqwxHdMDu_06oKYR9ZFr1HJg,5803
|
|
61
|
+
docling_core-2.21.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
62
|
+
docling_core-2.21.2.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
63
|
+
docling_core-2.21.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|