docling-core 2.21.1__py3-none-any.whl → 2.21.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hybrid_chunker.py +2 -2
- docling_core/types/doc/document.py +17 -7
- {docling_core-2.21.1.dist-info → docling_core-2.21.2.dist-info}/METADATA +1 -1
- {docling_core-2.21.1.dist-info → docling_core-2.21.2.dist-info}/RECORD +7 -7
- {docling_core-2.21.1.dist-info → docling_core-2.21.2.dist-info}/LICENSE +0 -0
- {docling_core-2.21.1.dist-info → docling_core-2.21.2.dist-info}/WHEEL +0 -0
- {docling_core-2.21.1.dist-info → docling_core-2.21.2.dist-info}/entry_points.txt +0 -0
|
@@ -73,7 +73,7 @@ class HybridChunker(BaseChunker):
|
|
|
73
73
|
for t in text:
|
|
74
74
|
total += self._count_text_tokens(t)
|
|
75
75
|
return total
|
|
76
|
-
return len(self._tokenizer.tokenize(text
|
|
76
|
+
return len(self._tokenizer.tokenize(text))
|
|
77
77
|
|
|
78
78
|
class _ChunkLengthInfo(BaseModel):
|
|
79
79
|
total_len: int
|
|
@@ -82,7 +82,7 @@ class HybridChunker(BaseChunker):
|
|
|
82
82
|
|
|
83
83
|
def _count_chunk_tokens(self, doc_chunk: DocChunk):
|
|
84
84
|
ser_txt = self.serialize(chunk=doc_chunk)
|
|
85
|
-
return len(self._tokenizer.tokenize(text=ser_txt
|
|
85
|
+
return len(self._tokenizer.tokenize(text=ser_txt))
|
|
86
86
|
|
|
87
87
|
def _doc_chunk_length(self, doc_chunk: DocChunk):
|
|
88
88
|
text_length = self._count_text_tokens(doc_chunk.text)
|
|
@@ -800,7 +800,7 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
800
800
|
:param add_content: bool: (Default value = True)
|
|
801
801
|
|
|
802
802
|
"""
|
|
803
|
-
body = f"<{self.label.value}{new_line}"
|
|
803
|
+
body = f"<{self.label.value}>{new_line}"
|
|
804
804
|
|
|
805
805
|
if add_location:
|
|
806
806
|
body += self.get_location_tokens(
|
|
@@ -813,7 +813,7 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
813
813
|
if add_content and self.text is not None:
|
|
814
814
|
body += f"<_{self.code_language.value}_>{self.text}{new_line}"
|
|
815
815
|
|
|
816
|
-
body += f"</{self.label.value}
|
|
816
|
+
body += f"</{self.label.value}>\n"
|
|
817
817
|
|
|
818
818
|
return body
|
|
819
819
|
|
|
@@ -2487,7 +2487,6 @@ class DoclingDocument(BaseModel):
|
|
|
2487
2487
|
is_inline_scope=is_inline_scope,
|
|
2488
2488
|
visited=visited,
|
|
2489
2489
|
)
|
|
2490
|
-
# NOTE: assumes unordered (flag & marker currently in ListItem)
|
|
2491
2490
|
indent_str = list_level * indent * " "
|
|
2492
2491
|
is_ol = item.label == GroupLabel.ORDERED_LIST
|
|
2493
2492
|
text = "\n".join(
|
|
@@ -2501,7 +2500,12 @@ class DoclingDocument(BaseModel):
|
|
|
2501
2500
|
for i, c in enumerate(comps)
|
|
2502
2501
|
]
|
|
2503
2502
|
)
|
|
2504
|
-
_ingest_text(
|
|
2503
|
+
_ingest_text(
|
|
2504
|
+
text=text,
|
|
2505
|
+
# special chars have already been escaped as needed
|
|
2506
|
+
do_escape_html=False,
|
|
2507
|
+
do_escape_underscores=False,
|
|
2508
|
+
)
|
|
2505
2509
|
elif item.label == GroupLabel.INLINE:
|
|
2506
2510
|
comps = self._get_markdown_components(
|
|
2507
2511
|
node=item,
|
|
@@ -2520,7 +2524,13 @@ class DoclingDocument(BaseModel):
|
|
|
2520
2524
|
is_inline_scope=True,
|
|
2521
2525
|
visited=visited,
|
|
2522
2526
|
)
|
|
2523
|
-
|
|
2527
|
+
text = " ".join(comps)
|
|
2528
|
+
_ingest_text(
|
|
2529
|
+
text=text,
|
|
2530
|
+
# special chars have already been escaped as needed
|
|
2531
|
+
do_escape_html=False,
|
|
2532
|
+
do_escape_underscores=False,
|
|
2533
|
+
)
|
|
2524
2534
|
else:
|
|
2525
2535
|
continue
|
|
2526
2536
|
|
|
@@ -2838,7 +2848,7 @@ class DoclingDocument(BaseModel):
|
|
|
2838
2848
|
|
|
2839
2849
|
# Building a math equation in MathML format
|
|
2840
2850
|
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
2841
|
-
elif formula_to_mathml:
|
|
2851
|
+
elif formula_to_mathml and len(math_formula) > 0:
|
|
2842
2852
|
try:
|
|
2843
2853
|
mathml_element = latex2mathml.converter.convert_to_element(
|
|
2844
2854
|
math_formula, display="block"
|
|
@@ -2860,7 +2870,7 @@ class DoclingDocument(BaseModel):
|
|
|
2860
2870
|
and img_fallback is not None
|
|
2861
2871
|
):
|
|
2862
2872
|
text = img_fallback
|
|
2863
|
-
|
|
2873
|
+
else:
|
|
2864
2874
|
text = f"<pre>{math_formula}</pre>"
|
|
2865
2875
|
|
|
2866
2876
|
elif math_formula != "":
|
|
@@ -19,12 +19,12 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
|
|
|
19
19
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
20
20
|
docling_core/transforms/chunker/base.py,sha256=BSWTiFOsF5YaZaZJZY8nwIdOXb9uufJMRIds7LxRNh8,2546
|
|
21
21
|
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=MStDUDtzFGc6j8v9AkcAnnSHTDxdoiVrp8FTmRdGqU8,8138
|
|
22
|
-
docling_core/transforms/chunker/hybrid_chunker.py,sha256=
|
|
22
|
+
docling_core/transforms/chunker/hybrid_chunker.py,sha256=v-HpFg-HvQLi0gQtHm-6KlMfcMWupkBjwr5qF-rfr4E,9842
|
|
23
23
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
24
24
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
25
25
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
26
26
|
docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
|
|
27
|
-
docling_core/types/doc/document.py,sha256=
|
|
27
|
+
docling_core/types/doc/document.py,sha256=ajb-E6ZNcOWEY0ngEDhlUDx5oHBjvV9aQELJ616nU94,110854
|
|
28
28
|
docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
|
|
29
29
|
docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
|
|
30
30
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
56
56
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
57
57
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
58
58
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
59
|
-
docling_core-2.21.
|
|
60
|
-
docling_core-2.21.
|
|
61
|
-
docling_core-2.21.
|
|
62
|
-
docling_core-2.21.
|
|
63
|
-
docling_core-2.21.
|
|
59
|
+
docling_core-2.21.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
60
|
+
docling_core-2.21.2.dist-info/METADATA,sha256=qIh8v5l84ULFf25DQ70FqwxHdMDu_06oKYR9ZFr1HJg,5803
|
|
61
|
+
docling_core-2.21.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
62
|
+
docling_core-2.21.2.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
63
|
+
docling_core-2.21.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|