docling-core 2.38.2__py3-none-any.whl → 2.40.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hierarchical_chunker.py +2 -3
- docling_core/transforms/serializer/base.py +2 -3
- docling_core/transforms/serializer/common.py +3 -4
- docling_core/transforms/serializer/doctags.py +4 -5
- docling_core/transforms/serializer/html.py +57 -10
- docling_core/transforms/serializer/markdown.py +75 -21
- docling_core/types/doc/__init__.py +1 -0
- docling_core/types/doc/document.py +78 -65
- docling_core/types/doc/labels.py +1 -1
- docling_core/types/doc/page.py +3 -2
- docling_core/types/doc/utils.py +18 -7
- docling_core/utils/file.py +27 -0
- docling_core/utils/legacy.py +1 -2
- {docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/METADATA +1 -1
- {docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/RECORD +19 -19
- {docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/WHEEL +0 -0
- {docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/top_level.txt +0 -0
|
@@ -35,11 +35,10 @@ from docling_core.types.doc.document import (
|
|
|
35
35
|
DocumentOrigin,
|
|
36
36
|
InlineGroup,
|
|
37
37
|
LevelNumber,
|
|
38
|
-
|
|
38
|
+
ListGroup,
|
|
39
39
|
SectionHeaderItem,
|
|
40
40
|
TableItem,
|
|
41
41
|
TitleItem,
|
|
42
|
-
UnorderedList,
|
|
43
42
|
)
|
|
44
43
|
|
|
45
44
|
_VERSION: Final = "1.0.0"
|
|
@@ -240,7 +239,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
240
239
|
heading_by_level.pop(k, None)
|
|
241
240
|
continue
|
|
242
241
|
elif (
|
|
243
|
-
isinstance(item, (
|
|
242
|
+
isinstance(item, (ListGroup, InlineGroup, DocItem))
|
|
244
243
|
and item.self_ref not in visited
|
|
245
244
|
):
|
|
246
245
|
ser_res = my_doc_ser.serialize(item=item, visited=visited)
|
|
@@ -17,12 +17,11 @@ from docling_core.types.doc.document import (
|
|
|
17
17
|
FormItem,
|
|
18
18
|
InlineGroup,
|
|
19
19
|
KeyValueItem,
|
|
20
|
+
ListGroup,
|
|
20
21
|
NodeItem,
|
|
21
|
-
OrderedList,
|
|
22
22
|
PictureItem,
|
|
23
23
|
TableItem,
|
|
24
24
|
TextItem,
|
|
25
|
-
UnorderedList,
|
|
26
25
|
)
|
|
27
26
|
|
|
28
27
|
|
|
@@ -128,7 +127,7 @@ class BaseListSerializer(ABC):
|
|
|
128
127
|
def serialize(
|
|
129
128
|
self,
|
|
130
129
|
*,
|
|
131
|
-
item:
|
|
130
|
+
item: ListGroup,
|
|
132
131
|
doc_serializer: "BaseDocSerializer",
|
|
133
132
|
doc: DoclingDocument,
|
|
134
133
|
**kwargs: Any,
|
|
@@ -39,8 +39,8 @@ from docling_core.types.doc.document import (
|
|
|
39
39
|
FormItem,
|
|
40
40
|
InlineGroup,
|
|
41
41
|
KeyValueItem,
|
|
42
|
+
ListGroup,
|
|
42
43
|
NodeItem,
|
|
43
|
-
OrderedList,
|
|
44
44
|
PictureClassificationData,
|
|
45
45
|
PictureDataType,
|
|
46
46
|
PictureItem,
|
|
@@ -49,7 +49,6 @@ from docling_core.types.doc.document import (
|
|
|
49
49
|
TableAnnotationType,
|
|
50
50
|
TableItem,
|
|
51
51
|
TextItem,
|
|
52
|
-
UnorderedList,
|
|
53
52
|
)
|
|
54
53
|
from docling_core.types.doc.labels import DocItemLabel
|
|
55
54
|
|
|
@@ -89,7 +88,7 @@ def _iterate_items(
|
|
|
89
88
|
):
|
|
90
89
|
if add_page_breaks:
|
|
91
90
|
if (
|
|
92
|
-
isinstance(item, (
|
|
91
|
+
isinstance(item, (ListGroup, InlineGroup))
|
|
93
92
|
and item.self_ref not in my_visited
|
|
94
93
|
):
|
|
95
94
|
# if group starts with new page, yield page break before group node
|
|
@@ -316,7 +315,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
316
315
|
########
|
|
317
316
|
# groups
|
|
318
317
|
########
|
|
319
|
-
if isinstance(item,
|
|
318
|
+
if isinstance(item, ListGroup):
|
|
320
319
|
part = self.list_serializer.serialize(
|
|
321
320
|
item=item,
|
|
322
321
|
doc_serializer=self,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Define classes for Doctags serialization."""
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
from typing_extensions import override
|
|
@@ -34,9 +34,9 @@ from docling_core.types.doc.document import (
|
|
|
34
34
|
FormItem,
|
|
35
35
|
InlineGroup,
|
|
36
36
|
KeyValueItem,
|
|
37
|
+
ListGroup,
|
|
37
38
|
ListItem,
|
|
38
39
|
NodeItem,
|
|
39
|
-
OrderedList,
|
|
40
40
|
PictureClassificationData,
|
|
41
41
|
PictureItem,
|
|
42
42
|
PictureMoleculeData,
|
|
@@ -44,7 +44,6 @@ from docling_core.types.doc.document import (
|
|
|
44
44
|
ProvenanceItem,
|
|
45
45
|
TableItem,
|
|
46
46
|
TextItem,
|
|
47
|
-
UnorderedList,
|
|
48
47
|
)
|
|
49
48
|
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
|
|
50
49
|
from docling_core.types.doc.tokens import DocumentToken
|
|
@@ -376,7 +375,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
376
375
|
def serialize(
|
|
377
376
|
self,
|
|
378
377
|
*,
|
|
379
|
-
item:
|
|
378
|
+
item: ListGroup,
|
|
380
379
|
doc_serializer: "BaseDocSerializer",
|
|
381
380
|
doc: DoclingDocument,
|
|
382
381
|
list_level: int = 0,
|
|
@@ -406,7 +405,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
406
405
|
text_res = f"{text_res}{delim}"
|
|
407
406
|
wrap_tag = (
|
|
408
407
|
DocumentToken.ORDERED_LIST.value
|
|
409
|
-
if
|
|
408
|
+
if item.first_item_is_enumerated(doc)
|
|
410
409
|
else DocumentToken.UNORDERED_LIST.value
|
|
411
410
|
)
|
|
412
411
|
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
|
|
@@ -58,9 +58,9 @@ from docling_core.types.doc.document import (
|
|
|
58
58
|
ImageRef,
|
|
59
59
|
InlineGroup,
|
|
60
60
|
KeyValueItem,
|
|
61
|
+
ListGroup,
|
|
61
62
|
ListItem,
|
|
62
63
|
NodeItem,
|
|
63
|
-
OrderedList,
|
|
64
64
|
PictureClassificationData,
|
|
65
65
|
PictureItem,
|
|
66
66
|
PictureMoleculeData,
|
|
@@ -70,7 +70,6 @@ from docling_core.types.doc.document import (
|
|
|
70
70
|
TableItem,
|
|
71
71
|
TextItem,
|
|
72
72
|
TitleItem,
|
|
73
|
-
UnorderedList,
|
|
74
73
|
)
|
|
75
74
|
from docling_core.types.doc.labels import DocItemLabel
|
|
76
75
|
from docling_core.types.doc.utils import (
|
|
@@ -117,6 +116,8 @@ class HTMLParams(CommonParams):
|
|
|
117
116
|
|
|
118
117
|
include_annotations: bool = True
|
|
119
118
|
|
|
119
|
+
show_original_list_item_marker: bool = True
|
|
120
|
+
|
|
120
121
|
|
|
121
122
|
class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
122
123
|
"""HTML-specific text item serializer."""
|
|
@@ -162,7 +163,19 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
162
163
|
elif isinstance(item, ListItem):
|
|
163
164
|
# List items are handled by list serializer
|
|
164
165
|
text_inner = self._prepare_content(item.text)
|
|
165
|
-
text =
|
|
166
|
+
text = (
|
|
167
|
+
get_html_tag_with_text_direction(
|
|
168
|
+
html_tag="li",
|
|
169
|
+
text=text_inner,
|
|
170
|
+
attrs=(
|
|
171
|
+
{"style": f"list-style-type: '{item.marker} ';"}
|
|
172
|
+
if params.show_original_list_item_marker and item.marker
|
|
173
|
+
else {}
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
if text_inner
|
|
177
|
+
else ""
|
|
178
|
+
)
|
|
166
179
|
|
|
167
180
|
elif is_inline_scope:
|
|
168
181
|
text = self._prepare_content(item.text)
|
|
@@ -680,7 +693,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
680
693
|
def serialize(
|
|
681
694
|
self,
|
|
682
695
|
*,
|
|
683
|
-
item:
|
|
696
|
+
item: ListGroup,
|
|
684
697
|
doc_serializer: "BaseDocSerializer",
|
|
685
698
|
doc: DoclingDocument,
|
|
686
699
|
list_level: int = 0,
|
|
@@ -690,7 +703,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
690
703
|
) -> SerializationResult:
|
|
691
704
|
"""Serializes a list to HTML."""
|
|
692
705
|
my_visited: set[str] = visited if visited is not None else set()
|
|
693
|
-
|
|
706
|
+
params = HTMLParams(**kwargs)
|
|
694
707
|
# Get all child parts
|
|
695
708
|
parts = doc_serializer.get_parts(
|
|
696
709
|
item=item,
|
|
@@ -706,17 +719,51 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
706
719
|
(
|
|
707
720
|
p.text
|
|
708
721
|
if (
|
|
709
|
-
(
|
|
710
|
-
|
|
711
|
-
|
|
722
|
+
(
|
|
723
|
+
p.text.startswith(("<li>", "<li "))
|
|
724
|
+
and p.text.endswith("</li>")
|
|
725
|
+
)
|
|
726
|
+
or (
|
|
727
|
+
p.text.startswith(("<ol>", "<ol "))
|
|
728
|
+
and p.text.endswith("</ol>")
|
|
729
|
+
)
|
|
730
|
+
or (
|
|
731
|
+
p.text.startswith(("<ul>", "<ul "))
|
|
732
|
+
and p.text.endswith("</ul>")
|
|
733
|
+
)
|
|
734
|
+
)
|
|
735
|
+
else (
|
|
736
|
+
get_html_tag_with_text_direction(
|
|
737
|
+
html_tag="li",
|
|
738
|
+
text=p.text,
|
|
739
|
+
attrs=(
|
|
740
|
+
{
|
|
741
|
+
"style": f"list-style-type: '{grandparent_item.marker} ';"
|
|
742
|
+
}
|
|
743
|
+
if params.show_original_list_item_marker
|
|
744
|
+
and grandparent_item.marker
|
|
745
|
+
else {}
|
|
746
|
+
),
|
|
747
|
+
)
|
|
748
|
+
if p.spans
|
|
749
|
+
and p.spans[0].item.parent
|
|
750
|
+
and isinstance(
|
|
751
|
+
(parent_item := p.spans[0].item.parent.resolve(doc)),
|
|
752
|
+
InlineGroup,
|
|
753
|
+
)
|
|
754
|
+
and parent_item.parent
|
|
755
|
+
and isinstance(
|
|
756
|
+
(grandparent_item := parent_item.parent.resolve(doc)),
|
|
757
|
+
ListItem,
|
|
758
|
+
)
|
|
759
|
+
else f"<li>{p.text}</li>"
|
|
712
760
|
)
|
|
713
|
-
else f"<li>{p.text}</li>"
|
|
714
761
|
)
|
|
715
762
|
for p in parts
|
|
716
763
|
]
|
|
717
764
|
)
|
|
718
765
|
if text_res:
|
|
719
|
-
tag = "ol" if
|
|
766
|
+
tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
|
|
720
767
|
text_res = f"<{tag}>\n{text_res}\n</{tag}>"
|
|
721
768
|
|
|
722
769
|
return create_ser_result(text=text_res, span_source=parts)
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
import html
|
|
8
8
|
import re
|
|
9
9
|
import textwrap
|
|
10
|
+
from enum import Enum
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from typing import Any, Optional, Union
|
|
12
13
|
|
|
@@ -31,7 +32,6 @@ from docling_core.transforms.serializer.common import (
|
|
|
31
32
|
CommonParams,
|
|
32
33
|
DocSerializer,
|
|
33
34
|
_get_annotation_text,
|
|
34
|
-
_PageBreakSerResult,
|
|
35
35
|
create_ser_result,
|
|
36
36
|
)
|
|
37
37
|
from docling_core.types.doc.base import ImageRefMode
|
|
@@ -48,8 +48,9 @@ from docling_core.types.doc.document import (
|
|
|
48
48
|
ImageRef,
|
|
49
49
|
InlineGroup,
|
|
50
50
|
KeyValueItem,
|
|
51
|
+
ListGroup,
|
|
52
|
+
ListItem,
|
|
51
53
|
NodeItem,
|
|
52
|
-
OrderedList,
|
|
53
54
|
PictureClassificationData,
|
|
54
55
|
PictureItem,
|
|
55
56
|
PictureMoleculeData,
|
|
@@ -58,7 +59,6 @@ from docling_core.types.doc.document import (
|
|
|
58
59
|
TableItem,
|
|
59
60
|
TextItem,
|
|
60
61
|
TitleItem,
|
|
61
|
-
UnorderedList,
|
|
62
62
|
)
|
|
63
63
|
|
|
64
64
|
|
|
@@ -79,6 +79,14 @@ def _get_annotation_ser_result(
|
|
|
79
79
|
)
|
|
80
80
|
|
|
81
81
|
|
|
82
|
+
class OrigListItemMarkerMode(str, Enum):
|
|
83
|
+
"""Display mode for original list item marker."""
|
|
84
|
+
|
|
85
|
+
NEVER = "never"
|
|
86
|
+
ALWAYS = "always"
|
|
87
|
+
AUTO = "auto"
|
|
88
|
+
|
|
89
|
+
|
|
82
90
|
class MarkdownParams(CommonParams):
|
|
83
91
|
"""Markdown-specific serialization parameters."""
|
|
84
92
|
|
|
@@ -93,6 +101,8 @@ class MarkdownParams(CommonParams):
|
|
|
93
101
|
escape_html: bool = True
|
|
94
102
|
include_annotations: bool = True
|
|
95
103
|
mark_annotations: bool = False
|
|
104
|
+
orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO
|
|
105
|
+
ensure_valid_list_item_marker: bool = True
|
|
96
106
|
|
|
97
107
|
|
|
98
108
|
class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
@@ -117,7 +127,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
117
127
|
escape_html = True
|
|
118
128
|
escape_underscores = True
|
|
119
129
|
processing_pending = True
|
|
120
|
-
if isinstance(item, (TitleItem, SectionHeaderItem)):
|
|
130
|
+
if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
|
|
121
131
|
# case where processing/formatting should be applied first (in inner scope)
|
|
122
132
|
processing_pending = False
|
|
123
133
|
if (
|
|
@@ -127,7 +137,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
127
137
|
(child_group := item.children[0].resolve(doc)), InlineGroup
|
|
128
138
|
)
|
|
129
139
|
):
|
|
130
|
-
# case of heading
|
|
140
|
+
# case of inline within heading / list item
|
|
131
141
|
ser_res = doc_serializer.serialize(item=child_group)
|
|
132
142
|
text = ser_res.text
|
|
133
143
|
for span in ser_res.spans:
|
|
@@ -140,8 +150,55 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
140
150
|
formatting=item.formatting,
|
|
141
151
|
hyperlink=item.hyperlink,
|
|
142
152
|
)
|
|
143
|
-
|
|
144
|
-
|
|
153
|
+
|
|
154
|
+
if isinstance(item, ListItem):
|
|
155
|
+
pieces: list[str] = []
|
|
156
|
+
case_auto = (
|
|
157
|
+
params.orig_list_item_marker_mode == OrigListItemMarkerMode.AUTO
|
|
158
|
+
and bool(re.search(r"[a-zA-Z0-9]", item.marker))
|
|
159
|
+
)
|
|
160
|
+
case_already_valid = (
|
|
161
|
+
params.ensure_valid_list_item_marker
|
|
162
|
+
and params.orig_list_item_marker_mode
|
|
163
|
+
!= OrigListItemMarkerMode.NEVER
|
|
164
|
+
and (
|
|
165
|
+
item.marker in ["-", "*", "+"]
|
|
166
|
+
or re.fullmatch(r"\d+\.", item.marker)
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# wrap with outer marker (if applicable)
|
|
171
|
+
if params.ensure_valid_list_item_marker and not case_already_valid:
|
|
172
|
+
assert item.parent and isinstance(
|
|
173
|
+
(list_group := item.parent.resolve(doc)), ListGroup
|
|
174
|
+
)
|
|
175
|
+
if list_group.first_item_is_enumerated(doc) and (
|
|
176
|
+
params.orig_list_item_marker_mode != OrigListItemMarkerMode.AUTO
|
|
177
|
+
or not item.marker
|
|
178
|
+
):
|
|
179
|
+
pos = -1
|
|
180
|
+
for i, child in enumerate(list_group.children):
|
|
181
|
+
if child.resolve(doc) == item:
|
|
182
|
+
pos = i
|
|
183
|
+
break
|
|
184
|
+
md_marker = f"{pos + 1}."
|
|
185
|
+
else:
|
|
186
|
+
md_marker = "-"
|
|
187
|
+
pieces.append(md_marker)
|
|
188
|
+
|
|
189
|
+
# include original marker (if applicable)
|
|
190
|
+
if item.marker and (
|
|
191
|
+
params.orig_list_item_marker_mode == OrigListItemMarkerMode.ALWAYS
|
|
192
|
+
or case_auto
|
|
193
|
+
or case_already_valid
|
|
194
|
+
):
|
|
195
|
+
pieces.append(item.marker)
|
|
196
|
+
|
|
197
|
+
pieces.append(text)
|
|
198
|
+
text_part = " ".join(pieces)
|
|
199
|
+
else:
|
|
200
|
+
num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
|
|
201
|
+
text_part = f"{num_hashes * '#'} {text}"
|
|
145
202
|
elif isinstance(item, CodeItem):
|
|
146
203
|
text_part = f"`{text}`" if is_inline_scope else f"```\n{text}\n```"
|
|
147
204
|
escape_html = False
|
|
@@ -452,7 +509,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
452
509
|
def serialize(
|
|
453
510
|
self,
|
|
454
511
|
*,
|
|
455
|
-
item:
|
|
512
|
+
item: ListGroup,
|
|
456
513
|
doc_serializer: "BaseDocSerializer",
|
|
457
514
|
doc: DoclingDocument,
|
|
458
515
|
list_level: int = 0,
|
|
@@ -473,27 +530,24 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
473
530
|
sep = "\n"
|
|
474
531
|
my_parts: list[SerializationResult] = []
|
|
475
532
|
for p in parts:
|
|
476
|
-
if
|
|
477
|
-
my_parts
|
|
533
|
+
if (
|
|
534
|
+
my_parts
|
|
535
|
+
and p.text
|
|
536
|
+
and p.spans
|
|
537
|
+
and p.spans[0].item.parent
|
|
538
|
+
and isinstance(p.spans[0].item.parent.resolve(doc), InlineGroup)
|
|
539
|
+
):
|
|
540
|
+
my_parts[-1].text = f"{my_parts[-1].text}{p.text}" # append to last
|
|
478
541
|
my_parts[-1].spans.extend(p.spans)
|
|
479
542
|
else:
|
|
480
543
|
my_parts.append(p)
|
|
481
544
|
|
|
482
545
|
indent_str = list_level * params.indent * " "
|
|
483
|
-
is_ol = isinstance(item, OrderedList)
|
|
484
546
|
text_res = sep.join(
|
|
485
547
|
[
|
|
486
548
|
# avoid additional marker on already evaled sublists
|
|
487
|
-
(
|
|
488
|
-
|
|
489
|
-
if c.text and c.text[0] == " "
|
|
490
|
-
else (
|
|
491
|
-
f"{indent_str}"
|
|
492
|
-
f"{'' if isinstance(c, _PageBreakSerResult) else (f'{i + 1}. ' if is_ol else '- ')}" # noqa: E501
|
|
493
|
-
f"{c.text}"
|
|
494
|
-
)
|
|
495
|
-
)
|
|
496
|
-
for i, c in enumerate(my_parts)
|
|
549
|
+
(c.text if c.text and c.text[0] == " " else f"{indent_str}{c.text}")
|
|
550
|
+
for c in my_parts
|
|
497
551
|
]
|
|
498
552
|
)
|
|
499
553
|
return create_ser_result(text=text_res, span_source=my_parts)
|
|
@@ -54,7 +54,7 @@ _logger = logging.getLogger(__name__)
|
|
|
54
54
|
|
|
55
55
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
56
56
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
57
|
-
CURRENT_VERSION: Final = "1.
|
|
57
|
+
CURRENT_VERSION: Final = "1.5.0"
|
|
58
58
|
|
|
59
59
|
DEFAULT_EXPORT_LABELS = {
|
|
60
60
|
DocItemLabel.TITLE,
|
|
@@ -133,12 +133,6 @@ class MiscAnnotation(BaseAnnotation):
|
|
|
133
133
|
content: Dict[str, Any]
|
|
134
134
|
|
|
135
135
|
|
|
136
|
-
# deprecated aliases:
|
|
137
|
-
BasePictureData = BaseAnnotation
|
|
138
|
-
PictureDescriptionData = DescriptionAnnotation
|
|
139
|
-
PictureMiscData = MiscAnnotation
|
|
140
|
-
|
|
141
|
-
|
|
142
136
|
class ChartLine(BaseModel):
|
|
143
137
|
"""Represents a line in a line chart.
|
|
144
138
|
|
|
@@ -737,9 +731,11 @@ class ProvenanceItem(BaseModel):
|
|
|
737
731
|
class ContentLayer(str, Enum):
|
|
738
732
|
"""ContentLayer."""
|
|
739
733
|
|
|
740
|
-
BODY = "body"
|
|
741
|
-
FURNITURE = "furniture"
|
|
742
|
-
BACKGROUND = "background"
|
|
734
|
+
BODY = "body" # main content of the document
|
|
735
|
+
FURNITURE = "furniture" # eg page-headers/footers
|
|
736
|
+
BACKGROUND = "background" # eg watermarks
|
|
737
|
+
INVISIBLE = "invisible" # hidden or invisible text
|
|
738
|
+
NOTES = "notes" # author/speaker notes, corrections, etc
|
|
743
739
|
|
|
744
740
|
|
|
745
741
|
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
|
|
@@ -860,12 +856,27 @@ class GroupItem(NodeItem): # Container type, can't be a leaf node
|
|
|
860
856
|
label: GroupLabel = GroupLabel.UNSPECIFIED
|
|
861
857
|
|
|
862
858
|
|
|
863
|
-
class
|
|
864
|
-
"""
|
|
859
|
+
class ListGroup(GroupItem):
|
|
860
|
+
"""ListGroup."""
|
|
865
861
|
|
|
866
862
|
label: typing.Literal[GroupLabel.LIST] = GroupLabel.LIST # type: ignore[assignment]
|
|
867
863
|
|
|
864
|
+
@field_validator("label", mode="before")
|
|
865
|
+
@classmethod
|
|
866
|
+
def patch_ordered(cls, value):
|
|
867
|
+
"""patch_ordered."""
|
|
868
|
+
return GroupLabel.LIST if value == GroupLabel.ORDERED_LIST else value
|
|
869
|
+
|
|
870
|
+
def first_item_is_enumerated(self, doc: "DoclingDocument"):
|
|
871
|
+
"""Whether the first list item is enumerated."""
|
|
872
|
+
return (
|
|
873
|
+
len(self.children) > 0
|
|
874
|
+
and isinstance(first_child := self.children[0].resolve(doc), ListItem)
|
|
875
|
+
and first_child.enumerated
|
|
876
|
+
)
|
|
877
|
+
|
|
868
878
|
|
|
879
|
+
@deprecated("Use ListGroup instead.")
|
|
869
880
|
class OrderedList(GroupItem):
|
|
870
881
|
"""OrderedList."""
|
|
871
882
|
|
|
@@ -1752,7 +1763,7 @@ class DoclingDocument(BaseModel):
|
|
|
1752
1763
|
) # List[RefItem] = []
|
|
1753
1764
|
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
1754
1765
|
|
|
1755
|
-
groups: List[Union[
|
|
1766
|
+
groups: List[Union[ListGroup, InlineGroup, GroupItem]] = []
|
|
1756
1767
|
texts: List[
|
|
1757
1768
|
Union[TitleItem, SectionHeaderItem, ListItem, CodeItem, FormulaItem, TextItem]
|
|
1758
1769
|
] = []
|
|
@@ -1938,7 +1949,7 @@ class DoclingDocument(BaseModel):
|
|
|
1938
1949
|
|
|
1939
1950
|
self.form_items.append(item)
|
|
1940
1951
|
|
|
1941
|
-
elif isinstance(item, (
|
|
1952
|
+
elif isinstance(item, (ListGroup, InlineGroup)):
|
|
1942
1953
|
item_label = "groups"
|
|
1943
1954
|
item_index = len(self.groups)
|
|
1944
1955
|
|
|
@@ -2160,16 +2171,16 @@ class DoclingDocument(BaseModel):
|
|
|
2160
2171
|
# TODO: refactor add* methods below
|
|
2161
2172
|
###################################
|
|
2162
2173
|
|
|
2163
|
-
def
|
|
2174
|
+
def add_list_group(
|
|
2164
2175
|
self,
|
|
2165
2176
|
name: Optional[str] = None,
|
|
2166
2177
|
parent: Optional[NodeItem] = None,
|
|
2167
2178
|
content_layer: Optional[ContentLayer] = None,
|
|
2168
|
-
) ->
|
|
2169
|
-
"""
|
|
2179
|
+
) -> ListGroup:
|
|
2180
|
+
"""add_list_group."""
|
|
2170
2181
|
_parent = parent or self.body
|
|
2171
2182
|
cref = f"#/groups/{len(self.groups)}"
|
|
2172
|
-
group =
|
|
2183
|
+
group = ListGroup(self_ref=cref, parent=_parent.get_ref())
|
|
2173
2184
|
if name is not None:
|
|
2174
2185
|
group.name = name
|
|
2175
2186
|
if content_layer:
|
|
@@ -2179,6 +2190,21 @@ class DoclingDocument(BaseModel):
|
|
|
2179
2190
|
_parent.children.append(RefItem(cref=cref))
|
|
2180
2191
|
return group
|
|
2181
2192
|
|
|
2193
|
+
@deprecated("Use add_list_group() instead.")
|
|
2194
|
+
def add_ordered_list(
|
|
2195
|
+
self,
|
|
2196
|
+
name: Optional[str] = None,
|
|
2197
|
+
parent: Optional[NodeItem] = None,
|
|
2198
|
+
content_layer: Optional[ContentLayer] = None,
|
|
2199
|
+
) -> GroupItem:
|
|
2200
|
+
"""add_ordered_list."""
|
|
2201
|
+
return self.add_list_group(
|
|
2202
|
+
name=name,
|
|
2203
|
+
parent=parent,
|
|
2204
|
+
content_layer=content_layer,
|
|
2205
|
+
)
|
|
2206
|
+
|
|
2207
|
+
@deprecated("Use add_list_group() instead.")
|
|
2182
2208
|
def add_unordered_list(
|
|
2183
2209
|
self,
|
|
2184
2210
|
name: Optional[str] = None,
|
|
@@ -2186,25 +2212,18 @@ class DoclingDocument(BaseModel):
|
|
|
2186
2212
|
content_layer: Optional[ContentLayer] = None,
|
|
2187
2213
|
) -> GroupItem:
|
|
2188
2214
|
"""add_unordered_list."""
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
if content_layer:
|
|
2195
|
-
group.content_layer = content_layer
|
|
2196
|
-
|
|
2197
|
-
self.groups.append(group)
|
|
2198
|
-
_parent.children.append(RefItem(cref=cref))
|
|
2199
|
-
return group
|
|
2215
|
+
return self.add_list_group(
|
|
2216
|
+
name=name,
|
|
2217
|
+
parent=parent,
|
|
2218
|
+
content_layer=content_layer,
|
|
2219
|
+
)
|
|
2200
2220
|
|
|
2201
2221
|
def add_inline_group(
|
|
2202
2222
|
self,
|
|
2203
2223
|
name: Optional[str] = None,
|
|
2204
2224
|
parent: Optional[NodeItem] = None,
|
|
2205
2225
|
content_layer: Optional[ContentLayer] = None,
|
|
2206
|
-
|
|
2207
|
-
) -> GroupItem:
|
|
2226
|
+
) -> InlineGroup:
|
|
2208
2227
|
"""add_inline_group."""
|
|
2209
2228
|
_parent = parent or self.body
|
|
2210
2229
|
cref = f"#/groups/{len(self.groups)}"
|
|
@@ -2232,14 +2251,8 @@ class DoclingDocument(BaseModel):
|
|
|
2232
2251
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
2233
2252
|
|
|
2234
2253
|
"""
|
|
2235
|
-
if label
|
|
2236
|
-
return self.
|
|
2237
|
-
name=name,
|
|
2238
|
-
parent=parent,
|
|
2239
|
-
content_layer=content_layer,
|
|
2240
|
-
)
|
|
2241
|
-
elif label == GroupLabel.ORDERED_LIST:
|
|
2242
|
-
return self.add_ordered_list(
|
|
2254
|
+
if label in [GroupLabel.LIST, GroupLabel.ORDERED_LIST]:
|
|
2255
|
+
return self.add_list_group(
|
|
2243
2256
|
name=name,
|
|
2244
2257
|
parent=parent,
|
|
2245
2258
|
content_layer=content_layer,
|
|
@@ -2291,17 +2304,16 @@ class DoclingDocument(BaseModel):
|
|
|
2291
2304
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
2292
2305
|
|
|
2293
2306
|
"""
|
|
2294
|
-
if not isinstance(parent,
|
|
2295
|
-
warnings.warn(
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2307
|
+
if not isinstance(parent, ListGroup):
|
|
2308
|
+
warnings.warn(
|
|
2309
|
+
"ListItem parent must be a list group, creating one on the fly.",
|
|
2310
|
+
DeprecationWarning,
|
|
2311
|
+
)
|
|
2312
|
+
parent = self.add_list_group(parent=parent)
|
|
2299
2313
|
|
|
2300
2314
|
if not orig:
|
|
2301
2315
|
orig = text
|
|
2302
2316
|
|
|
2303
|
-
marker = marker or "-"
|
|
2304
|
-
|
|
2305
2317
|
text_index = len(self.texts)
|
|
2306
2318
|
cref = f"#/texts/{text_index}"
|
|
2307
2319
|
list_item = ListItem(
|
|
@@ -2310,7 +2322,7 @@ class DoclingDocument(BaseModel):
|
|
|
2310
2322
|
self_ref=cref,
|
|
2311
2323
|
parent=parent.get_ref(),
|
|
2312
2324
|
enumerated=enumerated,
|
|
2313
|
-
marker=marker,
|
|
2325
|
+
marker=marker or "",
|
|
2314
2326
|
formatting=formatting,
|
|
2315
2327
|
hyperlink=hyperlink,
|
|
2316
2328
|
)
|
|
@@ -2864,7 +2876,7 @@ class DoclingDocument(BaseModel):
|
|
|
2864
2876
|
if (
|
|
2865
2877
|
root_is_picture
|
|
2866
2878
|
and not traverse_pictures
|
|
2867
|
-
and isinstance(child,
|
|
2879
|
+
and isinstance(child, NodeItem)
|
|
2868
2880
|
and child.self_ref not in allowed_pic_refs
|
|
2869
2881
|
):
|
|
2870
2882
|
continue
|
|
@@ -4056,18 +4068,18 @@ class DoclingDocument(BaseModel):
|
|
|
4056
4068
|
DocumentToken.ORDERED_LIST.value,
|
|
4057
4069
|
DocumentToken.UNORDERED_LIST.value,
|
|
4058
4070
|
]:
|
|
4059
|
-
|
|
4071
|
+
GroupLabel.LIST
|
|
4060
4072
|
enum_marker = ""
|
|
4061
4073
|
enum_value = 0
|
|
4062
4074
|
if tag_name == DocumentToken.ORDERED_LIST.value:
|
|
4063
|
-
|
|
4075
|
+
GroupLabel.ORDERED_LIST
|
|
4064
4076
|
|
|
4065
4077
|
list_item_pattern = (
|
|
4066
4078
|
rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
|
|
4067
4079
|
)
|
|
4068
4080
|
li_pattern = re.compile(list_item_pattern, re.DOTALL)
|
|
4069
4081
|
# Add list group:
|
|
4070
|
-
new_list = doc.
|
|
4082
|
+
new_list = doc.add_list_group(name="list")
|
|
4071
4083
|
# Pricess list items
|
|
4072
4084
|
for li_match in li_pattern.finditer(full_chunk):
|
|
4073
4085
|
enum_value += 1
|
|
@@ -4385,17 +4397,17 @@ class DoclingDocument(BaseModel):
|
|
|
4385
4397
|
@field_validator("version")
|
|
4386
4398
|
@classmethod
|
|
4387
4399
|
def check_version_is_compatible(cls, v: str) -> str:
|
|
4388
|
-
"""Check if this document version is compatible with
|
|
4389
|
-
|
|
4400
|
+
"""Check if this document version is compatible with SDK schema version."""
|
|
4401
|
+
sdk_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
|
|
4390
4402
|
doc_match = re.match(VERSION_PATTERN, v)
|
|
4391
4403
|
if (
|
|
4392
4404
|
doc_match is None
|
|
4393
|
-
or
|
|
4394
|
-
or doc_match["major"] !=
|
|
4395
|
-
or doc_match["minor"] >
|
|
4405
|
+
or sdk_match is None
|
|
4406
|
+
or doc_match["major"] != sdk_match["major"]
|
|
4407
|
+
or doc_match["minor"] > sdk_match["minor"]
|
|
4396
4408
|
):
|
|
4397
4409
|
raise ValueError(
|
|
4398
|
-
f"
|
|
4410
|
+
f"Doc version {v} incompatible with SDK schema version {CURRENT_VERSION}"
|
|
4399
4411
|
)
|
|
4400
4412
|
else:
|
|
4401
4413
|
return CURRENT_VERSION
|
|
@@ -4425,9 +4437,7 @@ class DoclingDocument(BaseModel):
|
|
|
4425
4437
|
):
|
|
4426
4438
|
if isinstance(item, ListItem) and (
|
|
4427
4439
|
item.parent is None
|
|
4428
|
-
or not isinstance(
|
|
4429
|
-
item.parent.resolve(doc=self), (OrderedList, UnorderedList)
|
|
4430
|
-
)
|
|
4440
|
+
or not isinstance(item.parent.resolve(doc=self), ListGroup)
|
|
4431
4441
|
):
|
|
4432
4442
|
if isinstance(prev, ListItem) and (
|
|
4433
4443
|
prev.parent is None or prev.parent.resolve(self) == self.body
|
|
@@ -4440,11 +4450,7 @@ class DoclingDocument(BaseModel):
|
|
|
4440
4450
|
for curr_list_items in reversed(misplaced_list_items):
|
|
4441
4451
|
|
|
4442
4452
|
# add group
|
|
4443
|
-
new_group = (
|
|
4444
|
-
OrderedList(self_ref="#")
|
|
4445
|
-
if curr_list_items[0].enumerated
|
|
4446
|
-
else UnorderedList(self_ref="#")
|
|
4447
|
-
)
|
|
4453
|
+
new_group = ListGroup(self_ref="#")
|
|
4448
4454
|
self.insert_item_before_sibling(
|
|
4449
4455
|
new_item=new_group,
|
|
4450
4456
|
sibling=curr_list_items[0],
|
|
@@ -4531,3 +4537,10 @@ class DoclingDocument(BaseModel):
|
|
|
4531
4537
|
self.key_value_items = item_lists["key_value_items"] # type: ignore
|
|
4532
4538
|
self.form_items = item_lists["form_items"] # type: ignore
|
|
4533
4539
|
self.body = new_body
|
|
4540
|
+
|
|
4541
|
+
|
|
4542
|
+
# deprecated aliases (kept for backwards compatibility):
|
|
4543
|
+
BasePictureData = BaseAnnotation
|
|
4544
|
+
PictureDescriptionData = DescriptionAnnotation
|
|
4545
|
+
PictureMiscData = MiscAnnotation
|
|
4546
|
+
UnorderedList = ListGroup
|
docling_core/types/doc/labels.py
CHANGED
|
@@ -77,7 +77,7 @@ class GroupLabel(str, Enum):
|
|
|
77
77
|
LIST = (
|
|
78
78
|
"list" # group label for list container (not the list-items) (e.g. HTML <ul/>)
|
|
79
79
|
)
|
|
80
|
-
ORDERED_LIST = "ordered_list" #
|
|
80
|
+
ORDERED_LIST = "ordered_list" # deprecated
|
|
81
81
|
CHAPTER = "chapter"
|
|
82
82
|
SECTION = "section"
|
|
83
83
|
SHEET = "sheet"
|
docling_core/types/doc/page.py
CHANGED
|
@@ -122,6 +122,8 @@ class BoundingRectangle(BaseModel):
|
|
|
122
122
|
p_1 = ((self.r_x1 + self.r_x2) / 2.0, (self.r_y1 + self.r_y2) / 2.0)
|
|
123
123
|
|
|
124
124
|
delta_x, delta_y = p_1[0] - p_0[0], p_1[1] - p_0[1]
|
|
125
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
126
|
+
delta_y = -delta_y
|
|
125
127
|
|
|
126
128
|
if abs(delta_y) < 1.0e-3:
|
|
127
129
|
angle = 0.0
|
|
@@ -131,8 +133,7 @@ class BoundingRectangle(BaseModel):
|
|
|
131
133
|
angle = math.atan(delta_y / delta_x)
|
|
132
134
|
if delta_x < 0:
|
|
133
135
|
angle += np.pi
|
|
134
|
-
|
|
135
|
-
angle += 2 * np.pi
|
|
136
|
+
angle = angle % (2 * np.pi)
|
|
136
137
|
return angle
|
|
137
138
|
|
|
138
139
|
@property
|
docling_core/types/doc/utils.py
CHANGED
|
@@ -5,8 +5,10 @@
|
|
|
5
5
|
|
|
6
6
|
"""Utils for document types."""
|
|
7
7
|
|
|
8
|
+
import html
|
|
8
9
|
import unicodedata
|
|
9
10
|
from pathlib import Path
|
|
11
|
+
from typing import Optional
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
def relative_path(src: Path, target: Path) -> Path:
|
|
@@ -49,14 +51,23 @@ def relative_path(src: Path, target: Path) -> Path:
|
|
|
49
51
|
return Path(*up_segments, *down_segments)
|
|
50
52
|
|
|
51
53
|
|
|
52
|
-
def get_html_tag_with_text_direction(
|
|
54
|
+
def get_html_tag_with_text_direction(
|
|
55
|
+
html_tag: str, text: str, attrs: Optional[dict] = None
|
|
56
|
+
) -> str:
|
|
53
57
|
"""Form the HTML element with tag, text, and optional dir attribute."""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
58
|
+
my_attrs = attrs or {}
|
|
59
|
+
if (dir := my_attrs.get("dir")) is not None and dir != "ltr":
|
|
60
|
+
my_attrs["dir"] = get_text_direction(text)
|
|
61
|
+
pieces: list[str] = [html_tag]
|
|
62
|
+
if my_attrs:
|
|
63
|
+
attrs_str = " ".join(
|
|
64
|
+
[
|
|
65
|
+
f'{html.escape(k, quote=False)}="{html.escape(my_attrs[k], quote=False)}"'
|
|
66
|
+
for k in my_attrs
|
|
67
|
+
]
|
|
68
|
+
)
|
|
69
|
+
pieces.append(attrs_str)
|
|
70
|
+
return f"<{' '.join(pieces)}>{text}</{html_tag}>"
|
|
60
71
|
|
|
61
72
|
|
|
62
73
|
def get_text_direction(text: str) -> str:
|
docling_core/utils/file.py
CHANGED
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
"""File-related utilities."""
|
|
7
7
|
|
|
8
8
|
import importlib
|
|
9
|
+
import re
|
|
9
10
|
import tempfile
|
|
10
11
|
from io import BytesIO
|
|
11
12
|
from pathlib import Path
|
|
@@ -76,6 +77,32 @@ def resolve_source_to_stream(
|
|
|
76
77
|
agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
|
|
77
78
|
req_headers["user-agent"] = agent_name
|
|
78
79
|
|
|
80
|
+
# Google Docs, Files, PDF URLs, Spreadsheets, Presentations: convert to export URL
|
|
81
|
+
google_doc_id = re.search(
|
|
82
|
+
r"google\.com\/(file|document|spreadsheets|presentation)\/d\/([\w-]+)",
|
|
83
|
+
str(http_url),
|
|
84
|
+
)
|
|
85
|
+
if google_doc_id:
|
|
86
|
+
doc_type = google_doc_id.group(1)
|
|
87
|
+
doc_id = google_doc_id.group(2)
|
|
88
|
+
|
|
89
|
+
if doc_type == "file":
|
|
90
|
+
http_url = TypeAdapter(AnyHttpUrl).validate_python(
|
|
91
|
+
f"https://drive.google.com/uc?export=download&id={doc_id}"
|
|
92
|
+
)
|
|
93
|
+
elif doc_type == "document":
|
|
94
|
+
http_url = TypeAdapter(AnyHttpUrl).validate_python(
|
|
95
|
+
f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
|
|
96
|
+
)
|
|
97
|
+
elif doc_type == "spreadsheets":
|
|
98
|
+
http_url = TypeAdapter(AnyHttpUrl).validate_python(
|
|
99
|
+
f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=xlsx"
|
|
100
|
+
)
|
|
101
|
+
elif doc_type == "presentation":
|
|
102
|
+
http_url = TypeAdapter(AnyHttpUrl).validate_python(
|
|
103
|
+
f"https://docs.google.com/presentation/d/{doc_id}/export?format=pptx"
|
|
104
|
+
)
|
|
105
|
+
|
|
79
106
|
# fetch the page
|
|
80
107
|
res = requests.get(http_url, stream=True, headers=req_headers)
|
|
81
108
|
res.raise_for_status()
|
docling_core/utils/legacy.py
CHANGED
|
@@ -26,7 +26,6 @@ from docling_core.types.doc import (
|
|
|
26
26
|
TextItem,
|
|
27
27
|
)
|
|
28
28
|
from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
|
|
29
|
-
from docling_core.types.doc.labels import GroupLabel
|
|
30
29
|
from docling_core.types.legacy_doc.base import (
|
|
31
30
|
BaseCell,
|
|
32
31
|
BaseText,
|
|
@@ -486,7 +485,7 @@ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # no
|
|
|
486
485
|
item_type in "list-item-level-1" or item.name in {"list", "list-item"}
|
|
487
486
|
):
|
|
488
487
|
if current_list is None:
|
|
489
|
-
current_list = doc.
|
|
488
|
+
current_list = doc.add_list_group(name="list")
|
|
490
489
|
else:
|
|
491
490
|
current_list = None
|
|
492
491
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.40.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -19,19 +19,19 @@ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75
|
|
|
19
19
|
docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
|
|
20
20
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
21
21
|
docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
|
|
22
|
-
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=
|
|
22
|
+
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=uDf-qGiIT_4JUEg9NOdzvDqAPOTqycKJ-jEpDkV3jJU,8243
|
|
23
23
|
docling_core/transforms/chunker/hybrid_chunker.py,sha256=xjkz8hy3tXXzkJzf7QMFOEq_v8V7Jcs9tCY0Mxjge74,12548
|
|
24
24
|
docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
|
|
25
25
|
docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
|
|
26
26
|
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
|
|
27
27
|
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
28
28
|
docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
29
|
-
docling_core/transforms/serializer/base.py,sha256=
|
|
30
|
-
docling_core/transforms/serializer/common.py,sha256=
|
|
31
|
-
docling_core/transforms/serializer/doctags.py,sha256=
|
|
32
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
29
|
+
docling_core/transforms/serializer/base.py,sha256=s3Anl_3-QJM1t29Bz-iOgLhAcfG3BZuwZqdYTi5Xfr0,6846
|
|
30
|
+
docling_core/transforms/serializer/common.py,sha256=Dkw9axJqU2qlZuEFRDa6Av11PIL2ejOOOCAahtoK9sA,19106
|
|
31
|
+
docling_core/transforms/serializer/doctags.py,sha256=TD0yAm1qSVy-GsE6svpUAI-Yqjcf2rrTZ3ac9YU3gbE,19858
|
|
32
|
+
docling_core/transforms/serializer/html.py,sha256=oxnUhszRPBINiK1tq2dwf5QjTCrIV_q15vsrPVqBeME,38988
|
|
33
33
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
34
|
-
docling_core/transforms/serializer/markdown.py,sha256=
|
|
34
|
+
docling_core/transforms/serializer/markdown.py,sha256=VwonuAkuOPmQM7ibDIGvQBHOqhTcTJ_t187fLQQiNPo,23951
|
|
35
35
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
36
36
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
37
37
|
docling_core/transforms/visualizer/layout_visualizer.py,sha256=zHzQTWcy-z1J2BcsjvakLkrp8pgStgnxhDl8YqIAotY,8035
|
|
@@ -39,13 +39,13 @@ docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=muqmaxOBao
|
|
|
39
39
|
docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
|
|
40
40
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
41
41
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
42
|
-
docling_core/types/doc/__init__.py,sha256=
|
|
42
|
+
docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
|
|
43
43
|
docling_core/types/doc/base.py,sha256=ndXquBrOKTFQApIJ5s2-zstj3xlVKRbJDSId0KOQnUg,14817
|
|
44
|
-
docling_core/types/doc/document.py,sha256=
|
|
45
|
-
docling_core/types/doc/labels.py,sha256
|
|
46
|
-
docling_core/types/doc/page.py,sha256=
|
|
44
|
+
docling_core/types/doc/document.py,sha256=9-n0tngXLTRVAkqGHe3bDSh1OJbBt87EW2nV8GdOGME,157406
|
|
45
|
+
docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
|
|
46
|
+
docling_core/types/doc/page.py,sha256=J_4ThNhrdhrfPtNMBTDHi-CQBvraejAwUaqVjyDeeeI,41288
|
|
47
47
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
48
|
-
docling_core/types/doc/utils.py,sha256=
|
|
48
|
+
docling_core/types/doc/utils.py,sha256=JpAi7x9DHksFlIj_gRJPcSZOHa8AHvVPEO_K9aSnw4c,2608
|
|
49
49
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
50
50
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
51
51
|
docling_core/types/io/__init__.py,sha256=7QYvFRaDE0AzBg8e7tvsVNlLBbCbAbQ9rP2TU8aXR1k,350
|
|
@@ -68,15 +68,15 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
|
|
|
68
68
|
docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
|
|
69
69
|
docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
|
|
70
70
|
docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
|
|
71
|
-
docling_core/utils/file.py,sha256=
|
|
71
|
+
docling_core/utils/file.py,sha256=CSNclJGL2OwLIc8DQFdoLxr22FUc4_UC7zS6pNrFfkQ,6858
|
|
72
72
|
docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
|
|
73
73
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
74
|
-
docling_core/utils/legacy.py,sha256=
|
|
74
|
+
docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
|
|
75
75
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
76
76
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
82
|
-
docling_core-2.
|
|
77
|
+
docling_core-2.40.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
78
|
+
docling_core-2.40.0.dist-info/METADATA,sha256=A6_Wz_CJzmHa20USMUgQPDMpN5-S3f8VpNrx7ns1SXo,6453
|
|
79
|
+
docling_core-2.40.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
80
|
+
docling_core-2.40.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
81
|
+
docling_core-2.40.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
82
|
+
docling_core-2.40.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|