docling-core 2.28.1__tar.gz → 2.29.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.28.1 → docling_core-2.29.0}/PKG-INFO +1 -1
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/hierarchical_chunker.py +5 -5
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/hybrid_chunker.py +4 -4
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/common.py +1 -1
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/doctags.py +2 -2
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/html.py +3 -3
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/markdown.py +2 -2
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/visualizer/layout_visualizer.py +33 -30
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +52 -50
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/doc/document.py +28 -26
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/doc/page.py +6 -4
- {docling_core-2.28.1 → docling_core-2.29.0}/pyproject.toml +1 -1
- {docling_core-2.28.1 → docling_core-2.29.0}/LICENSE +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/README.md +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/py.typed +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/search/package.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/__init__.py +0 -0
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/base.py +0 -0
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.29.0/docling_core/transforms}/serializer/html_styles.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.28.1 → docling_core-2.29.0}/docling_core/utils/validators.py +0 -0
{docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
@@ -14,19 +14,19 @@ from typing import Any, ClassVar, Final, Iterator, Literal, Optional
|
|
|
14
14
|
from pydantic import ConfigDict, Field, StringConstraints, field_validator
|
|
15
15
|
from typing_extensions import Annotated, override
|
|
16
16
|
|
|
17
|
-
from docling_core.
|
|
17
|
+
from docling_core.search.package import VERSION_PATTERN
|
|
18
|
+
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
19
|
+
from docling_core.transforms.serializer.base import (
|
|
18
20
|
BaseDocSerializer,
|
|
19
21
|
BaseSerializerProvider,
|
|
20
22
|
BaseTableSerializer,
|
|
21
23
|
SerializationResult,
|
|
22
24
|
)
|
|
23
|
-
from docling_core.
|
|
24
|
-
from docling_core.
|
|
25
|
+
from docling_core.transforms.serializer.common import create_ser_result
|
|
26
|
+
from docling_core.transforms.serializer.markdown import (
|
|
25
27
|
MarkdownDocSerializer,
|
|
26
28
|
MarkdownParams,
|
|
27
29
|
)
|
|
28
|
-
from docling_core.search.package import VERSION_PATTERN
|
|
29
|
-
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
30
30
|
from docling_core.types import DoclingDocument as DLDocument
|
|
31
31
|
from docling_core.types.doc.base import ImageRefMode
|
|
32
32
|
from docling_core.types.doc.document import (
|
{docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
@@ -25,10 +25,6 @@ except ImportError:
|
|
|
25
25
|
"`pip install 'docling-core[chunking-openai]'`"
|
|
26
26
|
)
|
|
27
27
|
|
|
28
|
-
from docling_core.experimental.serializer.base import (
|
|
29
|
-
BaseDocSerializer,
|
|
30
|
-
BaseSerializerProvider,
|
|
31
|
-
)
|
|
32
28
|
from docling_core.transforms.chunker import (
|
|
33
29
|
BaseChunk,
|
|
34
30
|
BaseChunker,
|
|
@@ -36,6 +32,10 @@ from docling_core.transforms.chunker import (
|
|
|
36
32
|
DocMeta,
|
|
37
33
|
HierarchicalChunker,
|
|
38
34
|
)
|
|
35
|
+
from docling_core.transforms.serializer.base import (
|
|
36
|
+
BaseDocSerializer,
|
|
37
|
+
BaseSerializerProvider,
|
|
38
|
+
)
|
|
39
39
|
from docling_core.types import DoclingDocument
|
|
40
40
|
|
|
41
41
|
|
|
@@ -14,7 +14,7 @@ from typing import Any, Iterable, Optional, Tuple, Union
|
|
|
14
14
|
from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
|
|
15
15
|
from typing_extensions import Self, override
|
|
16
16
|
|
|
17
|
-
from docling_core.
|
|
17
|
+
from docling_core.transforms.serializer.base import (
|
|
18
18
|
BaseDocSerializer,
|
|
19
19
|
BaseFallbackSerializer,
|
|
20
20
|
BaseFormSerializer,
|
|
@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
from typing_extensions import override
|
|
8
8
|
|
|
9
|
-
from docling_core.
|
|
9
|
+
from docling_core.transforms.serializer.base import (
|
|
10
10
|
BaseDocSerializer,
|
|
11
11
|
BaseFallbackSerializer,
|
|
12
12
|
BaseFormSerializer,
|
|
@@ -18,7 +18,7 @@ from docling_core.experimental.serializer.base import (
|
|
|
18
18
|
BaseTextSerializer,
|
|
19
19
|
SerializationResult,
|
|
20
20
|
)
|
|
21
|
-
from docling_core.
|
|
21
|
+
from docling_core.transforms.serializer.common import (
|
|
22
22
|
CommonParams,
|
|
23
23
|
DocSerializer,
|
|
24
24
|
create_ser_result,
|
|
@@ -19,7 +19,7 @@ import latex2mathml.converter
|
|
|
19
19
|
from pydantic import AnyUrl, BaseModel
|
|
20
20
|
from typing_extensions import override
|
|
21
21
|
|
|
22
|
-
from docling_core.
|
|
22
|
+
from docling_core.transforms.serializer.base import (
|
|
23
23
|
BaseDocSerializer,
|
|
24
24
|
BaseFallbackSerializer,
|
|
25
25
|
BaseFormSerializer,
|
|
@@ -31,12 +31,12 @@ from docling_core.experimental.serializer.base import (
|
|
|
31
31
|
BaseTextSerializer,
|
|
32
32
|
SerializationResult,
|
|
33
33
|
)
|
|
34
|
-
from docling_core.
|
|
34
|
+
from docling_core.transforms.serializer.common import (
|
|
35
35
|
CommonParams,
|
|
36
36
|
DocSerializer,
|
|
37
37
|
create_ser_result,
|
|
38
38
|
)
|
|
39
|
-
from docling_core.
|
|
39
|
+
from docling_core.transforms.serializer.html_styles import (
|
|
40
40
|
_get_css_for_single_column,
|
|
41
41
|
_get_css_for_split_page,
|
|
42
42
|
)
|
|
@@ -14,7 +14,7 @@ from pydantic import AnyUrl, BaseModel, PositiveInt
|
|
|
14
14
|
from tabulate import tabulate
|
|
15
15
|
from typing_extensions import override
|
|
16
16
|
|
|
17
|
-
from docling_core.
|
|
17
|
+
from docling_core.transforms.serializer.base import (
|
|
18
18
|
BaseDocSerializer,
|
|
19
19
|
BaseFallbackSerializer,
|
|
20
20
|
BaseFormSerializer,
|
|
@@ -26,7 +26,7 @@ from docling_core.experimental.serializer.base import (
|
|
|
26
26
|
BaseTextSerializer,
|
|
27
27
|
SerializationResult,
|
|
28
28
|
)
|
|
29
|
-
from docling_core.
|
|
29
|
+
from docling_core.transforms.serializer.common import (
|
|
30
30
|
CommonParams,
|
|
31
31
|
DocSerializer,
|
|
32
32
|
_PageBreakSerResult,
|
{docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
@@ -149,38 +149,41 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
149
149
|
continue
|
|
150
150
|
if len(elem.prov) == 0:
|
|
151
151
|
continue # Skip elements without provenances
|
|
152
|
-
prov = elem.prov[0]
|
|
153
|
-
page_nr = prov.page_no
|
|
154
|
-
|
|
155
|
-
if page_nr in my_images:
|
|
156
|
-
image = my_images[page_nr]
|
|
157
|
-
else:
|
|
158
|
-
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
159
|
-
|
|
160
|
-
if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
|
|
161
|
-
# complete previous drawing
|
|
162
|
-
if prev_page_nr is not None and prev_image and clusters:
|
|
163
|
-
self._draw_clusters(
|
|
164
|
-
image=prev_image,
|
|
165
|
-
clusters=clusters,
|
|
166
|
-
scale_x=prev_image.width / doc.pages[prev_page_nr].size.width,
|
|
167
|
-
scale_y=prev_image.height / doc.pages[prev_page_nr].size.height,
|
|
168
|
-
)
|
|
169
|
-
clusters = []
|
|
170
152
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
153
|
+
for prov in elem.prov:
|
|
154
|
+
page_nr = prov.page_no
|
|
155
|
+
|
|
156
|
+
if page_nr in my_images:
|
|
157
|
+
image = my_images[page_nr]
|
|
158
|
+
else:
|
|
159
|
+
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
160
|
+
|
|
161
|
+
if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
|
|
162
|
+
# complete previous drawing
|
|
163
|
+
if prev_page_nr is not None and prev_image and clusters:
|
|
164
|
+
self._draw_clusters(
|
|
165
|
+
image=prev_image,
|
|
166
|
+
clusters=clusters,
|
|
167
|
+
scale_x=prev_image.width
|
|
168
|
+
/ doc.pages[prev_page_nr].size.width,
|
|
169
|
+
scale_y=prev_image.height
|
|
170
|
+
/ doc.pages[prev_page_nr].size.height,
|
|
171
|
+
)
|
|
172
|
+
clusters = []
|
|
173
|
+
|
|
174
|
+
tlo_bbox = prov.bbox.to_top_left_origin(
|
|
175
|
+
page_height=doc.pages[prov.page_no].size.height
|
|
176
|
+
)
|
|
177
|
+
cluster = _TLCluster(
|
|
178
|
+
id=idx,
|
|
179
|
+
label=elem.label,
|
|
180
|
+
brec=_TLBoundingRectangle.from_bounding_box(bbox=tlo_bbox),
|
|
181
|
+
cells=[],
|
|
182
|
+
)
|
|
183
|
+
clusters.append(cluster)
|
|
181
184
|
|
|
182
|
-
|
|
183
|
-
|
|
185
|
+
prev_page_nr = page_nr
|
|
186
|
+
prev_image = image
|
|
184
187
|
|
|
185
188
|
# complete last drawing
|
|
186
189
|
if prev_page_nr is not None and prev_image and clusters:
|
|
@@ -77,57 +77,59 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
77
77
|
continue
|
|
78
78
|
if len(elem.prov) == 0:
|
|
79
79
|
continue # Skip elements without provenances
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
ro_bbox.l = round(ro_bbox.l * image.width) # noqa: E741
|
|
107
|
-
ro_bbox.r = round(ro_bbox.r * image.width)
|
|
108
|
-
ro_bbox.t = round(ro_bbox.t * image.height)
|
|
109
|
-
ro_bbox.b = round(ro_bbox.b * image.height)
|
|
110
|
-
|
|
111
|
-
if ro_bbox.b > ro_bbox.t:
|
|
112
|
-
ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
|
|
113
|
-
|
|
114
|
-
if x0 is None and y0 is None:
|
|
115
|
-
x0 = (ro_bbox.l + ro_bbox.r) / 2.0
|
|
116
|
-
y0 = (ro_bbox.b + ro_bbox.t) / 2.0
|
|
117
|
-
else:
|
|
118
|
-
assert x0 is not None
|
|
119
|
-
assert y0 is not None
|
|
120
|
-
|
|
121
|
-
x1 = (ro_bbox.l + ro_bbox.r) / 2.0
|
|
122
|
-
y1 = (ro_bbox.b + ro_bbox.t) / 2.0
|
|
123
|
-
|
|
124
|
-
draw = self._draw_arrow(
|
|
125
|
-
draw=draw,
|
|
126
|
-
arrow_coords=(x0, y0, x1, y1),
|
|
127
|
-
line_width=2,
|
|
128
|
-
color="red",
|
|
80
|
+
|
|
81
|
+
for prov in elem.prov:
|
|
82
|
+
page_no = prov.page_no
|
|
83
|
+
image = my_images.get(page_no)
|
|
84
|
+
|
|
85
|
+
if image is None or prev_page is None or page_no > prev_page:
|
|
86
|
+
# new page begins
|
|
87
|
+
prev_page = page_no
|
|
88
|
+
x0 = y0 = None
|
|
89
|
+
|
|
90
|
+
if image is None:
|
|
91
|
+
page_image = doc.pages[page_no].image
|
|
92
|
+
if (
|
|
93
|
+
page_image is None
|
|
94
|
+
or (pil_img := page_image.pil_image) is None
|
|
95
|
+
):
|
|
96
|
+
raise RuntimeError(
|
|
97
|
+
"Cannot visualize document without images"
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
image = deepcopy(pil_img)
|
|
101
|
+
my_images[page_no] = image
|
|
102
|
+
draw = ImageDraw.Draw(image)
|
|
103
|
+
|
|
104
|
+
tlo_bbox = prov.bbox.to_top_left_origin(
|
|
105
|
+
page_height=doc.pages[prov.page_no].size.height
|
|
129
106
|
)
|
|
130
|
-
|
|
107
|
+
ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size)
|
|
108
|
+
ro_bbox.l = round(ro_bbox.l * image.width) # noqa: E741
|
|
109
|
+
ro_bbox.r = round(ro_bbox.r * image.width)
|
|
110
|
+
ro_bbox.t = round(ro_bbox.t * image.height)
|
|
111
|
+
ro_bbox.b = round(ro_bbox.b * image.height)
|
|
112
|
+
|
|
113
|
+
if ro_bbox.b > ro_bbox.t:
|
|
114
|
+
ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
|
|
115
|
+
|
|
116
|
+
if x0 is None and y0 is None:
|
|
117
|
+
x0 = (ro_bbox.l + ro_bbox.r) / 2.0
|
|
118
|
+
y0 = (ro_bbox.b + ro_bbox.t) / 2.0
|
|
119
|
+
else:
|
|
120
|
+
assert x0 is not None
|
|
121
|
+
assert y0 is not None
|
|
122
|
+
|
|
123
|
+
x1 = (ro_bbox.l + ro_bbox.r) / 2.0
|
|
124
|
+
y1 = (ro_bbox.b + ro_bbox.t) / 2.0
|
|
125
|
+
|
|
126
|
+
draw = self._draw_arrow(
|
|
127
|
+
draw=draw,
|
|
128
|
+
arrow_coords=(x0, y0, x1, y1),
|
|
129
|
+
line_width=2,
|
|
130
|
+
color="red",
|
|
131
|
+
)
|
|
132
|
+
x0, y0 = x1, y1
|
|
131
133
|
return my_images
|
|
132
134
|
|
|
133
135
|
@override
|
|
@@ -872,7 +872,7 @@ class TextItem(DocItem):
|
|
|
872
872
|
:param add_content: bool: (Default value = True)
|
|
873
873
|
|
|
874
874
|
"""
|
|
875
|
-
from docling_core.
|
|
875
|
+
from docling_core.transforms.serializer.doctags import (
|
|
876
876
|
DocTagsDocSerializer,
|
|
877
877
|
DocTagsParams,
|
|
878
878
|
)
|
|
@@ -930,7 +930,7 @@ class SectionHeaderItem(TextItem):
|
|
|
930
930
|
:param add_content: bool: (Default value = True)
|
|
931
931
|
|
|
932
932
|
"""
|
|
933
|
-
from docling_core.
|
|
933
|
+
from docling_core.transforms.serializer.doctags import (
|
|
934
934
|
DocTagsDocSerializer,
|
|
935
935
|
DocTagsParams,
|
|
936
936
|
)
|
|
@@ -1020,7 +1020,7 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
1020
1020
|
:param add_content: bool: (Default value = True)
|
|
1021
1021
|
|
|
1022
1022
|
"""
|
|
1023
|
-
from docling_core.
|
|
1023
|
+
from docling_core.transforms.serializer.doctags import (
|
|
1024
1024
|
DocTagsDocSerializer,
|
|
1025
1025
|
DocTagsParams,
|
|
1026
1026
|
)
|
|
@@ -1091,7 +1091,7 @@ class PictureItem(FloatingItem):
|
|
|
1091
1091
|
image_placeholder: str = "<!-- image -->",
|
|
1092
1092
|
) -> str:
|
|
1093
1093
|
"""Export picture to Markdown format."""
|
|
1094
|
-
from docling_core.
|
|
1094
|
+
from docling_core.transforms.serializer.markdown import (
|
|
1095
1095
|
MarkdownDocSerializer,
|
|
1096
1096
|
MarkdownParams,
|
|
1097
1097
|
)
|
|
@@ -1118,7 +1118,7 @@ class PictureItem(FloatingItem):
|
|
|
1118
1118
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1119
1119
|
) -> str:
|
|
1120
1120
|
"""Export picture to HTML format."""
|
|
1121
|
-
from docling_core.
|
|
1121
|
+
from docling_core.transforms.serializer.html import (
|
|
1122
1122
|
HTMLDocSerializer,
|
|
1123
1123
|
HTMLParams,
|
|
1124
1124
|
)
|
|
@@ -1159,7 +1159,7 @@ class PictureItem(FloatingItem):
|
|
|
1159
1159
|
:param # not used at the moment
|
|
1160
1160
|
|
|
1161
1161
|
"""
|
|
1162
|
-
from docling_core.
|
|
1162
|
+
from docling_core.transforms.serializer.doctags import (
|
|
1163
1163
|
DocTagsDocSerializer,
|
|
1164
1164
|
DocTagsParams,
|
|
1165
1165
|
)
|
|
@@ -1235,7 +1235,7 @@ class TableItem(FloatingItem):
|
|
|
1235
1235
|
def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
|
|
1236
1236
|
"""Export the table as markdown."""
|
|
1237
1237
|
if doc is not None:
|
|
1238
|
-
from docling_core.
|
|
1238
|
+
from docling_core.transforms.serializer.markdown import (
|
|
1239
1239
|
MarkdownDocSerializer,
|
|
1240
1240
|
)
|
|
1241
1241
|
|
|
@@ -1282,7 +1282,7 @@ class TableItem(FloatingItem):
|
|
|
1282
1282
|
) -> str:
|
|
1283
1283
|
"""Export the table as html."""
|
|
1284
1284
|
if doc is not None:
|
|
1285
|
-
from docling_core.
|
|
1285
|
+
from docling_core.transforms.serializer.html import HTMLDocSerializer
|
|
1286
1286
|
|
|
1287
1287
|
serializer = HTMLDocSerializer(doc=doc)
|
|
1288
1288
|
text = serializer.serialize(item=self).text
|
|
@@ -1414,7 +1414,7 @@ class TableItem(FloatingItem):
|
|
|
1414
1414
|
:param add_caption: bool: (Default value = True)
|
|
1415
1415
|
|
|
1416
1416
|
"""
|
|
1417
|
-
from docling_core.
|
|
1417
|
+
from docling_core.transforms.serializer.doctags import (
|
|
1418
1418
|
DocTagsDocSerializer,
|
|
1419
1419
|
DocTagsParams,
|
|
1420
1420
|
)
|
|
@@ -1512,7 +1512,7 @@ class KeyValueItem(FloatingItem):
|
|
|
1512
1512
|
:param add_content: bool: (Default value = True)
|
|
1513
1513
|
|
|
1514
1514
|
"""
|
|
1515
|
-
from docling_core.
|
|
1515
|
+
from docling_core.transforms.serializer.doctags import (
|
|
1516
1516
|
DocTagsDocSerializer,
|
|
1517
1517
|
DocTagsParams,
|
|
1518
1518
|
)
|
|
@@ -2999,7 +2999,7 @@ class DoclingDocument(BaseModel):
|
|
|
2999
2999
|
:returns: The exported Markdown representation.
|
|
3000
3000
|
:rtype: str
|
|
3001
3001
|
"""
|
|
3002
|
-
from docling_core.
|
|
3002
|
+
from docling_core.transforms.serializer.markdown import (
|
|
3003
3003
|
MarkdownDocSerializer,
|
|
3004
3004
|
MarkdownParams,
|
|
3005
3005
|
)
|
|
@@ -3153,7 +3153,7 @@ class DoclingDocument(BaseModel):
|
|
|
3153
3153
|
split_page_view: bool = False,
|
|
3154
3154
|
) -> str:
|
|
3155
3155
|
r"""Serialize to HTML."""
|
|
3156
|
-
from docling_core.
|
|
3156
|
+
from docling_core.transforms.serializer.html import (
|
|
3157
3157
|
HTMLDocSerializer,
|
|
3158
3158
|
HTMLOutputStyle,
|
|
3159
3159
|
HTMLParams,
|
|
@@ -3195,9 +3195,9 @@ class DoclingDocument(BaseModel):
|
|
|
3195
3195
|
|
|
3196
3196
|
return ser_res.text
|
|
3197
3197
|
|
|
3198
|
+
@staticmethod
|
|
3198
3199
|
def load_from_doctags( # noqa: C901
|
|
3199
|
-
|
|
3200
|
-
doctag_document: DocTagsDocument,
|
|
3200
|
+
doctag_document: DocTagsDocument, document_name: str = "Document"
|
|
3201
3201
|
) -> "DoclingDocument":
|
|
3202
3202
|
r"""Load Docling document from lists of DocTags and Images."""
|
|
3203
3203
|
# Maps the recognized tag to a Docling label.
|
|
@@ -3221,6 +3221,8 @@ class DoclingDocument(BaseModel):
|
|
|
3221
3221
|
"key_value_region": DocItemLabel.KEY_VALUE_REGION,
|
|
3222
3222
|
}
|
|
3223
3223
|
|
|
3224
|
+
doc = DoclingDocument(name=document_name)
|
|
3225
|
+
|
|
3224
3226
|
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
|
3225
3227
|
"""Extract <loc_...> coords from the chunk, normalized by / 500."""
|
|
3226
3228
|
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
|
@@ -3244,7 +3246,7 @@ class DoclingDocument(BaseModel):
|
|
|
3244
3246
|
caption_content = caption.group(1)
|
|
3245
3247
|
bbox = extract_bounding_box(caption_content)
|
|
3246
3248
|
caption_text = extract_inner_text(caption_content)
|
|
3247
|
-
caption_item =
|
|
3249
|
+
caption_item = doc.add_text(
|
|
3248
3250
|
label=DocItemLabel.CAPTION,
|
|
3249
3251
|
text=caption_text,
|
|
3250
3252
|
parent=None,
|
|
@@ -3567,7 +3569,7 @@ class DoclingDocument(BaseModel):
|
|
|
3567
3569
|
pg_width = 1
|
|
3568
3570
|
pg_height = 1
|
|
3569
3571
|
|
|
3570
|
-
|
|
3572
|
+
doc.add_page(
|
|
3571
3573
|
page_no=page_no,
|
|
3572
3574
|
size=Size(width=pg_width, height=pg_height),
|
|
3573
3575
|
image=ImageRef.from_pil(image=image, dpi=72) if image else None,
|
|
@@ -3624,9 +3626,9 @@ class DoclingDocument(BaseModel):
|
|
|
3624
3626
|
charspan=(0, 0),
|
|
3625
3627
|
page_no=page_no,
|
|
3626
3628
|
)
|
|
3627
|
-
|
|
3629
|
+
doc.add_table(data=table_data, prov=prov, caption=caption)
|
|
3628
3630
|
else:
|
|
3629
|
-
|
|
3631
|
+
doc.add_table(data=table_data, caption=caption)
|
|
3630
3632
|
|
|
3631
3633
|
elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
|
|
3632
3634
|
caption, caption_bbox = extract_caption(full_chunk)
|
|
@@ -3646,7 +3648,7 @@ class DoclingDocument(BaseModel):
|
|
|
3646
3648
|
int(bbox.b * im_height),
|
|
3647
3649
|
)
|
|
3648
3650
|
cropped_image = image.crop(crop_box)
|
|
3649
|
-
pic =
|
|
3651
|
+
pic = doc.add_picture(
|
|
3650
3652
|
parent=None,
|
|
3651
3653
|
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
|
3652
3654
|
prov=(
|
|
@@ -3692,7 +3694,7 @@ class DoclingDocument(BaseModel):
|
|
|
3692
3694
|
else:
|
|
3693
3695
|
if bbox:
|
|
3694
3696
|
# In case we don't have access to an binary of an image
|
|
3695
|
-
pic =
|
|
3697
|
+
pic = doc.add_picture(
|
|
3696
3698
|
parent=None,
|
|
3697
3699
|
prov=ProvenanceItem(
|
|
3698
3700
|
bbox=bbox, charspan=(0, 0), page_no=page_no
|
|
@@ -3733,7 +3735,7 @@ class DoclingDocument(BaseModel):
|
|
|
3733
3735
|
key_value_data, kv_item_prov = parse_key_value_item(
|
|
3734
3736
|
full_chunk, image
|
|
3735
3737
|
)
|
|
3736
|
-
|
|
3738
|
+
doc.add_key_values(graph=key_value_data, prov=kv_item_prov)
|
|
3737
3739
|
elif tag_name in [
|
|
3738
3740
|
DocumentToken.ORDERED_LIST.value,
|
|
3739
3741
|
DocumentToken.UNORDERED_LIST.value,
|
|
@@ -3749,7 +3751,7 @@ class DoclingDocument(BaseModel):
|
|
|
3749
3751
|
)
|
|
3750
3752
|
li_pattern = re.compile(list_item_pattern, re.DOTALL)
|
|
3751
3753
|
# Add list group:
|
|
3752
|
-
new_list =
|
|
3754
|
+
new_list = doc.add_group(label=list_label, name="list")
|
|
3753
3755
|
# Pricess list items
|
|
3754
3756
|
for li_match in li_pattern.finditer(full_chunk):
|
|
3755
3757
|
enum_value += 1
|
|
@@ -3760,7 +3762,7 @@ class DoclingDocument(BaseModel):
|
|
|
3760
3762
|
li_bbox = extract_bounding_box(li_full_chunk) if image else None
|
|
3761
3763
|
text_content = extract_inner_text(li_full_chunk)
|
|
3762
3764
|
# Add list item
|
|
3763
|
-
|
|
3765
|
+
doc.add_list_item(
|
|
3764
3766
|
marker=enum_marker,
|
|
3765
3767
|
enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
|
|
3766
3768
|
parent=new_list,
|
|
@@ -3792,13 +3794,13 @@ class DoclingDocument(BaseModel):
|
|
|
3792
3794
|
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
|
3793
3795
|
content_layer = ContentLayer.FURNITURE
|
|
3794
3796
|
|
|
3795
|
-
|
|
3797
|
+
doc.add_text(
|
|
3796
3798
|
label=doc_label,
|
|
3797
3799
|
text=text_content,
|
|
3798
3800
|
prov=element_prov,
|
|
3799
3801
|
content_layer=content_layer,
|
|
3800
3802
|
)
|
|
3801
|
-
return
|
|
3803
|
+
return doc
|
|
3802
3804
|
|
|
3803
3805
|
@deprecated("Use save_as_doctags instead.")
|
|
3804
3806
|
def save_as_document_tokens(self, *args, **kwargs):
|
|
@@ -3885,7 +3887,7 @@ class DoclingDocument(BaseModel):
|
|
|
3885
3887
|
:returns: The content of the document formatted as a DocTags string.
|
|
3886
3888
|
:rtype: str
|
|
3887
3889
|
"""
|
|
3888
|
-
from docling_core.
|
|
3890
|
+
from docling_core.transforms.serializer.doctags import (
|
|
3889
3891
|
DocTagsDocSerializer,
|
|
3890
3892
|
DocTagsParams,
|
|
3891
3893
|
)
|
|
@@ -579,13 +579,17 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
579
579
|
with open(filename, "r", encoding="utf-8") as f:
|
|
580
580
|
return cls.model_validate_json(f.read())
|
|
581
581
|
|
|
582
|
-
def crop_text(
|
|
582
|
+
def crop_text(
|
|
583
|
+
self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0
|
|
584
|
+
) -> str:
|
|
583
585
|
"""Extract text from cells within the specified bounding box.
|
|
584
586
|
|
|
585
587
|
Args:
|
|
586
588
|
cell_unit: Type of text unit to extract
|
|
587
589
|
bbox: Bounding box to extract from
|
|
588
590
|
eps: Epsilon value for position comparison
|
|
591
|
+
Returns:
|
|
592
|
+
Extracted text from the cells
|
|
589
593
|
"""
|
|
590
594
|
selection = []
|
|
591
595
|
for page_cell in self.iterate_cells(cell_unit):
|
|
@@ -605,7 +609,6 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
605
609
|
|
|
606
610
|
text = ""
|
|
607
611
|
for i, cell in enumerate(selection):
|
|
608
|
-
|
|
609
612
|
if i == 0:
|
|
610
613
|
text += cell.text
|
|
611
614
|
else:
|
|
@@ -619,6 +622,7 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
619
622
|
else:
|
|
620
623
|
text += " "
|
|
621
624
|
text += cell.text
|
|
625
|
+
return text
|
|
622
626
|
|
|
623
627
|
def export_to_textlines(
|
|
624
628
|
self,
|
|
@@ -640,7 +644,6 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
640
644
|
"""
|
|
641
645
|
lines: List[str] = []
|
|
642
646
|
for cell in self.iterate_cells(cell_unit):
|
|
643
|
-
|
|
644
647
|
line = ""
|
|
645
648
|
if add_location:
|
|
646
649
|
line += f"({cell.rect.r_x0:06.02f}, {cell.rect.r_y0:06.02f}) "
|
|
@@ -1104,7 +1107,6 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
1104
1107
|
|
|
1105
1108
|
# Draw each rectangle by connecting its four points
|
|
1106
1109
|
for line in self.lines:
|
|
1107
|
-
|
|
1108
1110
|
line.to_top_left_origin(page_height=page_height)
|
|
1109
1111
|
for segment in line.iterate_segments():
|
|
1110
1112
|
draw.line(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.29.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.29.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.29.0}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|