docling-core 2.28.1__tar.gz → 2.30.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.28.1 → docling_core-2.30.0}/PKG-INFO +1 -1
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/hierarchical_chunker.py +5 -5
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/hybrid_chunker.py +4 -4
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/common.py +1 -1
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/doctags.py +2 -2
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/html.py +29 -3
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/markdown.py +2 -2
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/layout_visualizer.py +33 -30
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +52 -50
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/document.py +59 -38
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/labels.py +1 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/page.py +25 -4
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/legacy.py +1 -1
- {docling_core-2.28.1 → docling_core-2.30.0}/pyproject.toml +1 -1
- {docling_core-2.28.1 → docling_core-2.30.0}/LICENSE +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/README.md +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/py.typed +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/package.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/__init__.py +0 -0
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/base.py +0 -0
- {docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/html_styles.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/validators.py +0 -0
{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
@@ -14,19 +14,19 @@ from typing import Any, ClassVar, Final, Iterator, Literal, Optional
|
|
|
14
14
|
from pydantic import ConfigDict, Field, StringConstraints, field_validator
|
|
15
15
|
from typing_extensions import Annotated, override
|
|
16
16
|
|
|
17
|
-
from docling_core.
|
|
17
|
+
from docling_core.search.package import VERSION_PATTERN
|
|
18
|
+
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
19
|
+
from docling_core.transforms.serializer.base import (
|
|
18
20
|
BaseDocSerializer,
|
|
19
21
|
BaseSerializerProvider,
|
|
20
22
|
BaseTableSerializer,
|
|
21
23
|
SerializationResult,
|
|
22
24
|
)
|
|
23
|
-
from docling_core.
|
|
24
|
-
from docling_core.
|
|
25
|
+
from docling_core.transforms.serializer.common import create_ser_result
|
|
26
|
+
from docling_core.transforms.serializer.markdown import (
|
|
25
27
|
MarkdownDocSerializer,
|
|
26
28
|
MarkdownParams,
|
|
27
29
|
)
|
|
28
|
-
from docling_core.search.package import VERSION_PATTERN
|
|
29
|
-
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
30
30
|
from docling_core.types import DoclingDocument as DLDocument
|
|
31
31
|
from docling_core.types.doc.base import ImageRefMode
|
|
32
32
|
from docling_core.types.doc.document import (
|
{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
@@ -25,10 +25,6 @@ except ImportError:
|
|
|
25
25
|
"`pip install 'docling-core[chunking-openai]'`"
|
|
26
26
|
)
|
|
27
27
|
|
|
28
|
-
from docling_core.experimental.serializer.base import (
|
|
29
|
-
BaseDocSerializer,
|
|
30
|
-
BaseSerializerProvider,
|
|
31
|
-
)
|
|
32
28
|
from docling_core.transforms.chunker import (
|
|
33
29
|
BaseChunk,
|
|
34
30
|
BaseChunker,
|
|
@@ -36,6 +32,10 @@ from docling_core.transforms.chunker import (
|
|
|
36
32
|
DocMeta,
|
|
37
33
|
HierarchicalChunker,
|
|
38
34
|
)
|
|
35
|
+
from docling_core.transforms.serializer.base import (
|
|
36
|
+
BaseDocSerializer,
|
|
37
|
+
BaseSerializerProvider,
|
|
38
|
+
)
|
|
39
39
|
from docling_core.types import DoclingDocument
|
|
40
40
|
|
|
41
41
|
|
|
@@ -14,7 +14,7 @@ from typing import Any, Iterable, Optional, Tuple, Union
|
|
|
14
14
|
from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
|
|
15
15
|
from typing_extensions import Self, override
|
|
16
16
|
|
|
17
|
-
from docling_core.
|
|
17
|
+
from docling_core.transforms.serializer.base import (
|
|
18
18
|
BaseDocSerializer,
|
|
19
19
|
BaseFallbackSerializer,
|
|
20
20
|
BaseFormSerializer,
|
|
@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
from typing_extensions import override
|
|
8
8
|
|
|
9
|
-
from docling_core.
|
|
9
|
+
from docling_core.transforms.serializer.base import (
|
|
10
10
|
BaseDocSerializer,
|
|
11
11
|
BaseFallbackSerializer,
|
|
12
12
|
BaseFormSerializer,
|
|
@@ -18,7 +18,7 @@ from docling_core.experimental.serializer.base import (
|
|
|
18
18
|
BaseTextSerializer,
|
|
19
19
|
SerializationResult,
|
|
20
20
|
)
|
|
21
|
-
from docling_core.
|
|
21
|
+
from docling_core.transforms.serializer.common import (
|
|
22
22
|
CommonParams,
|
|
23
23
|
DocSerializer,
|
|
24
24
|
create_ser_result,
|
|
@@ -19,7 +19,7 @@ import latex2mathml.converter
|
|
|
19
19
|
from pydantic import AnyUrl, BaseModel
|
|
20
20
|
from typing_extensions import override
|
|
21
21
|
|
|
22
|
-
from docling_core.
|
|
22
|
+
from docling_core.transforms.serializer.base import (
|
|
23
23
|
BaseDocSerializer,
|
|
24
24
|
BaseFallbackSerializer,
|
|
25
25
|
BaseFormSerializer,
|
|
@@ -31,12 +31,12 @@ from docling_core.experimental.serializer.base import (
|
|
|
31
31
|
BaseTextSerializer,
|
|
32
32
|
SerializationResult,
|
|
33
33
|
)
|
|
34
|
-
from docling_core.
|
|
34
|
+
from docling_core.transforms.serializer.common import (
|
|
35
35
|
CommonParams,
|
|
36
36
|
DocSerializer,
|
|
37
37
|
create_ser_result,
|
|
38
38
|
)
|
|
39
|
-
from docling_core.
|
|
39
|
+
from docling_core.transforms.serializer.html_styles import (
|
|
40
40
|
_get_css_for_single_column,
|
|
41
41
|
_get_css_for_split_page,
|
|
42
42
|
)
|
|
@@ -370,6 +370,13 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
370
370
|
**kwargs: Any,
|
|
371
371
|
) -> SerializationResult:
|
|
372
372
|
"""Export picture to HTML format."""
|
|
373
|
+
|
|
374
|
+
def get_img_row(imgb64: str, ind: int) -> str:
|
|
375
|
+
row = '<tr><td style="border: 2px solid black; padding: 8px;">'
|
|
376
|
+
row += f'<img src="data:image/png;base64,{imgb64}" alt="image {ind}">'
|
|
377
|
+
row += "</td></tr>\n"
|
|
378
|
+
return row
|
|
379
|
+
|
|
373
380
|
params = HTMLParams(**kwargs)
|
|
374
381
|
|
|
375
382
|
res_parts: list[SerializationResult] = []
|
|
@@ -393,6 +400,22 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
393
400
|
and item.image.uri.scheme == "data"
|
|
394
401
|
):
|
|
395
402
|
img_text = f'<img src="{item.image.uri}">'
|
|
403
|
+
elif len(item.prov) > 1: # more than 1 provenance
|
|
404
|
+
|
|
405
|
+
img_text = (
|
|
406
|
+
'<table style="border-collapse: collapse; width: 100%;">\n'
|
|
407
|
+
)
|
|
408
|
+
for ind, prov in enumerate(item.prov):
|
|
409
|
+
img = item.get_image(doc, prov_index=ind)
|
|
410
|
+
|
|
411
|
+
if img is not None:
|
|
412
|
+
imgb64 = item._image_to_base64(img)
|
|
413
|
+
img_text += get_img_row(imgb64=imgb64, ind=ind)
|
|
414
|
+
else:
|
|
415
|
+
_logger.warning("Could not get image")
|
|
416
|
+
|
|
417
|
+
img_text += "</table>\n"
|
|
418
|
+
|
|
396
419
|
else:
|
|
397
420
|
# get the item.image._pil or crop it out of the page-image
|
|
398
421
|
img = item.get_image(doc)
|
|
@@ -400,6 +423,9 @@ class HTMLPictureSerializer(BasePictureSerializer):
|
|
|
400
423
|
if img is not None:
|
|
401
424
|
imgb64 = item._image_to_base64(img)
|
|
402
425
|
img_text = f'<img src="data:image/png;base64,{imgb64}">'
|
|
426
|
+
else:
|
|
427
|
+
_logger.warning("Could not get image")
|
|
428
|
+
|
|
403
429
|
elif params.image_mode == ImageRefMode.REFERENCED:
|
|
404
430
|
if isinstance(item.image, ImageRef) and not (
|
|
405
431
|
isinstance(item.image.uri, AnyUrl)
|
|
@@ -14,7 +14,7 @@ from pydantic import AnyUrl, BaseModel, PositiveInt
|
|
|
14
14
|
from tabulate import tabulate
|
|
15
15
|
from typing_extensions import override
|
|
16
16
|
|
|
17
|
-
from docling_core.
|
|
17
|
+
from docling_core.transforms.serializer.base import (
|
|
18
18
|
BaseDocSerializer,
|
|
19
19
|
BaseFallbackSerializer,
|
|
20
20
|
BaseFormSerializer,
|
|
@@ -26,7 +26,7 @@ from docling_core.experimental.serializer.base import (
|
|
|
26
26
|
BaseTextSerializer,
|
|
27
27
|
SerializationResult,
|
|
28
28
|
)
|
|
29
|
-
from docling_core.
|
|
29
|
+
from docling_core.transforms.serializer.common import (
|
|
30
30
|
CommonParams,
|
|
31
31
|
DocSerializer,
|
|
32
32
|
_PageBreakSerResult,
|
{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
@@ -149,38 +149,41 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
149
149
|
continue
|
|
150
150
|
if len(elem.prov) == 0:
|
|
151
151
|
continue # Skip elements without provenances
|
|
152
|
-
prov = elem.prov[0]
|
|
153
|
-
page_nr = prov.page_no
|
|
154
|
-
|
|
155
|
-
if page_nr in my_images:
|
|
156
|
-
image = my_images[page_nr]
|
|
157
|
-
else:
|
|
158
|
-
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
159
|
-
|
|
160
|
-
if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
|
|
161
|
-
# complete previous drawing
|
|
162
|
-
if prev_page_nr is not None and prev_image and clusters:
|
|
163
|
-
self._draw_clusters(
|
|
164
|
-
image=prev_image,
|
|
165
|
-
clusters=clusters,
|
|
166
|
-
scale_x=prev_image.width / doc.pages[prev_page_nr].size.width,
|
|
167
|
-
scale_y=prev_image.height / doc.pages[prev_page_nr].size.height,
|
|
168
|
-
)
|
|
169
|
-
clusters = []
|
|
170
152
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
153
|
+
for prov in elem.prov:
|
|
154
|
+
page_nr = prov.page_no
|
|
155
|
+
|
|
156
|
+
if page_nr in my_images:
|
|
157
|
+
image = my_images[page_nr]
|
|
158
|
+
else:
|
|
159
|
+
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
160
|
+
|
|
161
|
+
if prev_page_nr is None or page_nr > prev_page_nr: # new page begins
|
|
162
|
+
# complete previous drawing
|
|
163
|
+
if prev_page_nr is not None and prev_image and clusters:
|
|
164
|
+
self._draw_clusters(
|
|
165
|
+
image=prev_image,
|
|
166
|
+
clusters=clusters,
|
|
167
|
+
scale_x=prev_image.width
|
|
168
|
+
/ doc.pages[prev_page_nr].size.width,
|
|
169
|
+
scale_y=prev_image.height
|
|
170
|
+
/ doc.pages[prev_page_nr].size.height,
|
|
171
|
+
)
|
|
172
|
+
clusters = []
|
|
173
|
+
|
|
174
|
+
tlo_bbox = prov.bbox.to_top_left_origin(
|
|
175
|
+
page_height=doc.pages[prov.page_no].size.height
|
|
176
|
+
)
|
|
177
|
+
cluster = _TLCluster(
|
|
178
|
+
id=idx,
|
|
179
|
+
label=elem.label,
|
|
180
|
+
brec=_TLBoundingRectangle.from_bounding_box(bbox=tlo_bbox),
|
|
181
|
+
cells=[],
|
|
182
|
+
)
|
|
183
|
+
clusters.append(cluster)
|
|
181
184
|
|
|
182
|
-
|
|
183
|
-
|
|
185
|
+
prev_page_nr = page_nr
|
|
186
|
+
prev_image = image
|
|
184
187
|
|
|
185
188
|
# complete last drawing
|
|
186
189
|
if prev_page_nr is not None and prev_image and clusters:
|
|
@@ -77,57 +77,59 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
77
77
|
continue
|
|
78
78
|
if len(elem.prov) == 0:
|
|
79
79
|
continue # Skip elements without provenances
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
ro_bbox.l = round(ro_bbox.l * image.width) # noqa: E741
|
|
107
|
-
ro_bbox.r = round(ro_bbox.r * image.width)
|
|
108
|
-
ro_bbox.t = round(ro_bbox.t * image.height)
|
|
109
|
-
ro_bbox.b = round(ro_bbox.b * image.height)
|
|
110
|
-
|
|
111
|
-
if ro_bbox.b > ro_bbox.t:
|
|
112
|
-
ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
|
|
113
|
-
|
|
114
|
-
if x0 is None and y0 is None:
|
|
115
|
-
x0 = (ro_bbox.l + ro_bbox.r) / 2.0
|
|
116
|
-
y0 = (ro_bbox.b + ro_bbox.t) / 2.0
|
|
117
|
-
else:
|
|
118
|
-
assert x0 is not None
|
|
119
|
-
assert y0 is not None
|
|
120
|
-
|
|
121
|
-
x1 = (ro_bbox.l + ro_bbox.r) / 2.0
|
|
122
|
-
y1 = (ro_bbox.b + ro_bbox.t) / 2.0
|
|
123
|
-
|
|
124
|
-
draw = self._draw_arrow(
|
|
125
|
-
draw=draw,
|
|
126
|
-
arrow_coords=(x0, y0, x1, y1),
|
|
127
|
-
line_width=2,
|
|
128
|
-
color="red",
|
|
80
|
+
|
|
81
|
+
for prov in elem.prov:
|
|
82
|
+
page_no = prov.page_no
|
|
83
|
+
image = my_images.get(page_no)
|
|
84
|
+
|
|
85
|
+
if image is None or prev_page is None or page_no > prev_page:
|
|
86
|
+
# new page begins
|
|
87
|
+
prev_page = page_no
|
|
88
|
+
x0 = y0 = None
|
|
89
|
+
|
|
90
|
+
if image is None:
|
|
91
|
+
page_image = doc.pages[page_no].image
|
|
92
|
+
if (
|
|
93
|
+
page_image is None
|
|
94
|
+
or (pil_img := page_image.pil_image) is None
|
|
95
|
+
):
|
|
96
|
+
raise RuntimeError(
|
|
97
|
+
"Cannot visualize document without images"
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
image = deepcopy(pil_img)
|
|
101
|
+
my_images[page_no] = image
|
|
102
|
+
draw = ImageDraw.Draw(image)
|
|
103
|
+
|
|
104
|
+
tlo_bbox = prov.bbox.to_top_left_origin(
|
|
105
|
+
page_height=doc.pages[prov.page_no].size.height
|
|
129
106
|
)
|
|
130
|
-
|
|
107
|
+
ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size)
|
|
108
|
+
ro_bbox.l = round(ro_bbox.l * image.width) # noqa: E741
|
|
109
|
+
ro_bbox.r = round(ro_bbox.r * image.width)
|
|
110
|
+
ro_bbox.t = round(ro_bbox.t * image.height)
|
|
111
|
+
ro_bbox.b = round(ro_bbox.b * image.height)
|
|
112
|
+
|
|
113
|
+
if ro_bbox.b > ro_bbox.t:
|
|
114
|
+
ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
|
|
115
|
+
|
|
116
|
+
if x0 is None and y0 is None:
|
|
117
|
+
x0 = (ro_bbox.l + ro_bbox.r) / 2.0
|
|
118
|
+
y0 = (ro_bbox.b + ro_bbox.t) / 2.0
|
|
119
|
+
else:
|
|
120
|
+
assert x0 is not None
|
|
121
|
+
assert y0 is not None
|
|
122
|
+
|
|
123
|
+
x1 = (ro_bbox.l + ro_bbox.r) / 2.0
|
|
124
|
+
y1 = (ro_bbox.b + ro_bbox.t) / 2.0
|
|
125
|
+
|
|
126
|
+
draw = self._draw_arrow(
|
|
127
|
+
draw=draw,
|
|
128
|
+
arrow_coords=(x0, y0, x1, y1),
|
|
129
|
+
line_width=2,
|
|
130
|
+
color="red",
|
|
131
|
+
)
|
|
132
|
+
x0, y0 = x1, y1
|
|
131
133
|
return my_images
|
|
132
134
|
|
|
133
135
|
@override
|
|
@@ -790,7 +790,9 @@ class DocItem(
|
|
|
790
790
|
|
|
791
791
|
return location
|
|
792
792
|
|
|
793
|
-
def get_image(
|
|
793
|
+
def get_image(
|
|
794
|
+
self, doc: "DoclingDocument", prov_index: int = 0
|
|
795
|
+
) -> Optional[PILImage.Image]:
|
|
794
796
|
"""Returns the image of this DocItem.
|
|
795
797
|
|
|
796
798
|
The function returns None if this DocItem has no valid provenance or
|
|
@@ -800,7 +802,7 @@ class DocItem(
|
|
|
800
802
|
if not len(self.prov):
|
|
801
803
|
return None
|
|
802
804
|
|
|
803
|
-
page = doc.pages.get(self.prov[
|
|
805
|
+
page = doc.pages.get(self.prov[prov_index].page_no)
|
|
804
806
|
if page is None or page.size is None or page.image is None:
|
|
805
807
|
return None
|
|
806
808
|
|
|
@@ -808,7 +810,7 @@ class DocItem(
|
|
|
808
810
|
if not page_image:
|
|
809
811
|
return None
|
|
810
812
|
crop_bbox = (
|
|
811
|
-
self.prov[
|
|
813
|
+
self.prov[prov_index]
|
|
812
814
|
.bbox.to_top_left_origin(page_height=page.size.height)
|
|
813
815
|
.scale_to_size(old_size=page.size, new_size=page.image.size)
|
|
814
816
|
# .scaled(scale=page_image.height / page.size.height)
|
|
@@ -872,7 +874,7 @@ class TextItem(DocItem):
|
|
|
872
874
|
:param add_content: bool: (Default value = True)
|
|
873
875
|
|
|
874
876
|
"""
|
|
875
|
-
from docling_core.
|
|
877
|
+
from docling_core.transforms.serializer.doctags import (
|
|
876
878
|
DocTagsDocSerializer,
|
|
877
879
|
DocTagsParams,
|
|
878
880
|
)
|
|
@@ -930,7 +932,7 @@ class SectionHeaderItem(TextItem):
|
|
|
930
932
|
:param add_content: bool: (Default value = True)
|
|
931
933
|
|
|
932
934
|
"""
|
|
933
|
-
from docling_core.
|
|
935
|
+
from docling_core.transforms.serializer.doctags import (
|
|
934
936
|
DocTagsDocSerializer,
|
|
935
937
|
DocTagsParams,
|
|
936
938
|
)
|
|
@@ -973,7 +975,9 @@ class FloatingItem(DocItem):
|
|
|
973
975
|
text += cap.resolve(doc).text
|
|
974
976
|
return text
|
|
975
977
|
|
|
976
|
-
def get_image(
|
|
978
|
+
def get_image(
|
|
979
|
+
self, doc: "DoclingDocument", prov_index: int = 0
|
|
980
|
+
) -> Optional[PILImage.Image]:
|
|
977
981
|
"""Returns the image corresponding to this FloatingItem.
|
|
978
982
|
|
|
979
983
|
This function returns the PIL image from self.image if one is available.
|
|
@@ -985,7 +989,7 @@ class FloatingItem(DocItem):
|
|
|
985
989
|
"""
|
|
986
990
|
if self.image is not None:
|
|
987
991
|
return self.image.pil_image
|
|
988
|
-
return super().get_image(doc=doc)
|
|
992
|
+
return super().get_image(doc=doc, prov_index=prov_index)
|
|
989
993
|
|
|
990
994
|
|
|
991
995
|
class CodeItem(FloatingItem, TextItem):
|
|
@@ -1020,7 +1024,7 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
1020
1024
|
:param add_content: bool: (Default value = True)
|
|
1021
1025
|
|
|
1022
1026
|
"""
|
|
1023
|
-
from docling_core.
|
|
1027
|
+
from docling_core.transforms.serializer.doctags import (
|
|
1024
1028
|
DocTagsDocSerializer,
|
|
1025
1029
|
DocTagsParams,
|
|
1026
1030
|
)
|
|
@@ -1073,7 +1077,7 @@ class PictureItem(FloatingItem):
|
|
|
1073
1077
|
image_bytes = self.image._pil.tobytes()
|
|
1074
1078
|
|
|
1075
1079
|
# Create a hash object (e.g., SHA-256)
|
|
1076
|
-
hasher = hashlib.sha256()
|
|
1080
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
|
1077
1081
|
|
|
1078
1082
|
# Feed the image bytes into the hash object
|
|
1079
1083
|
hasher.update(image_bytes)
|
|
@@ -1091,7 +1095,7 @@ class PictureItem(FloatingItem):
|
|
|
1091
1095
|
image_placeholder: str = "<!-- image -->",
|
|
1092
1096
|
) -> str:
|
|
1093
1097
|
"""Export picture to Markdown format."""
|
|
1094
|
-
from docling_core.
|
|
1098
|
+
from docling_core.transforms.serializer.markdown import (
|
|
1095
1099
|
MarkdownDocSerializer,
|
|
1096
1100
|
MarkdownParams,
|
|
1097
1101
|
)
|
|
@@ -1118,7 +1122,7 @@ class PictureItem(FloatingItem):
|
|
|
1118
1122
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1119
1123
|
) -> str:
|
|
1120
1124
|
"""Export picture to HTML format."""
|
|
1121
|
-
from docling_core.
|
|
1125
|
+
from docling_core.transforms.serializer.html import (
|
|
1122
1126
|
HTMLDocSerializer,
|
|
1123
1127
|
HTMLParams,
|
|
1124
1128
|
)
|
|
@@ -1159,7 +1163,7 @@ class PictureItem(FloatingItem):
|
|
|
1159
1163
|
:param # not used at the moment
|
|
1160
1164
|
|
|
1161
1165
|
"""
|
|
1162
|
-
from docling_core.
|
|
1166
|
+
from docling_core.transforms.serializer.doctags import (
|
|
1163
1167
|
DocTagsDocSerializer,
|
|
1164
1168
|
DocTagsParams,
|
|
1165
1169
|
)
|
|
@@ -1235,7 +1239,7 @@ class TableItem(FloatingItem):
|
|
|
1235
1239
|
def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
|
|
1236
1240
|
"""Export the table as markdown."""
|
|
1237
1241
|
if doc is not None:
|
|
1238
|
-
from docling_core.
|
|
1242
|
+
from docling_core.transforms.serializer.markdown import (
|
|
1239
1243
|
MarkdownDocSerializer,
|
|
1240
1244
|
)
|
|
1241
1245
|
|
|
@@ -1282,7 +1286,7 @@ class TableItem(FloatingItem):
|
|
|
1282
1286
|
) -> str:
|
|
1283
1287
|
"""Export the table as html."""
|
|
1284
1288
|
if doc is not None:
|
|
1285
|
-
from docling_core.
|
|
1289
|
+
from docling_core.transforms.serializer.html import HTMLDocSerializer
|
|
1286
1290
|
|
|
1287
1291
|
serializer = HTMLDocSerializer(doc=doc)
|
|
1288
1292
|
text = serializer.serialize(item=self).text
|
|
@@ -1414,7 +1418,7 @@ class TableItem(FloatingItem):
|
|
|
1414
1418
|
:param add_caption: bool: (Default value = True)
|
|
1415
1419
|
|
|
1416
1420
|
"""
|
|
1417
|
-
from docling_core.
|
|
1421
|
+
from docling_core.transforms.serializer.doctags import (
|
|
1418
1422
|
DocTagsDocSerializer,
|
|
1419
1423
|
DocTagsParams,
|
|
1420
1424
|
)
|
|
@@ -1512,7 +1516,7 @@ class KeyValueItem(FloatingItem):
|
|
|
1512
1516
|
:param add_content: bool: (Default value = True)
|
|
1513
1517
|
|
|
1514
1518
|
"""
|
|
1515
|
-
from docling_core.
|
|
1519
|
+
from docling_core.transforms.serializer.doctags import (
|
|
1516
1520
|
DocTagsDocSerializer,
|
|
1517
1521
|
DocTagsParams,
|
|
1518
1522
|
)
|
|
@@ -2657,16 +2661,25 @@ class DoclingDocument(BaseModel):
|
|
|
2657
2661
|
if should_yield:
|
|
2658
2662
|
yield root, my_stack
|
|
2659
2663
|
|
|
2660
|
-
# Handle picture traversal - only traverse children if requested
|
|
2661
|
-
if isinstance(root, PictureItem) and not traverse_pictures:
|
|
2662
|
-
return
|
|
2663
|
-
|
|
2664
2664
|
my_stack.append(-1)
|
|
2665
2665
|
|
|
2666
|
+
allowed_pic_refs: set[str] = (
|
|
2667
|
+
{r.cref for r in root.captions}
|
|
2668
|
+
if (root_is_picture := isinstance(root, PictureItem))
|
|
2669
|
+
else set()
|
|
2670
|
+
)
|
|
2671
|
+
|
|
2666
2672
|
# Traverse children
|
|
2667
2673
|
for child_ind, child_ref in enumerate(root.children):
|
|
2668
|
-
my_stack[-1] = child_ind
|
|
2669
2674
|
child = child_ref.resolve(self)
|
|
2675
|
+
if (
|
|
2676
|
+
root_is_picture
|
|
2677
|
+
and not traverse_pictures
|
|
2678
|
+
and isinstance(child, DocItem)
|
|
2679
|
+
and child.self_ref not in allowed_pic_refs
|
|
2680
|
+
):
|
|
2681
|
+
continue
|
|
2682
|
+
my_stack[-1] = child_ind
|
|
2670
2683
|
|
|
2671
2684
|
if isinstance(child, NodeItem):
|
|
2672
2685
|
yield from self._iterate_items_with_stack(
|
|
@@ -2999,7 +3012,7 @@ class DoclingDocument(BaseModel):
|
|
|
2999
3012
|
:returns: The exported Markdown representation.
|
|
3000
3013
|
:rtype: str
|
|
3001
3014
|
"""
|
|
3002
|
-
from docling_core.
|
|
3015
|
+
from docling_core.transforms.serializer.markdown import (
|
|
3003
3016
|
MarkdownDocSerializer,
|
|
3004
3017
|
MarkdownParams,
|
|
3005
3018
|
)
|
|
@@ -3153,7 +3166,7 @@ class DoclingDocument(BaseModel):
|
|
|
3153
3166
|
split_page_view: bool = False,
|
|
3154
3167
|
) -> str:
|
|
3155
3168
|
r"""Serialize to HTML."""
|
|
3156
|
-
from docling_core.
|
|
3169
|
+
from docling_core.transforms.serializer.html import (
|
|
3157
3170
|
HTMLDocSerializer,
|
|
3158
3171
|
HTMLOutputStyle,
|
|
3159
3172
|
HTMLParams,
|
|
@@ -3195,9 +3208,9 @@ class DoclingDocument(BaseModel):
|
|
|
3195
3208
|
|
|
3196
3209
|
return ser_res.text
|
|
3197
3210
|
|
|
3211
|
+
@staticmethod
|
|
3198
3212
|
def load_from_doctags( # noqa: C901
|
|
3199
|
-
|
|
3200
|
-
doctag_document: DocTagsDocument,
|
|
3213
|
+
doctag_document: DocTagsDocument, document_name: str = "Document"
|
|
3201
3214
|
) -> "DoclingDocument":
|
|
3202
3215
|
r"""Load Docling document from lists of DocTags and Images."""
|
|
3203
3216
|
# Maps the recognized tag to a Docling label.
|
|
@@ -3221,6 +3234,8 @@ class DoclingDocument(BaseModel):
|
|
|
3221
3234
|
"key_value_region": DocItemLabel.KEY_VALUE_REGION,
|
|
3222
3235
|
}
|
|
3223
3236
|
|
|
3237
|
+
doc = DoclingDocument(name=document_name)
|
|
3238
|
+
|
|
3224
3239
|
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
|
3225
3240
|
"""Extract <loc_...> coords from the chunk, normalized by / 500."""
|
|
3226
3241
|
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
|
@@ -3244,7 +3259,7 @@ class DoclingDocument(BaseModel):
|
|
|
3244
3259
|
caption_content = caption.group(1)
|
|
3245
3260
|
bbox = extract_bounding_box(caption_content)
|
|
3246
3261
|
caption_text = extract_inner_text(caption_content)
|
|
3247
|
-
caption_item =
|
|
3262
|
+
caption_item = doc.add_text(
|
|
3248
3263
|
label=DocItemLabel.CAPTION,
|
|
3249
3264
|
text=caption_text,
|
|
3250
3265
|
parent=None,
|
|
@@ -3567,7 +3582,7 @@ class DoclingDocument(BaseModel):
|
|
|
3567
3582
|
pg_width = 1
|
|
3568
3583
|
pg_height = 1
|
|
3569
3584
|
|
|
3570
|
-
|
|
3585
|
+
doc.add_page(
|
|
3571
3586
|
page_no=page_no,
|
|
3572
3587
|
size=Size(width=pg_width, height=pg_height),
|
|
3573
3588
|
image=ImageRef.from_pil(image=image, dpi=72) if image else None,
|
|
@@ -3595,7 +3610,9 @@ class DoclingDocument(BaseModel):
|
|
|
3595
3610
|
rf"{DocumentToken.UNORDERED_LIST.value}|"
|
|
3596
3611
|
rf"{DocItemLabel.KEY_VALUE_REGION}|"
|
|
3597
3612
|
rf"{DocumentToken.CHART.value}|"
|
|
3598
|
-
rf"{DocumentToken.OTSL.value})
|
|
3613
|
+
rf"{DocumentToken.OTSL.value})>"
|
|
3614
|
+
rf"(?P<content>.*?)"
|
|
3615
|
+
rf"(?:(?P<closed></(?P=tag)>)|(?P<eof>$))"
|
|
3599
3616
|
)
|
|
3600
3617
|
pattern = re.compile(tag_pattern, re.DOTALL)
|
|
3601
3618
|
|
|
@@ -3605,6 +3622,10 @@ class DoclingDocument(BaseModel):
|
|
|
3605
3622
|
tag_name = match.group("tag")
|
|
3606
3623
|
|
|
3607
3624
|
bbox = extract_bounding_box(full_chunk) # Extracts first bbox
|
|
3625
|
+
if not match.group("closed"):
|
|
3626
|
+
# no closing tag; only the existence of the item is recovered
|
|
3627
|
+
full_chunk = f"<{tag_name}></{tag_name}>"
|
|
3628
|
+
|
|
3608
3629
|
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
|
3609
3630
|
|
|
3610
3631
|
if tag_name == DocumentToken.OTSL.value:
|
|
@@ -3624,9 +3645,9 @@ class DoclingDocument(BaseModel):
|
|
|
3624
3645
|
charspan=(0, 0),
|
|
3625
3646
|
page_no=page_no,
|
|
3626
3647
|
)
|
|
3627
|
-
|
|
3648
|
+
doc.add_table(data=table_data, prov=prov, caption=caption)
|
|
3628
3649
|
else:
|
|
3629
|
-
|
|
3650
|
+
doc.add_table(data=table_data, caption=caption)
|
|
3630
3651
|
|
|
3631
3652
|
elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
|
|
3632
3653
|
caption, caption_bbox = extract_caption(full_chunk)
|
|
@@ -3646,7 +3667,7 @@ class DoclingDocument(BaseModel):
|
|
|
3646
3667
|
int(bbox.b * im_height),
|
|
3647
3668
|
)
|
|
3648
3669
|
cropped_image = image.crop(crop_box)
|
|
3649
|
-
pic =
|
|
3670
|
+
pic = doc.add_picture(
|
|
3650
3671
|
parent=None,
|
|
3651
3672
|
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
|
3652
3673
|
prov=(
|
|
@@ -3692,7 +3713,7 @@ class DoclingDocument(BaseModel):
|
|
|
3692
3713
|
else:
|
|
3693
3714
|
if bbox:
|
|
3694
3715
|
# In case we don't have access to an binary of an image
|
|
3695
|
-
pic =
|
|
3716
|
+
pic = doc.add_picture(
|
|
3696
3717
|
parent=None,
|
|
3697
3718
|
prov=ProvenanceItem(
|
|
3698
3719
|
bbox=bbox, charspan=(0, 0), page_no=page_no
|
|
@@ -3733,7 +3754,7 @@ class DoclingDocument(BaseModel):
|
|
|
3733
3754
|
key_value_data, kv_item_prov = parse_key_value_item(
|
|
3734
3755
|
full_chunk, image
|
|
3735
3756
|
)
|
|
3736
|
-
|
|
3757
|
+
doc.add_key_values(graph=key_value_data, prov=kv_item_prov)
|
|
3737
3758
|
elif tag_name in [
|
|
3738
3759
|
DocumentToken.ORDERED_LIST.value,
|
|
3739
3760
|
DocumentToken.UNORDERED_LIST.value,
|
|
@@ -3749,7 +3770,7 @@ class DoclingDocument(BaseModel):
|
|
|
3749
3770
|
)
|
|
3750
3771
|
li_pattern = re.compile(list_item_pattern, re.DOTALL)
|
|
3751
3772
|
# Add list group:
|
|
3752
|
-
new_list =
|
|
3773
|
+
new_list = doc.add_group(label=list_label, name="list")
|
|
3753
3774
|
# Pricess list items
|
|
3754
3775
|
for li_match in li_pattern.finditer(full_chunk):
|
|
3755
3776
|
enum_value += 1
|
|
@@ -3760,7 +3781,7 @@ class DoclingDocument(BaseModel):
|
|
|
3760
3781
|
li_bbox = extract_bounding_box(li_full_chunk) if image else None
|
|
3761
3782
|
text_content = extract_inner_text(li_full_chunk)
|
|
3762
3783
|
# Add list item
|
|
3763
|
-
|
|
3784
|
+
doc.add_list_item(
|
|
3764
3785
|
marker=enum_marker,
|
|
3765
3786
|
enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
|
|
3766
3787
|
parent=new_list,
|
|
@@ -3792,13 +3813,13 @@ class DoclingDocument(BaseModel):
|
|
|
3792
3813
|
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
|
3793
3814
|
content_layer = ContentLayer.FURNITURE
|
|
3794
3815
|
|
|
3795
|
-
|
|
3816
|
+
doc.add_text(
|
|
3796
3817
|
label=doc_label,
|
|
3797
3818
|
text=text_content,
|
|
3798
3819
|
prov=element_prov,
|
|
3799
3820
|
content_layer=content_layer,
|
|
3800
3821
|
)
|
|
3801
|
-
return
|
|
3822
|
+
return doc
|
|
3802
3823
|
|
|
3803
3824
|
@deprecated("Use save_as_doctags instead.")
|
|
3804
3825
|
def save_as_document_tokens(self, *args, **kwargs):
|
|
@@ -3885,7 +3906,7 @@ class DoclingDocument(BaseModel):
|
|
|
3885
3906
|
:returns: The content of the document formatted as a DocTags string.
|
|
3886
3907
|
:rtype: str
|
|
3887
3908
|
"""
|
|
3888
|
-
from docling_core.
|
|
3909
|
+
from docling_core.transforms.serializer.doctags import (
|
|
3889
3910
|
DocTagsDocSerializer,
|
|
3890
3911
|
DocTagsParams,
|
|
3891
3912
|
)
|
|
@@ -472,8 +472,27 @@ class SegmentedPage(BaseModel):
|
|
|
472
472
|
word_cells: List[TextCell] = []
|
|
473
473
|
textline_cells: List[TextCell] = []
|
|
474
474
|
|
|
475
|
+
# These flags are set to differentiate if above lists of this SegmentedPage
|
|
476
|
+
# are empty (page had no content) or if they have not been computed (i.e. textline_cells may be present
|
|
477
|
+
# but word_cells are not)
|
|
478
|
+
has_chars: bool = False
|
|
479
|
+
has_words: bool = False
|
|
480
|
+
has_lines: bool = False
|
|
481
|
+
|
|
475
482
|
image: Optional[ImageRef] = None
|
|
476
483
|
|
|
484
|
+
@model_validator(mode="after")
|
|
485
|
+
def validate_page(self) -> "SegmentedPage":
|
|
486
|
+
"""Validate page."""
|
|
487
|
+
if len(self.textline_cells) > 0:
|
|
488
|
+
self.has_lines = True
|
|
489
|
+
if len(self.word_cells) > 0:
|
|
490
|
+
self.has_words = True
|
|
491
|
+
if len(self.char_cells) > 0:
|
|
492
|
+
self.has_chars = True
|
|
493
|
+
|
|
494
|
+
return self
|
|
495
|
+
|
|
477
496
|
def iterate_cells(self, unit_type: TextCellUnit) -> Iterator[TextCell]:
|
|
478
497
|
"""Iterate through text cells of the specified unit type.
|
|
479
498
|
|
|
@@ -579,13 +598,17 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
579
598
|
with open(filename, "r", encoding="utf-8") as f:
|
|
580
599
|
return cls.model_validate_json(f.read())
|
|
581
600
|
|
|
582
|
-
def crop_text(
|
|
601
|
+
def crop_text(
|
|
602
|
+
self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0
|
|
603
|
+
) -> str:
|
|
583
604
|
"""Extract text from cells within the specified bounding box.
|
|
584
605
|
|
|
585
606
|
Args:
|
|
586
607
|
cell_unit: Type of text unit to extract
|
|
587
608
|
bbox: Bounding box to extract from
|
|
588
609
|
eps: Epsilon value for position comparison
|
|
610
|
+
Returns:
|
|
611
|
+
Extracted text from the cells
|
|
589
612
|
"""
|
|
590
613
|
selection = []
|
|
591
614
|
for page_cell in self.iterate_cells(cell_unit):
|
|
@@ -605,7 +628,6 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
605
628
|
|
|
606
629
|
text = ""
|
|
607
630
|
for i, cell in enumerate(selection):
|
|
608
|
-
|
|
609
631
|
if i == 0:
|
|
610
632
|
text += cell.text
|
|
611
633
|
else:
|
|
@@ -619,6 +641,7 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
619
641
|
else:
|
|
620
642
|
text += " "
|
|
621
643
|
text += cell.text
|
|
644
|
+
return text
|
|
622
645
|
|
|
623
646
|
def export_to_textlines(
|
|
624
647
|
self,
|
|
@@ -640,7 +663,6 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
640
663
|
"""
|
|
641
664
|
lines: List[str] = []
|
|
642
665
|
for cell in self.iterate_cells(cell_unit):
|
|
643
|
-
|
|
644
666
|
line = ""
|
|
645
667
|
if add_location:
|
|
646
668
|
line += f"({cell.rect.r_x0:06.02f}, {cell.rect.r_y0:06.02f}) "
|
|
@@ -1104,7 +1126,6 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
1104
1126
|
|
|
1105
1127
|
# Draw each rectangle by connecting its four points
|
|
1106
1128
|
for line in self.lines:
|
|
1107
|
-
|
|
1108
1129
|
line.to_top_left_origin(page_height=page_height)
|
|
1109
1130
|
for segment in line.iterate_segments():
|
|
1110
1131
|
draw.line(
|
|
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
|
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
def _create_hash(string: str):
|
|
50
|
-
hasher = hashlib.sha256()
|
|
50
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
|
51
51
|
hasher.update(string.encode("utf-8"))
|
|
52
52
|
|
|
53
53
|
return hasher.hexdigest()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|