docling 2.1.0__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +1 -0
- docling/backend/asciidoc_backend.py +435 -0
- docling/backend/docling_parse_backend.py +3 -3
- docling/backend/docling_parse_v2_backend.py +11 -3
- docling/backend/html_backend.py +43 -39
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +62 -39
- docling/backend/msword_backend.py +12 -25
- docling/datamodel/base_models.py +15 -9
- docling/datamodel/document.py +33 -5
- docling/document_converter.py +18 -0
- {docling-2.1.0.dist-info → docling-2.2.1.dist-info}/METADATA +8 -7
- {docling-2.1.0.dist-info → docling-2.2.1.dist-info}/RECORD +16 -14
- {docling-2.1.0.dist-info → docling-2.2.1.dist-info}/LICENSE +0 -0
- {docling-2.1.0.dist-info → docling-2.2.1.dist-info}/WHEEL +0 -0
- {docling-2.1.0.dist-info → docling-2.2.1.dist-info}/entry_points.txt +0 -0
docling/datamodel/document.py
CHANGED
@@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
|
|
45
45
|
ConversionStatus,
|
46
46
|
DocumentStream,
|
47
47
|
ErrorItem,
|
48
|
+
FormatToExtensions,
|
49
|
+
FormatToMimeType,
|
48
50
|
InputFormat,
|
49
51
|
MimeTypeToFormat,
|
50
52
|
Page,
|
@@ -143,11 +145,13 @@ class InputDocument(BaseModel):
|
|
143
145
|
self.valid = False
|
144
146
|
|
145
147
|
except (FileNotFoundError, OSError) as e:
|
148
|
+
self.valid = False
|
146
149
|
_log.exception(
|
147
150
|
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
148
151
|
)
|
149
152
|
# raise
|
150
153
|
except RuntimeError as e:
|
154
|
+
self.valid = False
|
151
155
|
_log.exception(
|
152
156
|
f"An unexpected error occurred while opening the document {self.file.name}",
|
153
157
|
exc_info=e,
|
@@ -166,6 +170,8 @@ class InputDocument(BaseModel):
|
|
166
170
|
)
|
167
171
|
|
168
172
|
self._backend = backend(self, path_or_stream=path_or_stream)
|
173
|
+
if not self._backend.is_valid():
|
174
|
+
self.valid = False
|
169
175
|
|
170
176
|
|
171
177
|
class DocumentFormat(str, Enum):
|
@@ -480,26 +486,48 @@ class _DocumentConversionInput(BaseModel):
|
|
480
486
|
else:
|
481
487
|
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
482
488
|
|
483
|
-
def _guess_format(self, obj):
|
484
|
-
content =
|
489
|
+
def _guess_format(self, obj: Union[Path, DocumentStream]):
|
490
|
+
content = b"" # empty binary blob
|
491
|
+
format = None
|
492
|
+
|
485
493
|
if isinstance(obj, Path):
|
486
494
|
mime = filetype.guess_mime(str(obj))
|
487
495
|
if mime is None:
|
496
|
+
ext = obj.suffix[1:]
|
497
|
+
mime = self._mime_from_extension(ext)
|
498
|
+
if mime is None: # must guess from
|
488
499
|
with obj.open("rb") as f:
|
489
500
|
content = f.read(1024) # Read first 1KB
|
490
501
|
|
491
502
|
elif isinstance(obj, DocumentStream):
|
492
|
-
obj.stream.seek(0)
|
493
503
|
content = obj.stream.read(8192)
|
494
504
|
obj.stream.seek(0)
|
495
505
|
mime = filetype.guess_mime(content)
|
506
|
+
if mime is None:
|
507
|
+
ext = (
|
508
|
+
obj.name.rsplit(".", 1)[-1]
|
509
|
+
if ("." in obj.name and not obj.name.startswith("."))
|
510
|
+
else ""
|
511
|
+
)
|
512
|
+
mime = self._mime_from_extension(ext)
|
496
513
|
|
497
|
-
|
498
|
-
|
514
|
+
mime = mime or self._detect_html_xhtml(content)
|
515
|
+
mime = mime or "text/plain"
|
499
516
|
|
500
517
|
format = MimeTypeToFormat.get(mime)
|
501
518
|
return format
|
502
519
|
|
520
|
+
def _mime_from_extension(self, ext):
|
521
|
+
mime = None
|
522
|
+
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
523
|
+
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
524
|
+
elif ext in FormatToExtensions[InputFormat.HTML]:
|
525
|
+
mime = FormatToMimeType[InputFormat.HTML][0]
|
526
|
+
elif ext in FormatToExtensions[InputFormat.MD]:
|
527
|
+
mime = FormatToMimeType[InputFormat.MD][0]
|
528
|
+
|
529
|
+
return mime
|
530
|
+
|
503
531
|
def _detect_html_xhtml(self, content):
|
504
532
|
content_str = content.decode("ascii", errors="ignore").lower()
|
505
533
|
# Remove XML comments
|
docling/document_converter.py
CHANGED
@@ -8,8 +8,10 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type
|
|
8
8
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
9
9
|
|
10
10
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
11
|
+
from docling.backend.asciidoc_backend import AsciiDocBackend
|
11
12
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
12
13
|
from docling.backend.html_backend import HTMLDocumentBackend
|
14
|
+
from docling.backend.md_backend import MarkdownDocumentBackend
|
13
15
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
14
16
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
15
17
|
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
@@ -52,6 +54,16 @@ class PowerpointFormatOption(FormatOption):
|
|
52
54
|
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
53
55
|
|
54
56
|
|
57
|
+
class MarkdownFormatOption(FormatOption):
|
58
|
+
pipeline_cls: Type = SimplePipeline
|
59
|
+
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
60
|
+
|
61
|
+
|
62
|
+
class AsciiDocFormatOption(FormatOption):
|
63
|
+
pipeline_cls: Type = SimplePipeline
|
64
|
+
backend: Type[AbstractDocumentBackend] = AsciiDocBackend
|
65
|
+
|
66
|
+
|
55
67
|
class HTMLFormatOption(FormatOption):
|
56
68
|
pipeline_cls: Type = SimplePipeline
|
57
69
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
@@ -74,6 +86,12 @@ _format_to_default_options = {
|
|
74
86
|
InputFormat.PPTX: FormatOption(
|
75
87
|
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
76
88
|
),
|
89
|
+
InputFormat.MD: FormatOption(
|
90
|
+
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
91
|
+
),
|
92
|
+
InputFormat.ASCIIDOC: FormatOption(
|
93
|
+
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
94
|
+
),
|
77
95
|
InputFormat.HTML: FormatOption(
|
78
96
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
79
97
|
),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.1
|
3
|
+
Version: 2.2.1
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -22,13 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
22
|
Provides-Extra: tesserocr
|
23
23
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
24
24
|
Requires-Dist: certifi (>=2024.7.4)
|
25
|
-
Requires-Dist: deepsearch-glm (>=0.
|
26
|
-
Requires-Dist: docling-core (>=2.
|
25
|
+
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
26
|
+
Requires-Dist: docling-core (>=2.2.1,<3.0.0)
|
27
27
|
Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
|
28
|
-
Requires-Dist: docling-parse (>=
|
28
|
+
Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
|
29
29
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
30
30
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
31
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
32
|
+
Requires-Dist: marko (>=2.1.2,<3.0.0)
|
32
33
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
33
34
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
34
35
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
@@ -50,7 +51,7 @@ Description-Content-Type: text/markdown
|
|
50
51
|
|
51
52
|
<p align="center">
|
52
53
|
<a href="https://github.com/ds4sd/docling">
|
53
|
-
<img loading="lazy" alt="Docling" src="docs/assets/docling_processing.png" width="100%"/>
|
54
|
+
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
|
54
55
|
</a>
|
55
56
|
</p>
|
56
57
|
|
@@ -100,7 +101,7 @@ To convert individual documents, use `convert()`, for example:
|
|
100
101
|
```python
|
101
102
|
from docling.document_converter import DocumentConverter
|
102
103
|
|
103
|
-
source = "https://arxiv.org/pdf/2408.09869" #
|
104
|
+
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
104
105
|
converter = DocumentConverter()
|
105
106
|
result = converter.convert(source)
|
106
107
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
@@ -144,6 +145,6 @@ If you use Docling in your projects, please consider citing the following:
|
|
144
145
|
|
145
146
|
## License
|
146
147
|
|
147
|
-
The Docling codebase is under MIT license.
|
148
|
+
The Docling codebase is under MIT license.
|
148
149
|
For individual model usage, please refer to the model licenses found in the original packages.
|
149
150
|
|
@@ -1,21 +1,23 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256
|
4
|
-
docling/backend/
|
5
|
-
docling/backend/
|
6
|
-
docling/backend/
|
7
|
-
docling/backend/
|
8
|
-
docling/backend/
|
3
|
+
docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
|
4
|
+
docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
|
5
|
+
docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
|
6
|
+
docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
|
7
|
+
docling/backend/html_backend.py,sha256=TUY5EVv3bo28A_w5CvBgNW4ZqL1d-VxOQPh1_taPHgU,15070
|
8
|
+
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
9
|
+
docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
|
10
|
+
docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpRwhxI,17230
|
9
11
|
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
10
12
|
docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
|
11
13
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
14
|
docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
|
13
15
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
docling/datamodel/base_models.py,sha256=
|
15
|
-
docling/datamodel/document.py,sha256=
|
16
|
+
docling/datamodel/base_models.py,sha256=Mx0xR6YmRP8thu8CjOxjbGHLUJctqIvFwRZQ-8tQowY,5380
|
17
|
+
docling/datamodel/document.py,sha256=mkPXDms9jtPFY1pfBSicNaVRZwbbfzYFUj0dJDbMgG8,20612
|
16
18
|
docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
|
17
19
|
docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
|
18
|
-
docling/document_converter.py,sha256=
|
20
|
+
docling/document_converter.py,sha256=T-Y2pWwbCIofW209XJ3wlc5TiGeQqMbDqgzcVWyZ_0Y,10227
|
19
21
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
22
|
docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
|
21
23
|
docling/models/base_ocr_model.py,sha256=SYelQRValiUo6M_p_9-J7CqNIOFO-EkK58j90SMsKQY,5028
|
@@ -35,8 +37,8 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
37
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
36
38
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
37
39
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
38
|
-
docling-2.1.
|
39
|
-
docling-2.1.
|
40
|
-
docling-2.1.
|
41
|
-
docling-2.1.
|
42
|
-
docling-2.1.
|
40
|
+
docling-2.2.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
41
|
+
docling-2.2.1.dist-info/METADATA,sha256=BOYg-5kaA2Fjxc2bwaJOuAd9LmrQerOzQLHCyaiQ1aE,6205
|
42
|
+
docling-2.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
43
|
+
docling-2.2.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
44
|
+
docling-2.2.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|