docling 2.1.0__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
45
45
  ConversionStatus,
46
46
  DocumentStream,
47
47
  ErrorItem,
48
+ FormatToExtensions,
49
+ FormatToMimeType,
48
50
  InputFormat,
49
51
  MimeTypeToFormat,
50
52
  Page,
@@ -143,11 +145,13 @@ class InputDocument(BaseModel):
143
145
  self.valid = False
144
146
 
145
147
  except (FileNotFoundError, OSError) as e:
148
+ self.valid = False
146
149
  _log.exception(
147
150
  f"File {self.file.name} not found or cannot be opened.", exc_info=e
148
151
  )
149
152
  # raise
150
153
  except RuntimeError as e:
154
+ self.valid = False
151
155
  _log.exception(
152
156
  f"An unexpected error occurred while opening the document {self.file.name}",
153
157
  exc_info=e,
@@ -166,6 +170,8 @@ class InputDocument(BaseModel):
166
170
  )
167
171
 
168
172
  self._backend = backend(self, path_or_stream=path_or_stream)
173
+ if not self._backend.is_valid():
174
+ self.valid = False
169
175
 
170
176
 
171
177
  class DocumentFormat(str, Enum):
@@ -480,26 +486,48 @@ class _DocumentConversionInput(BaseModel):
480
486
  else:
481
487
  raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
482
488
 
483
- def _guess_format(self, obj):
484
- content = None
489
+ def _guess_format(self, obj: Union[Path, DocumentStream]):
490
+ content = b"" # empty binary blob
491
+ format = None
492
+
485
493
  if isinstance(obj, Path):
486
494
  mime = filetype.guess_mime(str(obj))
487
495
  if mime is None:
496
+ ext = obj.suffix[1:]
497
+ mime = self._mime_from_extension(ext)
498
+ if mime is None: # must guess from
488
499
  with obj.open("rb") as f:
489
500
  content = f.read(1024) # Read first 1KB
490
501
 
491
502
  elif isinstance(obj, DocumentStream):
492
- obj.stream.seek(0)
493
503
  content = obj.stream.read(8192)
494
504
  obj.stream.seek(0)
495
505
  mime = filetype.guess_mime(content)
506
+ if mime is None:
507
+ ext = (
508
+ obj.name.rsplit(".", 1)[-1]
509
+ if ("." in obj.name and not obj.name.startswith("."))
510
+ else ""
511
+ )
512
+ mime = self._mime_from_extension(ext)
496
513
 
497
- if mime is None:
498
- mime = self._detect_html_xhtml(content)
514
+ mime = mime or self._detect_html_xhtml(content)
515
+ mime = mime or "text/plain"
499
516
 
500
517
  format = MimeTypeToFormat.get(mime)
501
518
  return format
502
519
 
520
+ def _mime_from_extension(self, ext):
521
+ mime = None
522
+ if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
523
+ mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
524
+ elif ext in FormatToExtensions[InputFormat.HTML]:
525
+ mime = FormatToMimeType[InputFormat.HTML][0]
526
+ elif ext in FormatToExtensions[InputFormat.MD]:
527
+ mime = FormatToMimeType[InputFormat.MD][0]
528
+
529
+ return mime
530
+
503
531
  def _detect_html_xhtml(self, content):
504
532
  content_str = content.decode("ascii", errors="ignore").lower()
505
533
  # Remove XML comments
@@ -8,8 +8,10 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type
8
8
  from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
9
 
10
10
  from docling.backend.abstract_backend import AbstractDocumentBackend
11
+ from docling.backend.asciidoc_backend import AsciiDocBackend
11
12
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
12
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
+ from docling.backend.md_backend import MarkdownDocumentBackend
13
15
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
14
16
  from docling.backend.msword_backend import MsWordDocumentBackend
15
17
  from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
@@ -52,6 +54,16 @@ class PowerpointFormatOption(FormatOption):
52
54
  backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
53
55
 
54
56
 
57
+ class MarkdownFormatOption(FormatOption):
58
+ pipeline_cls: Type = SimplePipeline
59
+ backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
60
+
61
+
62
+ class AsciiDocFormatOption(FormatOption):
63
+ pipeline_cls: Type = SimplePipeline
64
+ backend: Type[AbstractDocumentBackend] = AsciiDocBackend
65
+
66
+
55
67
  class HTMLFormatOption(FormatOption):
56
68
  pipeline_cls: Type = SimplePipeline
57
69
  backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
@@ -74,6 +86,12 @@ _format_to_default_options = {
74
86
  InputFormat.PPTX: FormatOption(
75
87
  pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
76
88
  ),
89
+ InputFormat.MD: FormatOption(
90
+ pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
91
+ ),
92
+ InputFormat.ASCIIDOC: FormatOption(
93
+ pipeline_cls=SimplePipeline, backend=AsciiDocBackend
94
+ ),
77
95
  InputFormat.HTML: FormatOption(
78
96
  pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
79
97
  ),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.1.0
3
+ Version: 2.2.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -22,13 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Provides-Extra: tesserocr
23
23
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
- Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
26
- Requires-Dist: docling-core (>=2.0.0,<3.0.0)
25
+ Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
+ Requires-Dist: docling-core (>=2.2.1,<3.0.0)
27
27
  Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
- Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
28
+ Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0)
30
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
32
+ Requires-Dist: marko (>=2.1.2,<3.0.0)
32
33
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
33
34
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
34
35
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
@@ -50,7 +51,7 @@ Description-Content-Type: text/markdown
50
51
 
51
52
  <p align="center">
52
53
  <a href="https://github.com/ds4sd/docling">
53
- <img loading="lazy" alt="Docling" src="docs/assets/docling_processing.png" width="100%"/>
54
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
54
55
  </a>
55
56
  </p>
56
57
 
@@ -100,7 +101,7 @@ To convert individual documents, use `convert()`, for example:
100
101
  ```python
101
102
  from docling.document_converter import DocumentConverter
102
103
 
103
- source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
104
+ source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
104
105
  converter = DocumentConverter()
105
106
  result = converter.convert(source)
106
107
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
@@ -144,6 +145,6 @@ If you use Docling in your projects, please consider citing the following:
144
145
 
145
146
  ## License
146
147
 
147
- The Docling codebase is under MIT license.
148
+ The Docling codebase is under MIT license.
148
149
  For individual model usage, please refer to the model licenses found in the original packages.
149
150
 
@@ -1,21 +1,23 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=8Lh1gf1P9AnzlwB989OVBgLmokTpfI0LxYRfuvYTqoo,1646
4
- docling/backend/docling_parse_backend.py,sha256=UgBpopZIP5YkhwhybiqDnqVsSqv9DAAPFkafhfL0pPo,7623
5
- docling/backend/docling_parse_v2_backend.py,sha256=VY7MsiyqjN3Vl0UkyezriiVJMLbLRrQVuKjWaTgIUwY,8336
6
- docling/backend/html_backend.py,sha256=MlhEXaA0tgX_tLuQLnkex43gsKqpqHWnbkssxY4n_kc,14753
7
- docling/backend/mspowerpoint_backend.py,sha256=2UYfMMeWwgDtvIKQELCA-bYv5Z-rGvbMiBNcidNL_uE,14332
8
- docling/backend/msword_backend.py,sha256=4SDqZAZxLr6VV50OU3MRBAV8SwZMCyJCUbNVMVUpitc,17659
3
+ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
+ docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
5
+ docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
6
+ docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
7
+ docling/backend/html_backend.py,sha256=TUY5EVv3bo28A_w5CvBgNW4ZqL1d-VxOQPh1_taPHgU,15070
8
+ docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
+ docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
10
+ docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpRwhxI,17230
9
11
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
10
12
  docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
11
13
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
14
  docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
13
15
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- docling/datamodel/base_models.py,sha256=Ha-DoRZoksjHSZHWqUSiQ79MTBEfY5ur8U_LVtyBRYU,5153
15
- docling/datamodel/document.py,sha256=FZQyJtHSeGBrZwFf-GGXDu-Dyp4iIl7VbVnTupmlUqk,19532
16
+ docling/datamodel/base_models.py,sha256=Mx0xR6YmRP8thu8CjOxjbGHLUJctqIvFwRZQ-8tQowY,5380
17
+ docling/datamodel/document.py,sha256=mkPXDms9jtPFY1pfBSicNaVRZwbbfzYFUj0dJDbMgG8,20612
16
18
  docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
17
19
  docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
18
- docling/document_converter.py,sha256=S_t9hs2uZfXC38LC0hTaAihrSJIrCvnTiuY5SvUccgk,9587
20
+ docling/document_converter.py,sha256=T-Y2pWwbCIofW209XJ3wlc5TiGeQqMbDqgzcVWyZ_0Y,10227
19
21
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
22
  docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
21
23
  docling/models/base_ocr_model.py,sha256=SYelQRValiUo6M_p_9-J7CqNIOFO-EkK58j90SMsKQY,5028
@@ -35,8 +37,8 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
37
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
36
38
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
37
39
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
38
- docling-2.1.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
39
- docling-2.1.0.dist-info/METADATA,sha256=SorLD4OMK1dU3bX5eqnw5GHqPrPwdhQ7JfYvOyajE20,6109
40
- docling-2.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
41
- docling-2.1.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
42
- docling-2.1.0.dist-info/RECORD,,
40
+ docling-2.2.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
41
+ docling-2.2.1.dist-info/METADATA,sha256=BOYg-5kaA2Fjxc2bwaJOuAd9LmrQerOzQLHCyaiQ1aE,6205
42
+ docling-2.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
43
+ docling-2.2.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
44
+ docling-2.2.1.dist-info/RECORD,,