docling 1.20.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. docling-2.0.0/PKG-INFO +149 -0
  2. docling-2.0.0/README.md +98 -0
  3. docling-2.0.0/docling/backend/abstract_backend.py +63 -0
  4. {docling-1.20.0 → docling-2.0.0}/docling/backend/docling_parse_backend.py +16 -12
  5. {docling-1.20.0 → docling-2.0.0}/docling/backend/docling_parse_v2_backend.py +15 -11
  6. docling-2.0.0/docling/backend/html_backend.py +425 -0
  7. docling-2.0.0/docling/backend/mspowerpoint_backend.py +375 -0
  8. docling-2.0.0/docling/backend/msword_backend.py +509 -0
  9. docling-2.0.0/docling/backend/pdf_backend.py +78 -0
  10. {docling-1.20.0 → docling-2.0.0}/docling/backend/pypdfium2_backend.py +15 -10
  11. {docling-1.20.0 → docling-2.0.0}/docling/cli/main.py +61 -60
  12. docling-2.0.0/docling/datamodel/base_models.py +204 -0
  13. docling-2.0.0/docling/datamodel/document.py +507 -0
  14. {docling-1.20.0 → docling-2.0.0}/docling/datamodel/pipeline_options.py +13 -0
  15. {docling-1.20.0 → docling-2.0.0}/docling/datamodel/settings.py +1 -0
  16. docling-2.0.0/docling/document_converter.py +260 -0
  17. docling-2.0.0/docling/models/base_model.py +25 -0
  18. {docling-1.20.0 → docling-2.0.0}/docling/models/base_ocr_model.py +10 -5
  19. docling-2.0.0/docling/models/ds_glm_model.py +275 -0
  20. {docling-1.20.0 → docling-2.0.0}/docling/models/easyocr_model.py +4 -1
  21. {docling-1.20.0 → docling-2.0.0}/docling/models/layout_model.py +73 -61
  22. {docling-1.20.0 → docling-2.0.0}/docling/models/page_assemble_model.py +21 -5
  23. docling-2.0.0/docling/models/page_preprocessing_model.py +57 -0
  24. {docling-1.20.0 → docling-2.0.0}/docling/models/table_structure_model.py +34 -32
  25. {docling-1.20.0 → docling-2.0.0}/docling/models/tesseract_ocr_cli_model.py +8 -5
  26. {docling-1.20.0 → docling-2.0.0}/docling/models/tesseract_ocr_model.py +8 -5
  27. docling-2.0.0/docling/pipeline/base_pipeline.py +190 -0
  28. docling-2.0.0/docling/pipeline/simple_pipeline.py +59 -0
  29. docling-2.0.0/docling/pipeline/standard_pdf_pipeline.py +198 -0
  30. {docling-1.20.0 → docling-2.0.0}/docling/utils/export.py +4 -3
  31. {docling-1.20.0 → docling-2.0.0}/docling/utils/layout_utils.py +17 -11
  32. {docling-1.20.0 → docling-2.0.0}/pyproject.toml +25 -10
  33. docling-1.20.0/PKG-INFO +0 -380
  34. docling-1.20.0/README.md +0 -332
  35. docling-1.20.0/docling/backend/abstract_backend.py +0 -68
  36. docling-1.20.0/docling/datamodel/base_models.py +0 -324
  37. docling-1.20.0/docling/datamodel/document.py +0 -461
  38. docling-1.20.0/docling/document_converter.py +0 -297
  39. docling-1.20.0/docling/models/ds_glm_model.py +0 -86
  40. docling-1.20.0/docling/pipeline/base_model_pipeline.py +0 -18
  41. docling-1.20.0/docling/pipeline/standard_model_pipeline.py +0 -66
  42. {docling-1.20.0 → docling-2.0.0}/LICENSE +0 -0
  43. {docling-1.20.0 → docling-2.0.0}/docling/__init__.py +0 -0
  44. {docling-1.20.0 → docling-2.0.0}/docling/backend/__init__.py +0 -0
  45. {docling-1.20.0 → docling-2.0.0}/docling/cli/__init__.py +0 -0
  46. {docling-1.20.0 → docling-2.0.0}/docling/datamodel/__init__.py +0 -0
  47. {docling-1.20.0 → docling-2.0.0}/docling/models/__init__.py +0 -0
  48. {docling-1.20.0 → docling-2.0.0}/docling/pipeline/__init__.py +0 -0
  49. {docling-1.20.0 → docling-2.0.0}/docling/utils/__init__.py +0 -0
  50. {docling-1.20.0 → docling-2.0.0}/docling/utils/utils.py +0 -0
docling-2.0.0/PKG-INFO ADDED
@@ -0,0 +1,149 @@
1
+ Metadata-Version: 2.1
2
+ Name: docling
3
+ Version: 2.0.0
4
+ Summary: Docling PDF conversion package
5
+ Home-page: https://github.com/DS4SD/docling
6
+ License: MIT
7
+ Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
8
+ Author: Christoph Auer
9
+ Author-email: cau@zurich.ibm.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: MacOS :: MacOS X
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Provides-Extra: tesserocr
23
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
+ Requires-Dist: certifi (>=2024.7.4)
25
+ Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
26
+ Requires-Dist: docling-core (>=2.0.0,<3.0.0)
27
+ Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
+ Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
29
+ Requires-Dist: easyocr (>=1.7,<2.0)
30
+ Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
+ Requires-Dist: huggingface_hub (>=0.23,<1)
32
+ Requires-Dist: pandas (>=2.1.4,<3.0.0)
33
+ Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
34
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
35
+ Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
36
+ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
37
+ Requires-Dist: python-docx (>=1.1.2,<2.0.0)
38
+ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
39
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
40
+ Requires-Dist: rtree (>=1.3.0,<2.0.0)
41
+ Requires-Dist: scipy (>=1.14.1,<2.0.0)
42
+ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
43
+ Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
44
+ Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
45
+ Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
46
+ Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
47
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
48
+ Project-URL: Repository, https://github.com/DS4SD/docling
49
+ Description-Content-Type: text/markdown
50
+
51
+ <p align="center">
52
+ <a href="https://github.com/ds4sd/docling">
53
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
54
+ </a>
55
+ </p>
56
+
57
+ # Docling
58
+
59
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
60
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
61
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
62
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
63
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
64
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
65
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
66
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
67
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
68
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
69
+
70
+ Docling parses documents and exports them to the desired format with ease and speed.
71
+
72
+ ## Features
73
+
74
+ * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
75
+ * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
76
+ * 📝 Metadata extraction, including title, authors, references & language
77
+ * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
78
+ * 🔍 OCR support for scanned PDFs
79
+ * 💻 Simple and convenient CLI
80
+
81
+ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
82
+
83
+
84
+ ## Installation
85
+
86
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
87
+ ```bash
88
+ pip install docling
89
+ ```
90
+
91
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
92
+
93
+ More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
94
+
95
+ ## Getting started
96
+
97
+ To convert invidual documents, use `convert()`, for example:
98
+
99
+ ```python
100
+ from docling.document_converter import DocumentConverter
101
+
102
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
103
+ converter = DocumentConverter()
104
+ result = converter.convert(source)
105
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
106
+ print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
107
+ ```
108
+
109
+
110
+ Check out [Getting started](https://ds4sd.github.io/docling/).
111
+ You will find lots of tuning options to leverage all the advanced capabilities.
112
+
113
+
114
+ ## Get help and support
115
+
116
+ Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
117
+
118
+
119
+ ## Technical report
120
+
121
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
122
+
123
+ ## Contributing
124
+
125
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
126
+
127
+
128
+ ## References
129
+
130
+ If you use Docling in your projects, please consider citing the following:
131
+
132
+ ```bib
133
+ @techreport{Docling,
134
+ author = {Deep Search Team},
135
+ month = {8},
136
+ title = {Docling Technical Report},
137
+ url = {https://arxiv.org/abs/2408.09869},
138
+ eprint = {2408.09869},
139
+ doi = {10.48550/arXiv.2408.09869},
140
+ version = {1.0.0},
141
+ year = {2024}
142
+ }
143
+ ```
144
+
145
+ ## License
146
+
147
+ The Docling codebase is under MIT license.
148
+ For individual model usage, please refer to the model licenses found in the original packages.
149
+
@@ -0,0 +1,98 @@
1
+ <p align="center">
2
+ <a href="https://github.com/ds4sd/docling">
3
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
4
+ </a>
5
+ </p>
6
+
7
+ # Docling
8
+
9
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
10
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
11
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
12
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
13
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
14
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
15
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
16
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
17
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
18
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
19
+
20
+ Docling parses documents and exports them to the desired format with ease and speed.
21
+
22
+ ## Features
23
+
24
+ * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
25
+ * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
26
+ * 📝 Metadata extraction, including title, authors, references & language
27
+ * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
28
+ * 🔍 OCR support for scanned PDFs
29
+ * 💻 Simple and convenient CLI
30
+
31
+ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
32
+
33
+
34
+ ## Installation
35
+
36
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
37
+ ```bash
38
+ pip install docling
39
+ ```
40
+
41
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
42
+
43
+ More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
44
+
45
+ ## Getting started
46
+
47
+ To convert invidual documents, use `convert()`, for example:
48
+
49
+ ```python
50
+ from docling.document_converter import DocumentConverter
51
+
52
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
53
+ converter = DocumentConverter()
54
+ result = converter.convert(source)
55
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
56
+ print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
57
+ ```
58
+
59
+
60
+ Check out [Getting started](https://ds4sd.github.io/docling/).
61
+ You will find lots of tuning options to leverage all the advanced capabilities.
62
+
63
+
64
+ ## Get help and support
65
+
66
+ Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
67
+
68
+
69
+ ## Technical report
70
+
71
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
72
+
73
+ ## Contributing
74
+
75
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
76
+
77
+
78
+ ## References
79
+
80
+ If you use Docling in your projects, please consider citing the following:
81
+
82
+ ```bib
83
+ @techreport{Docling,
84
+ author = {Deep Search Team},
85
+ month = {8},
86
+ title = {Docling Technical Report},
87
+ url = {https://arxiv.org/abs/2408.09869},
88
+ eprint = {2408.09869},
89
+ doi = {10.48550/arXiv.2408.09869},
90
+ version = {1.0.0},
91
+ year = {2024}
92
+ }
93
+ ```
94
+
95
+ ## License
96
+
97
+ The Docling codebase is under MIT license.
98
+ For individual model usage, please refer to the model licenses found in the original packages.
@@ -0,0 +1,63 @@
1
+ from abc import ABC, abstractmethod
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Set, Union
5
+
6
+ from docling_core.types.doc import DoclingDocument
7
+
8
+ if TYPE_CHECKING:
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.document import InputDocument
11
+
12
+
13
+ class AbstractDocumentBackend(ABC):
14
+ @abstractmethod
15
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
16
+ self.path_or_stream = path_or_stream
17
+ self.document_hash = in_doc.document_hash
18
+ self.input_format = in_doc.format
19
+
20
+ @abstractmethod
21
+ def is_valid(self) -> bool:
22
+ pass
23
+
24
+ @classmethod
25
+ @abstractmethod
26
+ def supports_pagination(cls) -> bool:
27
+ pass
28
+
29
+ @abstractmethod
30
+ def unload(self):
31
+ if isinstance(self.path_or_stream, BytesIO):
32
+ self.path_or_stream.close()
33
+
34
+ self.path_or_stream = None
35
+
36
+ @classmethod
37
+ @abstractmethod
38
+ def supported_formats(cls) -> Set["InputFormat"]:
39
+ pass
40
+
41
+
42
+ class PaginatedDocumentBackend(AbstractDocumentBackend):
43
+ """DeclarativeDocumentBackend.
44
+
45
+ A declarative document backend is a backend that can transform to DoclingDocument
46
+ straight without a recognition pipeline.
47
+ """
48
+
49
+ @abstractmethod
50
+ def page_count(self) -> int:
51
+ pass
52
+
53
+
54
+ class DeclarativeDocumentBackend(AbstractDocumentBackend):
55
+ """DeclarativeDocumentBackend.
56
+
57
+ A declarative document backend is a backend that can transform to DoclingDocument
58
+ straight without a recognition pipeline.
59
+ """
60
+
61
+ @abstractmethod
62
+ def convert(self) -> DoclingDocument:
63
+ pass
@@ -5,12 +5,14 @@ from pathlib import Path
5
5
  from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
8
9
  from docling_parse.docling_parse import pdf_parser
9
10
  from PIL import Image, ImageDraw
10
11
  from pypdfium2 import PdfPage
11
12
 
12
- from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
13
- from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import Cell
15
+ from docling.datamodel.document import InputDocument
14
16
 
15
17
  _log = logging.getLogger(__name__)
16
18
 
@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):
177
179
 
178
180
  return image
179
181
 
180
- def get_size(self) -> PageSize:
181
- return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
182
+ def get_size(self) -> Size:
183
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
182
184
 
183
185
  def unload(self):
184
186
  self._ppage = None
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
186
188
 
187
189
 
188
190
  class DoclingParseDocumentBackend(PdfDocumentBackend):
189
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
190
- super().__init__(path_or_stream, document_hash)
191
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
192
+ super().__init__(in_doc, path_or_stream)
191
193
 
192
- self._pdoc = pdfium.PdfDocument(path_or_stream)
194
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
193
195
  self.parser = pdf_parser()
194
196
 
195
197
  success = False
196
- if isinstance(path_or_stream, BytesIO):
198
+ if isinstance(self.path_or_stream, BytesIO):
197
199
  success = self.parser.load_document_from_bytesio(
198
- document_hash, path_or_stream
200
+ self.document_hash, self.path_or_stream
201
+ )
202
+ elif isinstance(self.path_or_stream, Path):
203
+ success = self.parser.load_document(
204
+ self.document_hash, str(self.path_or_stream)
199
205
  )
200
- elif isinstance(path_or_stream, Path):
201
- success = self.parser.load_document(document_hash, str(path_or_stream))
202
206
 
203
207
  if not success:
204
208
  raise RuntimeError(
205
- f"docling-parse could not load document {document_hash}."
209
+ f"docling-parse could not load document with hash {self.document_hash}."
206
210
  )
207
211
 
208
212
  def page_count(self) -> int:
@@ -2,15 +2,19 @@ import logging
2
2
  import random
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
8
9
  from docling_parse.docling_parse import pdf_parser_v2
9
10
  from PIL import Image, ImageDraw
10
11
  from pypdfium2 import PdfPage
11
12
 
12
- from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
13
- from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import Cell, Size
15
+
16
+ if TYPE_CHECKING:
17
+ from docling.datamodel.document import InputDocument
14
18
 
15
19
  _log = logging.getLogger(__name__)
16
20
 
@@ -190,8 +194,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):
190
194
 
191
195
  return image
192
196
 
193
- def get_size(self) -> PageSize:
194
- return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
197
+ def get_size(self) -> Size:
198
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
195
199
 
196
200
  def unload(self):
197
201
  self._ppage = None
@@ -199,23 +203,23 @@ class DoclingParseV2PageBackend(PdfPageBackend):
199
203
 
200
204
 
201
205
  class DoclingParseV2DocumentBackend(PdfDocumentBackend):
202
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
203
- super().__init__(path_or_stream, document_hash)
206
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
207
+ super().__init__(in_doc, path_or_stream)
204
208
 
205
- self._pdoc = pdfium.PdfDocument(path_or_stream)
209
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
206
210
  self.parser = pdf_parser_v2("fatal")
207
211
 
208
212
  success = False
209
213
  if isinstance(path_or_stream, BytesIO):
210
214
  success = self.parser.load_document_from_bytesio(
211
- document_hash, path_or_stream
215
+ self.document_hash, path_or_stream
212
216
  )
213
217
  elif isinstance(path_or_stream, Path):
214
- success = self.parser.load_document(document_hash, str(path_or_stream))
218
+ success = self.parser.load_document(self.document_hash, str(path_or_stream))
215
219
 
216
220
  if not success:
217
221
  raise RuntimeError(
218
- f"docling-parse could not load document {document_hash}."
222
+ f"docling-parse v2 could not load document {self.document_hash}."
219
223
  )
220
224
 
221
225
  def page_count(self) -> int: