docling 1.20.0__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docling-2.1.0/PKG-INFO +149 -0
  2. docling-2.1.0/README.md +98 -0
  3. docling-2.1.0/docling/backend/abstract_backend.py +63 -0
  4. {docling-1.20.0 → docling-2.1.0}/docling/backend/docling_parse_backend.py +16 -12
  5. {docling-1.20.0 → docling-2.1.0}/docling/backend/docling_parse_v2_backend.py +15 -11
  6. docling-2.1.0/docling/backend/html_backend.py +425 -0
  7. docling-2.1.0/docling/backend/mspowerpoint_backend.py +375 -0
  8. docling-2.1.0/docling/backend/msword_backend.py +509 -0
  9. docling-2.1.0/docling/backend/pdf_backend.py +78 -0
  10. {docling-1.20.0 → docling-2.1.0}/docling/backend/pypdfium2_backend.py +15 -10
  11. {docling-1.20.0 → docling-2.1.0}/docling/cli/main.py +61 -60
  12. docling-2.1.0/docling/datamodel/base_models.py +204 -0
  13. docling-2.1.0/docling/datamodel/document.py +516 -0
  14. {docling-1.20.0 → docling-2.1.0}/docling/datamodel/pipeline_options.py +16 -0
  15. {docling-1.20.0 → docling-2.1.0}/docling/datamodel/settings.py +1 -0
  16. docling-2.1.0/docling/document_converter.py +260 -0
  17. docling-2.1.0/docling/models/base_model.py +25 -0
  18. {docling-1.20.0 → docling-2.1.0}/docling/models/base_ocr_model.py +19 -6
  19. docling-2.1.0/docling/models/ds_glm_model.py +284 -0
  20. docling-2.1.0/docling/models/easyocr_model.py +90 -0
  21. {docling-1.20.0 → docling-2.1.0}/docling/models/layout_model.py +130 -114
  22. docling-2.1.0/docling/models/page_assemble_model.py +172 -0
  23. docling-2.1.0/docling/models/page_preprocessing_model.py +61 -0
  24. docling-2.1.0/docling/models/table_structure_model.py +171 -0
  25. {docling-1.20.0 → docling-2.1.0}/docling/models/tesseract_ocr_cli_model.py +63 -56
  26. docling-2.1.0/docling/models/tesseract_ocr_model.py +130 -0
  27. docling-2.1.0/docling/pipeline/base_pipeline.py +190 -0
  28. docling-2.1.0/docling/pipeline/simple_pipeline.py +59 -0
  29. docling-2.1.0/docling/pipeline/standard_pdf_pipeline.py +198 -0
  30. {docling-1.20.0 → docling-2.1.0}/docling/utils/export.py +4 -3
  31. {docling-1.20.0 → docling-2.1.0}/docling/utils/layout_utils.py +17 -11
  32. {docling-1.20.0 → docling-2.1.0}/pyproject.toml +27 -10
  33. docling-1.20.0/PKG-INFO +0 -380
  34. docling-1.20.0/README.md +0 -332
  35. docling-1.20.0/docling/backend/abstract_backend.py +0 -68
  36. docling-1.20.0/docling/datamodel/base_models.py +0 -324
  37. docling-1.20.0/docling/datamodel/document.py +0 -461
  38. docling-1.20.0/docling/document_converter.py +0 -297
  39. docling-1.20.0/docling/models/ds_glm_model.py +0 -86
  40. docling-1.20.0/docling/models/easyocr_model.py +0 -85
  41. docling-1.20.0/docling/models/page_assemble_model.py +0 -148
  42. docling-1.20.0/docling/models/table_structure_model.py +0 -160
  43. docling-1.20.0/docling/models/tesseract_ocr_model.py +0 -122
  44. docling-1.20.0/docling/pipeline/base_model_pipeline.py +0 -18
  45. docling-1.20.0/docling/pipeline/standard_model_pipeline.py +0 -66
  46. {docling-1.20.0 → docling-2.1.0}/LICENSE +0 -0
  47. {docling-1.20.0 → docling-2.1.0}/docling/__init__.py +0 -0
  48. {docling-1.20.0 → docling-2.1.0}/docling/backend/__init__.py +0 -0
  49. {docling-1.20.0 → docling-2.1.0}/docling/cli/__init__.py +0 -0
  50. {docling-1.20.0 → docling-2.1.0}/docling/datamodel/__init__.py +0 -0
  51. {docling-1.20.0 → docling-2.1.0}/docling/models/__init__.py +0 -0
  52. {docling-1.20.0 → docling-2.1.0}/docling/pipeline/__init__.py +0 -0
  53. {docling-1.20.0 → docling-2.1.0}/docling/utils/__init__.py +0 -0
  54. {docling-1.20.0 → docling-2.1.0}/docling/utils/utils.py +0 -0
docling-2.1.0/PKG-INFO ADDED
@@ -0,0 +1,149 @@
1
+ Metadata-Version: 2.1
2
+ Name: docling
3
+ Version: 2.1.0
4
+ Summary: Docling PDF conversion package
5
+ Home-page: https://github.com/DS4SD/docling
6
+ License: MIT
7
+ Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
8
+ Author: Christoph Auer
9
+ Author-email: cau@zurich.ibm.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: MacOS :: MacOS X
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Provides-Extra: tesserocr
23
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
+ Requires-Dist: certifi (>=2024.7.4)
25
+ Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
26
+ Requires-Dist: docling-core (>=2.0.0,<3.0.0)
27
+ Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
+ Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
29
+ Requires-Dist: easyocr (>=1.7,<2.0)
30
+ Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
+ Requires-Dist: huggingface_hub (>=0.23,<1)
32
+ Requires-Dist: pandas (>=2.1.4,<3.0.0)
33
+ Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
34
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
35
+ Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
36
+ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
37
+ Requires-Dist: python-docx (>=1.1.2,<2.0.0)
38
+ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
39
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
40
+ Requires-Dist: rtree (>=1.3.0,<2.0.0)
41
+ Requires-Dist: scipy (>=1.14.1,<2.0.0)
42
+ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
43
+ Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
44
+ Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
45
+ Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
46
+ Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
47
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
48
+ Project-URL: Repository, https://github.com/DS4SD/docling
49
+ Description-Content-Type: text/markdown
50
+
51
+ <p align="center">
52
+ <a href="https://github.com/ds4sd/docling">
53
+ <img loading="lazy" alt="Docling" src="docs/assets/docling_processing.png" width="100%"/>
54
+ </a>
55
+ </p>
56
+
57
+ # Docling
58
+
59
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
60
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
61
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
62
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
63
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
64
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
65
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
66
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
67
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
68
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
69
+
70
+ Docling parses documents and exports them to the desired format with ease and speed.
71
+
72
+
73
+ ## Features
74
+
75
+ * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
76
+ * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
77
+ * 📝 Metadata extraction, including title, authors, references & language
78
+ * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
79
+ * 🔍 OCR support for scanned PDFs
80
+ * 💻 Simple and convenient CLI
81
+
82
+ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
83
+
84
+
85
+ ## Installation
86
+
87
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
88
+ ```bash
89
+ pip install docling
90
+ ```
91
+
92
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
93
+
94
+ More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
95
+
96
+ ## Getting started
97
+
98
+ To convert individual documents, use `convert()`, for example:
99
+
100
+ ```python
101
+ from docling.document_converter import DocumentConverter
102
+
103
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
104
+ converter = DocumentConverter()
105
+ result = converter.convert(source)
106
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
107
+ ```
108
+
109
+
110
+ Check out [Getting started](https://ds4sd.github.io/docling/).
111
+ You will find lots of tuning options to leverage all the advanced capabilities.
112
+
113
+
114
+ ## Get help and support
115
+
116
+ Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
117
+
118
+
119
+ ## Technical report
120
+
121
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
122
+
123
+ ## Contributing
124
+
125
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
126
+
127
+
128
+ ## References
129
+
130
+ If you use Docling in your projects, please consider citing the following:
131
+
132
+ ```bib
133
+ @techreport{Docling,
134
+ author = {Deep Search Team},
135
+ month = {8},
136
+ title = {Docling Technical Report},
137
+ url = {https://arxiv.org/abs/2408.09869},
138
+ eprint = {2408.09869},
139
+ doi = {10.48550/arXiv.2408.09869},
140
+ version = {1.0.0},
141
+ year = {2024}
142
+ }
143
+ ```
144
+
145
+ ## License
146
+
147
+ The Docling codebase is under MIT license.
148
+ For individual model usage, please refer to the model licenses found in the original packages.
149
+
@@ -0,0 +1,98 @@
1
+ <p align="center">
2
+ <a href="https://github.com/ds4sd/docling">
3
+ <img loading="lazy" alt="Docling" src="docs/assets/docling_processing.png" width="100%"/>
4
+ </a>
5
+ </p>
6
+
7
+ # Docling
8
+
9
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
10
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
11
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
12
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
13
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
14
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
15
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
16
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
17
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
18
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
19
+
20
+ Docling parses documents and exports them to the desired format with ease and speed.
21
+
22
+
23
+ ## Features
24
+
25
+ * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
26
+ * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
27
+ * 📝 Metadata extraction, including title, authors, references & language
28
+ * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
29
+ * 🔍 OCR support for scanned PDFs
30
+ * 💻 Simple and convenient CLI
31
+
32
+ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
33
+
34
+
35
+ ## Installation
36
+
37
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
38
+ ```bash
39
+ pip install docling
40
+ ```
41
+
42
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
43
+
44
+ More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
45
+
46
+ ## Getting started
47
+
48
+ To convert individual documents, use `convert()`, for example:
49
+
50
+ ```python
51
+ from docling.document_converter import DocumentConverter
52
+
53
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
54
+ converter = DocumentConverter()
55
+ result = converter.convert(source)
56
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
57
+ ```
58
+
59
+
60
+ Check out [Getting started](https://ds4sd.github.io/docling/).
61
+ You will find lots of tuning options to leverage all the advanced capabilities.
62
+
63
+
64
+ ## Get help and support
65
+
66
+ Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
67
+
68
+
69
+ ## Technical report
70
+
71
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
72
+
73
+ ## Contributing
74
+
75
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
76
+
77
+
78
+ ## References
79
+
80
+ If you use Docling in your projects, please consider citing the following:
81
+
82
+ ```bib
83
+ @techreport{Docling,
84
+ author = {Deep Search Team},
85
+ month = {8},
86
+ title = {Docling Technical Report},
87
+ url = {https://arxiv.org/abs/2408.09869},
88
+ eprint = {2408.09869},
89
+ doi = {10.48550/arXiv.2408.09869},
90
+ version = {1.0.0},
91
+ year = {2024}
92
+ }
93
+ ```
94
+
95
+ ## License
96
+
97
+ The Docling codebase is under MIT license.
98
+ For individual model usage, please refer to the model licenses found in the original packages.
@@ -0,0 +1,63 @@
1
+ from abc import ABC, abstractmethod
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Set, Union
5
+
6
+ from docling_core.types.doc import DoclingDocument
7
+
8
+ if TYPE_CHECKING:
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.document import InputDocument
11
+
12
+
13
+ class AbstractDocumentBackend(ABC):
14
+ @abstractmethod
15
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
16
+ self.path_or_stream = path_or_stream
17
+ self.document_hash = in_doc.document_hash
18
+ self.input_format = in_doc.format
19
+
20
+ @abstractmethod
21
+ def is_valid(self) -> bool:
22
+ pass
23
+
24
+ @classmethod
25
+ @abstractmethod
26
+ def supports_pagination(cls) -> bool:
27
+ pass
28
+
29
+ @abstractmethod
30
+ def unload(self):
31
+ if isinstance(self.path_or_stream, BytesIO):
32
+ self.path_or_stream.close()
33
+
34
+ self.path_or_stream = None
35
+
36
+ @classmethod
37
+ @abstractmethod
38
+ def supported_formats(cls) -> Set["InputFormat"]:
39
+ pass
40
+
41
+
42
+ class PaginatedDocumentBackend(AbstractDocumentBackend):
43
+ """DeclarativeDocumentBackend.
44
+
45
+ A declarative document backend is a backend that can transform to DoclingDocument
46
+ straight without a recognition pipeline.
47
+ """
48
+
49
+ @abstractmethod
50
+ def page_count(self) -> int:
51
+ pass
52
+
53
+
54
+ class DeclarativeDocumentBackend(AbstractDocumentBackend):
55
+ """DeclarativeDocumentBackend.
56
+
57
+ A declarative document backend is a backend that can transform to DoclingDocument
58
+ straight without a recognition pipeline.
59
+ """
60
+
61
+ @abstractmethod
62
+ def convert(self) -> DoclingDocument:
63
+ pass
@@ -5,12 +5,14 @@ from pathlib import Path
5
5
  from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
8
9
  from docling_parse.docling_parse import pdf_parser
9
10
  from PIL import Image, ImageDraw
10
11
  from pypdfium2 import PdfPage
11
12
 
12
- from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
13
- from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import Cell
15
+ from docling.datamodel.document import InputDocument
14
16
 
15
17
  _log = logging.getLogger(__name__)
16
18
 
@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):
177
179
 
178
180
  return image
179
181
 
180
- def get_size(self) -> PageSize:
181
- return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
182
+ def get_size(self) -> Size:
183
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
182
184
 
183
185
  def unload(self):
184
186
  self._ppage = None
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
186
188
 
187
189
 
188
190
  class DoclingParseDocumentBackend(PdfDocumentBackend):
189
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
190
- super().__init__(path_or_stream, document_hash)
191
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
192
+ super().__init__(in_doc, path_or_stream)
191
193
 
192
- self._pdoc = pdfium.PdfDocument(path_or_stream)
194
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
193
195
  self.parser = pdf_parser()
194
196
 
195
197
  success = False
196
- if isinstance(path_or_stream, BytesIO):
198
+ if isinstance(self.path_or_stream, BytesIO):
197
199
  success = self.parser.load_document_from_bytesio(
198
- document_hash, path_or_stream
200
+ self.document_hash, self.path_or_stream
201
+ )
202
+ elif isinstance(self.path_or_stream, Path):
203
+ success = self.parser.load_document(
204
+ self.document_hash, str(self.path_or_stream)
199
205
  )
200
- elif isinstance(path_or_stream, Path):
201
- success = self.parser.load_document(document_hash, str(path_or_stream))
202
206
 
203
207
  if not success:
204
208
  raise RuntimeError(
205
- f"docling-parse could not load document {document_hash}."
209
+ f"docling-parse could not load document with hash {self.document_hash}."
206
210
  )
207
211
 
208
212
  def page_count(self) -> int:
@@ -2,15 +2,19 @@ import logging
2
2
  import random
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
8
9
  from docling_parse.docling_parse import pdf_parser_v2
9
10
  from PIL import Image, ImageDraw
10
11
  from pypdfium2 import PdfPage
11
12
 
12
- from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
13
- from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import Cell, Size
15
+
16
+ if TYPE_CHECKING:
17
+ from docling.datamodel.document import InputDocument
14
18
 
15
19
  _log = logging.getLogger(__name__)
16
20
 
@@ -190,8 +194,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):
190
194
 
191
195
  return image
192
196
 
193
- def get_size(self) -> PageSize:
194
- return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
197
+ def get_size(self) -> Size:
198
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
195
199
 
196
200
  def unload(self):
197
201
  self._ppage = None
@@ -199,23 +203,23 @@ class DoclingParseV2PageBackend(PdfPageBackend):
199
203
 
200
204
 
201
205
  class DoclingParseV2DocumentBackend(PdfDocumentBackend):
202
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
203
- super().__init__(path_or_stream, document_hash)
206
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
207
+ super().__init__(in_doc, path_or_stream)
204
208
 
205
- self._pdoc = pdfium.PdfDocument(path_or_stream)
209
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
206
210
  self.parser = pdf_parser_v2("fatal")
207
211
 
208
212
  success = False
209
213
  if isinstance(path_or_stream, BytesIO):
210
214
  success = self.parser.load_document_from_bytesio(
211
- document_hash, path_or_stream
215
+ self.document_hash, path_or_stream
212
216
  )
213
217
  elif isinstance(path_or_stream, Path):
214
- success = self.parser.load_document(document_hash, str(path_or_stream))
218
+ success = self.parser.load_document(self.document_hash, str(path_or_stream))
215
219
 
216
220
  if not success:
217
221
  raise RuntimeError(
218
- f"docling-parse could not load document {document_hash}."
222
+ f"docling-parse v2 could not load document {self.document_hash}."
219
223
  )
220
224
 
221
225
  def page_count(self) -> int: