docling 1.11.0__py3-none-any.whl → 1.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,13 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Any, Iterable, Optional, Union
4
+ from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
5
5
 
6
6
  from PIL import Image
7
7
 
8
+ if TYPE_CHECKING:
9
+ from docling.datamodel.base_models import BoundingBox, Cell, PageSize
10
+
8
11
 
9
12
  class PdfPageBackend(ABC):
10
13
 
@@ -17,12 +20,12 @@ class PdfPageBackend(ABC):
17
20
  pass
18
21
 
19
22
  @abstractmethod
20
- def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
23
+ def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
21
24
  pass
22
25
 
23
26
  @abstractmethod
24
27
  def get_page_image(
25
- self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
28
+ self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
26
29
  ) -> Image.Image:
27
30
  pass
28
31
 
@@ -2,7 +2,7 @@ import logging
2
2
  import random
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Iterable, Optional, Union
5
+ from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_parse.docling_parse import pdf_parser
@@ -22,7 +22,6 @@ class DoclingParsePageBackend(PdfPageBackend):
22
22
  self._ppage = page_obj
23
23
  parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
24
24
 
25
- self._dpage = None
26
25
  self.valid = "pages" in parsed_page
27
26
  if self.valid:
28
27
  self._dpage = parsed_page["pages"][0]
@@ -68,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend):
68
67
  return text_piece
69
68
 
70
69
  def get_text_cells(self) -> Iterable[Cell]:
71
- cells = []
70
+ cells: List[Cell] = []
72
71
  cell_counter = 0
73
72
 
74
73
  if not self.valid:
@@ -130,7 +129,7 @@ class DoclingParsePageBackend(PdfPageBackend):
130
129
 
131
130
  return cells
132
131
 
133
- def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
132
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
134
133
  AREA_THRESHOLD = 32 * 32
135
134
 
136
135
  for i in range(len(self._dpage["images"])):
@@ -145,7 +144,7 @@ class DoclingParsePageBackend(PdfPageBackend):
145
144
  yield cropbox
146
145
 
147
146
  def get_page_image(
148
- self, scale: int = 1, cropbox: Optional[BoundingBox] = None
147
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
149
148
  ) -> Image.Image:
150
149
 
151
150
  page_size = self.get_size()
@@ -7,7 +7,7 @@ from typing import Iterable, List, Optional, Union
7
7
  import pypdfium2 as pdfium
8
8
  import pypdfium2.raw as pdfium_c
9
9
  from PIL import Image, ImageDraw
10
- from pypdfium2 import PdfPage
10
+ from pypdfium2 import PdfPage, PdfTextPage
11
11
  from pypdfium2._helpers.misc import PdfiumError
12
12
 
13
13
  from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
@@ -29,12 +29,12 @@ class PyPdfiumPageBackend(PdfPageBackend):
29
29
  exc_info=True,
30
30
  )
31
31
  self.valid = False
32
- self.text_page = None
32
+ self.text_page: Optional[PdfTextPage] = None
33
33
 
34
34
  def is_valid(self) -> bool:
35
35
  return self.valid
36
36
 
37
- def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
37
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
38
38
  AREA_THRESHOLD = 32 * 32
39
39
  for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
40
40
  pos = obj.get_pos()
@@ -189,7 +189,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
189
189
  return cells
190
190
 
191
191
  def get_page_image(
192
- self, scale: int = 1, cropbox: Optional[BoundingBox] = None
192
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
193
193
  ) -> Image.Image:
194
194
 
195
195
  page_size = self.get_size()
File without changes
docling/cli/main.py ADDED
@@ -0,0 +1,258 @@
1
+ import importlib
2
+ import json
3
+ import logging
4
+ import time
5
+ import warnings
6
+ from enum import Enum
7
+ from pathlib import Path
8
+ from typing import Annotated, Iterable, List, Optional
9
+
10
+ import typer
11
+ from pydantic import AnyUrl
12
+
13
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
15
+ from docling.datamodel.base_models import ConversionStatus, PipelineOptions
16
+ from docling.datamodel.document import ConversionResult, DocumentConversionInput
17
+ from docling.document_converter import DocumentConverter
18
+
19
+ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
20
+ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
21
+
22
+ _log = logging.getLogger(__name__)
23
+ from rich.console import Console
24
+
25
+ err_console = Console(stderr=True)
26
+
27
+
28
+ app = typer.Typer(
29
+ name="Docling",
30
+ no_args_is_help=True,
31
+ add_completion=False,
32
+ pretty_exceptions_enable=False,
33
+ )
34
+
35
+
36
+ def version_callback(value: bool):
37
+ if value:
38
+ docling_version = importlib.metadata.version("docling")
39
+ docling_core_version = importlib.metadata.version("docling-core")
40
+ docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
41
+ docling_parse_version = importlib.metadata.version("docling-parse")
42
+ print(f"Docling version: {docling_version}")
43
+ print(f"Docling Core version: {docling_core_version}")
44
+ print(f"Docling IBM Models version: {docling_ibm_models_version}")
45
+ print(f"Docling Parse version: {docling_parse_version}")
46
+ raise typer.Exit()
47
+
48
+
49
+ # Define an enum for the backend options
50
+ class Backend(str, Enum):
51
+ PYPDFIUM2 = "pypdfium2"
52
+ DOCLING = "docling"
53
+
54
+
55
+ def export_documents(
56
+ conv_results: Iterable[ConversionResult],
57
+ output_dir: Path,
58
+ export_json: bool,
59
+ export_md: bool,
60
+ export_txt: bool,
61
+ export_doctags: bool,
62
+ ):
63
+
64
+ success_count = 0
65
+ failure_count = 0
66
+
67
+ for conv_res in conv_results:
68
+ if conv_res.status == ConversionStatus.SUCCESS:
69
+ success_count += 1
70
+ doc_filename = conv_res.input.file.stem
71
+
72
+ # Export Deep Search document JSON format:
73
+ if export_json:
74
+ fname = output_dir / f"{doc_filename}.json"
75
+ with fname.open("w") as fp:
76
+ _log.info(f"writing JSON output to {fname}")
77
+ fp.write(json.dumps(conv_res.render_as_dict()))
78
+
79
+ # Export Text format:
80
+ if export_txt:
81
+ fname = output_dir / f"{doc_filename}.txt"
82
+ with fname.open("w") as fp:
83
+ _log.info(f"writing Text output to {fname}")
84
+ fp.write(conv_res.render_as_text())
85
+
86
+ # Export Markdown format:
87
+ if export_md:
88
+ fname = output_dir / f"{doc_filename}.md"
89
+ with fname.open("w") as fp:
90
+ _log.info(f"writing Markdown output to {fname}")
91
+ fp.write(conv_res.render_as_markdown())
92
+
93
+ # Export Document Tags format:
94
+ if export_doctags:
95
+ fname = output_dir / f"{doc_filename}.doctags"
96
+ with fname.open("w") as fp:
97
+ _log.info(f"writing Doc Tags output to {fname}")
98
+ fp.write(conv_res.render_as_doctags())
99
+
100
+ else:
101
+ _log.warning(f"Document {conv_res.input.file} failed to convert.")
102
+ failure_count += 1
103
+
104
+ _log.info(
105
+ f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
106
+ )
107
+
108
+
109
+ @app.command(no_args_is_help=True)
110
+ def convert(
111
+ input_sources: Annotated[
112
+ List[Path],
113
+ typer.Argument(
114
+ ...,
115
+ metavar="source",
116
+ help="PDF files to convert. Directories are also accepted.",
117
+ ),
118
+ ],
119
+ export_json: Annotated[
120
+ bool,
121
+ typer.Option(
122
+ ..., "--json/--no-json", help="If enabled the document is exported as JSON."
123
+ ),
124
+ ] = False,
125
+ export_md: Annotated[
126
+ bool,
127
+ typer.Option(
128
+ ..., "--md/--no-md", help="If enabled the document is exported as Markdown."
129
+ ),
130
+ ] = True,
131
+ export_txt: Annotated[
132
+ bool,
133
+ typer.Option(
134
+ ..., "--txt/--no-txt", help="If enabled the document is exported as Text."
135
+ ),
136
+ ] = False,
137
+ export_doctags: Annotated[
138
+ bool,
139
+ typer.Option(
140
+ ...,
141
+ "--doctags/--no-doctags",
142
+ help="If enabled the document is exported as Doc Tags.",
143
+ ),
144
+ ] = False,
145
+ ocr: Annotated[
146
+ bool,
147
+ typer.Option(
148
+ ..., help="If enabled, the bitmap content will be processed using OCR."
149
+ ),
150
+ ] = True,
151
+ backend: Annotated[
152
+ Backend, typer.Option(..., help="The PDF backend to use.")
153
+ ] = Backend.DOCLING,
154
+ output: Annotated[
155
+ Path, typer.Option(..., help="Output directory where results are saved.")
156
+ ] = Path("."),
157
+ version: Annotated[
158
+ Optional[bool],
159
+ typer.Option(
160
+ "--version",
161
+ callback=version_callback,
162
+ is_eager=True,
163
+ help="Show version information.",
164
+ ),
165
+ ] = None,
166
+ ):
167
+ logging.basicConfig(level=logging.INFO)
168
+
169
+ input_doc_paths: List[Path] = []
170
+ for source in input_sources:
171
+ if not source.exists():
172
+ err_console.print(
173
+ f"[red]Error: The input file {source} does not exist.[/red]"
174
+ )
175
+ raise typer.Abort()
176
+ elif source.is_dir():
177
+ input_doc_paths.extend(list(source.glob("**/*.pdf")))
178
+ input_doc_paths.extend(list(source.glob("**/*.PDF")))
179
+ else:
180
+ input_doc_paths.append(source)
181
+
182
+ ###########################################################################
183
+
184
+ # The following sections contain a combination of PipelineOptions
185
+ # and PDF Backends for various configurations.
186
+ # Uncomment one section at the time to see the differences in the output.
187
+
188
+ doc_converter = None
189
+ if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
190
+ pipeline_options = PipelineOptions()
191
+ pipeline_options.do_ocr = False
192
+ pipeline_options.do_table_structure = True
193
+ pipeline_options.table_structure_options.do_cell_matching = False
194
+
195
+ doc_converter = DocumentConverter(
196
+ pipeline_options=pipeline_options,
197
+ pdf_backend=PyPdfiumDocumentBackend,
198
+ )
199
+
200
+ elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
201
+ pipeline_options = PipelineOptions()
202
+ pipeline_options.do_ocr = False
203
+ pipeline_options.do_table_structure = True
204
+ pipeline_options.table_structure_options.do_cell_matching = True
205
+
206
+ doc_converter = DocumentConverter(
207
+ pipeline_options=pipeline_options,
208
+ pdf_backend=PyPdfiumDocumentBackend,
209
+ )
210
+
211
+ elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
212
+ pipeline_options = PipelineOptions()
213
+ pipeline_options.do_ocr = False
214
+ pipeline_options.do_table_structure = True
215
+ pipeline_options.table_structure_options.do_cell_matching = True
216
+
217
+ doc_converter = DocumentConverter(
218
+ pipeline_options=pipeline_options,
219
+ pdf_backend=DoclingParseDocumentBackend,
220
+ )
221
+
222
+ elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
223
+ pipeline_options = PipelineOptions()
224
+ pipeline_options.do_ocr = True
225
+ pipeline_options.do_table_structure = True
226
+ pipeline_options.table_structure_options.do_cell_matching = True
227
+
228
+ doc_converter = DocumentConverter(
229
+ pipeline_options=pipeline_options,
230
+ pdf_backend=DoclingParseDocumentBackend,
231
+ )
232
+
233
+ ###########################################################################
234
+
235
+ # Define input files
236
+ input = DocumentConversionInput.from_paths(input_doc_paths)
237
+
238
+ start_time = time.time()
239
+
240
+ conv_results = doc_converter.convert(input)
241
+
242
+ output.mkdir(parents=True, exist_ok=True)
243
+ export_documents(
244
+ conv_results,
245
+ output_dir=output,
246
+ export_json=export_json,
247
+ export_md=export_md,
248
+ export_txt=export_txt,
249
+ export_doctags=export_doctags,
250
+ )
251
+
252
+ end_time = time.time() - start_time
253
+
254
+ _log.info(f"All documents were converted in {end_time:.2f} seconds.")
255
+
256
+
257
+ if __name__ == "__main__":
258
+ app()
@@ -87,7 +87,7 @@ class BoundingBox(BaseModel):
87
87
  return (self.l, self.b, self.r, self.t)
88
88
 
89
89
  @classmethod
90
- def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
90
+ def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
91
91
  if origin == CoordOrigin.TOPLEFT:
92
92
  l, t, r, b = coord[0], coord[1], coord[2], coord[3]
93
93
  if r < l:
@@ -246,7 +246,7 @@ class EquationPrediction(BaseModel):
246
246
 
247
247
 
248
248
  class PagePredictions(BaseModel):
249
- layout: LayoutPrediction = None
249
+ layout: Optional[LayoutPrediction] = None
250
250
  tablestructure: Optional[TableStructurePrediction] = None
251
251
  figures_classification: Optional[FigureClassificationPrediction] = None
252
252
  equations_prediction: Optional[EquationPrediction] = None
@@ -267,7 +267,7 @@ class Page(BaseModel):
267
267
  page_no: int
268
268
  page_hash: Optional[str] = None
269
269
  size: Optional[PageSize] = None
270
- cells: List[Cell] = None
270
+ cells: List[Cell] = []
271
271
  predictions: PagePredictions = PagePredictions()
272
272
  assembled: Optional[AssembledUnit] = None
273
273
 
@@ -1,12 +1,12 @@
1
1
  from pathlib import Path
2
- from typing import Iterable
2
+ from typing import Callable, Iterable, List
3
3
 
4
4
  from docling.datamodel.base_models import Page, PipelineOptions
5
5
 
6
6
 
7
7
  class BaseModelPipeline:
8
8
  def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
9
- self.model_pipe = []
9
+ self.model_pipe: List[Callable] = []
10
10
  self.artifacts_path = artifacts_path
11
11
  self.pipeline_options = pipeline_options
12
12
 
docling/utils/export.py CHANGED
@@ -1,10 +1,10 @@
1
1
  import logging
2
- from typing import Any, Dict, Iterable, List, Tuple
2
+ from typing import Any, Dict, Iterable, List, Tuple, Union
3
3
 
4
- from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
4
+ from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
5
5
 
6
6
  from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
7
- from docling.datamodel.document import ConvertedDocument, Page
7
+ from docling.datamodel.document import ConversionResult, Page
8
8
 
9
9
  _log = logging.getLogger(__name__)
10
10
 
@@ -15,7 +15,10 @@ def _export_table_to_html(table: Table):
15
15
  # to the docling-core package.
16
16
 
17
17
  def _get_tablecell_span(cell: TableCell, ix):
18
- span = set([s[ix] for s in cell.spans])
18
+ if cell.spans is None:
19
+ span = set()
20
+ else:
21
+ span = set([s[ix] for s in cell.spans])
19
22
  if len(span) == 0:
20
23
  return 1, None, None
21
24
  return len(span), min(span), max(span)
@@ -24,6 +27,8 @@ def _export_table_to_html(table: Table):
24
27
  nrows = table.num_rows
25
28
  ncols = table.num_cols
26
29
 
30
+ if table.data is None:
31
+ return ""
27
32
  for i in range(nrows):
28
33
  body += "<tr>"
29
34
  for j in range(ncols):
@@ -66,7 +71,7 @@ def _export_table_to_html(table: Table):
66
71
 
67
72
 
68
73
  def generate_multimodal_pages(
69
- doc_result: ConvertedDocument,
74
+ doc_result: ConversionResult,
70
75
  ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
71
76
 
72
77
  label_to_doclaynet = {
@@ -94,7 +99,7 @@ def generate_multimodal_pages(
94
99
  page_no = 0
95
100
  start_ix = 0
96
101
  end_ix = 0
97
- doc_items = []
102
+ doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
98
103
 
99
104
  doc = doc_result.output
100
105
 
@@ -105,11 +110,11 @@ def generate_multimodal_pages(
105
110
  item_type = item.obj_type
106
111
  label = label_to_doclaynet.get(item_type, None)
107
112
 
108
- if label is None:
113
+ if label is None or item.prov is None or page.size is None:
109
114
  continue
110
115
 
111
116
  bbox = BoundingBox.from_tuple(
112
- item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
117
+ tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
113
118
  )
114
119
  new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
115
120
  page_size=page.size
@@ -137,13 +142,15 @@ def generate_multimodal_pages(
137
142
  return segments
138
143
 
139
144
  def _process_page_cells(page: Page):
140
- cells = []
145
+ cells: List[dict] = []
146
+ if page.size is None:
147
+ return cells
141
148
  for cell in page.cells:
142
149
  new_bbox = cell.bbox.to_top_left_origin(
143
150
  page_height=page.size.height
144
151
  ).normalized(page_size=page.size)
145
152
  is_ocr = isinstance(cell, OcrCell)
146
- ocr_confidence = cell.confidence if is_ocr else 1.0
153
+ ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
147
154
  cells.append(
148
155
  {
149
156
  "text": cell.text,
@@ -170,6 +177,8 @@ def generate_multimodal_pages(
170
177
 
171
178
  return content_text, content_md, content_dt, page_cells, page_segments, page
172
179
 
180
+ if doc.main_text is None:
181
+ return
173
182
  for ix, orig_item in enumerate(doc.main_text):
174
183
 
175
184
  item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.11.0
3
+ Version: 1.12.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -19,21 +19,34 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Provides-Extra: examples
22
23
  Requires-Dist: certifi (>=2024.7.4)
23
24
  Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
24
- Requires-Dist: docling-core (>=1.2.0,<2.0.0)
25
+ Requires-Dist: docling-core (>=1.3.0,<2.0.0)
25
26
  Requires-Dist: docling-ibm-models (>=1.1.7,<2.0.0)
26
27
  Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
27
28
  Requires-Dist: easyocr (>=1.7,<2.0)
28
29
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
30
  Requires-Dist: huggingface_hub (>=0.23,<1)
31
+ Requires-Dist: langchain-huggingface (>=0.0.3,<0.0.4) ; extra == "examples"
32
+ Requires-Dist: langchain-milvus (>=0.1.4,<0.2.0) ; extra == "examples"
33
+ Requires-Dist: langchain-text-splitters (>=0.2.4,<0.3.0) ; extra == "examples"
34
+ Requires-Dist: llama-index-embeddings-huggingface (>=0.3.1,<0.4.0) ; extra == "examples"
35
+ Requires-Dist: llama-index-llms-huggingface-api (>=0.2.0,<0.3.0) ; extra == "examples"
36
+ Requires-Dist: llama-index-vector-stores-milvus (>=0.2.1,<0.3.0) ; extra == "examples"
30
37
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
31
38
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
32
39
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
33
40
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
41
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0) ; extra == "examples"
34
42
  Requires-Dist: requests (>=2.32.3,<3.0.0)
35
43
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
36
44
  Requires-Dist: scipy (>=1.14.1,<2.0.0)
45
+ Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
46
+ Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
47
+ Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
48
+ Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
49
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
37
50
  Project-URL: Repository, https://github.com/DS4SD/docling
38
51
  Description-Content-Type: text/markdown
39
52
 
@@ -62,8 +75,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
62
75
  * 📑 Understands detailed page layout, reading order and recovers table structures
63
76
  * 📝 Extracts metadata from the document, such as title, authors, references and language
64
77
  * 🔍 Optionally applies OCR (use with scanned PDFs)
65
-
66
- For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
78
+ * 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
67
79
 
68
80
  ## Installation
69
81
 
@@ -182,6 +194,10 @@ results = doc_converter.convert(conv_input)
182
194
 
183
195
  You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
184
196
 
197
+ ### RAG
198
+ Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
199
+ - [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
200
+ - [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
185
201
 
186
202
  ## Technical report
187
203
 
@@ -1,10 +1,12 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=xfNNiZKksPPa9KAiA-fHD86flg0It4n_29ccpm8fFiY,1436
4
- docling/backend/docling_parse_backend.py,sha256=r3aJwsWR7qG47ElhOa9iQJJQauHMt950FfCsf6fhlP4,7480
5
- docling/backend/pypdfium2_backend.py,sha256=FggVFitmyMMmLar6vk6XQsavGOPQx95TD14opWYRMAY,8837
3
+ docling/backend/abstract_backend.py,sha256=clJtGxLedpLriEhpx7oyxjmlwMLPorkv-1tdfZm9GdA,1546
4
+ docling/backend/docling_parse_backend.py,sha256=RUWWZbx2cUotZeeTkc-Lbg2k8MVFXFxaDjM4sPfaFZE,7475
5
+ docling/backend/pypdfium2_backend.py,sha256=bIIImVM73wmcVcKMqjl4JF8CD-Qj2W5rZbI4G7clU4s,8877
6
+ docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ docling/cli/main.py,sha256=VUzm4vOijPo2F2Ht20zTnMI5alJLixfC5WK2NJCbyng,8492
6
8
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/datamodel/base_models.py,sha256=PSJe_Qlh2VJfijg3kkXOOqZbi_uqRHCmLjX__c5Buck,9155
9
+ docling/datamodel/base_models.py,sha256=tE2Sxoe3e_fBZjq3GDo2NCughDMU5xDeAfkQgT72TRI,9168
8
10
  docling/datamodel/document.py,sha256=oXPitPRd9Gyi7ZU4kfEc4K9eMVtTJDx1T-ellTwF3Ak,15716
9
11
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
12
  docling/document_converter.py,sha256=5OiNafoaVcQhZ8ATF69xRp2KyFyKeSMhmwEFUoCzP-k,10980
@@ -16,13 +18,14 @@ docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvV
16
18
  docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
17
19
  docling/models/table_structure_model.py,sha256=0wOeiRoma6et7FtoJZw2SA3wBd9-R9ivp5uvXBQqeM4,5768
18
20
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- docling/pipeline/base_model_pipeline.py,sha256=AC5NTR0xLy5JIZqsTINkKEHeCPqpyvJpuE_bcnZhyvI,529
21
+ docling/pipeline/base_model_pipeline.py,sha256=H5XoADpsJEZls8BI3FnppR2ubltkQwf_er4Qr74rdQ8,561
20
22
  docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
21
23
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- docling/utils/export.py,sha256=ltPhhruS8sulHTYW0Rtjfc1I9lW3oH6QAF0oYewkz7k,6115
24
+ docling/utils/export.py,sha256=ast5p8YgPBwaDx5ClOF1iSJHO8BFEWE3EBBsUiD9MIQ,6474
23
25
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
24
26
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
25
- docling-1.11.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
26
- docling-1.11.0.dist-info/METADATA,sha256=lDqzdtE1ohikNmN3eyPQ31Qa30x9F5XN6FUTkTNGU9s,8231
27
- docling-1.11.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
28
- docling-1.11.0.dist-info/RECORD,,
27
+ docling-1.12.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
28
+ docling-1.12.1.dist-info/METADATA,sha256=uOBuBvm3hx7K2IS6_iONhO4W-pAywg6kFWgwd106m9k,9544
29
+ docling-1.12.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
30
+ docling-1.12.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
31
+ docling-1.12.1.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ docling=docling.cli.main:app
3
+