docling 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@ from docling.datamodel.document import ConversionResult, InputDocument
19
19
  from docling.datamodel.pipeline_options import PipelineOptions
20
20
  from docling.datamodel.settings import settings
21
21
  from docling.models.base_model import BaseEnrichmentModel
22
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
22
23
  from docling.utils.utils import chunkify
23
24
 
24
25
  _log = logging.getLogger(__name__)
@@ -35,13 +36,16 @@ class BasePipeline(ABC):
35
36
 
36
37
  _log.info(f"Processing document {in_doc.file.name}")
37
38
  try:
38
- # These steps are building and assembling the structure of the
39
- # output DoclingDocument
40
- conv_res = self._build_document(in_doc, conv_res)
41
- conv_res = self._assemble_document(in_doc, conv_res)
42
- # From this stage, all operations should rely only on conv_res.output
43
- conv_res = self._enrich_document(in_doc, conv_res)
44
- conv_res.status = self._determine_status(in_doc, conv_res)
39
+ with TimeRecorder(
40
+ conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
41
+ ):
42
+ # These steps are building and assembling the structure of the
43
+ # output DoclingDocument
44
+ conv_res = self._build_document(conv_res)
45
+ conv_res = self._assemble_document(conv_res)
46
+ # From this stage, all operations should rely only on conv_res.output
47
+ conv_res = self._enrich_document(conv_res)
48
+ conv_res.status = self._determine_status(conv_res)
45
49
  except Exception as e:
46
50
  conv_res.status = ConversionStatus.FAILURE
47
51
  if raises_on_error:
@@ -50,19 +54,13 @@ class BasePipeline(ABC):
50
54
  return conv_res
51
55
 
52
56
  @abstractmethod
53
- def _build_document(
54
- self, in_doc: InputDocument, conv_res: ConversionResult
55
- ) -> ConversionResult:
57
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
56
58
  pass
57
59
 
58
- def _assemble_document(
59
- self, in_doc: InputDocument, conv_res: ConversionResult
60
- ) -> ConversionResult:
60
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
61
61
  return conv_res
62
62
 
63
- def _enrich_document(
64
- self, in_doc: InputDocument, conv_res: ConversionResult
65
- ) -> ConversionResult:
63
+ def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
66
64
 
67
65
  def _filter_elements(
68
66
  doc: DoclingDocument, model: BaseEnrichmentModel
@@ -71,24 +69,23 @@ class BasePipeline(ABC):
71
69
  if model.is_processable(doc=doc, element=element):
72
70
  yield element
73
71
 
74
- for model in self.enrichment_pipe:
75
- for element_batch in chunkify(
76
- _filter_elements(conv_res.document, model),
77
- settings.perf.elements_batch_size,
78
- ):
79
- # TODO: currently we assume the element itself is modified, because
80
- # we don't have an interface to save the element back to the document
81
- for element in model(
82
- doc=conv_res.document, element_batch=element_batch
83
- ): # Must exhaust!
84
- pass
72
+ with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
73
+ for model in self.enrichment_pipe:
74
+ for element_batch in chunkify(
75
+ _filter_elements(conv_res.document, model),
76
+ settings.perf.elements_batch_size,
77
+ ):
78
+ # TODO: currently we assume the element itself is modified, because
79
+ # we don't have an interface to save the element back to the document
80
+ for element in model(
81
+ doc=conv_res.document, element_batch=element_batch
82
+ ): # Must exhaust!
83
+ pass
85
84
 
86
85
  return conv_res
87
86
 
88
87
  @abstractmethod
89
- def _determine_status(
90
- self, in_doc: InputDocument, conv_res: ConversionResult
91
- ) -> ConversionStatus:
88
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
92
89
  pass
93
90
 
94
91
  @classmethod
@@ -110,66 +107,68 @@ class BasePipeline(ABC):
110
107
 
111
108
  class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
112
109
 
113
- def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
110
+ def _apply_on_pages(
111
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
112
+ ) -> Iterable[Page]:
114
113
  for model in self.build_pipe:
115
- page_batch = model(page_batch)
114
+ page_batch = model(conv_res, page_batch)
116
115
 
117
116
  yield from page_batch
118
117
 
119
- def _build_document(
120
- self, in_doc: InputDocument, conv_res: ConversionResult
121
- ) -> ConversionResult:
118
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
122
119
 
123
- if not isinstance(in_doc._backend, PdfDocumentBackend):
120
+ if not isinstance(conv_res.input._backend, PdfDocumentBackend):
124
121
  raise RuntimeError(
125
- f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
122
+ f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
126
123
  f"Can not convert this with a PDF pipeline. "
127
124
  f"Please check your format configuration on DocumentConverter."
128
125
  )
129
126
  # conv_res.status = ConversionStatus.FAILURE
130
127
  # return conv_res
131
128
 
132
- for i in range(0, in_doc.page_count):
133
- conv_res.pages.append(Page(page_no=i))
129
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
134
130
 
135
- try:
136
- # Iterate batches of pages (page_batch_size) in the doc
137
- for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
138
- start_pb_time = time.time()
131
+ for i in range(0, conv_res.input.page_count):
132
+ conv_res.pages.append(Page(page_no=i))
139
133
 
140
- # 1. Initialise the page resources
141
- init_pages = map(
142
- functools.partial(self.initialize_page, in_doc), page_batch
143
- )
134
+ try:
135
+ # Iterate batches of pages (page_batch_size) in the doc
136
+ for page_batch in chunkify(
137
+ conv_res.pages, settings.perf.page_batch_size
138
+ ):
139
+ start_pb_time = time.time()
144
140
 
145
- # 2. Run pipeline stages
146
- pipeline_pages = self._apply_on_pages(init_pages)
141
+ # 1. Initialise the page resources
142
+ init_pages = map(
143
+ functools.partial(self.initialize_page, conv_res), page_batch
144
+ )
147
145
 
148
- for p in pipeline_pages: # Must exhaust!
149
- pass
146
+ # 2. Run pipeline stages
147
+ pipeline_pages = self._apply_on_pages(conv_res, init_pages)
150
148
 
151
- end_pb_time = time.time() - start_pb_time
152
- _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
149
+ for p in pipeline_pages: # Must exhaust!
150
+ pass
153
151
 
154
- except Exception as e:
155
- conv_res.status = ConversionStatus.FAILURE
156
- trace = "\n".join(traceback.format_exception(e))
157
- _log.warning(
158
- f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
159
- f"{trace}"
160
- )
161
- raise e
152
+ end_pb_time = time.time() - start_pb_time
153
+ _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
154
+
155
+ except Exception as e:
156
+ conv_res.status = ConversionStatus.FAILURE
157
+ trace = "\n".join(traceback.format_exception(e))
158
+ _log.warning(
159
+ f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
160
+ f"{trace}"
161
+ )
162
+ raise e
162
163
 
163
- finally:
164
- # Always unload the PDF backend, even in case of failure
165
- if in_doc._backend:
166
- in_doc._backend.unload()
164
+ finally:
165
+ # Always unload the PDF backend, even in case of failure
166
+ if conv_res.input._backend:
167
+ conv_res.input._backend.unload()
167
168
 
168
169
  return conv_res
169
170
 
170
- def _determine_status(
171
- self, in_doc: InputDocument, conv_res: ConversionResult
172
- ) -> ConversionStatus:
171
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
173
172
  status = ConversionStatus.SUCCESS
174
173
  for page in conv_res.pages:
175
174
  if page._backend is None or not page._backend.is_valid():
@@ -186,5 +185,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
186
185
 
187
186
  # Initialise and load resources for a page
188
187
  @abstractmethod
189
- def initialize_page(self, doc: InputDocument, page: Page) -> Page:
188
+ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
190
189
  pass
@@ -5,9 +5,10 @@ from docling.backend.abstract_backend import (
5
5
  DeclarativeDocumentBackend,
6
6
  )
7
7
  from docling.datamodel.base_models import ConversionStatus
8
- from docling.datamodel.document import ConversionResult, InputDocument
8
+ from docling.datamodel.document import ConversionResult
9
9
  from docling.datamodel.pipeline_options import PipelineOptions
10
10
  from docling.pipeline.base_pipeline import BasePipeline
11
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
11
12
 
12
13
  _log = logging.getLogger(__name__)
13
14
 
@@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
22
23
  def __init__(self, pipeline_options: PipelineOptions):
23
24
  super().__init__(pipeline_options)
24
25
 
25
- def _build_document(
26
- self, in_doc: InputDocument, conv_res: ConversionResult
27
- ) -> ConversionResult:
26
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
28
27
 
29
- if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
28
+ if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
30
29
  raise RuntimeError(
31
- f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
30
+ f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
32
31
  f"Can not convert this with simple pipeline. "
33
32
  f"Please check your format configuration on DocumentConverter."
34
33
  )
@@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
38
37
  # Instead of running a page-level pipeline to build up the document structure,
39
38
  # the backend is expected to be of type DeclarativeDocumentBackend, which can output
40
39
  # a DoclingDocument straight.
41
-
42
- conv_res.document = in_doc._backend.convert()
40
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
41
+ conv_res.document = conv_res.input._backend.convert()
43
42
  return conv_res
44
43
 
45
- def _determine_status(
46
- self, in_doc: InputDocument, conv_res: ConversionResult
47
- ) -> ConversionStatus:
44
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
48
45
  # This is called only if the previous steps didn't raise.
49
46
  # Since we don't have anything else to evaluate, we can
50
47
  # safely return SUCCESS.
@@ -7,7 +7,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
7
7
  from docling.backend.abstract_backend import AbstractDocumentBackend
8
8
  from docling.backend.pdf_backend import PdfDocumentBackend
9
9
  from docling.datamodel.base_models import AssembledUnit, Page
10
- from docling.datamodel.document import ConversionResult, InputDocument
10
+ from docling.datamodel.document import ConversionResult
11
11
  from docling.datamodel.pipeline_options import (
12
12
  EasyOcrOptions,
13
13
  PdfPipelineOptions,
@@ -27,6 +27,7 @@ from docling.models.table_structure_model import TableStructureModel
27
27
  from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
28
28
  from docling.models.tesseract_ocr_model import TesseractOcrModel
29
29
  from docling.pipeline.base_pipeline import PaginatedPipeline
30
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
30
31
 
31
32
  _log = logging.getLogger(__name__)
32
33
 
@@ -119,73 +120,75 @@ class StandardPdfPipeline(PaginatedPipeline):
119
120
  )
120
121
  return None
121
122
 
122
- def initialize_page(self, doc: InputDocument, page: Page) -> Page:
123
- page._backend = doc._backend.load_page(page.page_no) # type: ignore
124
- if page._backend is not None and page._backend.is_valid():
125
- page.size = page._backend.get_size()
123
+ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
124
+ with TimeRecorder(conv_res, "page_init"):
125
+ page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
126
+ if page._backend is not None and page._backend.is_valid():
127
+ page.size = page._backend.get_size()
126
128
 
127
129
  return page
128
130
 
129
- def _assemble_document(
130
- self, in_doc: InputDocument, conv_res: ConversionResult
131
- ) -> ConversionResult:
131
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
132
132
  all_elements = []
133
133
  all_headers = []
134
134
  all_body = []
135
135
 
136
- for p in conv_res.pages:
137
- if p.assembled is not None:
138
- for el in p.assembled.body:
139
- all_body.append(el)
140
- for el in p.assembled.headers:
141
- all_headers.append(el)
142
- for el in p.assembled.elements:
143
- all_elements.append(el)
144
-
145
- conv_res.assembled = AssembledUnit(
146
- elements=all_elements, headers=all_headers, body=all_body
147
- )
148
-
149
- conv_res.document = self.glm_model(conv_res)
136
+ with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
137
+ for p in conv_res.pages:
138
+ if p.assembled is not None:
139
+ for el in p.assembled.body:
140
+ all_body.append(el)
141
+ for el in p.assembled.headers:
142
+ all_headers.append(el)
143
+ for el in p.assembled.elements:
144
+ all_elements.append(el)
145
+
146
+ conv_res.assembled = AssembledUnit(
147
+ elements=all_elements, headers=all_headers, body=all_body
148
+ )
150
149
 
151
- # Generate page images in the output
152
- if self.pipeline_options.generate_page_images:
153
- for page in conv_res.pages:
154
- assert page.image is not None
155
- page_no = page.page_no + 1
156
- conv_res.document.pages[page_no].image = ImageRef.from_pil(
157
- page.image, dpi=int(72 * self.pipeline_options.images_scale)
158
- )
150
+ conv_res.document = self.glm_model(conv_res)
159
151
 
160
- # Generate images of the requested element types
161
- if (
162
- self.pipeline_options.generate_picture_images
163
- or self.pipeline_options.generate_table_images
164
- ):
165
- scale = self.pipeline_options.images_scale
166
- for element, _level in conv_res.document.iterate_items():
167
- if not isinstance(element, DocItem) or len(element.prov) == 0:
168
- continue
169
- if (
170
- isinstance(element, PictureItem)
171
- and self.pipeline_options.generate_picture_images
172
- ) or (
173
- isinstance(element, TableItem)
174
- and self.pipeline_options.generate_table_images
175
- ):
176
- page_ix = element.prov[0].page_no - 1
177
- page = conv_res.pages[page_ix]
178
- assert page.size is not None
152
+ # Generate page images in the output
153
+ if self.pipeline_options.generate_page_images:
154
+ for page in conv_res.pages:
179
155
  assert page.image is not None
180
-
181
- crop_bbox = (
182
- element.prov[0]
183
- .bbox.scaled(scale=scale)
184
- .to_top_left_origin(page_height=page.size.height * scale)
156
+ page_no = page.page_no + 1
157
+ conv_res.document.pages[page_no].image = ImageRef.from_pil(
158
+ page.image, dpi=int(72 * self.pipeline_options.images_scale)
185
159
  )
186
160
 
187
- cropped_im = page.image.crop(crop_bbox.as_tuple())
188
- element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
161
+ # Generate images of the requested element types
162
+ if (
163
+ self.pipeline_options.generate_picture_images
164
+ or self.pipeline_options.generate_table_images
165
+ ):
166
+ scale = self.pipeline_options.images_scale
167
+ for element, _level in conv_res.document.iterate_items():
168
+ if not isinstance(element, DocItem) or len(element.prov) == 0:
169
+ continue
170
+ if (
171
+ isinstance(element, PictureItem)
172
+ and self.pipeline_options.generate_picture_images
173
+ ) or (
174
+ isinstance(element, TableItem)
175
+ and self.pipeline_options.generate_table_images
176
+ ):
177
+ page_ix = element.prov[0].page_no - 1
178
+ page = conv_res.pages[page_ix]
179
+ assert page.size is not None
180
+ assert page.image is not None
181
+
182
+ crop_bbox = (
183
+ element.prov[0]
184
+ .bbox.scaled(scale=scale)
185
+ .to_top_left_origin(page_height=page.size.height * scale)
186
+ )
187
+
188
+ cropped_im = page.image.crop(crop_bbox.as_tuple())
189
+ element.image = ImageRef.from_pil(
190
+ cropped_im, dpi=int(72 * scale)
191
+ )
189
192
 
190
193
  return conv_res
191
194
 
@@ -0,0 +1,62 @@
1
+ import time
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from typing import TYPE_CHECKING, List
5
+
6
+ import numpy as np
7
+ from pydantic import BaseModel
8
+
9
+ from docling.datamodel.settings import settings
10
+
11
+ if TYPE_CHECKING:
12
+ from docling.datamodel.document import ConversionResult
13
+
14
+
15
+ class ProfilingScope(str, Enum):
16
+ PAGE = "page"
17
+ DOCUMENT = "document"
18
+
19
+
20
+ class ProfilingItem(BaseModel):
21
+ scope: ProfilingScope
22
+ count: int = 0
23
+ times: List[float] = []
24
+ start_timestamps: List[datetime] = []
25
+
26
+ def avg(self) -> float:
27
+ return np.average(self.times) # type: ignore
28
+
29
+ def std(self) -> float:
30
+ return np.std(self.times) # type: ignore
31
+
32
+ def mean(self) -> float:
33
+ return np.mean(self.times) # type: ignore
34
+
35
+ def percentile(self, perc: float) -> float:
36
+ return np.percentile(self.times, perc) # type: ignore
37
+
38
+
39
+ class TimeRecorder:
40
+ def __init__(
41
+ self,
42
+ conv_res: "ConversionResult",
43
+ key: str,
44
+ scope: ProfilingScope = ProfilingScope.PAGE,
45
+ ):
46
+ if settings.debug.profile_pipeline_timings:
47
+ if key not in conv_res.timings.keys():
48
+ conv_res.timings[key] = ProfilingItem(scope=scope)
49
+ self.conv_res = conv_res
50
+ self.key = key
51
+
52
+ def __enter__(self):
53
+ if settings.debug.profile_pipeline_timings:
54
+ self.start = time.monotonic()
55
+ self.conv_res.timings[self.key].start_timestamps.append(datetime.utcnow())
56
+ return self
57
+
58
+ def __exit__(self, *args):
59
+ if settings.debug.profile_pipeline_timings:
60
+ elapsed = time.monotonic() - self.start
61
+ self.conv_res.timings[self.key].times.append(elapsed)
62
+ self.conv_res.timings[self.key].count += 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.2.0
3
+ Version: 2.3.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -23,7 +23,7 @@ Provides-Extra: tesserocr
23
23
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
25
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
- Requires-Dist: docling-core (>=2.1.0,<3.0.0)
26
+ Requires-Dist: docling-core (>=2.2.3,<3.0.0)
27
27
  Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
28
  Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -73,8 +73,9 @@ Docling parses documents and exports them to the desired format with ease and sp
73
73
 
74
74
  ## Features
75
75
 
76
- * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
77
- * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
76
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
77
+ * 📑 Advanced PDF document understanding including page layout, reading order & table structures
78
+ * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
78
79
  * 📝 Metadata extraction, including title, authors, references & language
79
80
  * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
80
81
  * 🔍 OCR support for scanned PDFs
@@ -0,0 +1,45 @@
1
+ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
+ docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
5
+ docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
6
+ docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
7
+ docling/backend/html_backend.py,sha256=p3WlYta1f3e4osmvVR12KIUYLJimveTX8UwEkyPt7_g,15161
8
+ docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
+ docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
10
+ docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpRwhxI,17230
11
+ docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
12
+ docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
13
+ docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
15
+ docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ docling/datamodel/base_models.py,sha256=fmkS6iTxGZCTtNCo2zsgMmBC11Ogf2Ht-mNIlZ9GP-o,5375
17
+ docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
18
+ docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
19
+ docling/datamodel/settings.py,sha256=2-sYEnKLV_giGygUlBtiBd4CJYN5T9-3BdL6NpWkUYw,1155
20
+ docling/document_converter.py,sha256=Y0Tngh-seNSty7Ov71DDAJzbBgruoEdwYPunVn7DT00,10413
21
+ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
23
+ docling/models/base_ocr_model.py,sha256=Ti0glL-_DVRfmP3MpywYVmkNf5RP6qhRg_UKzJuV1Dc,5663
24
+ docling/models/ds_glm_model.py,sha256=2OpWW8MMzCIshrtP36gDSRPYOCjv1ex34FqxD2nYjP4,11986
25
+ docling/models/easyocr_model.py,sha256=23hWq484qVS3nkch6nRRWowfQamN-McFZgfbHfp5Vuo,3818
26
+ docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
27
+ docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
28
+ docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
29
+ docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
30
+ docling/models/tesseract_ocr_cli_model.py,sha256=ZflwQcD7YjhPqEB8bbgNgP14OBD4NNEJefUS8Lbr5X0,6511
31
+ docling/models/tesseract_ocr_model.py,sha256=AccCgaYNzGryiJnkwR4sv2FeOdlSgO3uspdQOmo1sNY,5569
32
+ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
34
+ docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
35
+ docling/pipeline/standard_pdf_pipeline.py,sha256=h59eA0CLMYuuJoH-0SyCRkYEregNs6i0pa46Ioqf8kU,7947
36
+ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
38
+ docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
39
+ docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
40
+ docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
41
+ docling-2.3.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
42
+ docling-2.3.0.dist-info/METADATA,sha256=e3LTQgbktuUHzQlI4qXDhIDMGOX0duC1EJWws6j6_y8,6373
43
+ docling-2.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
44
+ docling-2.3.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
45
+ docling-2.3.0.dist-info/RECORD,,
@@ -1,44 +0,0 @@
1
- docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
- docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
5
- docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
6
- docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
7
- docling/backend/html_backend.py,sha256=wfh5PWEwoqsCXxFCQbFBdJvEtlqZhXgqfPfTYETWHfE,14974
8
- docling/backend/md_backend.py,sha256=osYiNLnep9UgLq8mUH9bmwG3kP9RXxt69I8LlyeJN6g,11505
9
- docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
10
- docling/backend/msword_backend.py,sha256=6bY0ebOaeSbpskUJY5t5pOf4a2VclWzeHeSo-vzsaO0,17470
11
- docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
12
- docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
13
- docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
15
- docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- docling/datamodel/base_models.py,sha256=Mx0xR6YmRP8thu8CjOxjbGHLUJctqIvFwRZQ-8tQowY,5380
17
- docling/datamodel/document.py,sha256=mkPXDms9jtPFY1pfBSicNaVRZwbbfzYFUj0dJDbMgG8,20612
18
- docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
19
- docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
20
- docling/document_converter.py,sha256=T-Y2pWwbCIofW209XJ3wlc5TiGeQqMbDqgzcVWyZ_0Y,10227
21
- docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
23
- docling/models/base_ocr_model.py,sha256=SYelQRValiUo6M_p_9-J7CqNIOFO-EkK58j90SMsKQY,5028
24
- docling/models/ds_glm_model.py,sha256=vJLngchZonqFzGWbUr2izFSXk9DloPDhAfN2c3nkzNU,11254
25
- docling/models/easyocr_model.py,sha256=YfvdodjZ20WuOfouQXJmDyQL78QDOqWYsWSs2zSxWFc,3327
26
- docling/models/layout_model.py,sha256=zd2ULW3U6v9OJl4TnjWFEY6Q2O-lBfrIqtvrnDzF7HU,12596
27
- docling/models/page_assemble_model.py,sha256=LOKHho-r-RpeIVh8CpJ9tid_QIp5um3ukcrucZsyUlY,6645
28
- docling/models/page_preprocessing_model.py,sha256=cfhUIlGAGaX1RxILi69ZEV9Kmhhd3Y0XaSlQnGo18o4,1964
29
- docling/models/table_structure_model.py,sha256=YWSZKOz56gvicjTzVgSE-8Z_hI3NcRD5EN0yOUoM-_g,6979
30
- docling/models/tesseract_ocr_cli_model.py,sha256=fKc05V73ibMvAeuA4PForhYNtunpT5rR0k_xHZsew-E,5980
31
- docling/models/tesseract_ocr_model.py,sha256=v6td0vq8NogePuRTJRZhKF0DtZXITj70r9rKJKO5u9k,4984
32
- docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
- docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
34
- docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
35
- docling/pipeline/standard_pdf_pipeline.py,sha256=AVNSxGc6kPmBPDLWDc9eI8fryc25eOtiIVrOyVhZMZM,7527
36
- docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
- docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
38
- docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
39
- docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
40
- docling-2.2.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
41
- docling-2.2.0.dist-info/METADATA,sha256=TkaywA2l2ImdMc9WpUYWUQy3n50zG9Y9eC7ziElBlU0,6205
42
- docling-2.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
43
- docling-2.2.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
44
- docling-2.2.0.dist-info/RECORD,,