docling 2.14.0__py3-none-any.whl → 2.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
132
132
  return cells
133
133
 
134
134
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
135
- AREA_THRESHOLD = 32 * 32
135
+ AREA_THRESHOLD = 0 # 32 * 32
136
136
 
137
137
  for i in range(len(self._dpage["images"])):
138
138
  bitmap = self._dpage["images"][i]
@@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
140
140
  return cells
141
141
 
142
142
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
143
- AREA_THRESHOLD = 32 * 32
143
+ AREA_THRESHOLD = 0 # 32 * 32
144
144
 
145
145
  images = self._dpage["sanitized"]["images"]["data"]
146
146
  images_header = self._dpage["sanitized"]["images"]["header"]
@@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
37
37
 
38
38
  try:
39
39
  if isinstance(self.path_or_stream, BytesIO):
40
- text_stream = self.path_or_stream.getvalue().decode("utf-8")
40
+ text_stream = self.path_or_stream.getvalue()
41
41
  self.soup = BeautifulSoup(text_stream, "html.parser")
42
42
  if isinstance(self.path_or_stream, Path):
43
- with open(self.path_or_stream, "r", encoding="utf-8") as f:
43
+ with open(self.path_or_stream, "rb") as f:
44
44
  html_content = f.read()
45
45
  self.soup = BeautifulSoup(html_content, "html.parser")
46
46
  except Exception as e:
@@ -16,7 +16,7 @@ from docling_core.types.doc import (
16
16
  TableCell,
17
17
  TableData,
18
18
  )
19
- from PIL import Image
19
+ from PIL import Image, UnidentifiedImageError
20
20
  from pptx import Presentation
21
21
  from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
22
22
 
@@ -120,6 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
120
120
  bullet_type = "None"
121
121
  list_text = ""
122
122
  list_label = GroupLabel.LIST
123
+ doc_label = DocItemLabel.LIST_ITEM
123
124
  prov = self.generate_prov(shape, slide_ind, shape.text.strip())
124
125
 
125
126
  # Identify if shape contains lists
@@ -276,16 +277,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
276
277
  im_dpi, _ = image.dpi
277
278
 
278
279
  # Open it with PIL
279
- pil_image = Image.open(BytesIO(image_bytes))
280
-
281
- # shape has picture
282
- prov = self.generate_prov(shape, slide_ind, "")
283
- doc.add_picture(
284
- parent=parent_slide,
285
- image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
286
- caption=None,
287
- prov=prov,
288
- )
280
+ try:
281
+ pil_image = Image.open(BytesIO(image_bytes))
282
+
283
+ # shape has picture
284
+ prov = self.generate_prov(shape, slide_ind, "")
285
+ doc.add_picture(
286
+ parent=parent_slide,
287
+ image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
288
+ caption=None,
289
+ prov=prov,
290
+ )
291
+ except (UnidentifiedImageError, OSError) as e:
292
+ _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
289
293
  return
290
294
 
291
295
  def handle_tables(self, shape, parent_slide, slide_ind, doc):
@@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
39
39
  return self.valid
40
40
 
41
41
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
42
- AREA_THRESHOLD = 32 * 32
42
+ AREA_THRESHOLD = 0 # 32 * 32
43
43
  for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
44
44
  pos = obj.get_pos()
45
45
  cropbox = BoundingBox.from_tuple(
docling/cli/main.py CHANGED
@@ -164,6 +164,11 @@ def convert(
164
164
  to_formats: List[OutputFormat] = typer.Option(
165
165
  None, "--to", help="Specify output formats. Defaults to Markdown."
166
166
  ),
167
+ headers: str = typer.Option(
168
+ None,
169
+ "--headers",
170
+ help="Specify http request headers used when fetching url input sources in the form of a JSON string",
171
+ ),
167
172
  image_export_mode: Annotated[
168
173
  ImageRefMode,
169
174
  typer.Option(
@@ -279,12 +284,19 @@ def convert(
279
284
  if from_formats is None:
280
285
  from_formats = [e for e in InputFormat]
281
286
 
287
+ parsed_headers: Optional[Dict[str, str]] = None
288
+ if headers is not None:
289
+ headers_t = TypeAdapter(Dict[str, str])
290
+ parsed_headers = headers_t.validate_json(headers)
291
+
282
292
  with tempfile.TemporaryDirectory() as tempdir:
283
293
  input_doc_paths: List[Path] = []
284
294
  for src in input_sources:
285
295
  try:
286
296
  # check if we can fetch some remote url
287
- source = resolve_source_to_path(source=src, workdir=Path(tempdir))
297
+ source = resolve_source_to_path(
298
+ source=src, headers=parsed_headers, workdir=Path(tempdir)
299
+ )
288
300
  input_doc_paths.append(source)
289
301
  except FileNotFoundError:
290
302
  err_console.print(
@@ -390,7 +402,7 @@ def convert(
390
402
  start_time = time.time()
391
403
 
392
404
  conv_results = doc_converter.convert_all(
393
- input_doc_paths, raises_on_error=abort_on_error
405
+ input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
394
406
  )
395
407
 
396
408
  output.mkdir(parents=True, exist_ok=True)
@@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
227
227
  class _DocumentConversionInput(BaseModel):
228
228
 
229
229
  path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
230
+ headers: Optional[Dict[str, str]] = None
230
231
  limits: Optional[DocumentLimits] = DocumentLimits()
231
232
 
232
233
  def docs(
233
234
  self, format_options: Dict[InputFormat, "FormatOption"]
234
235
  ) -> Iterable[InputDocument]:
235
236
  for item in self.path_or_stream_iterator:
236
- obj = resolve_source_to_stream(item) if isinstance(item, str) else item
237
+ obj = (
238
+ resolve_source_to_stream(item, self.headers)
239
+ if isinstance(item, str)
240
+ else item
241
+ )
237
242
  format = self._guess_format(obj)
238
243
  backend: Type[AbstractDocumentBackend]
239
244
  if format not in format_options.keys():
@@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):
139
139
 
140
140
  use_gpu: Optional[bool] = None
141
141
 
142
- confidence_threshold: float = 0.65
142
+ confidence_threshold: float = 0.5
143
143
 
144
144
  model_storage_directory: Optional[str] = None
145
145
  recog_network: Optional[str] = "standard"
@@ -176,6 +176,7 @@ class DocumentConverter:
176
176
  def convert(
177
177
  self,
178
178
  source: Union[Path, str, DocumentStream], # TODO review naming
179
+ headers: Optional[Dict[str, str]] = None,
179
180
  raises_on_error: bool = True,
180
181
  max_num_pages: int = sys.maxsize,
181
182
  max_file_size: int = sys.maxsize,
@@ -185,6 +186,7 @@ class DocumentConverter:
185
186
  raises_on_error=raises_on_error,
186
187
  max_num_pages=max_num_pages,
187
188
  max_file_size=max_file_size,
189
+ headers=headers,
188
190
  )
189
191
  return next(all_res)
190
192
 
@@ -192,6 +194,7 @@ class DocumentConverter:
192
194
  def convert_all(
193
195
  self,
194
196
  source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
197
+ headers: Optional[Dict[str, str]] = None,
195
198
  raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
196
199
  max_num_pages: int = sys.maxsize,
197
200
  max_file_size: int = sys.maxsize,
@@ -201,8 +204,7 @@ class DocumentConverter:
201
204
  max_file_size=max_file_size,
202
205
  )
203
206
  conv_input = _DocumentConversionInput(
204
- path_or_stream_iterator=source,
205
- limits=limits,
207
+ path_or_stream_iterator=source, limits=limits, headers=headers
206
208
  )
207
209
  conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
208
210
 
@@ -8,7 +8,7 @@ import numpy as np
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
9
  from PIL import Image, ImageDraw
10
10
  from rtree import index
11
- from scipy.ndimage import find_objects, label
11
+ from scipy.ndimage import binary_dilation, find_objects, label
12
12
 
13
13
  from docling.datamodel.base_models import Cell, OcrCell, Page
14
14
  from docling.datamodel.document import ConversionResult
@@ -43,6 +43,12 @@ class BaseOcrModel(BasePageModel):
43
43
 
44
44
  np_image = np.array(image)
45
45
 
46
+ # Dilate the image by 10 pixels to merge nearby bitmap rectangles
47
+ structure = np.ones(
48
+ (20, 20)
49
+ ) # Create a 20x20 structure element (10 pixels in all directions)
50
+ np_image = binary_dilation(np_image > 0, structure=structure)
51
+
46
52
  # Find the connected components
47
53
  labeled_image, num_features = label(
48
54
  np_image > 0
@@ -72,7 +78,7 @@ class BaseOcrModel(BasePageModel):
72
78
  bitmap_rects = []
73
79
  coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
74
80
 
75
- # return full-page rectangle if sufficiently covered with bitmaps
81
+ # return full-page rectangle if page is dominantly covered with bitmaps
76
82
  if self.options.force_full_page_ocr or coverage > max(
77
83
  BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
78
84
  ):
@@ -85,17 +91,11 @@ class BaseOcrModel(BasePageModel):
85
91
  coord_origin=CoordOrigin.TOPLEFT,
86
92
  )
87
93
  ]
88
- # return individual rectangles if the bitmap coverage is smaller
89
- else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
90
-
91
- # skip OCR if the bitmap area on the page is smaller than the options threshold
92
- ocr_rects = [
93
- rect
94
- for rect in ocr_rects
95
- if rect.area() / (page.size.width * page.size.height)
96
- > self.options.bitmap_area_threshold
97
- ]
94
+ # return individual rectangles if the bitmap coverage is above the threshold
95
+ elif coverage > self.options.bitmap_area_threshold:
98
96
  return ocr_rects
97
+ else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
98
+ return []
99
99
 
100
100
  # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
101
101
  def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
@@ -138,18 +138,34 @@ class BaseOcrModel(BasePageModel):
138
138
 
139
139
  def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
140
140
  image = copy.deepcopy(page.image)
141
+ scale_x = image.width / page.size.width
142
+ scale_y = image.height / page.size.height
143
+
141
144
  draw = ImageDraw.Draw(image, "RGBA")
142
145
 
143
146
  # Draw OCR rectangles as yellow filled rect
144
147
  for rect in ocr_rects:
145
148
  x0, y0, x1, y1 = rect.as_tuple()
149
+ y0 *= scale_x
150
+ y1 *= scale_y
151
+ x0 *= scale_x
152
+ x1 *= scale_x
153
+
146
154
  shade_color = (255, 255, 0, 40) # transparent yellow
147
155
  draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
148
156
 
149
157
  # Draw OCR and programmatic cells
150
158
  for tc in page.cells:
151
159
  x0, y0, x1, y1 = tc.bbox.as_tuple()
152
- color = "red"
160
+ y0 *= scale_x
161
+ y1 *= scale_y
162
+ x0 *= scale_x
163
+ x1 *= scale_x
164
+
165
+ if y1 <= y0:
166
+ y1, y0 = y0, y1
167
+
168
+ color = "gray"
153
169
  if isinstance(tc, OcrCell):
154
170
  color = "magenta"
155
171
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
@@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
67
67
  - Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
68
68
  Includes label names and confidence scores for each cluster.
69
69
  """
70
- label_to_color = {
71
- DocItemLabel.TEXT: (255, 255, 153), # Light Yellow
72
- DocItemLabel.CAPTION: (255, 204, 153), # Light Orange
73
- DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
74
- DocItemLabel.FORMULA: (192, 192, 192), # Gray
75
- DocItemLabel.TABLE: (255, 204, 204), # Light Pink
76
- DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
77
- DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
78
- DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
79
- DocItemLabel.PAGE_FOOTER: (
80
- 204,
81
- 255,
82
- 204,
83
- ), # Light Green (same as Page-Header)
84
- DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
85
- DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
86
- DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
87
- DocItemLabel.CODE: (125, 125, 125), # Gray
88
- DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
89
- DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
90
- DocItemLabel.FORM: (200, 255, 255), # Light Cyan
91
- DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
92
- }
70
+ scale_x = page.image.width / page.size.width
71
+ scale_y = page.image.height / page.size.height
72
+
93
73
  # Filter clusters for left and right images
94
74
  exclude_labels = {
95
75
  DocItemLabel.FORM,
@@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
118
98
  cell_color = (0, 0, 0, 40) # Transparent black for cells
119
99
  for tc in c.cells:
120
100
  cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
101
+ cx0 *= scale_x
102
+ cx1 *= scale_x
103
+ cy0 *= scale_x
104
+ cy1 *= scale_y
105
+
121
106
  draw.rectangle(
122
107
  [(cx0, cy0), (cx1, cy1)],
123
108
  outline=None,
@@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
125
110
  )
126
111
  # Draw cluster rectangle
127
112
  x0, y0, x1, y1 = c.bbox.as_tuple()
128
- cluster_fill_color = (*list(label_to_color.get(c.label)), 70)
129
- cluster_outline_color = (*list(label_to_color.get(c.label)), 255)
113
+ x0 *= scale_x
114
+ x1 *= scale_x
115
+ y0 *= scale_x
116
+ y1 *= scale_y
117
+
118
+ cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
119
+ cluster_outline_color = (
120
+ *list(DocItemLabel.get_color(c.label)),
121
+ 255,
122
+ )
130
123
  draw.rectangle(
131
124
  [(x0, y0), (x1, y1)],
132
125
  outline=cluster_outline_color,
@@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
66
66
  show: bool = False,
67
67
  ):
68
68
  assert page._backend is not None
69
+ assert page.size is not None
69
70
 
70
71
  image = (
71
72
  page._backend.get_page_image()
72
73
  ) # make new image to avoid drawing on the saved ones
74
+
75
+ scale_x = image.width / page.size.width
76
+ scale_y = image.height / page.size.height
77
+
73
78
  draw = ImageDraw.Draw(image)
74
79
 
75
80
  for table_element in tbl_list:
76
81
  x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
82
+ y0 *= scale_x
83
+ y1 *= scale_y
84
+ x0 *= scale_x
85
+ x1 *= scale_x
86
+
77
87
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
78
88
 
79
89
  for cell in table_element.cluster.cells:
80
90
  x0, y0, x1, y1 = cell.bbox.as_tuple()
91
+ x0 *= scale_x
92
+ x1 *= scale_x
93
+ y0 *= scale_x
94
+ y1 *= scale_y
95
+
81
96
  draw.rectangle([(x0, y0), (x1, y1)], outline="green")
82
97
 
83
98
  for tc in table_element.table_cells:
84
99
  if tc.bbox is not None:
85
100
  x0, y0, x1, y1 = tc.bbox.as_tuple()
101
+ x0 *= scale_x
102
+ x1 *= scale_x
103
+ y0 *= scale_x
104
+ y1 *= scale_y
105
+
86
106
  if tc.column_header:
87
107
  width = 3
88
108
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.14.0
3
+ Version: 2.15.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.12.1,<3.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
31
31
  Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -45,7 +45,7 @@ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
45
45
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
46
46
  Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
47
47
  Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
48
- Requires-Dist: requests (>=2.32.3,<3.0.0)
48
+ Requires-Dist: requests (>=2.32.2,<3.0.0)
49
49
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
50
50
  Requires-Dist: scipy (>=1.6.0,<2.0.0)
51
51
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
@@ -84,7 +84,7 @@ Docling parses documents and exports them to the desired format with ease and sp
84
84
  * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
85
85
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
86
86
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
87
- * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
87
+ * 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
88
88
  * 🔍 OCR support for scanned PDFs
89
89
  * 💻 Simple and convenient CLI
90
90
 
@@ -94,7 +94,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
94
94
 
95
95
  * ♾️ Equation & code extraction
96
96
  * 📝 Metadata extraction, including title, authors, references & language
97
- * 🦜🔗 Native LangChain extension
98
97
 
99
98
  ## Installation
100
99
 
@@ -2,39 +2,39 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
4
  docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
5
- docling/backend/docling_parse_backend.py,sha256=_jY5f5-KGI3hi5pcZAY6e7tPLocSi5JUWrxraDVszqI,7631
6
- docling/backend/docling_parse_v2_backend.py,sha256=1TDUdMIp3fEjCWBNjusUHiCUmH1g6yZQ-b13scofP0Y,8637
7
- docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
5
+ docling/backend/docling_parse_backend.py,sha256=cJLkuOmfCtshRrwsv7WWayRNeMQASZv76v3nUHucqgM,7636
6
+ docling/backend/docling_parse_v2_backend.py,sha256=-lLsorxhK_Awrql_zXPen2LX0Gt9UvcDLMcmXf7_LKc,8642
7
+ docling/backend/html_backend.py,sha256=O8qXaw7MzOIdaxbBcjHieM9Ce4GEdtBj9YW0vpJspuA,15560
8
8
  docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
9
  docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
10
- docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
10
+ docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
11
11
  docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
12
12
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
13
- docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
13
+ docling/backend/pypdfium2_backend.py,sha256=Exb3NBp3x2YSLoNfmXq4NefShgooJXsxTXrJ4JbTzcc,9001
14
14
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
16
16
  docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
17
17
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
18
18
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- docling/cli/main.py,sha256=SdavhL0VTApK9JrKz0Pc1IYdnQhK-0OOaGT8zlTiN5c,15022
19
+ docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
20
20
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/datamodel/base_models.py,sha256=50Jf5zk9c4-zmnOzZLoPBnHQhTX0_OFQzIkKgnKK1o4,6229
22
- docling/datamodel/document.py,sha256=rnNw2tGuCZ1BDoBptlNpPllQ2osJMQHMvcbyrQZuSL4,12948
23
- docling/datamodel/pipeline_options.py,sha256=u37Q12FVfu1UTEhgBiZ2KslyBtG3z3Eobqvaqd_MYaA,7735
22
+ docling/datamodel/document.py,sha256=OHM6bm0a-62xnAZ8DFlMHzATmbgNcfMxQoQO2udaW5Q,13071
23
+ docling/datamodel/pipeline_options.py,sha256=wKFzw8sAim6emQGsjuS12n7FfpMo8HVNoMOPhkXTkVo,7734
24
24
  docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
25
- docling/document_converter.py,sha256=PoRcL2IzGoT7ZppGk6laPmKiHOwrXl1-dLMNWumNogg,12298
25
+ docling/document_converter.py,sha256=_pk0sHuPXJ14NEutatf5bK2VyNiU5cvYsVbh1HIgrIw,12431
26
26
  docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
27
27
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
29
- docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
29
+ docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
30
30
  docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
31
31
  docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
32
- docling/models/layout_model.py,sha256=skfFdWh_NgijR4bIqyUH8zlda5mMOIIdN3yMttdmsN8,9871
32
+ docling/models/layout_model.py,sha256=Xo8sclRTOO_V8Cr4RwuxB67vSWKF0LZ5nJRYU1WI--k,9063
33
33
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
34
34
  docling/models/page_assemble_model.py,sha256=qdEX0AIb76ZOqJV6O9j-7r67WmuIkUlwbb2PsL7eFK4,7608
35
35
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
36
36
  docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
37
- docling/models/table_structure_model.py,sha256=3bUBeP26WwDNCb5_aAlRwVZe4xUYgnwsSHgWQYZxk9E,8892
37
+ docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
38
38
  docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
39
39
  docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
40
40
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -49,8 +49,8 @@ docling/utils/glm_utils.py,sha256=IB19wToGath97gD3jAA3G_rQSptnZKhQCWLvPUCnkww,11
49
49
  docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
50
50
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
51
51
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
52
- docling-2.14.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
53
- docling-2.14.0.dist-info/METADATA,sha256=FmM_aRgxeqVSKDOYc-8MEKH1ec_Z7x8cgMQoMVeaKDw,7732
54
- docling-2.14.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
- docling-2.14.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
56
- docling-2.14.0.dist-info/RECORD,,
52
+ docling-2.15.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
53
+ docling-2.15.1.dist-info/METADATA,sha256=6WRzA633us43nw7RHwhX_jwizh2JSpGWxNh0pJq2ZYs,7739
54
+ docling-2.15.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
+ docling-2.15.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
56
+ docling-2.15.1.dist-info/RECORD,,