docling 2.13.0__py3-none-any.whl → 2.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
37
37
 
38
38
  try:
39
39
  if isinstance(self.path_or_stream, BytesIO):
40
- text_stream = self.path_or_stream.getvalue().decode("utf-8")
40
+ text_stream = self.path_or_stream.getvalue()
41
41
  self.soup = BeautifulSoup(text_stream, "html.parser")
42
42
  if isinstance(self.path_or_stream, Path):
43
- with open(self.path_or_stream, "r", encoding="utf-8") as f:
43
+ with open(self.path_or_stream, "rb") as f:
44
44
  html_content = f.read()
45
45
  self.soup = BeautifulSoup(html_content, "html.parser")
46
46
  except Exception as e:
@@ -16,7 +16,7 @@ from docling_core.types.doc import (
16
16
  TableCell,
17
17
  TableData,
18
18
  )
19
- from PIL import Image
19
+ from PIL import Image, UnidentifiedImageError
20
20
  from pptx import Presentation
21
21
  from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
22
22
 
@@ -120,6 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
120
120
  bullet_type = "None"
121
121
  list_text = ""
122
122
  list_label = GroupLabel.LIST
123
+ doc_label = DocItemLabel.LIST_ITEM
123
124
  prov = self.generate_prov(shape, slide_ind, shape.text.strip())
124
125
 
125
126
  # Identify if shape contains lists
@@ -276,16 +277,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
276
277
  im_dpi, _ = image.dpi
277
278
 
278
279
  # Open it with PIL
279
- pil_image = Image.open(BytesIO(image_bytes))
280
-
281
- # shape has picture
282
- prov = self.generate_prov(shape, slide_ind, "")
283
- doc.add_picture(
284
- parent=parent_slide,
285
- image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
286
- caption=None,
287
- prov=prov,
288
- )
280
+ try:
281
+ pil_image = Image.open(BytesIO(image_bytes))
282
+
283
+ # shape has picture
284
+ prov = self.generate_prov(shape, slide_ind, "")
285
+ doc.add_picture(
286
+ parent=parent_slide,
287
+ image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
288
+ caption=None,
289
+ prov=prov,
290
+ )
291
+ except (UnidentifiedImageError, OSError) as e:
292
+ _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
289
293
  return
290
294
 
291
295
  def handle_tables(self, shape, parent_slide, slide_ind, doc):
@@ -0,0 +1,592 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Any, Set, Union
5
+
6
+ import lxml
7
+ from bs4 import BeautifulSoup
8
+ from docling_core.types.doc import (
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ TableCell,
14
+ TableData,
15
+ )
16
+ from lxml import etree
17
+ from typing_extensions import TypedDict, override
18
+
19
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
20
+ from docling.datamodel.base_models import InputFormat
21
+ from docling.datamodel.document import InputDocument
22
+
23
+ _log = logging.getLogger(__name__)
24
+
25
+
26
+ class Paragraph(TypedDict):
27
+ text: str
28
+ headers: list[str]
29
+
30
+
31
+ class Author(TypedDict):
32
+ name: str
33
+ affiliation_names: list[str]
34
+
35
+
36
+ class Table(TypedDict):
37
+ label: str
38
+ caption: str
39
+ content: str
40
+
41
+
42
+ class FigureCaption(TypedDict):
43
+ label: str
44
+ caption: str
45
+
46
+
47
+ class Reference(TypedDict):
48
+ author_names: str
49
+ title: str
50
+ journal: str
51
+ year: str
52
+
53
+
54
+ class XMLComponents(TypedDict):
55
+ title: str
56
+ authors: list[Author]
57
+ abstract: str
58
+ paragraphs: list[Paragraph]
59
+ tables: list[Table]
60
+ figure_captions: list[FigureCaption]
61
+ references: list[Reference]
62
+
63
+
64
+ class PubMedDocumentBackend(DeclarativeDocumentBackend):
65
+ """
66
+ The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
67
+ Achakulvisut et al., (2020).
68
+ Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
69
+ Journal of Open Source Software, 5(46), 1979,
70
+ https://doi.org/10.21105/joss.01979
71
+ """
72
+
73
+ @override
74
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
75
+ super().__init__(in_doc, path_or_stream)
76
+ self.path_or_stream = path_or_stream
77
+
78
+ # Initialize parents for the document hierarchy
79
+ self.parents: dict = {}
80
+
81
+ self.valid = False
82
+ try:
83
+ if isinstance(self.path_or_stream, BytesIO):
84
+ self.path_or_stream.seek(0)
85
+ self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
86
+ if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
87
+ self.valid = True
88
+ except Exception as exc:
89
+ raise RuntimeError(
90
+ f"Could not initialize PubMed backend for file with hash {self.document_hash}."
91
+ ) from exc
92
+
93
+ @override
94
+ def is_valid(self) -> bool:
95
+ return self.valid
96
+
97
+ @classmethod
98
+ @override
99
+ def supports_pagination(cls) -> bool:
100
+ return False
101
+
102
+ @override
103
+ def unload(self):
104
+ if isinstance(self.path_or_stream, BytesIO):
105
+ self.path_or_stream.close()
106
+ self.path_or_stream = None
107
+
108
+ @classmethod
109
+ @override
110
+ def supported_formats(cls) -> Set[InputFormat]:
111
+ return {InputFormat.XML_PUBMED}
112
+
113
+ @override
114
+ def convert(self) -> DoclingDocument:
115
+ # Create empty document
116
+ origin = DocumentOrigin(
117
+ filename=self.file.name or "file",
118
+ mimetype="application/xml",
119
+ binary_hash=self.document_hash,
120
+ )
121
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
122
+
123
+ _log.debug("Trying to convert PubMed XML document...")
124
+
125
+ # Get parsed XML components
126
+ xml_components: XMLComponents = self._parse()
127
+
128
+ # Add XML components to the document
129
+ doc = self._populate_document(doc, xml_components)
130
+ return doc
131
+
132
+ def _parse_title(self) -> str:
133
+ title: str = " ".join(
134
+ [
135
+ t.replace("\n", "")
136
+ for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
137
+ ]
138
+ )
139
+ return title
140
+
141
+ def _parse_authors(self) -> list[Author]:
142
+ # Get mapping between affiliation ids and names
143
+ affiliation_names = []
144
+ for affiliation_node in self.tree.xpath(".//aff[@id]"):
145
+ affiliation_names.append(
146
+ ": ".join([t for t in affiliation_node.itertext() if t != "\n"])
147
+ )
148
+ affiliation_ids_names = {
149
+ id: name
150
+ for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
151
+ }
152
+
153
+ # Get author names and affiliation names
154
+ authors: list[Author] = []
155
+ for author_node in self.tree.xpath(
156
+ './/contrib-group/contrib[@contrib-type="author"]'
157
+ ):
158
+ author: Author = {
159
+ "name": "",
160
+ "affiliation_names": [],
161
+ }
162
+
163
+ # Affiliation names
164
+ affiliation_ids = [
165
+ a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
166
+ ]
167
+ for id in affiliation_ids:
168
+ if id in affiliation_ids_names:
169
+ author["affiliation_names"].append(affiliation_ids_names[id])
170
+
171
+ # Name
172
+ author["name"] = (
173
+ author_node.xpath("name/surname")[0].text
174
+ + " "
175
+ + author_node.xpath("name/given-names")[0].text
176
+ )
177
+
178
+ authors.append(author)
179
+ return authors
180
+
181
+ def _parse_abstract(self) -> str:
182
+ texts = []
183
+ for abstract_node in self.tree.xpath(".//abstract"):
184
+ for text in abstract_node.itertext():
185
+ texts.append(text.replace("\n", ""))
186
+ abstract: str = "".join(texts)
187
+ return abstract
188
+
189
+ def _parse_main_text(self) -> list[Paragraph]:
190
+ paragraphs: list[Paragraph] = []
191
+ for paragraph_node in self.tree.xpath("//body//p"):
192
+ # Skip captions
193
+ if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
194
+ continue
195
+
196
+ paragraph: Paragraph = {"text": "", "headers": []}
197
+
198
+ # Text
199
+ paragraph["text"] = "".join(
200
+ [t.replace("\n", "") for t in paragraph_node.itertext()]
201
+ )
202
+
203
+ # Header
204
+ path = "../title"
205
+ while len(paragraph_node.xpath(path)) > 0:
206
+ paragraph["headers"].append(
207
+ "".join(
208
+ [
209
+ t.replace("\n", "")
210
+ for t in paragraph_node.xpath(path)[0].itertext()
211
+ ]
212
+ )
213
+ )
214
+ path = "../" + path
215
+
216
+ paragraphs.append(paragraph)
217
+
218
+ return paragraphs
219
+
220
+ def _parse_tables(self) -> list[Table]:
221
+ tables: list[Table] = []
222
+ for table_node in self.tree.xpath(".//body//table-wrap"):
223
+ table: Table = {"label": "", "caption": "", "content": ""}
224
+
225
+ # Content
226
+ if len(table_node.xpath("table")) > 0:
227
+ table_content_node = table_node.xpath("table")[0]
228
+ elif len(table_node.xpath("alternatives/table")) > 0:
229
+ table_content_node = table_node.xpath("alternatives/table")[0]
230
+ else:
231
+ table_content_node = None
232
+ if table_content_node != None:
233
+ table["content"] = etree.tostring(table_content_node).decode("utf-8")
234
+
235
+ # Caption
236
+ if len(table_node.xpath("caption/p")) > 0:
237
+ caption_node = table_node.xpath("caption/p")[0]
238
+ elif len(table_node.xpath("caption/title")) > 0:
239
+ caption_node = table_node.xpath("caption/title")[0]
240
+ else:
241
+ caption_node = None
242
+ if caption_node != None:
243
+ table["caption"] = "".join(
244
+ [t.replace("\n", "") for t in caption_node.itertext()]
245
+ )
246
+
247
+ # Label
248
+ if len(table_node.xpath("label")) > 0:
249
+ table["label"] = table_node.xpath("label")[0].text
250
+
251
+ tables.append(table)
252
+ return tables
253
+
254
+ def _parse_figure_captions(self) -> list[FigureCaption]:
255
+ figure_captions: list[FigureCaption] = []
256
+
257
+ if not (self.tree.xpath(".//fig")):
258
+ return figure_captions
259
+
260
+ for figure_node in self.tree.xpath(".//fig"):
261
+ figure_caption: FigureCaption = {
262
+ "caption": "",
263
+ "label": "",
264
+ }
265
+
266
+ # Label
267
+ if figure_node.xpath("label"):
268
+ figure_caption["label"] = "".join(
269
+ [
270
+ t.replace("\n", "")
271
+ for t in figure_node.xpath("label")[0].itertext()
272
+ ]
273
+ )
274
+
275
+ # Caption
276
+ if figure_node.xpath("caption"):
277
+ caption = ""
278
+ for caption_node in figure_node.xpath("caption")[0].getchildren():
279
+ caption += (
280
+ "".join([t.replace("\n", "") for t in caption_node.itertext()])
281
+ + "\n"
282
+ )
283
+ figure_caption["caption"] = caption
284
+
285
+ figure_captions.append(figure_caption)
286
+
287
+ return figure_captions
288
+
289
+ def _parse_references(self) -> list[Reference]:
290
+ references: list[Reference] = []
291
+ for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
292
+ reference: Reference = {
293
+ "author_names": "",
294
+ "title": "",
295
+ "journal": "",
296
+ "year": "",
297
+ }
298
+ reference_node: Any = None
299
+ for tag in ["mixed-citation", "element-citation", "citation"]:
300
+ if len(reference_node_abs.xpath(tag)) > 0:
301
+ reference_node = reference_node_abs.xpath(tag)[0]
302
+ break
303
+
304
+ if reference_node is None:
305
+ continue
306
+
307
+ if all(
308
+ not (ref_type in ["citation-type", "publication-type"])
309
+ for ref_type in reference_node.attrib.keys()
310
+ ):
311
+ continue
312
+
313
+ # Author names
314
+ names = []
315
+ if len(reference_node.xpath("name")) > 0:
316
+ for name_node in reference_node.xpath("name"):
317
+ name_str = " ".join(
318
+ [t.text for t in name_node.getchildren() if (t.text != None)]
319
+ )
320
+ names.append(name_str)
321
+ elif len(reference_node.xpath("person-group")) > 0:
322
+ for name_node in reference_node.xpath("person-group")[0]:
323
+ name_str = (
324
+ name_node.xpath("given-names")[0].text
325
+ + " "
326
+ + name_node.xpath("surname")[0].text
327
+ )
328
+ names.append(name_str)
329
+ reference["author_names"] = "; ".join(names)
330
+
331
+ # Title
332
+ if len(reference_node.xpath("article-title")) > 0:
333
+ reference["title"] = " ".join(
334
+ [
335
+ t.replace("\n", " ")
336
+ for t in reference_node.xpath("article-title")[0].itertext()
337
+ ]
338
+ )
339
+
340
+ # Journal
341
+ if len(reference_node.xpath("source")) > 0:
342
+ reference["journal"] = reference_node.xpath("source")[0].text
343
+
344
+ # Year
345
+ if len(reference_node.xpath("year")) > 0:
346
+ reference["year"] = reference_node.xpath("year")[0].text
347
+
348
+ if (
349
+ not (reference_node.xpath("article-title"))
350
+ and not (reference_node.xpath("journal"))
351
+ and not (reference_node.xpath("year"))
352
+ ):
353
+ reference["title"] = reference_node.text
354
+
355
+ references.append(reference)
356
+ return references
357
+
358
+ def _parse(self) -> XMLComponents:
359
+ """Parsing PubMed document."""
360
+ xml_components: XMLComponents = {
361
+ "title": self._parse_title(),
362
+ "authors": self._parse_authors(),
363
+ "abstract": self._parse_abstract(),
364
+ "paragraphs": self._parse_main_text(),
365
+ "tables": self._parse_tables(),
366
+ "figure_captions": self._parse_figure_captions(),
367
+ "references": self._parse_references(),
368
+ }
369
+ return xml_components
370
+
371
+ def _populate_document(
372
+ self, doc: DoclingDocument, xml_components: XMLComponents
373
+ ) -> DoclingDocument:
374
+ self._add_title(doc, xml_components)
375
+ self._add_authors(doc, xml_components)
376
+ self._add_abstract(doc, xml_components)
377
+ self._add_main_text(doc, xml_components)
378
+
379
+ if xml_components["tables"]:
380
+ self._add_tables(doc, xml_components)
381
+
382
+ if xml_components["figure_captions"]:
383
+ self._add_figure_captions(doc, xml_components)
384
+
385
+ self._add_references(doc, xml_components)
386
+ return doc
387
+
388
+ def _add_figure_captions(
389
+ self, doc: DoclingDocument, xml_components: XMLComponents
390
+ ) -> None:
391
+ self.parents["Figures"] = doc.add_heading(
392
+ parent=self.parents["Title"], text="Figures"
393
+ )
394
+ for figure_caption_xml_component in xml_components["figure_captions"]:
395
+ figure_caption_text = (
396
+ figure_caption_xml_component["label"]
397
+ + ": "
398
+ + figure_caption_xml_component["caption"].strip()
399
+ )
400
+ fig_caption = doc.add_text(
401
+ label=DocItemLabel.CAPTION, text=figure_caption_text
402
+ )
403
+ doc.add_picture(
404
+ parent=self.parents["Figures"],
405
+ caption=fig_caption,
406
+ )
407
+ return
408
+
409
+ def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
410
+ self.parents["Title"] = doc.add_text(
411
+ parent=None,
412
+ text=xml_components["title"],
413
+ label=DocItemLabel.TITLE,
414
+ )
415
+ return
416
+
417
+ def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
418
+ authors_affiliations: list = []
419
+ for author in xml_components["authors"]:
420
+ authors_affiliations.append(author["name"])
421
+ authors_affiliations.append(", ".join(author["affiliation_names"]))
422
+ authors_affiliations_str = "; ".join(authors_affiliations)
423
+
424
+ doc.add_text(
425
+ parent=self.parents["Title"],
426
+ text=authors_affiliations_str,
427
+ label=DocItemLabel.PARAGRAPH,
428
+ )
429
+ return
430
+
431
+ def _add_abstract(
432
+ self, doc: DoclingDocument, xml_components: XMLComponents
433
+ ) -> None:
434
+ abstract_text: str = xml_components["abstract"]
435
+ self.parents["Abstract"] = doc.add_heading(
436
+ parent=self.parents["Title"], text="Abstract"
437
+ )
438
+ doc.add_text(
439
+ parent=self.parents["Abstract"],
440
+ text=abstract_text,
441
+ label=DocItemLabel.TEXT,
442
+ )
443
+ return
444
+
445
+ def _add_main_text(
446
+ self, doc: DoclingDocument, xml_components: XMLComponents
447
+ ) -> None:
448
+ added_headers: list = []
449
+ for paragraph in xml_components["paragraphs"]:
450
+ if not (paragraph["headers"]):
451
+ continue
452
+
453
+ # Header
454
+ for i, header in enumerate(reversed(paragraph["headers"])):
455
+ if header in added_headers:
456
+ continue
457
+ added_headers.append(header)
458
+
459
+ if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
460
+ i - 1
461
+ ] in self.parents:
462
+ parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
463
+ else:
464
+ parent = self.parents["Title"]
465
+
466
+ self.parents[header] = doc.add_heading(parent=parent, text=header)
467
+
468
+ # Paragraph text
469
+ if paragraph["headers"][0] in self.parents:
470
+ parent = self.parents[paragraph["headers"][0]]
471
+ else:
472
+ parent = self.parents["Title"]
473
+
474
+ doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
475
+ return
476
+
477
+ def _add_references(
478
+ self, doc: DoclingDocument, xml_components: XMLComponents
479
+ ) -> None:
480
+ self.parents["References"] = doc.add_heading(
481
+ parent=self.parents["Title"], text="References"
482
+ )
483
+ current_list = doc.add_group(
484
+ parent=self.parents["References"], label=GroupLabel.LIST, name="list"
485
+ )
486
+ for reference in xml_components["references"]:
487
+ reference_text: str = ""
488
+ if reference["author_names"]:
489
+ reference_text += reference["author_names"] + ". "
490
+
491
+ if reference["title"]:
492
+ reference_text += reference["title"]
493
+ if reference["title"][-1] != ".":
494
+ reference_text += "."
495
+ reference_text += " "
496
+
497
+ if reference["journal"]:
498
+ reference_text += reference["journal"]
499
+
500
+ if reference["year"]:
501
+ reference_text += " (" + reference["year"] + ")"
502
+
503
+ if not (reference_text):
504
+ _log.debug(f"Skipping reference for: {str(self.file)}")
505
+ continue
506
+
507
+ doc.add_list_item(
508
+ text=reference_text, enumerated=False, parent=current_list
509
+ )
510
+ return
511
+
512
+ def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
513
+ self.parents["Tables"] = doc.add_heading(
514
+ parent=self.parents["Title"], text="Tables"
515
+ )
516
+ for table_xml_component in xml_components["tables"]:
517
+ try:
518
+ self._add_table(doc, table_xml_component)
519
+ except Exception as e:
520
+ _log.debug(f"Skipping unsupported table for: {str(self.file)}")
521
+ pass
522
+ return
523
+
524
+ def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
525
+ soup = BeautifulSoup(table_xml_component["content"], "html.parser")
526
+ table_tag = soup.find("table")
527
+
528
+ nested_tables = table_tag.find("table")
529
+ if nested_tables:
530
+ _log.debug(f"Skipping nested table for: {str(self.file)}")
531
+ return
532
+
533
+ # Count the number of rows (number of <tr> elements)
534
+ num_rows = len(table_tag.find_all("tr"))
535
+
536
+ # Find the number of columns (taking into account colspan)
537
+ num_cols = 0
538
+ for row in table_tag.find_all("tr"):
539
+ col_count = 0
540
+ for cell in row.find_all(["td", "th"]):
541
+ colspan = int(cell.get("colspan", 1))
542
+ col_count += colspan
543
+ num_cols = max(num_cols, col_count)
544
+
545
+ grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
546
+
547
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
548
+
549
+ # Iterate over the rows in the table
550
+ for row_idx, row in enumerate(table_tag.find_all("tr")):
551
+ # For each row, find all the column cells (both <td> and <th>)
552
+ cells = row.find_all(["td", "th"])
553
+
554
+ # Check if each cell in the row is a header -> means it is a column header
555
+ col_header = True
556
+ for j, html_cell in enumerate(cells):
557
+ if html_cell.name == "td":
558
+ col_header = False
559
+
560
+ # Extract and print the text content of each cell
561
+ col_idx = 0
562
+ for _, html_cell in enumerate(cells):
563
+ text = html_cell.text
564
+
565
+ col_span = int(html_cell.get("colspan", 1))
566
+ row_span = int(html_cell.get("rowspan", 1))
567
+
568
+ while grid[row_idx][col_idx] != None:
569
+ col_idx += 1
570
+ for r in range(row_span):
571
+ for c in range(col_span):
572
+ grid[row_idx + r][col_idx + c] = text
573
+
574
+ cell = TableCell(
575
+ text=text,
576
+ row_span=row_span,
577
+ col_span=col_span,
578
+ start_row_offset_idx=row_idx,
579
+ end_row_offset_idx=row_idx + row_span,
580
+ start_col_offset_idx=col_idx,
581
+ end_col_offset_idx=col_idx + col_span,
582
+ col_header=col_header,
583
+ row_header=((not col_header) and html_cell.name == "th"),
584
+ )
585
+ data.table_cells.append(cell)
586
+
587
+ table_caption = doc.add_text(
588
+ label=DocItemLabel.CAPTION,
589
+ text=table_xml_component["label"] + ": " + table_xml_component["caption"],
590
+ )
591
+ doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
592
+ return
docling/cli/main.py CHANGED
@@ -164,6 +164,11 @@ def convert(
164
164
  to_formats: List[OutputFormat] = typer.Option(
165
165
  None, "--to", help="Specify output formats. Defaults to Markdown."
166
166
  ),
167
+ headers: str = typer.Option(
168
+ None,
169
+ "--headers",
170
+ help="Specify http request headers used when fetching url input sources in the form of a JSON string",
171
+ ),
167
172
  image_export_mode: Annotated[
168
173
  ImageRefMode,
169
174
  typer.Option(
@@ -279,12 +284,19 @@ def convert(
279
284
  if from_formats is None:
280
285
  from_formats = [e for e in InputFormat]
281
286
 
287
+ parsed_headers: Optional[Dict[str, str]] = None
288
+ if headers is not None:
289
+ headers_t = TypeAdapter(Dict[str, str])
290
+ parsed_headers = headers_t.validate_json(headers)
291
+
282
292
  with tempfile.TemporaryDirectory() as tempdir:
283
293
  input_doc_paths: List[Path] = []
284
294
  for src in input_sources:
285
295
  try:
286
296
  # check if we can fetch some remote url
287
- source = resolve_source_to_path(source=src, workdir=Path(tempdir))
297
+ source = resolve_source_to_path(
298
+ source=src, headers=parsed_headers, workdir=Path(tempdir)
299
+ )
288
300
  input_doc_paths.append(source)
289
301
  except FileNotFoundError:
290
302
  err_console.print(
@@ -390,7 +402,7 @@ def convert(
390
402
  start_time = time.time()
391
403
 
392
404
  conv_results = doc_converter.convert_all(
393
- input_doc_paths, raises_on_error=abort_on_error
405
+ input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
394
406
  )
395
407
 
396
408
  output.mkdir(parents=True, exist_ok=True)
@@ -33,6 +33,7 @@ class InputFormat(str, Enum):
33
33
  DOCX = "docx"
34
34
  PPTX = "pptx"
35
35
  HTML = "html"
36
+ XML_PUBMED = "xml_pubmed"
36
37
  IMAGE = "image"
37
38
  PDF = "pdf"
38
39
  ASCIIDOC = "asciidoc"
@@ -55,6 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
55
56
  InputFormat.PDF: ["pdf"],
56
57
  InputFormat.MD: ["md"],
57
58
  InputFormat.HTML: ["html", "htm", "xhtml"],
59
+ InputFormat.XML_PUBMED: ["xml", "nxml"],
58
60
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
59
61
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
60
62
  InputFormat.XLSX: ["xlsx"],
@@ -72,6 +74,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
72
74
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
73
75
  ],
74
76
  InputFormat.HTML: ["text/html", "application/xhtml+xml"],
77
+ InputFormat.XML_PUBMED: ["application/xml"],
75
78
  InputFormat.IMAGE: [
76
79
  "image/png",
77
80
  "image/jpeg",
@@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
227
227
  class _DocumentConversionInput(BaseModel):
228
228
 
229
229
  path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
230
+ headers: Optional[Dict[str, str]] = None
230
231
  limits: Optional[DocumentLimits] = DocumentLimits()
231
232
 
232
233
  def docs(
233
234
  self, format_options: Dict[InputFormat, "FormatOption"]
234
235
  ) -> Iterable[InputDocument]:
235
236
  for item in self.path_or_stream_iterator:
236
- obj = resolve_source_to_stream(item) if isinstance(item, str) else item
237
+ obj = (
238
+ resolve_source_to_stream(item, self.headers)
239
+ if isinstance(item, str)
240
+ else item
241
+ )
237
242
  format = self._guess_format(obj)
238
243
  backend: Type[AbstractDocumentBackend]
239
244
  if format not in format_options.keys():
@@ -292,8 +297,7 @@ class _DocumentConversionInput(BaseModel):
292
297
  mime = mime or "text/plain"
293
298
  formats = MimeTypeToFormat.get(mime, [])
294
299
  if formats:
295
- # TODO: remove application/xml case after adding another XML parse
296
- if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
300
+ if len(formats) == 1 and mime not in ("text/plain"):
297
301
  return formats[0]
298
302
  else: # ambiguity in formats
299
303
  return _DocumentConversionInput._guess_from_content(
@@ -325,6 +329,12 @@ class _DocumentConversionInput(BaseModel):
325
329
  ):
326
330
  input_format = InputFormat.XML_USPTO
327
331
 
332
+ if (
333
+ InputFormat.XML_PUBMED in formats
334
+ and "/NLM//DTD JATS" in xml_doctype
335
+ ):
336
+ input_format = InputFormat.XML_PUBMED
337
+
328
338
  elif mime == "text/plain":
329
339
  if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
330
340
  input_format = InputFormat.XML_USPTO
@@ -340,7 +350,6 @@ class _DocumentConversionInput(BaseModel):
340
350
  mime = FormatToMimeType[InputFormat.HTML][0]
341
351
  elif ext in FormatToExtensions[InputFormat.MD]:
342
352
  mime = FormatToMimeType[InputFormat.MD][0]
343
-
344
353
  return mime
345
354
 
346
355
  @staticmethod
@@ -370,4 +379,10 @@ class _DocumentConversionInput(BaseModel):
370
379
  if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
371
380
  return "text/html"
372
381
 
382
+ p = re.compile(
383
+ r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
384
+ )
385
+ if p.search(content_str):
386
+ return "application/xml"
387
+
373
388
  return None
@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
17
17
  from docling.backend.msword_backend import MsWordDocumentBackend
18
+ from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
18
19
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
19
20
  from docling.datamodel.base_models import (
20
21
  ConversionStatus,
@@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
88
89
  backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
89
90
 
90
91
 
92
+ class XMLPubMedFormatOption(FormatOption):
93
+ pipeline_cls: Type = SimplePipeline
94
+ backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
95
+
96
+
91
97
  class ImageFormatOption(FormatOption):
92
98
  pipeline_cls: Type = StandardPdfPipeline
93
99
  backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
@@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
121
127
  InputFormat.XML_USPTO: FormatOption(
122
128
  pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
123
129
  ),
130
+ InputFormat.XML_PUBMED: FormatOption(
131
+ pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
132
+ ),
124
133
  InputFormat.IMAGE: FormatOption(
125
134
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
126
135
  ),
@@ -167,16 +176,17 @@ class DocumentConverter:
167
176
  def convert(
168
177
  self,
169
178
  source: Union[Path, str, DocumentStream], # TODO review naming
179
+ headers: Optional[Dict[str, str]] = None,
170
180
  raises_on_error: bool = True,
171
181
  max_num_pages: int = sys.maxsize,
172
182
  max_file_size: int = sys.maxsize,
173
183
  ) -> ConversionResult:
174
-
175
184
  all_res = self.convert_all(
176
185
  source=[source],
177
186
  raises_on_error=raises_on_error,
178
187
  max_num_pages=max_num_pages,
179
188
  max_file_size=max_file_size,
189
+ headers=headers,
180
190
  )
181
191
  return next(all_res)
182
192
 
@@ -184,6 +194,7 @@ class DocumentConverter:
184
194
  def convert_all(
185
195
  self,
186
196
  source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
197
+ headers: Optional[Dict[str, str]] = None,
187
198
  raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
188
199
  max_num_pages: int = sys.maxsize,
189
200
  max_file_size: int = sys.maxsize,
@@ -193,8 +204,7 @@ class DocumentConverter:
193
204
  max_file_size=max_file_size,
194
205
  )
195
206
  conv_input = _DocumentConversionInput(
196
- path_or_stream_iterator=source,
197
- limits=limits,
207
+ path_or_stream_iterator=source, limits=limits, headers=headers
198
208
  )
199
209
  conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
200
210
 
@@ -138,18 +138,31 @@ class BaseOcrModel(BasePageModel):
138
138
 
139
139
  def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
140
140
  image = copy.deepcopy(page.image)
141
+ scale_x = image.width / page.size.width
142
+ scale_y = image.height / page.size.height
143
+
141
144
  draw = ImageDraw.Draw(image, "RGBA")
142
145
 
143
146
  # Draw OCR rectangles as yellow filled rect
144
147
  for rect in ocr_rects:
145
148
  x0, y0, x1, y1 = rect.as_tuple()
149
+ y0 *= scale_x
150
+ y1 *= scale_y
151
+ x0 *= scale_x
152
+ x1 *= scale_x
153
+
146
154
  shade_color = (255, 255, 0, 40) # transparent yellow
147
155
  draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
148
156
 
149
157
  # Draw OCR and programmatic cells
150
158
  for tc in page.cells:
151
159
  x0, y0, x1, y1 = tc.bbox.as_tuple()
152
- color = "red"
160
+ y0 *= scale_x
161
+ y1 *= scale_y
162
+ x0 *= scale_x
163
+ x1 *= scale_x
164
+
165
+ color = "gray"
153
166
  if isinstance(tc, OcrCell):
154
167
  color = "magenta"
155
168
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
@@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
67
67
  - Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
68
68
  Includes label names and confidence scores for each cluster.
69
69
  """
70
- label_to_color = {
71
- DocItemLabel.TEXT: (255, 255, 153), # Light Yellow
72
- DocItemLabel.CAPTION: (255, 204, 153), # Light Orange
73
- DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
74
- DocItemLabel.FORMULA: (192, 192, 192), # Gray
75
- DocItemLabel.TABLE: (255, 204, 204), # Light Pink
76
- DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
77
- DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
78
- DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
79
- DocItemLabel.PAGE_FOOTER: (
80
- 204,
81
- 255,
82
- 204,
83
- ), # Light Green (same as Page-Header)
84
- DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
85
- DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
86
- DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
87
- DocItemLabel.CODE: (125, 125, 125), # Gray
88
- DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
89
- DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
90
- DocItemLabel.FORM: (200, 255, 255), # Light Cyan
91
- DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
92
- }
70
+ scale_x = page.image.width / page.size.width
71
+ scale_y = page.image.height / page.size.height
72
+
93
73
  # Filter clusters for left and right images
94
74
  exclude_labels = {
95
75
  DocItemLabel.FORM,
@@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
118
98
  cell_color = (0, 0, 0, 40) # Transparent black for cells
119
99
  for tc in c.cells:
120
100
  cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
101
+ cx0 *= scale_x
102
+ cx1 *= scale_x
103
+ cy0 *= scale_x
104
+ cy1 *= scale_y
105
+
121
106
  draw.rectangle(
122
107
  [(cx0, cy0), (cx1, cy1)],
123
108
  outline=None,
@@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
125
110
  )
126
111
  # Draw cluster rectangle
127
112
  x0, y0, x1, y1 = c.bbox.as_tuple()
128
- cluster_fill_color = (*list(label_to_color.get(c.label)), 70)
129
- cluster_outline_color = (*list(label_to_color.get(c.label)), 255)
113
+ x0 *= scale_x
114
+ x1 *= scale_x
115
+ y0 *= scale_x
116
+ y1 *= scale_y
117
+
118
+ cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
119
+ cluster_outline_color = (
120
+ *list(DocItemLabel.get_color(c.label)),
121
+ 255,
122
+ )
130
123
  draw.rectangle(
131
124
  [(x0, y0), (x1, y1)],
132
125
  outline=cluster_outline_color,
@@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
66
66
  show: bool = False,
67
67
  ):
68
68
  assert page._backend is not None
69
+ assert page.size is not None
69
70
 
70
71
  image = (
71
72
  page._backend.get_page_image()
72
73
  ) # make new image to avoid drawing on the saved ones
74
+
75
+ scale_x = image.width / page.size.width
76
+ scale_y = image.height / page.size.height
77
+
73
78
  draw = ImageDraw.Draw(image)
74
79
 
75
80
  for table_element in tbl_list:
76
81
  x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
82
+ y0 *= scale_x
83
+ y1 *= scale_y
84
+ x0 *= scale_x
85
+ x1 *= scale_x
86
+
77
87
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
78
88
 
79
89
  for cell in table_element.cluster.cells:
80
90
  x0, y0, x1, y1 = cell.bbox.as_tuple()
91
+ x0 *= scale_x
92
+ x1 *= scale_x
93
+ y0 *= scale_x
94
+ y1 *= scale_y
95
+
81
96
  draw.rectangle([(x0, y0), (x1, y1)], outline="green")
82
97
 
83
98
  for tc in table_element.table_cells:
84
99
  if tc.bbox is not None:
85
100
  x0, y0, x1, y1 = tc.bbox.as_tuple()
101
+ x0 *= scale_x
102
+ x1 *= scale_x
103
+ y0 *= scale_x
104
+ y1 *= scale_y
105
+
86
106
  if tc.column_header:
87
107
  width = 3
88
108
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.13.0
3
+ Version: 2.15.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.12.1,<3.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
31
31
  Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -4,36 +4,37 @@ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq
4
4
  docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
5
5
  docling/backend/docling_parse_backend.py,sha256=_jY5f5-KGI3hi5pcZAY6e7tPLocSi5JUWrxraDVszqI,7631
6
6
  docling/backend/docling_parse_v2_backend.py,sha256=1TDUdMIp3fEjCWBNjusUHiCUmH1g6yZQ-b13scofP0Y,8637
7
- docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
7
+ docling/backend/html_backend.py,sha256=O8qXaw7MzOIdaxbBcjHieM9Ce4GEdtBj9YW0vpJspuA,15560
8
8
  docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
9
  docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
10
- docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
10
+ docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
11
11
  docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
12
12
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
13
13
  docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
14
14
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
15
16
  docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
16
17
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
17
18
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- docling/cli/main.py,sha256=SdavhL0VTApK9JrKz0Pc1IYdnQhK-0OOaGT8zlTiN5c,15022
19
+ docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
19
20
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- docling/datamodel/base_models.py,sha256=vUQkOUawcZvJz_6E8RCxxd7wIN83B56h_FOQfdJAU1w,6105
21
- docling/datamodel/document.py,sha256=V-K0_BZIHG1VL9YpA-TUyP23p2ZYlFH0BfFrwBswA4U,12647
21
+ docling/datamodel/base_models.py,sha256=50Jf5zk9c4-zmnOzZLoPBnHQhTX0_OFQzIkKgnKK1o4,6229
22
+ docling/datamodel/document.py,sha256=OHM6bm0a-62xnAZ8DFlMHzATmbgNcfMxQoQO2udaW5Q,13071
22
23
  docling/datamodel/pipeline_options.py,sha256=u37Q12FVfu1UTEhgBiZ2KslyBtG3z3Eobqvaqd_MYaA,7735
23
24
  docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
24
- docling/document_converter.py,sha256=ggJ0zv7qhm-_Vol2GkLHTTArb03p6g9kIS4PX66Wi5A,11950
25
+ docling/document_converter.py,sha256=_pk0sHuPXJ14NEutatf5bK2VyNiU5cvYsVbh1HIgrIw,12431
25
26
  docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
26
27
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
28
  docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
28
- docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
29
+ docling/models/base_ocr_model.py,sha256=qILpSHaqczAd1eUQzuoLxN-TYz3zozmN0K5_7kCWkrM,6738
29
30
  docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
30
31
  docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
31
- docling/models/layout_model.py,sha256=skfFdWh_NgijR4bIqyUH8zlda5mMOIIdN3yMttdmsN8,9871
32
+ docling/models/layout_model.py,sha256=Xo8sclRTOO_V8Cr4RwuxB67vSWKF0LZ5nJRYU1WI--k,9063
32
33
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
33
34
  docling/models/page_assemble_model.py,sha256=qdEX0AIb76ZOqJV6O9j-7r67WmuIkUlwbb2PsL7eFK4,7608
34
35
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
35
36
  docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
36
- docling/models/table_structure_model.py,sha256=3bUBeP26WwDNCb5_aAlRwVZe4xUYgnwsSHgWQYZxk9E,8892
37
+ docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
37
38
  docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
38
39
  docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
39
40
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -48,8 +49,8 @@ docling/utils/glm_utils.py,sha256=IB19wToGath97gD3jAA3G_rQSptnZKhQCWLvPUCnkww,11
48
49
  docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
49
50
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
50
51
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
51
- docling-2.13.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
52
- docling-2.13.0.dist-info/METADATA,sha256=lvbM7MRSyjnE30dP9UPdJQPACQ8jEolnBvoqbr1kcVA,7732
53
- docling-2.13.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
54
- docling-2.13.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
55
- docling-2.13.0.dist-info/RECORD,,
52
+ docling-2.15.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
53
+ docling-2.15.0.dist-info/METADATA,sha256=VglEfKqffhUESHax5WQgtOT_Fysyea5HLDFtf7yUpdM,7732
54
+ docling-2.15.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
+ docling-2.15.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
56
+ docling-2.15.0.dist-info/RECORD,,