docling 2.13.0__tar.gz → 2.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {docling-2.13.0 → docling-2.14.0}/PKG-INFO +1 -1
  2. docling-2.14.0/docling/backend/xml/pubmed_backend.py +592 -0
  3. {docling-2.13.0 → docling-2.14.0}/docling/datamodel/base_models.py +3 -0
  4. {docling-2.13.0 → docling-2.14.0}/docling/datamodel/document.py +13 -3
  5. {docling-2.13.0 → docling-2.14.0}/docling/document_converter.py +9 -1
  6. {docling-2.13.0 → docling-2.14.0}/pyproject.toml +1 -1
  7. {docling-2.13.0 → docling-2.14.0}/LICENSE +0 -0
  8. {docling-2.13.0 → docling-2.14.0}/README.md +0 -0
  9. {docling-2.13.0 → docling-2.14.0}/docling/__init__.py +0 -0
  10. {docling-2.13.0 → docling-2.14.0}/docling/backend/__init__.py +0 -0
  11. {docling-2.13.0 → docling-2.14.0}/docling/backend/abstract_backend.py +0 -0
  12. {docling-2.13.0 → docling-2.14.0}/docling/backend/asciidoc_backend.py +0 -0
  13. {docling-2.13.0 → docling-2.14.0}/docling/backend/docling_parse_backend.py +0 -0
  14. {docling-2.13.0 → docling-2.14.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  15. {docling-2.13.0 → docling-2.14.0}/docling/backend/html_backend.py +0 -0
  16. {docling-2.13.0 → docling-2.14.0}/docling/backend/md_backend.py +0 -0
  17. {docling-2.13.0 → docling-2.14.0}/docling/backend/msexcel_backend.py +0 -0
  18. {docling-2.13.0 → docling-2.14.0}/docling/backend/mspowerpoint_backend.py +0 -0
  19. {docling-2.13.0 → docling-2.14.0}/docling/backend/msword_backend.py +0 -0
  20. {docling-2.13.0 → docling-2.14.0}/docling/backend/pdf_backend.py +0 -0
  21. {docling-2.13.0 → docling-2.14.0}/docling/backend/pypdfium2_backend.py +0 -0
  22. {docling-2.13.0 → docling-2.14.0}/docling/backend/xml/__init__.py +0 -0
  23. {docling-2.13.0 → docling-2.14.0}/docling/backend/xml/uspto_backend.py +0 -0
  24. {docling-2.13.0 → docling-2.14.0}/docling/chunking/__init__.py +0 -0
  25. {docling-2.13.0 → docling-2.14.0}/docling/cli/__init__.py +0 -0
  26. {docling-2.13.0 → docling-2.14.0}/docling/cli/main.py +0 -0
  27. {docling-2.13.0 → docling-2.14.0}/docling/datamodel/__init__.py +0 -0
  28. {docling-2.13.0 → docling-2.14.0}/docling/datamodel/pipeline_options.py +0 -0
  29. {docling-2.13.0 → docling-2.14.0}/docling/datamodel/settings.py +0 -0
  30. {docling-2.13.0 → docling-2.14.0}/docling/exceptions.py +0 -0
  31. {docling-2.13.0 → docling-2.14.0}/docling/models/__init__.py +0 -0
  32. {docling-2.13.0 → docling-2.14.0}/docling/models/base_model.py +0 -0
  33. {docling-2.13.0 → docling-2.14.0}/docling/models/base_ocr_model.py +0 -0
  34. {docling-2.13.0 → docling-2.14.0}/docling/models/ds_glm_model.py +0 -0
  35. {docling-2.13.0 → docling-2.14.0}/docling/models/easyocr_model.py +0 -0
  36. {docling-2.13.0 → docling-2.14.0}/docling/models/layout_model.py +0 -0
  37. {docling-2.13.0 → docling-2.14.0}/docling/models/ocr_mac_model.py +0 -0
  38. {docling-2.13.0 → docling-2.14.0}/docling/models/page_assemble_model.py +0 -0
  39. {docling-2.13.0 → docling-2.14.0}/docling/models/page_preprocessing_model.py +0 -0
  40. {docling-2.13.0 → docling-2.14.0}/docling/models/rapid_ocr_model.py +0 -0
  41. {docling-2.13.0 → docling-2.14.0}/docling/models/table_structure_model.py +0 -0
  42. {docling-2.13.0 → docling-2.14.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  43. {docling-2.13.0 → docling-2.14.0}/docling/models/tesseract_ocr_model.py +0 -0
  44. {docling-2.13.0 → docling-2.14.0}/docling/pipeline/__init__.py +0 -0
  45. {docling-2.13.0 → docling-2.14.0}/docling/pipeline/base_pipeline.py +0 -0
  46. {docling-2.13.0 → docling-2.14.0}/docling/pipeline/simple_pipeline.py +0 -0
  47. {docling-2.13.0 → docling-2.14.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  48. {docling-2.13.0 → docling-2.14.0}/docling/py.typed +0 -0
  49. {docling-2.13.0 → docling-2.14.0}/docling/utils/__init__.py +0 -0
  50. {docling-2.13.0 → docling-2.14.0}/docling/utils/accelerator_utils.py +0 -0
  51. {docling-2.13.0 → docling-2.14.0}/docling/utils/export.py +0 -0
  52. {docling-2.13.0 → docling-2.14.0}/docling/utils/glm_utils.py +0 -0
  53. {docling-2.13.0 → docling-2.14.0}/docling/utils/layout_postprocessor.py +0 -0
  54. {docling-2.13.0 → docling-2.14.0}/docling/utils/profiling.py +0 -0
  55. {docling-2.13.0 → docling-2.14.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.13.0
3
+ Version: 2.14.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -0,0 +1,592 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Any, Set, Union
5
+
6
+ import lxml
7
+ from bs4 import BeautifulSoup
8
+ from docling_core.types.doc import (
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ TableCell,
14
+ TableData,
15
+ )
16
+ from lxml import etree
17
+ from typing_extensions import TypedDict, override
18
+
19
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
20
+ from docling.datamodel.base_models import InputFormat
21
+ from docling.datamodel.document import InputDocument
22
+
23
+ _log = logging.getLogger(__name__)
24
+
25
+
26
+ class Paragraph(TypedDict):
27
+ text: str
28
+ headers: list[str]
29
+
30
+
31
+ class Author(TypedDict):
32
+ name: str
33
+ affiliation_names: list[str]
34
+
35
+
36
+ class Table(TypedDict):
37
+ label: str
38
+ caption: str
39
+ content: str
40
+
41
+
42
+ class FigureCaption(TypedDict):
43
+ label: str
44
+ caption: str
45
+
46
+
47
+ class Reference(TypedDict):
48
+ author_names: str
49
+ title: str
50
+ journal: str
51
+ year: str
52
+
53
+
54
+ class XMLComponents(TypedDict):
55
+ title: str
56
+ authors: list[Author]
57
+ abstract: str
58
+ paragraphs: list[Paragraph]
59
+ tables: list[Table]
60
+ figure_captions: list[FigureCaption]
61
+ references: list[Reference]
62
+
63
+
64
+ class PubMedDocumentBackend(DeclarativeDocumentBackend):
65
+ """
66
+ The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
67
+ Achakulvisut et al., (2020).
68
+ Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
69
+ Journal of Open Source Software, 5(46), 1979,
70
+ https://doi.org/10.21105/joss.01979
71
+ """
72
+
73
+ @override
74
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
75
+ super().__init__(in_doc, path_or_stream)
76
+ self.path_or_stream = path_or_stream
77
+
78
+ # Initialize parents for the document hierarchy
79
+ self.parents: dict = {}
80
+
81
+ self.valid = False
82
+ try:
83
+ if isinstance(self.path_or_stream, BytesIO):
84
+ self.path_or_stream.seek(0)
85
+ self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
86
+ if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
87
+ self.valid = True
88
+ except Exception as exc:
89
+ raise RuntimeError(
90
+ f"Could not initialize PubMed backend for file with hash {self.document_hash}."
91
+ ) from exc
92
+
93
+ @override
94
+ def is_valid(self) -> bool:
95
+ return self.valid
96
+
97
+ @classmethod
98
+ @override
99
+ def supports_pagination(cls) -> bool:
100
+ return False
101
+
102
+ @override
103
+ def unload(self):
104
+ if isinstance(self.path_or_stream, BytesIO):
105
+ self.path_or_stream.close()
106
+ self.path_or_stream = None
107
+
108
+ @classmethod
109
+ @override
110
+ def supported_formats(cls) -> Set[InputFormat]:
111
+ return {InputFormat.XML_PUBMED}
112
+
113
+ @override
114
+ def convert(self) -> DoclingDocument:
115
+ # Create empty document
116
+ origin = DocumentOrigin(
117
+ filename=self.file.name or "file",
118
+ mimetype="application/xml",
119
+ binary_hash=self.document_hash,
120
+ )
121
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
122
+
123
+ _log.debug("Trying to convert PubMed XML document...")
124
+
125
+ # Get parsed XML components
126
+ xml_components: XMLComponents = self._parse()
127
+
128
+ # Add XML components to the document
129
+ doc = self._populate_document(doc, xml_components)
130
+ return doc
131
+
132
+ def _parse_title(self) -> str:
133
+ title: str = " ".join(
134
+ [
135
+ t.replace("\n", "")
136
+ for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
137
+ ]
138
+ )
139
+ return title
140
+
141
+ def _parse_authors(self) -> list[Author]:
142
+ # Get mapping between affiliation ids and names
143
+ affiliation_names = []
144
+ for affiliation_node in self.tree.xpath(".//aff[@id]"):
145
+ affiliation_names.append(
146
+ ": ".join([t for t in affiliation_node.itertext() if t != "\n"])
147
+ )
148
+ affiliation_ids_names = {
149
+ id: name
150
+ for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
151
+ }
152
+
153
+ # Get author names and affiliation names
154
+ authors: list[Author] = []
155
+ for author_node in self.tree.xpath(
156
+ './/contrib-group/contrib[@contrib-type="author"]'
157
+ ):
158
+ author: Author = {
159
+ "name": "",
160
+ "affiliation_names": [],
161
+ }
162
+
163
+ # Affiliation names
164
+ affiliation_ids = [
165
+ a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
166
+ ]
167
+ for id in affiliation_ids:
168
+ if id in affiliation_ids_names:
169
+ author["affiliation_names"].append(affiliation_ids_names[id])
170
+
171
+ # Name
172
+ author["name"] = (
173
+ author_node.xpath("name/surname")[0].text
174
+ + " "
175
+ + author_node.xpath("name/given-names")[0].text
176
+ )
177
+
178
+ authors.append(author)
179
+ return authors
180
+
181
+ def _parse_abstract(self) -> str:
182
+ texts = []
183
+ for abstract_node in self.tree.xpath(".//abstract"):
184
+ for text in abstract_node.itertext():
185
+ texts.append(text.replace("\n", ""))
186
+ abstract: str = "".join(texts)
187
+ return abstract
188
+
189
+ def _parse_main_text(self) -> list[Paragraph]:
190
+ paragraphs: list[Paragraph] = []
191
+ for paragraph_node in self.tree.xpath("//body//p"):
192
+ # Skip captions
193
+ if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
194
+ continue
195
+
196
+ paragraph: Paragraph = {"text": "", "headers": []}
197
+
198
+ # Text
199
+ paragraph["text"] = "".join(
200
+ [t.replace("\n", "") for t in paragraph_node.itertext()]
201
+ )
202
+
203
+ # Header
204
+ path = "../title"
205
+ while len(paragraph_node.xpath(path)) > 0:
206
+ paragraph["headers"].append(
207
+ "".join(
208
+ [
209
+ t.replace("\n", "")
210
+ for t in paragraph_node.xpath(path)[0].itertext()
211
+ ]
212
+ )
213
+ )
214
+ path = "../" + path
215
+
216
+ paragraphs.append(paragraph)
217
+
218
+ return paragraphs
219
+
220
+ def _parse_tables(self) -> list[Table]:
221
+ tables: list[Table] = []
222
+ for table_node in self.tree.xpath(".//body//table-wrap"):
223
+ table: Table = {"label": "", "caption": "", "content": ""}
224
+
225
+ # Content
226
+ if len(table_node.xpath("table")) > 0:
227
+ table_content_node = table_node.xpath("table")[0]
228
+ elif len(table_node.xpath("alternatives/table")) > 0:
229
+ table_content_node = table_node.xpath("alternatives/table")[0]
230
+ else:
231
+ table_content_node = None
232
+ if table_content_node != None:
233
+ table["content"] = etree.tostring(table_content_node).decode("utf-8")
234
+
235
+ # Caption
236
+ if len(table_node.xpath("caption/p")) > 0:
237
+ caption_node = table_node.xpath("caption/p")[0]
238
+ elif len(table_node.xpath("caption/title")) > 0:
239
+ caption_node = table_node.xpath("caption/title")[0]
240
+ else:
241
+ caption_node = None
242
+ if caption_node != None:
243
+ table["caption"] = "".join(
244
+ [t.replace("\n", "") for t in caption_node.itertext()]
245
+ )
246
+
247
+ # Label
248
+ if len(table_node.xpath("label")) > 0:
249
+ table["label"] = table_node.xpath("label")[0].text
250
+
251
+ tables.append(table)
252
+ return tables
253
+
254
+ def _parse_figure_captions(self) -> list[FigureCaption]:
255
+ figure_captions: list[FigureCaption] = []
256
+
257
+ if not (self.tree.xpath(".//fig")):
258
+ return figure_captions
259
+
260
+ for figure_node in self.tree.xpath(".//fig"):
261
+ figure_caption: FigureCaption = {
262
+ "caption": "",
263
+ "label": "",
264
+ }
265
+
266
+ # Label
267
+ if figure_node.xpath("label"):
268
+ figure_caption["label"] = "".join(
269
+ [
270
+ t.replace("\n", "")
271
+ for t in figure_node.xpath("label")[0].itertext()
272
+ ]
273
+ )
274
+
275
+ # Caption
276
+ if figure_node.xpath("caption"):
277
+ caption = ""
278
+ for caption_node in figure_node.xpath("caption")[0].getchildren():
279
+ caption += (
280
+ "".join([t.replace("\n", "") for t in caption_node.itertext()])
281
+ + "\n"
282
+ )
283
+ figure_caption["caption"] = caption
284
+
285
+ figure_captions.append(figure_caption)
286
+
287
+ return figure_captions
288
+
289
+ def _parse_references(self) -> list[Reference]:
290
+ references: list[Reference] = []
291
+ for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
292
+ reference: Reference = {
293
+ "author_names": "",
294
+ "title": "",
295
+ "journal": "",
296
+ "year": "",
297
+ }
298
+ reference_node: Any = None
299
+ for tag in ["mixed-citation", "element-citation", "citation"]:
300
+ if len(reference_node_abs.xpath(tag)) > 0:
301
+ reference_node = reference_node_abs.xpath(tag)[0]
302
+ break
303
+
304
+ if reference_node is None:
305
+ continue
306
+
307
+ if all(
308
+ not (ref_type in ["citation-type", "publication-type"])
309
+ for ref_type in reference_node.attrib.keys()
310
+ ):
311
+ continue
312
+
313
+ # Author names
314
+ names = []
315
+ if len(reference_node.xpath("name")) > 0:
316
+ for name_node in reference_node.xpath("name"):
317
+ name_str = " ".join(
318
+ [t.text for t in name_node.getchildren() if (t.text != None)]
319
+ )
320
+ names.append(name_str)
321
+ elif len(reference_node.xpath("person-group")) > 0:
322
+ for name_node in reference_node.xpath("person-group")[0]:
323
+ name_str = (
324
+ name_node.xpath("given-names")[0].text
325
+ + " "
326
+ + name_node.xpath("surname")[0].text
327
+ )
328
+ names.append(name_str)
329
+ reference["author_names"] = "; ".join(names)
330
+
331
+ # Title
332
+ if len(reference_node.xpath("article-title")) > 0:
333
+ reference["title"] = " ".join(
334
+ [
335
+ t.replace("\n", " ")
336
+ for t in reference_node.xpath("article-title")[0].itertext()
337
+ ]
338
+ )
339
+
340
+ # Journal
341
+ if len(reference_node.xpath("source")) > 0:
342
+ reference["journal"] = reference_node.xpath("source")[0].text
343
+
344
+ # Year
345
+ if len(reference_node.xpath("year")) > 0:
346
+ reference["year"] = reference_node.xpath("year")[0].text
347
+
348
+ if (
349
+ not (reference_node.xpath("article-title"))
350
+ and not (reference_node.xpath("journal"))
351
+ and not (reference_node.xpath("year"))
352
+ ):
353
+ reference["title"] = reference_node.text
354
+
355
+ references.append(reference)
356
+ return references
357
+
358
+ def _parse(self) -> XMLComponents:
359
+ """Parsing PubMed document."""
360
+ xml_components: XMLComponents = {
361
+ "title": self._parse_title(),
362
+ "authors": self._parse_authors(),
363
+ "abstract": self._parse_abstract(),
364
+ "paragraphs": self._parse_main_text(),
365
+ "tables": self._parse_tables(),
366
+ "figure_captions": self._parse_figure_captions(),
367
+ "references": self._parse_references(),
368
+ }
369
+ return xml_components
370
+
371
+ def _populate_document(
372
+ self, doc: DoclingDocument, xml_components: XMLComponents
373
+ ) -> DoclingDocument:
374
+ self._add_title(doc, xml_components)
375
+ self._add_authors(doc, xml_components)
376
+ self._add_abstract(doc, xml_components)
377
+ self._add_main_text(doc, xml_components)
378
+
379
+ if xml_components["tables"]:
380
+ self._add_tables(doc, xml_components)
381
+
382
+ if xml_components["figure_captions"]:
383
+ self._add_figure_captions(doc, xml_components)
384
+
385
+ self._add_references(doc, xml_components)
386
+ return doc
387
+
388
+ def _add_figure_captions(
389
+ self, doc: DoclingDocument, xml_components: XMLComponents
390
+ ) -> None:
391
+ self.parents["Figures"] = doc.add_heading(
392
+ parent=self.parents["Title"], text="Figures"
393
+ )
394
+ for figure_caption_xml_component in xml_components["figure_captions"]:
395
+ figure_caption_text = (
396
+ figure_caption_xml_component["label"]
397
+ + ": "
398
+ + figure_caption_xml_component["caption"].strip()
399
+ )
400
+ fig_caption = doc.add_text(
401
+ label=DocItemLabel.CAPTION, text=figure_caption_text
402
+ )
403
+ doc.add_picture(
404
+ parent=self.parents["Figures"],
405
+ caption=fig_caption,
406
+ )
407
+ return
408
+
409
+ def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
410
+ self.parents["Title"] = doc.add_text(
411
+ parent=None,
412
+ text=xml_components["title"],
413
+ label=DocItemLabel.TITLE,
414
+ )
415
+ return
416
+
417
+ def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
418
+ authors_affiliations: list = []
419
+ for author in xml_components["authors"]:
420
+ authors_affiliations.append(author["name"])
421
+ authors_affiliations.append(", ".join(author["affiliation_names"]))
422
+ authors_affiliations_str = "; ".join(authors_affiliations)
423
+
424
+ doc.add_text(
425
+ parent=self.parents["Title"],
426
+ text=authors_affiliations_str,
427
+ label=DocItemLabel.PARAGRAPH,
428
+ )
429
+ return
430
+
431
+ def _add_abstract(
432
+ self, doc: DoclingDocument, xml_components: XMLComponents
433
+ ) -> None:
434
+ abstract_text: str = xml_components["abstract"]
435
+ self.parents["Abstract"] = doc.add_heading(
436
+ parent=self.parents["Title"], text="Abstract"
437
+ )
438
+ doc.add_text(
439
+ parent=self.parents["Abstract"],
440
+ text=abstract_text,
441
+ label=DocItemLabel.TEXT,
442
+ )
443
+ return
444
+
445
+ def _add_main_text(
446
+ self, doc: DoclingDocument, xml_components: XMLComponents
447
+ ) -> None:
448
+ added_headers: list = []
449
+ for paragraph in xml_components["paragraphs"]:
450
+ if not (paragraph["headers"]):
451
+ continue
452
+
453
+ # Header
454
+ for i, header in enumerate(reversed(paragraph["headers"])):
455
+ if header in added_headers:
456
+ continue
457
+ added_headers.append(header)
458
+
459
+ if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
460
+ i - 1
461
+ ] in self.parents:
462
+ parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
463
+ else:
464
+ parent = self.parents["Title"]
465
+
466
+ self.parents[header] = doc.add_heading(parent=parent, text=header)
467
+
468
+ # Paragraph text
469
+ if paragraph["headers"][0] in self.parents:
470
+ parent = self.parents[paragraph["headers"][0]]
471
+ else:
472
+ parent = self.parents["Title"]
473
+
474
+ doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
475
+ return
476
+
477
+ def _add_references(
478
+ self, doc: DoclingDocument, xml_components: XMLComponents
479
+ ) -> None:
480
+ self.parents["References"] = doc.add_heading(
481
+ parent=self.parents["Title"], text="References"
482
+ )
483
+ current_list = doc.add_group(
484
+ parent=self.parents["References"], label=GroupLabel.LIST, name="list"
485
+ )
486
+ for reference in xml_components["references"]:
487
+ reference_text: str = ""
488
+ if reference["author_names"]:
489
+ reference_text += reference["author_names"] + ". "
490
+
491
+ if reference["title"]:
492
+ reference_text += reference["title"]
493
+ if reference["title"][-1] != ".":
494
+ reference_text += "."
495
+ reference_text += " "
496
+
497
+ if reference["journal"]:
498
+ reference_text += reference["journal"]
499
+
500
+ if reference["year"]:
501
+ reference_text += " (" + reference["year"] + ")"
502
+
503
+ if not (reference_text):
504
+ _log.debug(f"Skipping reference for: {str(self.file)}")
505
+ continue
506
+
507
+ doc.add_list_item(
508
+ text=reference_text, enumerated=False, parent=current_list
509
+ )
510
+ return
511
+
512
+ def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
513
+ self.parents["Tables"] = doc.add_heading(
514
+ parent=self.parents["Title"], text="Tables"
515
+ )
516
+ for table_xml_component in xml_components["tables"]:
517
+ try:
518
+ self._add_table(doc, table_xml_component)
519
+ except Exception as e:
520
+ _log.debug(f"Skipping unsupported table for: {str(self.file)}")
521
+ pass
522
+ return
523
+
524
+ def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
525
+ soup = BeautifulSoup(table_xml_component["content"], "html.parser")
526
+ table_tag = soup.find("table")
527
+
528
+ nested_tables = table_tag.find("table")
529
+ if nested_tables:
530
+ _log.debug(f"Skipping nested table for: {str(self.file)}")
531
+ return
532
+
533
+ # Count the number of rows (number of <tr> elements)
534
+ num_rows = len(table_tag.find_all("tr"))
535
+
536
+ # Find the number of columns (taking into account colspan)
537
+ num_cols = 0
538
+ for row in table_tag.find_all("tr"):
539
+ col_count = 0
540
+ for cell in row.find_all(["td", "th"]):
541
+ colspan = int(cell.get("colspan", 1))
542
+ col_count += colspan
543
+ num_cols = max(num_cols, col_count)
544
+
545
+ grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
546
+
547
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
548
+
549
+ # Iterate over the rows in the table
550
+ for row_idx, row in enumerate(table_tag.find_all("tr")):
551
+ # For each row, find all the column cells (both <td> and <th>)
552
+ cells = row.find_all(["td", "th"])
553
+
554
+ # Check if each cell in the row is a header -> means it is a column header
555
+ col_header = True
556
+ for j, html_cell in enumerate(cells):
557
+ if html_cell.name == "td":
558
+ col_header = False
559
+
560
+ # Extract and print the text content of each cell
561
+ col_idx = 0
562
+ for _, html_cell in enumerate(cells):
563
+ text = html_cell.text
564
+
565
+ col_span = int(html_cell.get("colspan", 1))
566
+ row_span = int(html_cell.get("rowspan", 1))
567
+
568
+ while grid[row_idx][col_idx] != None:
569
+ col_idx += 1
570
+ for r in range(row_span):
571
+ for c in range(col_span):
572
+ grid[row_idx + r][col_idx + c] = text
573
+
574
+ cell = TableCell(
575
+ text=text,
576
+ row_span=row_span,
577
+ col_span=col_span,
578
+ start_row_offset_idx=row_idx,
579
+ end_row_offset_idx=row_idx + row_span,
580
+ start_col_offset_idx=col_idx,
581
+ end_col_offset_idx=col_idx + col_span,
582
+ col_header=col_header,
583
+ row_header=((not col_header) and html_cell.name == "th"),
584
+ )
585
+ data.table_cells.append(cell)
586
+
587
+ table_caption = doc.add_text(
588
+ label=DocItemLabel.CAPTION,
589
+ text=table_xml_component["label"] + ": " + table_xml_component["caption"],
590
+ )
591
+ doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
592
+ return
@@ -33,6 +33,7 @@ class InputFormat(str, Enum):
33
33
  DOCX = "docx"
34
34
  PPTX = "pptx"
35
35
  HTML = "html"
36
+ XML_PUBMED = "xml_pubmed"
36
37
  IMAGE = "image"
37
38
  PDF = "pdf"
38
39
  ASCIIDOC = "asciidoc"
@@ -55,6 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
55
56
  InputFormat.PDF: ["pdf"],
56
57
  InputFormat.MD: ["md"],
57
58
  InputFormat.HTML: ["html", "htm", "xhtml"],
59
+ InputFormat.XML_PUBMED: ["xml", "nxml"],
58
60
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
59
61
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
60
62
  InputFormat.XLSX: ["xlsx"],
@@ -72,6 +74,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
72
74
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
73
75
  ],
74
76
  InputFormat.HTML: ["text/html", "application/xhtml+xml"],
77
+ InputFormat.XML_PUBMED: ["application/xml"],
75
78
  InputFormat.IMAGE: [
76
79
  "image/png",
77
80
  "image/jpeg",
@@ -292,8 +292,7 @@ class _DocumentConversionInput(BaseModel):
292
292
  mime = mime or "text/plain"
293
293
  formats = MimeTypeToFormat.get(mime, [])
294
294
  if formats:
295
- # TODO: remove application/xml case after adding another XML parse
296
- if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
295
+ if len(formats) == 1 and mime not in ("text/plain"):
297
296
  return formats[0]
298
297
  else: # ambiguity in formats
299
298
  return _DocumentConversionInput._guess_from_content(
@@ -325,6 +324,12 @@ class _DocumentConversionInput(BaseModel):
325
324
  ):
326
325
  input_format = InputFormat.XML_USPTO
327
326
 
327
+ if (
328
+ InputFormat.XML_PUBMED in formats
329
+ and "/NLM//DTD JATS" in xml_doctype
330
+ ):
331
+ input_format = InputFormat.XML_PUBMED
332
+
328
333
  elif mime == "text/plain":
329
334
  if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
330
335
  input_format = InputFormat.XML_USPTO
@@ -340,7 +345,6 @@ class _DocumentConversionInput(BaseModel):
340
345
  mime = FormatToMimeType[InputFormat.HTML][0]
341
346
  elif ext in FormatToExtensions[InputFormat.MD]:
342
347
  mime = FormatToMimeType[InputFormat.MD][0]
343
-
344
348
  return mime
345
349
 
346
350
  @staticmethod
@@ -370,4 +374,10 @@ class _DocumentConversionInput(BaseModel):
370
374
  if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
371
375
  return "text/html"
372
376
 
377
+ p = re.compile(
378
+ r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
379
+ )
380
+ if p.search(content_str):
381
+ return "application/xml"
382
+
373
383
  return None
@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
17
17
  from docling.backend.msword_backend import MsWordDocumentBackend
18
+ from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
18
19
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
19
20
  from docling.datamodel.base_models import (
20
21
  ConversionStatus,
@@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
88
89
  backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
89
90
 
90
91
 
92
+ class XMLPubMedFormatOption(FormatOption):
93
+ pipeline_cls: Type = SimplePipeline
94
+ backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
95
+
96
+
91
97
  class ImageFormatOption(FormatOption):
92
98
  pipeline_cls: Type = StandardPdfPipeline
93
99
  backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
@@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
121
127
  InputFormat.XML_USPTO: FormatOption(
122
128
  pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
123
129
  ),
130
+ InputFormat.XML_PUBMED: FormatOption(
131
+ pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
132
+ ),
124
133
  InputFormat.IMAGE: FormatOption(
125
134
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
126
135
  ),
@@ -171,7 +180,6 @@ class DocumentConverter:
171
180
  max_num_pages: int = sys.maxsize,
172
181
  max_file_size: int = sys.maxsize,
173
182
  ) -> ConversionResult:
174
-
175
183
  all_res = self.convert_all(
176
184
  source=[source],
177
185
  raises_on_error=raises_on_error,
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.13.0" # DO NOT EDIT, updated automatically
3
+ version = "2.14.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes