docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,819 @@
1
+ import logging
2
+ import traceback
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import Final, Optional, Union, cast
6
+
7
+ from bs4 import BeautifulSoup, NavigableString, Tag
8
+ from docling_core.types.doc import (
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupItem,
13
+ GroupLabel,
14
+ NodeItem,
15
+ TableCell,
16
+ TableData,
17
+ TextItem,
18
+ )
19
+ from lxml import etree
20
+ from typing_extensions import TypedDict, override
21
+
22
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
23
+ from docling.backend.html_backend import HTMLDocumentBackend
24
+ from docling.datamodel.base_models import InputFormat
25
+ from docling.datamodel.document import InputDocument
26
+
27
+ _log = logging.getLogger(__name__)
28
+
29
+ JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"]
30
+ DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments"
31
+ DEFAULT_HEADER_ABSTRACT: Final = "Abstract"
32
+ DEFAULT_HEADER_REFERENCES: Final = "References"
33
+ DEFAULT_TEXT_ETAL: Final = "et al."
34
+
35
+
36
+ class Abstract(TypedDict):
37
+ label: str
38
+ content: str
39
+
40
+
41
+ class Author(TypedDict):
42
+ name: str
43
+ affiliation_names: list[str]
44
+
45
+
46
+ class Citation(TypedDict):
47
+ author_names: str
48
+ title: str
49
+ source: str
50
+ year: str
51
+ volume: str
52
+ page: str
53
+ pub_id: str
54
+ publisher_name: str
55
+ publisher_loc: str
56
+
57
+
58
+ class Table(TypedDict):
59
+ label: str
60
+ caption: str
61
+ content: str
62
+
63
+
64
+ class XMLComponents(TypedDict):
65
+ title: str
66
+ authors: list[Author]
67
+ abstract: list[Abstract]
68
+
69
+
70
+ class JatsDocumentBackend(DeclarativeDocumentBackend):
71
+ """Backend to parse articles in XML format tagged according to JATS definition.
72
+
73
+ The Journal Article Tag Suite (JATS) is an definition standard for the
74
+ representation of journal articles in XML format. Several publishers and journal
75
+ archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
76
+ medRxiv, or Springer Nature.
77
+
78
+ Refer to https://jats.nlm.nih.gov for more details on JATS.
79
+
80
+ The code from this document backend has been developed by modifying parts of the
81
+ PubMed Parser library (version 0.5.0, released on 12.08.2024):
82
+ Achakulvisut et al., (2020).
83
+ Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
84
+ Dataset XML Dataset.
85
+ Journal of Open Source Software, 5(46), 1979,
86
+ https://doi.org/10.21105/joss.01979
87
+ """
88
+
89
+ @override
90
+ def __init__(
91
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
92
+ ) -> None:
93
+ super().__init__(in_doc, path_or_stream)
94
+ self.path_or_stream = path_or_stream
95
+
96
+ # Initialize the root of the document hierarchy
97
+ self.root: Optional[NodeItem] = None
98
+ self.hlevel: int = 0
99
+ self.valid: bool = False
100
+ try:
101
+ if isinstance(self.path_or_stream, BytesIO):
102
+ self.path_or_stream.seek(0)
103
+ self.tree: etree._ElementTree = etree.parse(self.path_or_stream)
104
+
105
+ doc_info: etree.DocInfo = self.tree.docinfo
106
+ if doc_info.system_url and any(
107
+ kwd in doc_info.system_url for kwd in JATS_DTD_URL
108
+ ):
109
+ self.valid = True
110
+ return
111
+ for ent in doc_info.internalDTD.iterentities():
112
+ if ent.system_url and any(
113
+ kwd in ent.system_url for kwd in JATS_DTD_URL
114
+ ):
115
+ self.valid = True
116
+ return
117
+ except Exception as exc:
118
+ raise RuntimeError(
119
+ f"Could not initialize JATS backend for file with hash {self.document_hash}."
120
+ ) from exc
121
+
122
+ @override
123
+ def is_valid(self) -> bool:
124
+ return self.valid
125
+
126
+ @classmethod
127
+ @override
128
+ def supports_pagination(cls) -> bool:
129
+ return False
130
+
131
+ @override
132
+ def unload(self):
133
+ if isinstance(self.path_or_stream, BytesIO):
134
+ self.path_or_stream.close()
135
+ self.path_or_stream = None
136
+
137
+ @classmethod
138
+ @override
139
+ def supported_formats(cls) -> set[InputFormat]:
140
+ return {InputFormat.XML_JATS}
141
+
142
+ @override
143
+ def convert(self) -> DoclingDocument:
144
+ try:
145
+ # Create empty document
146
+ origin = DocumentOrigin(
147
+ filename=self.file.name or "file",
148
+ mimetype="application/xml",
149
+ binary_hash=self.document_hash,
150
+ )
151
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
152
+ self.hlevel = 0
153
+
154
+ # Get metadata XML components
155
+ xml_components: XMLComponents = self._parse_metadata()
156
+
157
+ # Add metadata to the document
158
+ self._add_metadata(doc, xml_components)
159
+
160
+ # walk over the XML body
161
+ body = self.tree.xpath("//body")
162
+ if self.root and len(body) > 0:
163
+ self._walk_linear(doc, self.root, body[0])
164
+
165
+ # walk over the XML back matter
166
+ back = self.tree.xpath("//back")
167
+ if self.root and len(back) > 0:
168
+ self._walk_linear(doc, self.root, back[0])
169
+ except Exception:
170
+ _log.error(traceback.format_exc())
171
+
172
+ return doc
173
+
174
+ @staticmethod
175
+ def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
176
+ skip_tags = ["term", "disp-formula", "inline-formula"]
177
+ text: str = (
178
+ node.text.replace("\n", " ")
179
+ if (node.tag not in skip_tags and node.text)
180
+ else ""
181
+ )
182
+ for child in list(node):
183
+ if child.tag not in skip_tags:
184
+ # TODO: apply styling according to child.tag when supported by docling-core
185
+ text += JatsDocumentBackend._get_text(child, sep)
186
+ if sep:
187
+ text = text.rstrip(sep) + sep
188
+ text += child.tail.replace("\n", " ") if child.tail else ""
189
+
190
+ return text
191
+
192
+ def _find_metadata(self) -> Optional[etree._Element]:
193
+ meta_names: list[str] = ["article-meta", "book-part-meta"]
194
+ meta: Optional[etree._Element] = None
195
+ for name in meta_names:
196
+ node = self.tree.xpath(f".//{name}")
197
+ if len(node) > 0:
198
+ meta = node[0]
199
+ break
200
+
201
+ return meta
202
+
203
+ def _parse_abstract(self) -> list[Abstract]:
204
+ # TODO: address cases with multiple sections
205
+ abs_list: list[Abstract] = []
206
+
207
+ for abs_node in self.tree.xpath(".//abstract"):
208
+ abstract: Abstract = dict(label="", content="")
209
+ texts = []
210
+ for abs_par in abs_node.xpath("p"):
211
+ texts.append(JatsDocumentBackend._get_text(abs_par).strip())
212
+ abstract["content"] = " ".join(texts)
213
+
214
+ label_node = abs_node.xpath("title|label")
215
+ if len(label_node) > 0:
216
+ abstract["label"] = label_node[0].text.strip()
217
+
218
+ abs_list.append(abstract)
219
+
220
+ return abs_list
221
+
222
+ def _parse_authors(self) -> list[Author]:
223
+ # Get mapping between affiliation ids and names
224
+ authors: list[Author] = []
225
+ meta: Optional[etree._Element] = self._find_metadata()
226
+ if meta is None:
227
+ return authors
228
+
229
+ affiliation_names = []
230
+ for affiliation_node in meta.xpath(".//aff[@id]"):
231
+ aff = ", ".join([t for t in affiliation_node.itertext() if t.strip()])
232
+ aff = aff.replace("\n", " ")
233
+ label = affiliation_node.xpath("label")
234
+ if label:
235
+ # TODO: once superscript is supported, add label with formatting
236
+ aff = aff.removeprefix(f"{label[0].text}, ")
237
+ affiliation_names.append(aff)
238
+ affiliation_ids_names = dict(
239
+ zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
240
+ )
241
+
242
+ # Get author names and affiliation names
243
+ for author_node in meta.xpath(
244
+ './/contrib-group/contrib[@contrib-type="author"]'
245
+ ):
246
+ author: Author = {
247
+ "name": "",
248
+ "affiliation_names": [],
249
+ }
250
+
251
+ # Affiliation names
252
+ affiliation_ids = [
253
+ a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
254
+ ]
255
+ for id in affiliation_ids:
256
+ if id in affiliation_ids_names:
257
+ author["affiliation_names"].append(affiliation_ids_names[id])
258
+
259
+ # Name
260
+ author["name"] = (
261
+ author_node.xpath("name/given-names")[0].text
262
+ + " "
263
+ + author_node.xpath("name/surname")[0].text
264
+ )
265
+
266
+ authors.append(author)
267
+
268
+ return authors
269
+
270
+ def _parse_title(self) -> str:
271
+ meta_names: list[str] = [
272
+ "article-meta",
273
+ "collection-meta",
274
+ "book-meta",
275
+ "book-part-meta",
276
+ ]
277
+ title_names: list[str] = ["article-title", "subtitle", "title", "label"]
278
+ titles: list[str] = [
279
+ " ".join(
280
+ elem.text.replace("\n", " ").strip()
281
+ for elem in list(title_node)
282
+ if elem.tag in title_names
283
+ ).strip()
284
+ for title_node in self.tree.xpath(
285
+ "|".join([f".//{item}/title-group" for item in meta_names])
286
+ )
287
+ ]
288
+
289
+ text = " - ".join(titles)
290
+
291
+ return text
292
+
293
+ def _parse_metadata(self) -> XMLComponents:
294
+ """Parsing JATS document metadata."""
295
+ xml_components: XMLComponents = {
296
+ "title": self._parse_title(),
297
+ "authors": self._parse_authors(),
298
+ "abstract": self._parse_abstract(),
299
+ }
300
+ return xml_components
301
+
302
+ def _add_abstract(
303
+ self, doc: DoclingDocument, xml_components: XMLComponents
304
+ ) -> None:
305
+ for abstract in xml_components["abstract"]:
306
+ text: str = abstract["content"]
307
+ title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
308
+ if not text:
309
+ continue
310
+ parent = doc.add_heading(
311
+ parent=self.root, text=title, level=self.hlevel + 1
312
+ )
313
+ doc.add_text(
314
+ parent=parent,
315
+ text=text,
316
+ label=DocItemLabel.TEXT,
317
+ )
318
+
319
+ return
320
+
321
+ def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
322
+ # TODO: once docling supports text formatting, add affiliation reference to
323
+ # author names through superscripts
324
+ authors: list = [item["name"] for item in xml_components["authors"]]
325
+ authors_str = ", ".join(authors)
326
+ affiliations: list = [
327
+ item
328
+ for author in xml_components["authors"]
329
+ for item in author["affiliation_names"]
330
+ ]
331
+ affiliations_str = "; ".join(list(dict.fromkeys(affiliations)))
332
+ if authors_str:
333
+ doc.add_text(
334
+ parent=self.root,
335
+ text=authors_str,
336
+ label=DocItemLabel.PARAGRAPH,
337
+ )
338
+ if affiliations_str:
339
+ doc.add_text(
340
+ parent=self.root,
341
+ text=affiliations_str,
342
+ label=DocItemLabel.PARAGRAPH,
343
+ )
344
+
345
+ return
346
+
347
+ def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:
348
+ if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:
349
+ doc.add_list_item(text=text, enumerated=False, parent=parent)
350
+ else:
351
+ doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)
352
+
353
+ return
354
+
355
+ def _parse_element_citation(self, node: etree._Element) -> str:
356
+ citation: Citation = {
357
+ "author_names": "",
358
+ "title": "",
359
+ "source": "",
360
+ "year": "",
361
+ "volume": "",
362
+ "page": "",
363
+ "pub_id": "",
364
+ "publisher_name": "",
365
+ "publisher_loc": "",
366
+ }
367
+
368
+ _log.debug("Citation parsing started")
369
+
370
+ # Author names
371
+ names = []
372
+ for name_node in node.xpath(".//name"):
373
+ name_str = (
374
+ name_node.xpath("surname")[0].text.replace("\n", " ").strip()
375
+ + " "
376
+ + name_node.xpath("given-names")[0].text.replace("\n", " ").strip()
377
+ )
378
+ names.append(name_str)
379
+ etal_node = node.xpath(".//etal")
380
+ if len(etal_node) > 0:
381
+ etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL
382
+ names.append(etal_text)
383
+ citation["author_names"] = ", ".join(names)
384
+
385
+ titles: list[str] = [
386
+ "article-title",
387
+ "chapter-title",
388
+ "data-title",
389
+ "issue-title",
390
+ "part-title",
391
+ "trans-title",
392
+ ]
393
+ title_node: Optional[etree._Element] = None
394
+ for name in titles:
395
+ name_node = node.xpath(name)
396
+ if len(name_node) > 0:
397
+ title_node = name_node[0]
398
+ break
399
+ citation["title"] = (
400
+ JatsDocumentBackend._get_text(title_node)
401
+ if title_node is not None
402
+ else node.text.replace("\n", " ").strip()
403
+ )
404
+
405
+ # Journal, year, publisher name, publisher location, volume, elocation
406
+ fields: list[str] = [
407
+ "source",
408
+ "year",
409
+ "publisher-name",
410
+ "publisher-loc",
411
+ "volume",
412
+ ]
413
+ for item in fields:
414
+ item_node = node.xpath(item)
415
+ if len(item_node) > 0:
416
+ citation[item.replace("-", "_")] = ( # type: ignore[literal-required]
417
+ item_node[0].text.replace("\n", " ").strip()
418
+ )
419
+
420
+ # Publication identifier
421
+ if len(node.xpath("pub-id")) > 0:
422
+ pub_id: list[str] = []
423
+ for id_node in node.xpath("pub-id"):
424
+ id_type = id_node.get("assigning-authority") or id_node.get(
425
+ "pub-id-type"
426
+ )
427
+ id_text = id_node.text
428
+ if id_type and id_text:
429
+ pub_id.append(
430
+ id_type.replace("\n", " ").strip().upper()
431
+ + ": "
432
+ + id_text.replace("\n", " ").strip()
433
+ )
434
+ if pub_id:
435
+ citation["pub_id"] = ", ".join(pub_id)
436
+
437
+ # Pages
438
+ if len(node.xpath("elocation-id")) > 0:
439
+ citation["page"] = (
440
+ node.xpath("elocation-id")[0].text.replace("\n", " ").strip()
441
+ )
442
+ elif len(node.xpath("fpage")) > 0:
443
+ citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
444
+ if len(node.xpath("lpage")) > 0:
445
+ citation["page"] += (
446
+ "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
447
+ )
448
+
449
+ # Flatten the citation to string
450
+
451
+ text = ""
452
+ if citation["author_names"]:
453
+ text += citation["author_names"].rstrip(".") + ". "
454
+ if citation["title"]:
455
+ text += citation["title"] + ". "
456
+ if citation["source"]:
457
+ text += citation["source"] + ". "
458
+ if citation["publisher_name"]:
459
+ if citation["publisher_loc"]:
460
+ text += f"{citation['publisher_loc']}: "
461
+ text += citation["publisher_name"] + ". "
462
+ if citation["volume"]:
463
+ text = text.rstrip(". ")
464
+ text += f" {citation['volume']}. "
465
+ if citation["page"]:
466
+ text = text.rstrip(". ")
467
+ if citation["volume"]:
468
+ text += ":"
469
+ text += citation["page"] + ". "
470
+ if citation["year"]:
471
+ text = text.rstrip(". ")
472
+ text += f" ({citation['year']})."
473
+ if citation["pub_id"]:
474
+ text = text.rstrip(".") + ". "
475
+ text += citation["pub_id"]
476
+
477
+ _log.debug("Citation flattened")
478
+
479
+ return text
480
+
481
+ def _add_equation(
482
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
483
+ ) -> None:
484
+ math_text = node.text
485
+ math_parts = math_text.split("$$")
486
+ if len(math_parts) == 3:
487
+ math_formula = math_parts[1]
488
+ doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)
489
+
490
+ return
491
+
492
+ def _add_figure_captions(
493
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
494
+ ) -> None:
495
+ label_node = node.xpath("label")
496
+ label: Optional[str] = (
497
+ JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
498
+ )
499
+
500
+ caption_node = node.xpath("caption")
501
+ caption: Optional[str]
502
+ if len(caption_node) > 0:
503
+ caption = ""
504
+ for caption_par in list(caption_node[0]):
505
+ if caption_par.xpath(".//supplementary-material"):
506
+ continue
507
+ caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
508
+ caption = caption.strip()
509
+ else:
510
+ caption = None
511
+
512
+ # TODO: format label vs caption once styling is supported
513
+ fig_text: str = f"{label}{' ' if label and caption else ''}{caption}"
514
+ fig_caption: Optional[TextItem] = (
515
+ doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)
516
+ if fig_text
517
+ else None
518
+ )
519
+
520
+ doc.add_picture(parent=parent, caption=fig_caption)
521
+
522
+ return
523
+
524
+ # TODO: add footnotes when DocItemLabel.FOOTNOTE and styling are supported
525
+ # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
526
+ # new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
527
+ # for child in node.iterchildren(tag="fn"):
528
+ # text = JatsDocumentBackend._get_text(child)
529
+ # doc.add_list_item(text=text, parent=new_parent)
530
+
531
+ def _add_metadata(
532
+ self, doc: DoclingDocument, xml_components: XMLComponents
533
+ ) -> None:
534
+ self._add_title(doc, xml_components)
535
+ self._add_authors(doc, xml_components)
536
+ self._add_abstract(doc, xml_components)
537
+
538
+ return
539
+
540
+ @staticmethod
541
+ def parse_table_data(element: Tag) -> Optional[TableData]:
542
+ # TODO, see how to implement proper support for rich tables from HTML backend
543
+ nested_tables = element.find("table")
544
+ if nested_tables is not None:
545
+ _log.debug("Skipping nested table.")
546
+ return None
547
+
548
+ # Find the number of rows and columns (taking into account spans)
549
+ num_rows = 0
550
+ num_cols = 0
551
+ for row in element("tr"):
552
+ col_count = 0
553
+ is_row_header = True
554
+ if not isinstance(row, Tag):
555
+ continue
556
+ for cell in row(["td", "th"]):
557
+ if not isinstance(row, Tag):
558
+ continue
559
+ cell_tag = cast(Tag, cell)
560
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
561
+ col_count += col_span
562
+ if cell_tag.name == "td" or row_span == 1:
563
+ is_row_header = False
564
+ num_cols = max(num_cols, col_count)
565
+ if not is_row_header:
566
+ num_rows += 1
567
+
568
+ _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
569
+
570
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
571
+
572
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
573
+
574
+ # Iterate over the rows in the table
575
+ start_row_span = 0
576
+ row_idx = -1
577
+ for row in element("tr"):
578
+ if not isinstance(row, Tag):
579
+ continue
580
+
581
+ # For each row, find all the column cells (both <td> and <th>)
582
+ cells = row(["td", "th"])
583
+
584
+ # Check if cell is in a column header or row header
585
+ col_header = True
586
+ row_header = True
587
+ for html_cell in cells:
588
+ if isinstance(html_cell, Tag):
589
+ _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
590
+ if html_cell.name == "td":
591
+ col_header = False
592
+ row_header = False
593
+ elif row_span == 1:
594
+ row_header = False
595
+ if not row_header:
596
+ row_idx += 1
597
+ start_row_span = 0
598
+ else:
599
+ start_row_span += 1
600
+
601
+ # Extract the text content of each cell
602
+ col_idx = 0
603
+ for html_cell in cells:
604
+ if not isinstance(html_cell, Tag):
605
+ continue
606
+
607
+ # extract inline formulas
608
+ for formula in html_cell("inline-formula"):
609
+ math_parts = formula.text.split("$$")
610
+ if len(math_parts) == 3:
611
+ math_formula = f"$${math_parts[1]}$$"
612
+ formula.replace_with(NavigableString(math_formula))
613
+
614
+ # TODO: extract content correctly from table-cells with lists
615
+ text = HTMLDocumentBackend.get_text(html_cell).strip()
616
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
617
+ if row_header:
618
+ row_span -= 1
619
+ while (
620
+ col_idx < num_cols
621
+ and grid[row_idx + start_row_span][col_idx] is not None
622
+ ):
623
+ col_idx += 1
624
+ for r in range(start_row_span, start_row_span + row_span):
625
+ for c in range(col_span):
626
+ if row_idx + r < num_rows and col_idx + c < num_cols:
627
+ grid[row_idx + r][col_idx + c] = text
628
+
629
+ table_cell = TableCell(
630
+ text=text,
631
+ row_span=row_span,
632
+ col_span=col_span,
633
+ start_row_offset_idx=start_row_span + row_idx,
634
+ end_row_offset_idx=start_row_span + row_idx + row_span,
635
+ start_col_offset_idx=col_idx,
636
+ end_col_offset_idx=col_idx + col_span,
637
+ column_header=col_header,
638
+ row_header=((not col_header) and html_cell.name == "th"),
639
+ )
640
+ data.table_cells.append(table_cell)
641
+
642
+ return data
643
+
644
+ def _add_table(
645
+ self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
646
+ ) -> None:
647
+ soup = BeautifulSoup(table_xml_component["content"], "html.parser")
648
+ table_tag = soup.find("table")
649
+ if not isinstance(table_tag, Tag):
650
+ return
651
+
652
+ data = JatsDocumentBackend.parse_table_data(table_tag)
653
+ # TODO: format label vs caption once styling is supported
654
+ label = table_xml_component["label"]
655
+ caption = table_xml_component["caption"]
656
+ table_text: str = f"{label}{' ' if label and caption else ''}{caption}"
657
+ table_caption: Optional[TextItem] = (
658
+ doc.add_text(label=DocItemLabel.CAPTION, text=table_text)
659
+ if table_text
660
+ else None
661
+ )
662
+ if data is not None:
663
+ doc.add_table(data=data, parent=parent, caption=table_caption)
664
+
665
+ return
666
+
667
+ def _add_tables(
668
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
669
+ ) -> None:
670
+ table: Table = {"label": "", "caption": "", "content": ""}
671
+
672
+ # Content
673
+ if len(node.xpath("table")) > 0:
674
+ table_content_node = node.xpath("table")[0]
675
+ elif len(node.xpath("alternatives/table")) > 0:
676
+ table_content_node = node.xpath("alternatives/table")[0]
677
+ else:
678
+ table_content_node = None
679
+ if table_content_node is not None:
680
+ table["content"] = etree.tostring(table_content_node).decode("utf-8")
681
+
682
+ # Caption
683
+ caption_node = node.xpath("caption")
684
+ caption: Optional[str]
685
+ if caption_node:
686
+ caption = ""
687
+ for caption_par in list(caption_node[0]):
688
+ if caption_par.xpath(".//supplementary-material"):
689
+ continue
690
+ caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
691
+ caption = caption.strip()
692
+ else:
693
+ caption = None
694
+ if caption is not None:
695
+ table["caption"] = caption
696
+
697
+ # Label
698
+ if len(node.xpath("label")) > 0:
699
+ table["label"] = node.xpath("label")[0].text
700
+
701
+ try:
702
+ self._add_table(doc, parent, table)
703
+ except Exception:
704
+ _log.warning(f"Skipping unsupported table in {self.file!s}")
705
+
706
+ return
707
+
708
+ def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
709
+ self.root = doc.add_text(
710
+ parent=None,
711
+ text=xml_components["title"],
712
+ label=DocItemLabel.TITLE,
713
+ )
714
+ return
715
+
716
+ def _walk_linear(
717
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
718
+ ) -> str:
719
+ skip_tags = ["term"]
720
+ flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
721
+ new_parent: NodeItem = parent
722
+ node_text: str = (
723
+ node.text.replace("\n", " ")
724
+ if (node.tag not in skip_tags and node.text)
725
+ else ""
726
+ )
727
+
728
+ for child in list(node):
729
+ stop_walk: bool = False
730
+
731
+ # flush text into TextItem for some tags in paragraph nodes
732
+ if node.tag == "p" and node_text.strip() and child.tag in flush_tags:
733
+ doc.add_text(
734
+ label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent
735
+ )
736
+ node_text = ""
737
+
738
+ # add elements and decide whether to stop walking
739
+ if child.tag in ("sec", "ack"):
740
+ header = child.xpath("title|label")
741
+ text: Optional[str] = None
742
+ if len(header) > 0:
743
+ text = JatsDocumentBackend._get_text(header[0])
744
+ elif child.tag == "ack":
745
+ text = DEFAULT_HEADER_ACKNOWLEDGMENTS
746
+ if text:
747
+ self.hlevel += 1
748
+ new_parent = doc.add_heading(
749
+ text=text, parent=parent, level=self.hlevel
750
+ )
751
+ elif child.tag == "list":
752
+ new_parent = doc.add_group(
753
+ label=GroupLabel.LIST, name="list", parent=parent
754
+ )
755
+ elif child.tag == "list-item":
756
+ # TODO: address any type of content (another list, formula,...)
757
+ # TODO: address list type and item label
758
+ text = JatsDocumentBackend._get_text(child).strip()
759
+ new_parent = doc.add_list_item(text=text, parent=parent)
760
+ stop_walk = True
761
+ elif child.tag == "fig":
762
+ self._add_figure_captions(doc, parent, child)
763
+ stop_walk = True
764
+ elif child.tag == "table-wrap":
765
+ self._add_tables(doc, parent, child)
766
+ stop_walk = True
767
+ elif child.tag == "suplementary-material":
768
+ stop_walk = True
769
+ elif child.tag == "fn-group":
770
+ # header = child.xpath(".//title") or child.xpath(".//label")
771
+ # if header:
772
+ # text = JatsDocumentBackend._get_text(header[0])
773
+ # fn_parent = doc.add_heading(text=text, parent=new_parent)
774
+ # self._add_footnote_group(doc, fn_parent, child)
775
+ stop_walk = True
776
+ elif child.tag == "ref-list" and node.tag != "ref-list":
777
+ header = child.xpath("title|label")
778
+ text = (
779
+ JatsDocumentBackend._get_text(header[0])
780
+ if len(header) > 0
781
+ else DEFAULT_HEADER_REFERENCES
782
+ )
783
+ new_parent = doc.add_heading(text=text, parent=parent)
784
+ new_parent = doc.add_group(
785
+ parent=new_parent, label=GroupLabel.LIST, name="list"
786
+ )
787
+ elif child.tag == "element-citation":
788
+ text = self._parse_element_citation(child)
789
+ self._add_citation(doc, parent, text)
790
+ stop_walk = True
791
+ elif child.tag == "mixed-citation":
792
+ text = JatsDocumentBackend._get_text(child).strip()
793
+ self._add_citation(doc, parent, text)
794
+ stop_walk = True
795
+ elif child.tag == "tex-math":
796
+ self._add_equation(doc, parent, child)
797
+ stop_walk = True
798
+ elif child.tag == "inline-formula":
799
+ # TODO: address inline formulas when supported by docling-core
800
+ stop_walk = True
801
+
802
+ # step into child
803
+ if not stop_walk:
804
+ new_text = self._walk_linear(doc, new_parent, child)
805
+ if not (node.getparent().tag == "p" and node.tag in flush_tags):
806
+ node_text += new_text
807
+ if child.tag in ("sec", "ack") and text:
808
+ self.hlevel -= 1
809
+
810
+ # pick up the tail text
811
+ node_text += child.tail.replace("\n", " ") if child.tail else ""
812
+
813
+ # create paragraph
814
+ if node.tag == "p" and node_text.strip():
815
+ doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)
816
+ return ""
817
+ else:
818
+ # backpropagate the text
819
+ return node_text