docling 2.22.0__py3-none-any.whl → 2.23.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,710 @@
1
+ import logging
2
+ import traceback
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import Final, Optional, Union
6
+
7
+ from bs4 import BeautifulSoup, Tag
8
+ from docling_core.types.doc import (
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupItem,
13
+ GroupLabel,
14
+ NodeItem,
15
+ TextItem,
16
+ )
17
+ from lxml import etree
18
+ from typing_extensions import TypedDict, override
19
+
20
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
21
+ from docling.backend.html_backend import HTMLDocumentBackend
22
+ from docling.datamodel.base_models import InputFormat
23
+ from docling.datamodel.document import InputDocument
24
+
25
+ _log = logging.getLogger(__name__)
26
+
27
+ JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"]
28
+ DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments"
29
+ DEFAULT_HEADER_ABSTRACT: Final = "Abstract"
30
+ DEFAULT_HEADER_REFERENCES: Final = "References"
31
+ DEFAULT_TEXT_ETAL: Final = "et al."
32
+
33
+
34
+ class Abstract(TypedDict):
35
+ label: str
36
+ content: str
37
+
38
+
39
+ class Author(TypedDict):
40
+ name: str
41
+ affiliation_names: list[str]
42
+
43
+
44
+ class Citation(TypedDict):
45
+ author_names: str
46
+ title: str
47
+ source: str
48
+ year: str
49
+ volume: str
50
+ page: str
51
+ pub_id: str
52
+ publisher_name: str
53
+ publisher_loc: str
54
+
55
+
56
+ class Table(TypedDict):
57
+ label: str
58
+ caption: str
59
+ content: str
60
+
61
+
62
+ class XMLComponents(TypedDict):
63
+ title: str
64
+ authors: list[Author]
65
+ abstract: list[Abstract]
66
+
67
+
68
+ class JatsDocumentBackend(DeclarativeDocumentBackend):
69
+ """Backend to parse articles in XML format tagged according to JATS definition.
70
+
71
+ The Journal Article Tag Suite (JATS) is an definition standard for the
72
+ representation of journal articles in XML format. Several publishers and journal
73
+ archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
74
+ medRxiv, or Springer Nature.
75
+
76
+ Refer to https://jats.nlm.nih.gov for more details on JATS.
77
+
78
+ The code from this document backend has been developed by modifying parts of the
79
+ PubMed Parser library (version 0.5.0, released on 12.08.2024):
80
+ Achakulvisut et al., (2020).
81
+ Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
82
+ Dataset XML Dataset.
83
+ Journal of Open Source Software, 5(46), 1979,
84
+ https://doi.org/10.21105/joss.01979
85
+ """
86
+
87
+ @override
88
+ def __init__(
89
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
90
+ ) -> None:
91
+ super().__init__(in_doc, path_or_stream)
92
+ self.path_or_stream = path_or_stream
93
+
94
+ # Initialize the root of the document hiearchy
95
+ self.root: Optional[NodeItem] = None
96
+
97
+ self.valid = False
98
+ try:
99
+ if isinstance(self.path_or_stream, BytesIO):
100
+ self.path_or_stream.seek(0)
101
+ self.tree: etree._ElementTree = etree.parse(self.path_or_stream)
102
+
103
+ doc_info: etree.DocInfo = self.tree.docinfo
104
+ if doc_info.system_url and any(
105
+ [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
106
+ ):
107
+ self.valid = True
108
+ return
109
+ for ent in doc_info.internalDTD.iterentities():
110
+ if ent.system_url and any(
111
+ [kwd in ent.system_url for kwd in JATS_DTD_URL]
112
+ ):
113
+ self.valid = True
114
+ return
115
+ except Exception as exc:
116
+ raise RuntimeError(
117
+ f"Could not initialize JATS backend for file with hash {self.document_hash}."
118
+ ) from exc
119
+
120
+ @override
121
+ def is_valid(self) -> bool:
122
+ return self.valid
123
+
124
+ @classmethod
125
+ @override
126
+ def supports_pagination(cls) -> bool:
127
+ return False
128
+
129
+ @override
130
+ def unload(self):
131
+ if isinstance(self.path_or_stream, BytesIO):
132
+ self.path_or_stream.close()
133
+ self.path_or_stream = None
134
+
135
+ @classmethod
136
+ @override
137
+ def supported_formats(cls) -> set[InputFormat]:
138
+ return {InputFormat.XML_JATS}
139
+
140
+ @override
141
+ def convert(self) -> DoclingDocument:
142
+ try:
143
+ # Create empty document
144
+ origin = DocumentOrigin(
145
+ filename=self.file.name or "file",
146
+ mimetype="application/xml",
147
+ binary_hash=self.document_hash,
148
+ )
149
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
150
+
151
+ # Get metadata XML components
152
+ xml_components: XMLComponents = self._parse_metadata()
153
+
154
+ # Add metadata to the document
155
+ self._add_metadata(doc, xml_components)
156
+
157
+ # walk over the XML body
158
+ body = self.tree.xpath("//body")
159
+ if self.root and len(body) > 0:
160
+ self._walk_linear(doc, self.root, body[0])
161
+
162
+ # walk over the XML back matter
163
+ back = self.tree.xpath("//back")
164
+ if self.root and len(back) > 0:
165
+ self._walk_linear(doc, self.root, back[0])
166
+ except Exception:
167
+ _log.error(traceback.format_exc())
168
+
169
+ return doc
170
+
171
+ @staticmethod
172
+ def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
173
+ skip_tags = ["term", "disp-formula", "inline-formula"]
174
+ text: str = (
175
+ node.text.replace("\n", " ")
176
+ if (node.tag not in skip_tags and node.text)
177
+ else ""
178
+ )
179
+ for child in list(node):
180
+ if child.tag not in skip_tags:
181
+ # TODO: apply styling according to child.tag when supported by docling-core
182
+ text += JatsDocumentBackend._get_text(child, sep)
183
+ if sep:
184
+ text = text.rstrip(sep) + sep
185
+ text += child.tail.replace("\n", " ") if child.tail else ""
186
+
187
+ return text
188
+
189
+ def _find_metadata(self) -> Optional[etree._Element]:
190
+ meta_names: list[str] = ["article-meta", "book-part-meta"]
191
+ meta: Optional[etree._Element] = None
192
+ for name in meta_names:
193
+ node = self.tree.xpath(f".//{name}")
194
+ if len(node) > 0:
195
+ meta = node[0]
196
+ break
197
+
198
+ return meta
199
+
200
+ def _parse_abstract(self) -> list[Abstract]:
201
+ # TODO: address cases with multiple sections
202
+ abs_list: list[Abstract] = []
203
+
204
+ for abs_node in self.tree.xpath(".//abstract"):
205
+ abstract: Abstract = dict(label="", content="")
206
+ texts = []
207
+ for abs_par in abs_node.xpath("p"):
208
+ texts.append(JatsDocumentBackend._get_text(abs_par).strip())
209
+ abstract["content"] = " ".join(texts)
210
+
211
+ label_node = abs_node.xpath("title|label")
212
+ if len(label_node) > 0:
213
+ abstract["label"] = label_node[0].text.strip()
214
+
215
+ abs_list.append(abstract)
216
+
217
+ return abs_list
218
+
219
+ def _parse_authors(self) -> list[Author]:
220
+ # Get mapping between affiliation ids and names
221
+ authors: list[Author] = []
222
+ meta: Optional[etree._Element] = self._find_metadata()
223
+ if meta is None:
224
+ return authors
225
+
226
+ affiliation_names = []
227
+ for affiliation_node in meta.xpath(".//aff[@id]"):
228
+ aff = ", ".join([t for t in affiliation_node.itertext() if t.strip()])
229
+ aff = aff.replace("\n", " ")
230
+ label = affiliation_node.xpath("label")
231
+ if label:
232
+ # TODO: once superscript is supported, add label with formatting
233
+ aff = aff.removeprefix(f"{label[0].text}, ")
234
+ affiliation_names.append(aff)
235
+ affiliation_ids_names = {
236
+ id: name
237
+ for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
238
+ }
239
+
240
+ # Get author names and affiliation names
241
+ for author_node in meta.xpath(
242
+ './/contrib-group/contrib[@contrib-type="author"]'
243
+ ):
244
+ author: Author = {
245
+ "name": "",
246
+ "affiliation_names": [],
247
+ }
248
+
249
+ # Affiliation names
250
+ affiliation_ids = [
251
+ a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
252
+ ]
253
+ for id in affiliation_ids:
254
+ if id in affiliation_ids_names:
255
+ author["affiliation_names"].append(affiliation_ids_names[id])
256
+
257
+ # Name
258
+ author["name"] = (
259
+ author_node.xpath("name/given-names")[0].text
260
+ + " "
261
+ + author_node.xpath("name/surname")[0].text
262
+ )
263
+
264
+ authors.append(author)
265
+
266
+ return authors
267
+
268
+ def _parse_title(self) -> str:
269
+ meta_names: list[str] = [
270
+ "article-meta",
271
+ "collection-meta",
272
+ "book-meta",
273
+ "book-part-meta",
274
+ ]
275
+ title_names: list[str] = ["article-title", "subtitle", "title", "label"]
276
+ titles: list[str] = [
277
+ " ".join(
278
+ elem.text.replace("\n", " ").strip()
279
+ for elem in list(title_node)
280
+ if elem.tag in title_names
281
+ ).strip()
282
+ for title_node in self.tree.xpath(
283
+ "|".join([f".//{item}/title-group" for item in meta_names])
284
+ )
285
+ ]
286
+
287
+ text = " - ".join(titles)
288
+
289
+ return text
290
+
291
+ def _parse_metadata(self) -> XMLComponents:
292
+ """Parsing JATS document metadata."""
293
+ xml_components: XMLComponents = {
294
+ "title": self._parse_title(),
295
+ "authors": self._parse_authors(),
296
+ "abstract": self._parse_abstract(),
297
+ }
298
+ return xml_components
299
+
300
+ def _add_abstract(
301
+ self, doc: DoclingDocument, xml_components: XMLComponents
302
+ ) -> None:
303
+
304
+ for abstract in xml_components["abstract"]:
305
+ text: str = abstract["content"]
306
+ title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
307
+ if not text:
308
+ continue
309
+ parent = doc.add_heading(parent=self.root, text=title)
310
+ doc.add_text(
311
+ parent=parent,
312
+ text=text,
313
+ label=DocItemLabel.TEXT,
314
+ )
315
+
316
+ return
317
+
318
+ def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
319
+ # TODO: once docling supports text formatting, add affiliation reference to
320
+ # author names through superscripts
321
+ authors: list = [item["name"] for item in xml_components["authors"]]
322
+ authors_str = ", ".join(authors)
323
+ affiliations: list = [
324
+ item
325
+ for author in xml_components["authors"]
326
+ for item in author["affiliation_names"]
327
+ ]
328
+ affiliations_str = "; ".join(list(dict.fromkeys(affiliations)))
329
+ if authors_str:
330
+ doc.add_text(
331
+ parent=self.root,
332
+ text=authors_str,
333
+ label=DocItemLabel.PARAGRAPH,
334
+ )
335
+ if affiliations_str:
336
+ doc.add_text(
337
+ parent=self.root,
338
+ text=affiliations_str,
339
+ label=DocItemLabel.PARAGRAPH,
340
+ )
341
+
342
+ return
343
+
344
+ def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:
345
+ if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:
346
+ doc.add_list_item(text=text, enumerated=False, parent=parent)
347
+ else:
348
+ doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)
349
+
350
+ return
351
+
352
+ def _parse_element_citation(self, node: etree._Element) -> str:
353
+ citation: Citation = {
354
+ "author_names": "",
355
+ "title": "",
356
+ "source": "",
357
+ "year": "",
358
+ "volume": "",
359
+ "page": "",
360
+ "pub_id": "",
361
+ "publisher_name": "",
362
+ "publisher_loc": "",
363
+ }
364
+
365
+ _log.debug("Citation parsing started")
366
+
367
+ # Author names
368
+ names = []
369
+ for name_node in node.xpath(".//name"):
370
+ name_str = (
371
+ name_node.xpath("surname")[0].text.replace("\n", " ").strip()
372
+ + " "
373
+ + name_node.xpath("given-names")[0].text.replace("\n", " ").strip()
374
+ )
375
+ names.append(name_str)
376
+ etal_node = node.xpath(".//etal")
377
+ if len(etal_node) > 0:
378
+ etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL
379
+ names.append(etal_text)
380
+ citation["author_names"] = ", ".join(names)
381
+
382
+ titles: list[str] = [
383
+ "article-title",
384
+ "chapter-title",
385
+ "data-title",
386
+ "issue-title",
387
+ "part-title",
388
+ "trans-title",
389
+ ]
390
+ title_node: Optional[etree._Element] = None
391
+ for name in titles:
392
+ name_node = node.xpath(name)
393
+ if len(name_node) > 0:
394
+ title_node = name_node[0]
395
+ break
396
+ citation["title"] = (
397
+ JatsDocumentBackend._get_text(title_node)
398
+ if title_node is not None
399
+ else node.text.replace("\n", " ").strip()
400
+ )
401
+
402
+ # Journal, year, publisher name, publisher location, volume, elocation
403
+ fields: list[str] = [
404
+ "source",
405
+ "year",
406
+ "publisher-name",
407
+ "publisher-loc",
408
+ "volume",
409
+ ]
410
+ for item in fields:
411
+ item_node = node.xpath(item)
412
+ if len(item_node) > 0:
413
+ citation[item.replace("-", "_")] = ( # type: ignore[literal-required]
414
+ item_node[0].text.replace("\n", " ").strip()
415
+ )
416
+
417
+ # Publication identifier
418
+ if len(node.xpath("pub-id")) > 0:
419
+ pub_id: list[str] = []
420
+ for id_node in node.xpath("pub-id"):
421
+ id_type = id_node.get("assigning-authority") or id_node.get(
422
+ "pub-id-type"
423
+ )
424
+ id_text = id_node.text
425
+ if id_type and id_text:
426
+ pub_id.append(
427
+ id_type.replace("\n", " ").strip().upper()
428
+ + ": "
429
+ + id_text.replace("\n", " ").strip()
430
+ )
431
+ if pub_id:
432
+ citation["pub_id"] = ", ".join(pub_id)
433
+
434
+ # Pages
435
+ if len(node.xpath("elocation-id")) > 0:
436
+ citation["page"] = (
437
+ node.xpath("elocation-id")[0].text.replace("\n", " ").strip()
438
+ )
439
+ elif len(node.xpath("fpage")) > 0:
440
+ citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
441
+ if len(node.xpath("lpage")) > 0:
442
+ citation["page"] += (
443
+ "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
444
+ )
445
+
446
+ # Flatten the citation to string
447
+
448
+ text = ""
449
+ if citation["author_names"]:
450
+ text += citation["author_names"].rstrip(".") + ". "
451
+ if citation["title"]:
452
+ text += citation["title"] + ". "
453
+ if citation["source"]:
454
+ text += citation["source"] + ". "
455
+ if citation["publisher_name"]:
456
+ if citation["publisher_loc"]:
457
+ text += f"{citation['publisher_loc']}: "
458
+ text += citation["publisher_name"] + ". "
459
+ if citation["volume"]:
460
+ text = text.rstrip(". ")
461
+ text += f" {citation['volume']}. "
462
+ if citation["page"]:
463
+ text = text.rstrip(". ")
464
+ if citation["volume"]:
465
+ text += ":"
466
+ text += citation["page"] + ". "
467
+ if citation["year"]:
468
+ text = text.rstrip(". ")
469
+ text += f" ({citation['year']})."
470
+ if citation["pub_id"]:
471
+ text = text.rstrip(".") + ". "
472
+ text += citation["pub_id"]
473
+
474
+ _log.debug("Citation flattened")
475
+
476
+ return text
477
+
478
+ def _add_equation(
479
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
480
+ ) -> None:
481
+ math_text = node.text
482
+ math_parts = math_text.split("$$")
483
+ if len(math_parts) == 3:
484
+ math_formula = math_parts[1]
485
+ doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)
486
+
487
+ return
488
+
489
+ def _add_figure_captions(
490
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
491
+ ) -> None:
492
+ label_node = node.xpath("label")
493
+ label: Optional[str] = (
494
+ JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
495
+ )
496
+
497
+ caption_node = node.xpath("caption")
498
+ caption: Optional[str]
499
+ if len(caption_node) > 0:
500
+ caption = ""
501
+ for caption_par in list(caption_node[0]):
502
+ if caption_par.xpath(".//supplementary-material"):
503
+ continue
504
+ caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
505
+ caption = caption.strip()
506
+ else:
507
+ caption = None
508
+
509
+ # TODO: format label vs caption once styling is supported
510
+ fig_text: str = f"{label}{' ' if label and caption else ''}{caption}"
511
+ fig_caption: Optional[TextItem] = (
512
+ doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)
513
+ if fig_text
514
+ else None
515
+ )
516
+
517
+ doc.add_picture(parent=parent, caption=fig_caption)
518
+
519
+ return
520
+
521
+ # TODO: add footnotes when DocItemLabel.FOOTNOTE and styling are supported
522
+ # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
523
+ # new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
524
+ # for child in node.iterchildren(tag="fn"):
525
+ # text = JatsDocumentBackend._get_text(child)
526
+ # doc.add_list_item(text=text, parent=new_parent)
527
+
528
+ def _add_metadata(
529
+ self, doc: DoclingDocument, xml_components: XMLComponents
530
+ ) -> None:
531
+ self._add_title(doc, xml_components)
532
+ self._add_authors(doc, xml_components)
533
+ self._add_abstract(doc, xml_components)
534
+
535
+ return
536
+
537
+ def _add_table(
538
+ self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
539
+ ) -> None:
540
+ soup = BeautifulSoup(table_xml_component["content"], "html.parser")
541
+ table_tag = soup.find("table")
542
+ if not isinstance(table_tag, Tag):
543
+ return
544
+
545
+ data = HTMLDocumentBackend.parse_table_data(table_tag)
546
+
547
+ # TODO: format label vs caption once styling is supported
548
+ label = table_xml_component["label"]
549
+ caption = table_xml_component["caption"]
550
+ table_text: str = f"{label}{' ' if label and caption else ''}{caption}"
551
+ table_caption: Optional[TextItem] = (
552
+ doc.add_text(label=DocItemLabel.CAPTION, text=table_text)
553
+ if table_text
554
+ else None
555
+ )
556
+
557
+ if data is not None:
558
+ doc.add_table(data=data, parent=parent, caption=table_caption)
559
+
560
+ return
561
+
562
+ def _add_tables(
563
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
564
+ ) -> None:
565
+ table: Table = {"label": "", "caption": "", "content": ""}
566
+
567
+ # Content
568
+ if len(node.xpath("table")) > 0:
569
+ table_content_node = node.xpath("table")[0]
570
+ elif len(node.xpath("alternatives/table")) > 0:
571
+ table_content_node = node.xpath("alternatives/table")[0]
572
+ else:
573
+ table_content_node = None
574
+ if table_content_node is not None:
575
+ table["content"] = etree.tostring(table_content_node).decode("utf-8")
576
+
577
+ # Caption
578
+ caption_node = node.xpath("caption")
579
+ caption: Optional[str]
580
+ if caption_node:
581
+ caption = ""
582
+ for caption_par in list(caption_node[0]):
583
+ if caption_par.xpath(".//supplementary-material"):
584
+ continue
585
+ caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
586
+ caption = caption.strip()
587
+ else:
588
+ caption = None
589
+ if caption is not None:
590
+ table["caption"] = caption
591
+
592
+ # Label
593
+ if len(node.xpath("label")) > 0:
594
+ table["label"] = node.xpath("label")[0].text
595
+
596
+ try:
597
+ self._add_table(doc, parent, table)
598
+ except Exception as e:
599
+ _log.warning(f"Skipping unsupported table in {str(self.file)}")
600
+ pass
601
+
602
+ return
603
+
604
+ def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
605
+ self.root = doc.add_text(
606
+ parent=None,
607
+ text=xml_components["title"],
608
+ label=DocItemLabel.TITLE,
609
+ )
610
+ return
611
+
612
+ def _walk_linear(
613
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
614
+ ) -> str:
615
+ skip_tags = ["term"]
616
+ flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
617
+ new_parent: NodeItem = parent
618
+ node_text: str = (
619
+ node.text.replace("\n", " ")
620
+ if (node.tag not in skip_tags and node.text)
621
+ else ""
622
+ )
623
+
624
+ for child in list(node):
625
+ stop_walk: bool = False
626
+
627
+ # flush text into TextItem for some tags in paragraph nodes
628
+ if node.tag == "p" and node_text.strip() and child.tag in flush_tags:
629
+ doc.add_text(
630
+ label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent
631
+ )
632
+ node_text = ""
633
+
634
+ # add elements and decide whether to stop walking
635
+ if child.tag in ("sec", "ack"):
636
+ header = child.xpath("title|label")
637
+ text: Optional[str] = None
638
+ if len(header) > 0:
639
+ text = JatsDocumentBackend._get_text(header[0])
640
+ elif child.tag == "ack":
641
+ text = DEFAULT_HEADER_ACKNOWLEDGMENTS
642
+ if text:
643
+ new_parent = doc.add_heading(text=text, parent=parent)
644
+ elif child.tag == "list":
645
+ new_parent = doc.add_group(
646
+ label=GroupLabel.LIST, name="list", parent=parent
647
+ )
648
+ elif child.tag == "list-item":
649
+ # TODO: address any type of content (another list, formula,...)
650
+ # TODO: address list type and item label
651
+ text = JatsDocumentBackend._get_text(child).strip()
652
+ new_parent = doc.add_list_item(text=text, parent=parent)
653
+ stop_walk = True
654
+ elif child.tag == "fig":
655
+ self._add_figure_captions(doc, parent, child)
656
+ stop_walk = True
657
+ elif child.tag == "table-wrap":
658
+ self._add_tables(doc, parent, child)
659
+ stop_walk = True
660
+ elif child.tag == "suplementary-material":
661
+ stop_walk = True
662
+ elif child.tag == "fn-group":
663
+ # header = child.xpath(".//title") or child.xpath(".//label")
664
+ # if header:
665
+ # text = JatsDocumentBackend._get_text(header[0])
666
+ # fn_parent = doc.add_heading(text=text, parent=new_parent)
667
+ # self._add_footnote_group(doc, fn_parent, child)
668
+ stop_walk = True
669
+ elif child.tag == "ref-list" and node.tag != "ref-list":
670
+ header = child.xpath("title|label")
671
+ text = (
672
+ JatsDocumentBackend._get_text(header[0])
673
+ if len(header) > 0
674
+ else DEFAULT_HEADER_REFERENCES
675
+ )
676
+ new_parent = doc.add_heading(text=text, parent=parent)
677
+ new_parent = doc.add_group(
678
+ parent=new_parent, label=GroupLabel.LIST, name="list"
679
+ )
680
+ elif child.tag == "element-citation":
681
+ text = self._parse_element_citation(child)
682
+ self._add_citation(doc, parent, text)
683
+ stop_walk = True
684
+ elif child.tag == "mixed-citation":
685
+ text = JatsDocumentBackend._get_text(child).strip()
686
+ self._add_citation(doc, parent, text)
687
+ stop_walk = True
688
+ elif child.tag == "tex-math":
689
+ self._add_equation(doc, parent, child)
690
+ stop_walk = True
691
+ elif child.tag == "inline-formula":
692
+ # TODO: address inline formulas when supported by docling-core
693
+ stop_walk = True
694
+
695
+ # step into child
696
+ if not stop_walk:
697
+ new_text = self._walk_linear(doc, new_parent, child)
698
+ if not (node.getparent().tag == "p" and node.tag in flush_tags):
699
+ node_text += new_text
700
+
701
+ # pick up the tail text
702
+ node_text += child.tail.replace("\n", " ") if child.tail else ""
703
+
704
+ # create paragraph
705
+ if node.tag == "p" and node_text.strip():
706
+ doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)
707
+ return ""
708
+ else:
709
+ # backpropagate the text
710
+ return node_text