docling 2.22.0__py3-none-any.whl → 2.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,772 @@
1
+ import logging
2
+ import traceback
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import Final, Optional, Union
6
+
7
+ from bs4 import BeautifulSoup
8
+ from docling_core.types.doc import (
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupItem,
13
+ GroupLabel,
14
+ NodeItem,
15
+ TableCell,
16
+ TableData,
17
+ TextItem,
18
+ )
19
+ from lxml import etree
20
+ from typing_extensions import TypedDict, override
21
+
22
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
23
+ from docling.datamodel.base_models import InputFormat
24
+ from docling.datamodel.document import InputDocument
25
+
26
+ _log = logging.getLogger(__name__)
27
+
28
+ JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"]
29
+ DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments"
30
+ DEFAULT_HEADER_ABSTRACT: Final = "Abstract"
31
+ DEFAULT_HEADER_REFERENCES: Final = "References"
32
+ DEFAULT_TEXT_ETAL: Final = "et al."
33
+
34
+
35
+ class Abstract(TypedDict):
36
+ label: str
37
+ content: str
38
+
39
+
40
+ class Author(TypedDict):
41
+ name: str
42
+ affiliation_names: list[str]
43
+
44
+
45
+ class Citation(TypedDict):
46
+ author_names: str
47
+ title: str
48
+ source: str
49
+ year: str
50
+ volume: str
51
+ page: str
52
+ pub_id: str
53
+ publisher_name: str
54
+ publisher_loc: str
55
+
56
+
57
+ class Table(TypedDict):
58
+ label: str
59
+ caption: str
60
+ content: str
61
+
62
+
63
+ class XMLComponents(TypedDict):
64
+ title: str
65
+ authors: list[Author]
66
+ abstract: list[Abstract]
67
+
68
+
69
+ class JatsDocumentBackend(DeclarativeDocumentBackend):
70
+ """Backend to parse articles in XML format tagged according to JATS definition.
71
+
72
+ The Journal Article Tag Suite (JATS) is an definition standard for the
73
+ representation of journal articles in XML format. Several publishers and journal
74
+ archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
75
+ medRxiv, or Springer Nature.
76
+
77
+ Refer to https://jats.nlm.nih.gov for more details on JATS.
78
+
79
+ The code from this document backend has been developed by modifying parts of the
80
+ PubMed Parser library (version 0.5.0, released on 12.08.2024):
81
+ Achakulvisut et al., (2020).
82
+ Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
83
+ Dataset XML Dataset.
84
+ Journal of Open Source Software, 5(46), 1979,
85
+ https://doi.org/10.21105/joss.01979
86
+ """
87
+
88
+ @override
89
+ def __init__(
90
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
91
+ ) -> None:
92
+ super().__init__(in_doc, path_or_stream)
93
+ self.path_or_stream = path_or_stream
94
+
95
+ # Initialize the root of the document hiearchy
96
+ self.root: Optional[NodeItem] = None
97
+
98
+ self.valid = False
99
+ try:
100
+ if isinstance(self.path_or_stream, BytesIO):
101
+ self.path_or_stream.seek(0)
102
+ self.tree: etree._ElementTree = etree.parse(self.path_or_stream)
103
+
104
+ doc_info: etree.DocInfo = self.tree.docinfo
105
+ if doc_info.system_url and any(
106
+ [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
107
+ ):
108
+ self.valid = True
109
+ return
110
+ for ent in doc_info.internalDTD.iterentities():
111
+ if ent.system_url and any(
112
+ [kwd in ent.system_url for kwd in JATS_DTD_URL]
113
+ ):
114
+ self.valid = True
115
+ return
116
+ except Exception as exc:
117
+ raise RuntimeError(
118
+ f"Could not initialize JATS backend for file with hash {self.document_hash}."
119
+ ) from exc
120
+
121
+ @override
122
+ def is_valid(self) -> bool:
123
+ return self.valid
124
+
125
+ @classmethod
126
+ @override
127
+ def supports_pagination(cls) -> bool:
128
+ return False
129
+
130
+ @override
131
+ def unload(self):
132
+ if isinstance(self.path_or_stream, BytesIO):
133
+ self.path_or_stream.close()
134
+ self.path_or_stream = None
135
+
136
+ @classmethod
137
+ @override
138
+ def supported_formats(cls) -> set[InputFormat]:
139
+ return {InputFormat.XML_JATS}
140
+
141
+ @override
142
+ def convert(self) -> DoclingDocument:
143
+ try:
144
+ # Create empty document
145
+ origin = DocumentOrigin(
146
+ filename=self.file.name or "file",
147
+ mimetype="application/xml",
148
+ binary_hash=self.document_hash,
149
+ )
150
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
151
+
152
+ # Get metadata XML components
153
+ xml_components: XMLComponents = self._parse_metadata()
154
+
155
+ # Add metadata to the document
156
+ self._add_metadata(doc, xml_components)
157
+
158
+ # walk over the XML body
159
+ body = self.tree.xpath("//body")
160
+ if self.root and len(body) > 0:
161
+ self._walk_linear(doc, self.root, body[0])
162
+
163
+ # walk over the XML back matter
164
+ back = self.tree.xpath("//back")
165
+ if self.root and len(back) > 0:
166
+ self._walk_linear(doc, self.root, back[0])
167
+ except Exception:
168
+ _log.error(traceback.format_exc())
169
+
170
+ return doc
171
+
172
+ @staticmethod
173
+ def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
174
+ skip_tags = ["term", "disp-formula", "inline-formula"]
175
+ text: str = (
176
+ node.text.replace("\n", " ")
177
+ if (node.tag not in skip_tags and node.text)
178
+ else ""
179
+ )
180
+ for child in list(node):
181
+ if child.tag not in skip_tags:
182
+ # TODO: apply styling according to child.tag when supported by docling-core
183
+ text += JatsDocumentBackend._get_text(child, sep)
184
+ if sep:
185
+ text = text.rstrip(sep) + sep
186
+ text += child.tail.replace("\n", " ") if child.tail else ""
187
+
188
+ return text
189
+
190
+ def _find_metadata(self) -> Optional[etree._Element]:
191
+ meta_names: list[str] = ["article-meta", "book-part-meta"]
192
+ meta: Optional[etree._Element] = None
193
+ for name in meta_names:
194
+ node = self.tree.xpath(f".//{name}")
195
+ if len(node) > 0:
196
+ meta = node[0]
197
+ break
198
+
199
+ return meta
200
+
201
+ def _parse_abstract(self) -> list[Abstract]:
202
+ # TODO: address cases with multiple sections
203
+ abs_list: list[Abstract] = []
204
+
205
+ for abs_node in self.tree.xpath(".//abstract"):
206
+ abstract: Abstract = dict(label="", content="")
207
+ texts = []
208
+ for abs_par in abs_node.xpath("p"):
209
+ texts.append(JatsDocumentBackend._get_text(abs_par).strip())
210
+ abstract["content"] = " ".join(texts)
211
+
212
+ label_node = abs_node.xpath("title|label")
213
+ if len(label_node) > 0:
214
+ abstract["label"] = label_node[0].text.strip()
215
+
216
+ abs_list.append(abstract)
217
+
218
+ return abs_list
219
+
220
+ def _parse_authors(self) -> list[Author]:
221
+ # Get mapping between affiliation ids and names
222
+ authors: list[Author] = []
223
+ meta: Optional[etree._Element] = self._find_metadata()
224
+ if meta is None:
225
+ return authors
226
+
227
+ affiliation_names = []
228
+ for affiliation_node in meta.xpath(".//aff[@id]"):
229
+ aff = ", ".join([t for t in affiliation_node.itertext() if t.strip()])
230
+ aff = aff.replace("\n", " ")
231
+ label = affiliation_node.xpath("label")
232
+ if label:
233
+ # TODO: once superscript is supported, add label with formatting
234
+ aff = aff.removeprefix(f"{label[0].text}, ")
235
+ affiliation_names.append(aff)
236
+ affiliation_ids_names = {
237
+ id: name
238
+ for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
239
+ }
240
+
241
+ # Get author names and affiliation names
242
+ for author_node in meta.xpath(
243
+ './/contrib-group/contrib[@contrib-type="author"]'
244
+ ):
245
+ author: Author = {
246
+ "name": "",
247
+ "affiliation_names": [],
248
+ }
249
+
250
+ # Affiliation names
251
+ affiliation_ids = [
252
+ a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
253
+ ]
254
+ for id in affiliation_ids:
255
+ if id in affiliation_ids_names:
256
+ author["affiliation_names"].append(affiliation_ids_names[id])
257
+
258
+ # Name
259
+ author["name"] = (
260
+ author_node.xpath("name/given-names")[0].text
261
+ + " "
262
+ + author_node.xpath("name/surname")[0].text
263
+ )
264
+
265
+ authors.append(author)
266
+
267
+ return authors
268
+
269
+ def _parse_title(self) -> str:
270
+ meta_names: list[str] = [
271
+ "article-meta",
272
+ "collection-meta",
273
+ "book-meta",
274
+ "book-part-meta",
275
+ ]
276
+ title_names: list[str] = ["article-title", "subtitle", "title", "label"]
277
+ titles: list[str] = [
278
+ " ".join(
279
+ elem.text.replace("\n", " ").strip()
280
+ for elem in list(title_node)
281
+ if elem.tag in title_names
282
+ ).strip()
283
+ for title_node in self.tree.xpath(
284
+ "|".join([f".//{item}/title-group" for item in meta_names])
285
+ )
286
+ ]
287
+
288
+ text = " - ".join(titles)
289
+
290
+ return text
291
+
292
+ def _parse_metadata(self) -> XMLComponents:
293
+ """Parsing JATS document metadata."""
294
+ xml_components: XMLComponents = {
295
+ "title": self._parse_title(),
296
+ "authors": self._parse_authors(),
297
+ "abstract": self._parse_abstract(),
298
+ }
299
+ return xml_components
300
+
301
+ def _add_abstract(
302
+ self, doc: DoclingDocument, xml_components: XMLComponents
303
+ ) -> None:
304
+
305
+ for abstract in xml_components["abstract"]:
306
+ text: str = abstract["content"]
307
+ title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
308
+ if not text:
309
+ continue
310
+ parent = doc.add_heading(parent=self.root, text=title)
311
+ doc.add_text(
312
+ parent=parent,
313
+ text=text,
314
+ label=DocItemLabel.TEXT,
315
+ )
316
+
317
+ return
318
+
319
+ def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
320
+ # TODO: once docling supports text formatting, add affiliation reference to
321
+ # author names through superscripts
322
+ authors: list = [item["name"] for item in xml_components["authors"]]
323
+ authors_str = ", ".join(authors)
324
+ affiliations: list = [
325
+ item
326
+ for author in xml_components["authors"]
327
+ for item in author["affiliation_names"]
328
+ ]
329
+ affiliations_str = "; ".join(list(dict.fromkeys(affiliations)))
330
+ if authors_str:
331
+ doc.add_text(
332
+ parent=self.root,
333
+ text=authors_str,
334
+ label=DocItemLabel.PARAGRAPH,
335
+ )
336
+ if affiliations_str:
337
+ doc.add_text(
338
+ parent=self.root,
339
+ text=affiliations_str,
340
+ label=DocItemLabel.PARAGRAPH,
341
+ )
342
+
343
+ return
344
+
345
+ def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:
346
+ if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:
347
+ doc.add_list_item(text=text, enumerated=False, parent=parent)
348
+ else:
349
+ doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)
350
+
351
+ return
352
+
353
+ def _parse_element_citation(self, node: etree._Element) -> str:
354
+ citation: Citation = {
355
+ "author_names": "",
356
+ "title": "",
357
+ "source": "",
358
+ "year": "",
359
+ "volume": "",
360
+ "page": "",
361
+ "pub_id": "",
362
+ "publisher_name": "",
363
+ "publisher_loc": "",
364
+ }
365
+
366
+ _log.debug("Citation parsing started")
367
+
368
+ # Author names
369
+ names = []
370
+ for name_node in node.xpath(".//name"):
371
+ name_str = (
372
+ name_node.xpath("surname")[0].text.replace("\n", " ").strip()
373
+ + " "
374
+ + name_node.xpath("given-names")[0].text.replace("\n", " ").strip()
375
+ )
376
+ names.append(name_str)
377
+ etal_node = node.xpath(".//etal")
378
+ if len(etal_node) > 0:
379
+ etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL
380
+ names.append(etal_text)
381
+ citation["author_names"] = ", ".join(names)
382
+
383
+ titles: list[str] = [
384
+ "article-title",
385
+ "chapter-title",
386
+ "data-title",
387
+ "issue-title",
388
+ "part-title",
389
+ "trans-title",
390
+ ]
391
+ title_node: Optional[etree._Element] = None
392
+ for name in titles:
393
+ name_node = node.xpath(name)
394
+ if len(name_node) > 0:
395
+ title_node = name_node[0]
396
+ break
397
+ citation["title"] = (
398
+ JatsDocumentBackend._get_text(title_node)
399
+ if title_node is not None
400
+ else node.text.replace("\n", " ").strip()
401
+ )
402
+
403
+ # Journal, year, publisher name, publisher location, volume, elocation
404
+ fields: list[str] = [
405
+ "source",
406
+ "year",
407
+ "publisher-name",
408
+ "publisher-loc",
409
+ "volume",
410
+ ]
411
+ for item in fields:
412
+ item_node = node.xpath(item)
413
+ if len(item_node) > 0:
414
+ citation[item.replace("-", "_")] = ( # type: ignore[literal-required]
415
+ item_node[0].text.replace("\n", " ").strip()
416
+ )
417
+
418
+ # Publication identifier
419
+ if len(node.xpath("pub-id")) > 0:
420
+ pub_id: list[str] = []
421
+ for id_node in node.xpath("pub-id"):
422
+ id_type = id_node.get("assigning-authority") or id_node.get(
423
+ "pub-id-type"
424
+ )
425
+ id_text = id_node.text
426
+ if id_type and id_text:
427
+ pub_id.append(
428
+ id_type.replace("\n", " ").strip().upper()
429
+ + ": "
430
+ + id_text.replace("\n", " ").strip()
431
+ )
432
+ if pub_id:
433
+ citation["pub_id"] = ", ".join(pub_id)
434
+
435
+ # Pages
436
+ if len(node.xpath("elocation-id")) > 0:
437
+ citation["page"] = (
438
+ node.xpath("elocation-id")[0].text.replace("\n", " ").strip()
439
+ )
440
+ elif len(node.xpath("fpage")) > 0:
441
+ citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
442
+ if len(node.xpath("lpage")) > 0:
443
+ citation["page"] += (
444
+ "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
445
+ )
446
+
447
+ # Flatten the citation to string
448
+
449
+ text = ""
450
+ if citation["author_names"]:
451
+ text += citation["author_names"].rstrip(".") + ". "
452
+ if citation["title"]:
453
+ text += citation["title"] + ". "
454
+ if citation["source"]:
455
+ text += citation["source"] + ". "
456
+ if citation["publisher_name"]:
457
+ if citation["publisher_loc"]:
458
+ text += f"{citation['publisher_loc']}: "
459
+ text += citation["publisher_name"] + ". "
460
+ if citation["volume"]:
461
+ text = text.rstrip(". ")
462
+ text += f" {citation['volume']}. "
463
+ if citation["page"]:
464
+ text = text.rstrip(". ")
465
+ if citation["volume"]:
466
+ text += ":"
467
+ text += citation["page"] + ". "
468
+ if citation["year"]:
469
+ text = text.rstrip(". ")
470
+ text += f" ({citation['year']})."
471
+ if citation["pub_id"]:
472
+ text = text.rstrip(".") + ". "
473
+ text += citation["pub_id"]
474
+
475
+ _log.debug("Citation flattened")
476
+
477
+ return text
478
+
479
+ def _add_equation(
480
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
481
+ ) -> None:
482
+ math_text = node.text
483
+ math_parts = math_text.split("$$")
484
+ if len(math_parts) == 3:
485
+ math_formula = math_parts[1]
486
+ doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)
487
+
488
+ return
489
+
490
+ def _add_figure_captions(
491
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
492
+ ) -> None:
493
+ label_node = node.xpath("label")
494
+ label: Optional[str] = (
495
+ JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
496
+ )
497
+
498
+ caption_node = node.xpath("caption")
499
+ caption: Optional[str]
500
+ if len(caption_node) > 0:
501
+ caption = ""
502
+ for caption_par in list(caption_node[0]):
503
+ if caption_par.xpath(".//supplementary-material"):
504
+ continue
505
+ caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
506
+ caption = caption.strip()
507
+ else:
508
+ caption = None
509
+
510
+ # TODO: format label vs caption once styling is supported
511
+ fig_text: str = f"{label}{' ' if label and caption else ''}{caption}"
512
+ fig_caption: Optional[TextItem] = (
513
+ doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)
514
+ if fig_text
515
+ else None
516
+ )
517
+
518
+ doc.add_picture(parent=parent, caption=fig_caption)
519
+
520
+ return
521
+
522
+ # TODO: add footnotes when DocItemLabel.FOOTNOTE and styling are supported
523
+ # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
524
+ # new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
525
+ # for child in node.iterchildren(tag="fn"):
526
+ # text = JatsDocumentBackend._get_text(child)
527
+ # doc.add_list_item(text=text, parent=new_parent)
528
+
529
+ def _add_metadata(
530
+ self, doc: DoclingDocument, xml_components: XMLComponents
531
+ ) -> None:
532
+ self._add_title(doc, xml_components)
533
+ self._add_authors(doc, xml_components)
534
+ self._add_abstract(doc, xml_components)
535
+
536
+ return
537
+
538
+ def _add_table(
539
+ self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
540
+ ) -> None:
541
+ soup = BeautifulSoup(table_xml_component["content"], "html.parser")
542
+ table_tag = soup.find("table")
543
+
544
+ nested_tables = table_tag.find("table")
545
+ if nested_tables:
546
+ _log.warning(f"Skipping nested table in {str(self.file)}")
547
+ return
548
+
549
+ # Count the number of rows (number of <tr> elements)
550
+ num_rows = len(table_tag.find_all("tr"))
551
+
552
+ # Find the number of columns (taking into account colspan)
553
+ num_cols = 0
554
+ for row in table_tag.find_all("tr"):
555
+ col_count = 0
556
+ for cell in row.find_all(["td", "th"]):
557
+ colspan = int(cell.get("colspan", 1))
558
+ col_count += colspan
559
+ num_cols = max(num_cols, col_count)
560
+
561
+ grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
562
+
563
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
564
+
565
+ # Iterate over the rows in the table
566
+ for row_idx, row in enumerate(table_tag.find_all("tr")):
567
+ # For each row, find all the column cells (both <td> and <th>)
568
+ cells = row.find_all(["td", "th"])
569
+
570
+ # Check if each cell in the row is a header -> means it is a column header
571
+ col_header = True
572
+ for j, html_cell in enumerate(cells):
573
+ if html_cell.name == "td":
574
+ col_header = False
575
+
576
+ # Extract and print the text content of each cell
577
+ col_idx = 0
578
+ for _, html_cell in enumerate(cells):
579
+ # extract inline formulas
580
+ for formula in html_cell.find_all("inline-formula"):
581
+ math_parts = formula.text.split("$$")
582
+ if len(math_parts) == 3:
583
+ math_formula = f"$${math_parts[1]}$$"
584
+ formula.replaceWith(math_formula)
585
+ text = html_cell.text
586
+
587
+ col_span = int(html_cell.get("colspan", 1))
588
+ row_span = int(html_cell.get("rowspan", 1))
589
+
590
+ while grid[row_idx][col_idx] is not None:
591
+ col_idx += 1
592
+ for r in range(row_span):
593
+ for c in range(col_span):
594
+ grid[row_idx + r][col_idx + c] = text
595
+
596
+ cell = TableCell(
597
+ text=text,
598
+ row_span=row_span,
599
+ col_span=col_span,
600
+ start_row_offset_idx=row_idx,
601
+ end_row_offset_idx=row_idx + row_span,
602
+ start_col_offset_idx=col_idx,
603
+ end_col_offset_idx=col_idx + col_span,
604
+ col_header=col_header,
605
+ row_header=((not col_header) and html_cell.name == "th"),
606
+ )
607
+ data.table_cells.append(cell)
608
+
609
+ # TODO: format label vs caption once styling is supported
610
+ label = table_xml_component["label"]
611
+ caption = table_xml_component["caption"]
612
+ table_text: str = f"{label}{' ' if label and caption else ''}{caption}"
613
+ table_caption: Optional[TextItem] = (
614
+ doc.add_text(label=DocItemLabel.CAPTION, text=table_text)
615
+ if table_text
616
+ else None
617
+ )
618
+
619
+ doc.add_table(data=data, parent=parent, caption=table_caption)
620
+
621
+ return
622
+
623
+ def _add_tables(
624
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
625
+ ) -> None:
626
+ table: Table = {"label": "", "caption": "", "content": ""}
627
+
628
+ # Content
629
+ if len(node.xpath("table")) > 0:
630
+ table_content_node = node.xpath("table")[0]
631
+ elif len(node.xpath("alternatives/table")) > 0:
632
+ table_content_node = node.xpath("alternatives/table")[0]
633
+ else:
634
+ table_content_node = None
635
+ if table_content_node is not None:
636
+ table["content"] = etree.tostring(table_content_node).decode("utf-8")
637
+
638
+ # Caption
639
+ caption_node = node.xpath("caption")
640
+ caption: Optional[str]
641
+ if caption_node:
642
+ caption = ""
643
+ for caption_par in list(caption_node[0]):
644
+ if caption_par.xpath(".//supplementary-material"):
645
+ continue
646
+ caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
647
+ caption = caption.strip()
648
+ else:
649
+ caption = None
650
+ if caption is not None:
651
+ table["caption"] = caption
652
+
653
+ # Label
654
+ if len(node.xpath("label")) > 0:
655
+ table["label"] = node.xpath("label")[0].text
656
+
657
+ try:
658
+ self._add_table(doc, parent, table)
659
+ except Exception as e:
660
+ _log.warning(f"Skipping unsupported table in {str(self.file)}")
661
+ pass
662
+
663
+ return
664
+
665
+ def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
666
+ self.root = doc.add_text(
667
+ parent=None,
668
+ text=xml_components["title"],
669
+ label=DocItemLabel.TITLE,
670
+ )
671
+ return
672
+
673
+ def _walk_linear(
674
+ self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
675
+ ) -> str:
676
+ # _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
677
+ skip_tags = ["term"]
678
+ flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
679
+ new_parent: NodeItem = parent
680
+ node_text: str = (
681
+ node.text.replace("\n", " ")
682
+ if (node.tag not in skip_tags and node.text)
683
+ else ""
684
+ )
685
+
686
+ for child in list(node):
687
+ stop_walk: bool = False
688
+
689
+ # flush text into TextItem for some tags in paragraph nodes
690
+ if node.tag == "p" and node_text.strip() and child.tag in flush_tags:
691
+ doc.add_text(
692
+ label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent
693
+ )
694
+ node_text = ""
695
+
696
+ # add elements and decide whether to stop walking
697
+ if child.tag in ("sec", "ack"):
698
+ header = child.xpath("title|label")
699
+ text: Optional[str] = None
700
+ if len(header) > 0:
701
+ text = JatsDocumentBackend._get_text(header[0])
702
+ elif child.tag == "ack":
703
+ text = DEFAULT_HEADER_ACKNOWLEDGMENTS
704
+ if text:
705
+ new_parent = doc.add_heading(text=text, parent=parent)
706
+ elif child.tag == "list":
707
+ new_parent = doc.add_group(
708
+ label=GroupLabel.LIST, name="list", parent=parent
709
+ )
710
+ elif child.tag == "list-item":
711
+ # TODO: address any type of content (another list, formula,...)
712
+ # TODO: address list type and item label
713
+ text = JatsDocumentBackend._get_text(child).strip()
714
+ new_parent = doc.add_list_item(text=text, parent=parent)
715
+ stop_walk = True
716
+ elif child.tag == "fig":
717
+ self._add_figure_captions(doc, parent, child)
718
+ stop_walk = True
719
+ elif child.tag == "table-wrap":
720
+ self._add_tables(doc, parent, child)
721
+ stop_walk = True
722
+ elif child.tag == "suplementary-material":
723
+ stop_walk = True
724
+ elif child.tag == "fn-group":
725
+ # header = child.xpath(".//title") or child.xpath(".//label")
726
+ # if header:
727
+ # text = JatsDocumentBackend._get_text(header[0])
728
+ # fn_parent = doc.add_heading(text=text, parent=new_parent)
729
+ # self._add_footnote_group(doc, fn_parent, child)
730
+ stop_walk = True
731
+ elif child.tag == "ref-list" and node.tag != "ref-list":
732
+ header = child.xpath("title|label")
733
+ text = (
734
+ JatsDocumentBackend._get_text(header[0])
735
+ if len(header) > 0
736
+ else DEFAULT_HEADER_REFERENCES
737
+ )
738
+ new_parent = doc.add_heading(text=text, parent=parent)
739
+ new_parent = doc.add_group(
740
+ parent=new_parent, label=GroupLabel.LIST, name="list"
741
+ )
742
+ elif child.tag == "element-citation":
743
+ text = self._parse_element_citation(child)
744
+ self._add_citation(doc, parent, text)
745
+ stop_walk = True
746
+ elif child.tag == "mixed-citation":
747
+ text = JatsDocumentBackend._get_text(child).strip()
748
+ self._add_citation(doc, parent, text)
749
+ stop_walk = True
750
+ elif child.tag == "tex-math":
751
+ self._add_equation(doc, parent, child)
752
+ stop_walk = True
753
+ elif child.tag == "inline-formula":
754
+ # TODO: address inline formulas when supported by docling-core
755
+ stop_walk = True
756
+
757
+ # step into child
758
+ if not stop_walk:
759
+ new_text = self._walk_linear(doc, new_parent, child)
760
+ if not (node.getparent().tag == "p" and node.tag in flush_tags):
761
+ node_text += new_text
762
+
763
+ # pick up the tail text
764
+ node_text += child.tail.replace("\n", " ") if child.tail else ""
765
+
766
+ # create paragraph
767
+ if node.tag == "p" and node_text.strip():
768
+ doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)
769
+ return ""
770
+ else:
771
+ # backpropagate the text
772
+ return node_text