docling 2.21.0__py3-none-any.whl → 2.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,592 +0,0 @@
1
- import logging
2
- from io import BytesIO
3
- from pathlib import Path
4
- from typing import Any, Set, Union
5
-
6
- import lxml
7
- from bs4 import BeautifulSoup
8
- from docling_core.types.doc import (
9
- DocItemLabel,
10
- DoclingDocument,
11
- DocumentOrigin,
12
- GroupLabel,
13
- TableCell,
14
- TableData,
15
- )
16
- from lxml import etree
17
- from typing_extensions import TypedDict, override
18
-
19
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
20
- from docling.datamodel.base_models import InputFormat
21
- from docling.datamodel.document import InputDocument
22
-
23
- _log = logging.getLogger(__name__)
24
-
25
-
26
- class Paragraph(TypedDict):
27
- text: str
28
- headers: list[str]
29
-
30
-
31
- class Author(TypedDict):
32
- name: str
33
- affiliation_names: list[str]
34
-
35
-
36
- class Table(TypedDict):
37
- label: str
38
- caption: str
39
- content: str
40
-
41
-
42
- class FigureCaption(TypedDict):
43
- label: str
44
- caption: str
45
-
46
-
47
- class Reference(TypedDict):
48
- author_names: str
49
- title: str
50
- journal: str
51
- year: str
52
-
53
-
54
- class XMLComponents(TypedDict):
55
- title: str
56
- authors: list[Author]
57
- abstract: str
58
- paragraphs: list[Paragraph]
59
- tables: list[Table]
60
- figure_captions: list[FigureCaption]
61
- references: list[Reference]
62
-
63
-
64
- class PubMedDocumentBackend(DeclarativeDocumentBackend):
65
- """
66
- The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
67
- Achakulvisut et al., (2020).
68
- Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
69
- Journal of Open Source Software, 5(46), 1979,
70
- https://doi.org/10.21105/joss.01979
71
- """
72
-
73
- @override
74
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
75
- super().__init__(in_doc, path_or_stream)
76
- self.path_or_stream = path_or_stream
77
-
78
- # Initialize parents for the document hierarchy
79
- self.parents: dict = {}
80
-
81
- self.valid = False
82
- try:
83
- if isinstance(self.path_or_stream, BytesIO):
84
- self.path_or_stream.seek(0)
85
- self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
86
- if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
87
- self.valid = True
88
- except Exception as exc:
89
- raise RuntimeError(
90
- f"Could not initialize PubMed backend for file with hash {self.document_hash}."
91
- ) from exc
92
-
93
- @override
94
- def is_valid(self) -> bool:
95
- return self.valid
96
-
97
- @classmethod
98
- @override
99
- def supports_pagination(cls) -> bool:
100
- return False
101
-
102
- @override
103
- def unload(self):
104
- if isinstance(self.path_or_stream, BytesIO):
105
- self.path_or_stream.close()
106
- self.path_or_stream = None
107
-
108
- @classmethod
109
- @override
110
- def supported_formats(cls) -> Set[InputFormat]:
111
- return {InputFormat.XML_PUBMED}
112
-
113
- @override
114
- def convert(self) -> DoclingDocument:
115
- # Create empty document
116
- origin = DocumentOrigin(
117
- filename=self.file.name or "file",
118
- mimetype="application/xml",
119
- binary_hash=self.document_hash,
120
- )
121
- doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
122
-
123
- _log.debug("Trying to convert PubMed XML document...")
124
-
125
- # Get parsed XML components
126
- xml_components: XMLComponents = self._parse()
127
-
128
- # Add XML components to the document
129
- doc = self._populate_document(doc, xml_components)
130
- return doc
131
-
132
- def _parse_title(self) -> str:
133
- title: str = " ".join(
134
- [
135
- t.replace("\n", "")
136
- for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
137
- ]
138
- )
139
- return title
140
-
141
- def _parse_authors(self) -> list[Author]:
142
- # Get mapping between affiliation ids and names
143
- affiliation_names = []
144
- for affiliation_node in self.tree.xpath(".//aff[@id]"):
145
- affiliation_names.append(
146
- ": ".join([t for t in affiliation_node.itertext() if t != "\n"])
147
- )
148
- affiliation_ids_names = {
149
- id: name
150
- for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
151
- }
152
-
153
- # Get author names and affiliation names
154
- authors: list[Author] = []
155
- for author_node in self.tree.xpath(
156
- './/contrib-group/contrib[@contrib-type="author"]'
157
- ):
158
- author: Author = {
159
- "name": "",
160
- "affiliation_names": [],
161
- }
162
-
163
- # Affiliation names
164
- affiliation_ids = [
165
- a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
166
- ]
167
- for id in affiliation_ids:
168
- if id in affiliation_ids_names:
169
- author["affiliation_names"].append(affiliation_ids_names[id])
170
-
171
- # Name
172
- author["name"] = (
173
- author_node.xpath("name/surname")[0].text
174
- + " "
175
- + author_node.xpath("name/given-names")[0].text
176
- )
177
-
178
- authors.append(author)
179
- return authors
180
-
181
- def _parse_abstract(self) -> str:
182
- texts = []
183
- for abstract_node in self.tree.xpath(".//abstract"):
184
- for text in abstract_node.itertext():
185
- texts.append(text.replace("\n", ""))
186
- abstract: str = "".join(texts)
187
- return abstract
188
-
189
- def _parse_main_text(self) -> list[Paragraph]:
190
- paragraphs: list[Paragraph] = []
191
- for paragraph_node in self.tree.xpath("//body//p"):
192
- # Skip captions
193
- if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
194
- continue
195
-
196
- paragraph: Paragraph = {"text": "", "headers": []}
197
-
198
- # Text
199
- paragraph["text"] = "".join(
200
- [t.replace("\n", "") for t in paragraph_node.itertext()]
201
- )
202
-
203
- # Header
204
- path = "../title"
205
- while len(paragraph_node.xpath(path)) > 0:
206
- paragraph["headers"].append(
207
- "".join(
208
- [
209
- t.replace("\n", "")
210
- for t in paragraph_node.xpath(path)[0].itertext()
211
- ]
212
- )
213
- )
214
- path = "../" + path
215
-
216
- paragraphs.append(paragraph)
217
-
218
- return paragraphs
219
-
220
- def _parse_tables(self) -> list[Table]:
221
- tables: list[Table] = []
222
- for table_node in self.tree.xpath(".//body//table-wrap"):
223
- table: Table = {"label": "", "caption": "", "content": ""}
224
-
225
- # Content
226
- if len(table_node.xpath("table")) > 0:
227
- table_content_node = table_node.xpath("table")[0]
228
- elif len(table_node.xpath("alternatives/table")) > 0:
229
- table_content_node = table_node.xpath("alternatives/table")[0]
230
- else:
231
- table_content_node = None
232
- if table_content_node != None:
233
- table["content"] = etree.tostring(table_content_node).decode("utf-8")
234
-
235
- # Caption
236
- if len(table_node.xpath("caption/p")) > 0:
237
- caption_node = table_node.xpath("caption/p")[0]
238
- elif len(table_node.xpath("caption/title")) > 0:
239
- caption_node = table_node.xpath("caption/title")[0]
240
- else:
241
- caption_node = None
242
- if caption_node != None:
243
- table["caption"] = "".join(
244
- [t.replace("\n", "") for t in caption_node.itertext()]
245
- )
246
-
247
- # Label
248
- if len(table_node.xpath("label")) > 0:
249
- table["label"] = table_node.xpath("label")[0].text
250
-
251
- tables.append(table)
252
- return tables
253
-
254
- def _parse_figure_captions(self) -> list[FigureCaption]:
255
- figure_captions: list[FigureCaption] = []
256
-
257
- if not (self.tree.xpath(".//fig")):
258
- return figure_captions
259
-
260
- for figure_node in self.tree.xpath(".//fig"):
261
- figure_caption: FigureCaption = {
262
- "caption": "",
263
- "label": "",
264
- }
265
-
266
- # Label
267
- if figure_node.xpath("label"):
268
- figure_caption["label"] = "".join(
269
- [
270
- t.replace("\n", "")
271
- for t in figure_node.xpath("label")[0].itertext()
272
- ]
273
- )
274
-
275
- # Caption
276
- if figure_node.xpath("caption"):
277
- caption = ""
278
- for caption_node in figure_node.xpath("caption")[0].getchildren():
279
- caption += (
280
- "".join([t.replace("\n", "") for t in caption_node.itertext()])
281
- + "\n"
282
- )
283
- figure_caption["caption"] = caption
284
-
285
- figure_captions.append(figure_caption)
286
-
287
- return figure_captions
288
-
289
- def _parse_references(self) -> list[Reference]:
290
- references: list[Reference] = []
291
- for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
292
- reference: Reference = {
293
- "author_names": "",
294
- "title": "",
295
- "journal": "",
296
- "year": "",
297
- }
298
- reference_node: Any = None
299
- for tag in ["mixed-citation", "element-citation", "citation"]:
300
- if len(reference_node_abs.xpath(tag)) > 0:
301
- reference_node = reference_node_abs.xpath(tag)[0]
302
- break
303
-
304
- if reference_node is None:
305
- continue
306
-
307
- if all(
308
- not (ref_type in ["citation-type", "publication-type"])
309
- for ref_type in reference_node.attrib.keys()
310
- ):
311
- continue
312
-
313
- # Author names
314
- names = []
315
- if len(reference_node.xpath("name")) > 0:
316
- for name_node in reference_node.xpath("name"):
317
- name_str = " ".join(
318
- [t.text for t in name_node.getchildren() if (t.text != None)]
319
- )
320
- names.append(name_str)
321
- elif len(reference_node.xpath("person-group")) > 0:
322
- for name_node in reference_node.xpath("person-group")[0]:
323
- name_str = (
324
- name_node.xpath("given-names")[0].text
325
- + " "
326
- + name_node.xpath("surname")[0].text
327
- )
328
- names.append(name_str)
329
- reference["author_names"] = "; ".join(names)
330
-
331
- # Title
332
- if len(reference_node.xpath("article-title")) > 0:
333
- reference["title"] = " ".join(
334
- [
335
- t.replace("\n", " ")
336
- for t in reference_node.xpath("article-title")[0].itertext()
337
- ]
338
- )
339
-
340
- # Journal
341
- if len(reference_node.xpath("source")) > 0:
342
- reference["journal"] = reference_node.xpath("source")[0].text
343
-
344
- # Year
345
- if len(reference_node.xpath("year")) > 0:
346
- reference["year"] = reference_node.xpath("year")[0].text
347
-
348
- if (
349
- not (reference_node.xpath("article-title"))
350
- and not (reference_node.xpath("journal"))
351
- and not (reference_node.xpath("year"))
352
- ):
353
- reference["title"] = reference_node.text
354
-
355
- references.append(reference)
356
- return references
357
-
358
- def _parse(self) -> XMLComponents:
359
- """Parsing PubMed document."""
360
- xml_components: XMLComponents = {
361
- "title": self._parse_title(),
362
- "authors": self._parse_authors(),
363
- "abstract": self._parse_abstract(),
364
- "paragraphs": self._parse_main_text(),
365
- "tables": self._parse_tables(),
366
- "figure_captions": self._parse_figure_captions(),
367
- "references": self._parse_references(),
368
- }
369
- return xml_components
370
-
371
- def _populate_document(
372
- self, doc: DoclingDocument, xml_components: XMLComponents
373
- ) -> DoclingDocument:
374
- self._add_title(doc, xml_components)
375
- self._add_authors(doc, xml_components)
376
- self._add_abstract(doc, xml_components)
377
- self._add_main_text(doc, xml_components)
378
-
379
- if xml_components["tables"]:
380
- self._add_tables(doc, xml_components)
381
-
382
- if xml_components["figure_captions"]:
383
- self._add_figure_captions(doc, xml_components)
384
-
385
- self._add_references(doc, xml_components)
386
- return doc
387
-
388
- def _add_figure_captions(
389
- self, doc: DoclingDocument, xml_components: XMLComponents
390
- ) -> None:
391
- self.parents["Figures"] = doc.add_heading(
392
- parent=self.parents["Title"], text="Figures"
393
- )
394
- for figure_caption_xml_component in xml_components["figure_captions"]:
395
- figure_caption_text = (
396
- figure_caption_xml_component["label"]
397
- + ": "
398
- + figure_caption_xml_component["caption"].strip()
399
- )
400
- fig_caption = doc.add_text(
401
- label=DocItemLabel.CAPTION, text=figure_caption_text
402
- )
403
- doc.add_picture(
404
- parent=self.parents["Figures"],
405
- caption=fig_caption,
406
- )
407
- return
408
-
409
- def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
410
- self.parents["Title"] = doc.add_text(
411
- parent=None,
412
- text=xml_components["title"],
413
- label=DocItemLabel.TITLE,
414
- )
415
- return
416
-
417
- def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
418
- authors_affiliations: list = []
419
- for author in xml_components["authors"]:
420
- authors_affiliations.append(author["name"])
421
- authors_affiliations.append(", ".join(author["affiliation_names"]))
422
- authors_affiliations_str = "; ".join(authors_affiliations)
423
-
424
- doc.add_text(
425
- parent=self.parents["Title"],
426
- text=authors_affiliations_str,
427
- label=DocItemLabel.PARAGRAPH,
428
- )
429
- return
430
-
431
- def _add_abstract(
432
- self, doc: DoclingDocument, xml_components: XMLComponents
433
- ) -> None:
434
- abstract_text: str = xml_components["abstract"]
435
- self.parents["Abstract"] = doc.add_heading(
436
- parent=self.parents["Title"], text="Abstract"
437
- )
438
- doc.add_text(
439
- parent=self.parents["Abstract"],
440
- text=abstract_text,
441
- label=DocItemLabel.TEXT,
442
- )
443
- return
444
-
445
- def _add_main_text(
446
- self, doc: DoclingDocument, xml_components: XMLComponents
447
- ) -> None:
448
- added_headers: list = []
449
- for paragraph in xml_components["paragraphs"]:
450
- if not (paragraph["headers"]):
451
- continue
452
-
453
- # Header
454
- for i, header in enumerate(reversed(paragraph["headers"])):
455
- if header in added_headers:
456
- continue
457
- added_headers.append(header)
458
-
459
- if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
460
- i - 1
461
- ] in self.parents:
462
- parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
463
- else:
464
- parent = self.parents["Title"]
465
-
466
- self.parents[header] = doc.add_heading(parent=parent, text=header)
467
-
468
- # Paragraph text
469
- if paragraph["headers"][0] in self.parents:
470
- parent = self.parents[paragraph["headers"][0]]
471
- else:
472
- parent = self.parents["Title"]
473
-
474
- doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
475
- return
476
-
477
- def _add_references(
478
- self, doc: DoclingDocument, xml_components: XMLComponents
479
- ) -> None:
480
- self.parents["References"] = doc.add_heading(
481
- parent=self.parents["Title"], text="References"
482
- )
483
- current_list = doc.add_group(
484
- parent=self.parents["References"], label=GroupLabel.LIST, name="list"
485
- )
486
- for reference in xml_components["references"]:
487
- reference_text: str = ""
488
- if reference["author_names"]:
489
- reference_text += reference["author_names"] + ". "
490
-
491
- if reference["title"]:
492
- reference_text += reference["title"]
493
- if reference["title"][-1] != ".":
494
- reference_text += "."
495
- reference_text += " "
496
-
497
- if reference["journal"]:
498
- reference_text += reference["journal"]
499
-
500
- if reference["year"]:
501
- reference_text += " (" + reference["year"] + ")"
502
-
503
- if not (reference_text):
504
- _log.debug(f"Skipping reference for: {str(self.file)}")
505
- continue
506
-
507
- doc.add_list_item(
508
- text=reference_text, enumerated=False, parent=current_list
509
- )
510
- return
511
-
512
- def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
513
- self.parents["Tables"] = doc.add_heading(
514
- parent=self.parents["Title"], text="Tables"
515
- )
516
- for table_xml_component in xml_components["tables"]:
517
- try:
518
- self._add_table(doc, table_xml_component)
519
- except Exception as e:
520
- _log.debug(f"Skipping unsupported table for: {str(self.file)}")
521
- pass
522
- return
523
-
524
- def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
525
- soup = BeautifulSoup(table_xml_component["content"], "html.parser")
526
- table_tag = soup.find("table")
527
-
528
- nested_tables = table_tag.find("table")
529
- if nested_tables:
530
- _log.debug(f"Skipping nested table for: {str(self.file)}")
531
- return
532
-
533
- # Count the number of rows (number of <tr> elements)
534
- num_rows = len(table_tag.find_all("tr"))
535
-
536
- # Find the number of columns (taking into account colspan)
537
- num_cols = 0
538
- for row in table_tag.find_all("tr"):
539
- col_count = 0
540
- for cell in row.find_all(["td", "th"]):
541
- colspan = int(cell.get("colspan", 1))
542
- col_count += colspan
543
- num_cols = max(num_cols, col_count)
544
-
545
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
546
-
547
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
548
-
549
- # Iterate over the rows in the table
550
- for row_idx, row in enumerate(table_tag.find_all("tr")):
551
- # For each row, find all the column cells (both <td> and <th>)
552
- cells = row.find_all(["td", "th"])
553
-
554
- # Check if each cell in the row is a header -> means it is a column header
555
- col_header = True
556
- for j, html_cell in enumerate(cells):
557
- if html_cell.name == "td":
558
- col_header = False
559
-
560
- # Extract and print the text content of each cell
561
- col_idx = 0
562
- for _, html_cell in enumerate(cells):
563
- text = html_cell.text
564
-
565
- col_span = int(html_cell.get("colspan", 1))
566
- row_span = int(html_cell.get("rowspan", 1))
567
-
568
- while grid[row_idx][col_idx] != None:
569
- col_idx += 1
570
- for r in range(row_span):
571
- for c in range(col_span):
572
- grid[row_idx + r][col_idx + c] = text
573
-
574
- cell = TableCell(
575
- text=text,
576
- row_span=row_span,
577
- col_span=col_span,
578
- start_row_offset_idx=row_idx,
579
- end_row_offset_idx=row_idx + row_span,
580
- start_col_offset_idx=col_idx,
581
- end_col_offset_idx=col_idx + col_span,
582
- col_header=col_header,
583
- row_header=((not col_header) and html_cell.name == "th"),
584
- )
585
- data.table_cells.append(cell)
586
-
587
- table_caption = doc.add_text(
588
- label=DocItemLabel.CAPTION,
589
- text=table_xml_component["label"] + ": " + table_xml_component["caption"],
590
- )
591
- doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
592
- return