aurelian 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aurelian/__init__.py +9 -0
- aurelian/agents/__init__.py +0 -0
- aurelian/agents/amigo/__init__.py +3 -0
- aurelian/agents/amigo/amigo_agent.py +77 -0
- aurelian/agents/amigo/amigo_config.py +85 -0
- aurelian/agents/amigo/amigo_evals.py +73 -0
- aurelian/agents/amigo/amigo_gradio.py +52 -0
- aurelian/agents/amigo/amigo_mcp.py +152 -0
- aurelian/agents/amigo/amigo_tools.py +152 -0
- aurelian/agents/biblio/__init__.py +42 -0
- aurelian/agents/biblio/biblio_agent.py +95 -0
- aurelian/agents/biblio/biblio_config.py +40 -0
- aurelian/agents/biblio/biblio_gradio.py +67 -0
- aurelian/agents/biblio/biblio_mcp.py +115 -0
- aurelian/agents/biblio/biblio_tools.py +164 -0
- aurelian/agents/biblio_agent.py +46 -0
- aurelian/agents/checklist/__init__.py +44 -0
- aurelian/agents/checklist/checklist_agent.py +86 -0
- aurelian/agents/checklist/checklist_config.py +28 -0
- aurelian/agents/checklist/checklist_gradio.py +70 -0
- aurelian/agents/checklist/checklist_mcp.py +86 -0
- aurelian/agents/checklist/checklist_tools.py +141 -0
- aurelian/agents/checklist/content/checklists.yaml +7 -0
- aurelian/agents/checklist/content/streams.csv +136 -0
- aurelian/agents/checklist_agent.py +40 -0
- aurelian/agents/chemistry/__init__.py +3 -0
- aurelian/agents/chemistry/chemistry_agent.py +47 -0
- aurelian/agents/chemistry/chemistry_config.py +71 -0
- aurelian/agents/chemistry/chemistry_evals.py +79 -0
- aurelian/agents/chemistry/chemistry_gradio.py +50 -0
- aurelian/agents/chemistry/chemistry_mcp.py +120 -0
- aurelian/agents/chemistry/chemistry_tools.py +121 -0
- aurelian/agents/chemistry/image_agent.py +15 -0
- aurelian/agents/d4d/__init__.py +30 -0
- aurelian/agents/d4d/d4d_agent.py +73 -0
- aurelian/agents/d4d/d4d_config.py +46 -0
- aurelian/agents/d4d/d4d_gradio.py +58 -0
- aurelian/agents/d4d/d4d_mcp.py +71 -0
- aurelian/agents/d4d/d4d_tools.py +157 -0
- aurelian/agents/d4d_agent.py +64 -0
- aurelian/agents/diagnosis/__init__.py +33 -0
- aurelian/agents/diagnosis/diagnosis_agent.py +54 -0
- aurelian/agents/diagnosis/diagnosis_config.py +48 -0
- aurelian/agents/diagnosis/diagnosis_evals.py +76 -0
- aurelian/agents/diagnosis/diagnosis_gradio.py +52 -0
- aurelian/agents/diagnosis/diagnosis_mcp.py +141 -0
- aurelian/agents/diagnosis/diagnosis_tools.py +204 -0
- aurelian/agents/diagnosis_agent.py +28 -0
- aurelian/agents/draw/__init__.py +3 -0
- aurelian/agents/draw/draw_agent.py +39 -0
- aurelian/agents/draw/draw_config.py +26 -0
- aurelian/agents/draw/draw_gradio.py +50 -0
- aurelian/agents/draw/draw_mcp.py +94 -0
- aurelian/agents/draw/draw_tools.py +100 -0
- aurelian/agents/draw/judge_agent.py +18 -0
- aurelian/agents/filesystem/__init__.py +0 -0
- aurelian/agents/filesystem/filesystem_config.py +27 -0
- aurelian/agents/filesystem/filesystem_gradio.py +49 -0
- aurelian/agents/filesystem/filesystem_mcp.py +89 -0
- aurelian/agents/filesystem/filesystem_tools.py +95 -0
- aurelian/agents/filesystem/py.typed +0 -0
- aurelian/agents/github/__init__.py +0 -0
- aurelian/agents/github/github_agent.py +83 -0
- aurelian/agents/github/github_cli.py +248 -0
- aurelian/agents/github/github_config.py +22 -0
- aurelian/agents/github/github_gradio.py +152 -0
- aurelian/agents/github/github_mcp.py +252 -0
- aurelian/agents/github/github_tools.py +408 -0
- aurelian/agents/github/github_tools.py.tmp +413 -0
- aurelian/agents/goann/__init__.py +13 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.md +1000 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.pdf +0 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.md +693 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.pdf +0 -0
- aurelian/agents/goann/goann_agent.py +90 -0
- aurelian/agents/goann/goann_config.py +90 -0
- aurelian/agents/goann/goann_evals.py +104 -0
- aurelian/agents/goann/goann_gradio.py +62 -0
- aurelian/agents/goann/goann_mcp.py +0 -0
- aurelian/agents/goann/goann_tools.py +65 -0
- aurelian/agents/gocam/__init__.py +52 -0
- aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/DNA-binding_transcription_factor_activity_annotation_guidelines.md +100 -0
- aurelian/agents/gocam/documents/E3 ubiquitin ligases.docx +0 -0
- aurelian/agents/gocam/documents/E3 ubiquitin ligases.pdf +0 -0
- aurelian/agents/gocam/documents/E3_ubiquitin_ligases.md +134 -0
- aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.docx +0 -0
- aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.pdf +0 -0
- aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.docx +0 -0
- aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.pdf +0 -0
- aurelian/agents/gocam/documents/GO-CAM_annotation_guidelines_README.md +1 -0
- aurelian/agents/gocam/documents/GO-CAM_modelling_guidelines_TO_DO.md +3 -0
- aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.pdf +0 -0
- aurelian/agents/gocam/documents/How to annotate molecular adaptors.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate molecular adaptors.pdf +0 -0
- aurelian/agents/gocam/documents/How to annotate sequestering proteins.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate sequestering proteins.pdf +0 -0
- aurelian/agents/gocam/documents/How_to_annotate_complexes_in_GO-CAM.md +29 -0
- aurelian/agents/gocam/documents/How_to_annotate_molecular_adaptors.md +31 -0
- aurelian/agents/gocam/documents/How_to_annotate_sequestering_proteins.md +42 -0
- aurelian/agents/gocam/documents/Molecular adaptor activity.docx +0 -0
- aurelian/agents/gocam/documents/Molecular adaptor activity.pdf +0 -0
- aurelian/agents/gocam/documents/Molecular carrier activity.docx +0 -0
- aurelian/agents/gocam/documents/Molecular carrier activity.pdf +0 -0
- aurelian/agents/gocam/documents/Molecular_adaptor_activity.md +51 -0
- aurelian/agents/gocam/documents/Molecular_carrier_activity.md +41 -0
- aurelian/agents/gocam/documents/Protein sequestering activity.docx +0 -0
- aurelian/agents/gocam/documents/Protein sequestering activity.pdf +0 -0
- aurelian/agents/gocam/documents/Protein_sequestering_activity.md +50 -0
- aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/Signaling_receptor_activity_annotation_guidelines.md +187 -0
- aurelian/agents/gocam/documents/Transcription coregulator activity.docx +0 -0
- aurelian/agents/gocam/documents/Transcription coregulator activity.pdf +0 -0
- aurelian/agents/gocam/documents/Transcription_coregulator_activity.md +36 -0
- aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/Transporter_activity_annotation_annotation_guidelines.md +43 -0
- Regulatory Processes in GO-CAM.docx +0 -0
- Regulatory Processes in GO-CAM.pdf +0 -0
- aurelian/agents/gocam/documents/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +31 -0
- aurelian/agents/gocam/documents/md/DNA-binding_transcription_factor_activity_annotation_guidelines.md +131 -0
- aurelian/agents/gocam/documents/md/E3_ubiquitin_ligases.md +166 -0
- aurelian/agents/gocam/documents/md/GO-CAM_annotation_guidelines_README.md +1 -0
- aurelian/agents/gocam/documents/md/GO-CAM_modelling_guidelines_TO_DO.md +5 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_complexes_in_GO-CAM.md +28 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_molecular_adaptors.md +19 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_sequestering_proteins.md +38 -0
- aurelian/agents/gocam/documents/md/Molecular_adaptor_activity.md +52 -0
- aurelian/agents/gocam/documents/md/Molecular_carrier_activity.md +59 -0
- aurelian/agents/gocam/documents/md/Protein_sequestering_activity.md +52 -0
- aurelian/agents/gocam/documents/md/Signaling_receptor_activity_annotation_guidelines.md +271 -0
- aurelian/agents/gocam/documents/md/Transcription_coregulator_activity.md +54 -0
- aurelian/agents/gocam/documents/md/Transporter_activity_annotation_annotation_guidelines.md +38 -0
- aurelian/agents/gocam/documents/md/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +39 -0
- aurelian/agents/gocam/documents/pandoc_md/Signaling_receptor_activity_annotation_guidelines.md +334 -0
- aurelian/agents/gocam/gocam_agent.py +243 -0
- aurelian/agents/gocam/gocam_config.py +85 -0
- aurelian/agents/gocam/gocam_curator_agent.py +46 -0
- aurelian/agents/gocam/gocam_evals.py +64 -0
- aurelian/agents/gocam/gocam_gradio.py +89 -0
- aurelian/agents/gocam/gocam_mcp.py +224 -0
- aurelian/agents/gocam/gocam_tools.py +294 -0
- aurelian/agents/linkml/__init__.py +0 -0
- aurelian/agents/linkml/linkml_agent.py +62 -0
- aurelian/agents/linkml/linkml_config.py +48 -0
- aurelian/agents/linkml/linkml_evals.py +66 -0
- aurelian/agents/linkml/linkml_gradio.py +45 -0
- aurelian/agents/linkml/linkml_mcp.py +181 -0
- aurelian/agents/linkml/linkml_tools.py +102 -0
- aurelian/agents/literature/__init__.py +3 -0
- aurelian/agents/literature/literature_agent.py +75 -0
- aurelian/agents/literature/literature_config.py +35 -0
- aurelian/agents/literature/literature_gradio.py +52 -0
- aurelian/agents/literature/literature_mcp.py +174 -0
- aurelian/agents/literature/literature_tools.py +182 -0
- aurelian/agents/monarch/__init__.py +0 -0
- aurelian/agents/monarch/monarch_agent.py +45 -0
- aurelian/agents/monarch/monarch_config.py +45 -0
- aurelian/agents/monarch/monarch_gradio.py +51 -0
- aurelian/agents/monarch/monarch_mcp.py +65 -0
- aurelian/agents/monarch/monarch_tools.py +112 -0
- aurelian/agents/oak/__init__.py +0 -0
- aurelian/agents/oak/oak_config.py +27 -0
- aurelian/agents/oak/oak_gradio.py +57 -0
- aurelian/agents/ontology_mapper/__init__.py +31 -0
- aurelian/agents/ontology_mapper/ontology_mapper_agent.py +57 -0
- aurelian/agents/ontology_mapper/ontology_mapper_config.py +50 -0
- aurelian/agents/ontology_mapper/ontology_mapper_evals.py +108 -0
- aurelian/agents/ontology_mapper/ontology_mapper_gradio.py +58 -0
- aurelian/agents/ontology_mapper/ontology_mapper_mcp.py +81 -0
- aurelian/agents/ontology_mapper/ontology_mapper_tools.py +147 -0
- aurelian/agents/paperqa/__init__.py +27 -0
- aurelian/agents/paperqa/paperqa_agent.py +66 -0
- aurelian/agents/paperqa/paperqa_cli.py +305 -0
- aurelian/agents/paperqa/paperqa_config.py +142 -0
- aurelian/agents/paperqa/paperqa_gradio.py +90 -0
- aurelian/agents/paperqa/paperqa_mcp.py +155 -0
- aurelian/agents/paperqa/paperqa_tools.py +566 -0
- aurelian/agents/phenopackets/__init__.py +3 -0
- aurelian/agents/phenopackets/phenopackets_agent.py +58 -0
- aurelian/agents/phenopackets/phenopackets_config.py +72 -0
- aurelian/agents/phenopackets/phenopackets_evals.py +99 -0
- aurelian/agents/phenopackets/phenopackets_gradio.py +55 -0
- aurelian/agents/phenopackets/phenopackets_mcp.py +178 -0
- aurelian/agents/phenopackets/phenopackets_tools.py +127 -0
- aurelian/agents/rag/__init__.py +40 -0
- aurelian/agents/rag/rag_agent.py +84 -0
- aurelian/agents/rag/rag_config.py +80 -0
- aurelian/agents/rag/rag_gradio.py +67 -0
- aurelian/agents/rag/rag_mcp.py +107 -0
- aurelian/agents/rag/rag_tools.py +189 -0
- aurelian/agents/rag_agent.py +54 -0
- aurelian/agents/robot/__init__.py +0 -0
- aurelian/agents/robot/assets/__init__.py +3 -0
- aurelian/agents/robot/assets/template.md +384 -0
- aurelian/agents/robot/robot_config.py +25 -0
- aurelian/agents/robot/robot_gradio.py +46 -0
- aurelian/agents/robot/robot_mcp.py +100 -0
- aurelian/agents/robot/robot_ontology_agent.py +139 -0
- aurelian/agents/robot/robot_tools.py +50 -0
- aurelian/agents/talisman/__init__.py +3 -0
- aurelian/agents/talisman/__main__.py +17 -0
- aurelian/agents/talisman/cli.py +70 -0
- aurelian/agents/talisman/run_talisman.py +18 -0
- aurelian/agents/talisman/talisman_agent.py +143 -0
- aurelian/agents/talisman/talisman_config.py +66 -0
- aurelian/agents/talisman/talisman_gradio.py +50 -0
- aurelian/agents/talisman/talisman_mcp.py +75 -0
- aurelian/agents/talisman/talisman_tools.py +962 -0
- aurelian/agents/ubergraph/__init__.py +40 -0
- aurelian/agents/ubergraph/ubergraph_agent.py +72 -0
- aurelian/agents/ubergraph/ubergraph_config.py +79 -0
- aurelian/agents/ubergraph/ubergraph_gradio.py +48 -0
- aurelian/agents/ubergraph/ubergraph_mcp.py +69 -0
- aurelian/agents/ubergraph/ubergraph_tools.py +118 -0
- aurelian/agents/uniprot/__init__.py +0 -0
- aurelian/agents/uniprot/uniprot_agent.py +43 -0
- aurelian/agents/uniprot/uniprot_config.py +43 -0
- aurelian/agents/uniprot/uniprot_evals.py +99 -0
- aurelian/agents/uniprot/uniprot_gradio.py +48 -0
- aurelian/agents/uniprot/uniprot_mcp.py +168 -0
- aurelian/agents/uniprot/uniprot_tools.py +136 -0
- aurelian/agents/web/__init__.py +0 -0
- aurelian/agents/web/web_config.py +27 -0
- aurelian/agents/web/web_gradio.py +48 -0
- aurelian/agents/web/web_mcp.py +50 -0
- aurelian/agents/web/web_tools.py +121 -0
- aurelian/chat.py +23 -0
- aurelian/cli.py +1004 -0
- aurelian/dependencies/__init__.py +0 -0
- aurelian/dependencies/workdir.py +78 -0
- aurelian/evaluators/model.py +9 -0
- aurelian/evaluators/substring_evaluator.py +30 -0
- aurelian/mcp/__init__.py +0 -0
- aurelian/mcp/amigo_mcp_test.py +86 -0
- aurelian/mcp/config_generator.py +123 -0
- aurelian/mcp/example_config.json +43 -0
- aurelian/mcp/generate_sample_config.py +37 -0
- aurelian/mcp/gocam_mcp_test.py +126 -0
- aurelian/mcp/linkml_mcp_tools.py +190 -0
- aurelian/mcp/mcp_discovery.py +87 -0
- aurelian/mcp/mcp_test.py +31 -0
- aurelian/mcp/phenopackets_mcp_test.py +103 -0
- aurelian/tools/__init__.py +0 -0
- aurelian/tools/web/__init__.py +0 -0
- aurelian/tools/web/url_download.py +51 -0
- aurelian/utils/__init__.py +0 -0
- aurelian/utils/async_utils.py +18 -0
- aurelian/utils/data_utils.py +32 -0
- aurelian/utils/documentation_manager.py +59 -0
- aurelian/utils/doi_fetcher.py +238 -0
- aurelian/utils/ontology_utils.py +68 -0
- aurelian/utils/pdf_fetcher.py +23 -0
- aurelian/utils/process_logs.py +100 -0
- aurelian/utils/pubmed_utils.py +238 -0
- aurelian/utils/pytest_report_to_markdown.py +67 -0
- aurelian/utils/robot_ontology_utils.py +112 -0
- aurelian/utils/search_utils.py +95 -0
- aurelian-0.1.0.dist-info/LICENSE +22 -0
- aurelian-0.1.0.dist-info/METADATA +109 -0
- aurelian-0.1.0.dist-info/RECORD +266 -0
- aurelian-0.1.0.dist-info/WHEEL +4 -0
- aurelian-0.1.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,238 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
from tempfile import NamedTemporaryFile
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
|
6
|
+
import logfire
|
7
|
+
import requests
|
8
|
+
import requests_cache
|
9
|
+
from bs4 import BeautifulSoup
|
10
|
+
from markitdown import MarkItDown
|
11
|
+
from openai import BaseModel
|
12
|
+
from pydantic import Field
|
13
|
+
|
14
|
+
|
15
|
+
class FullTextInfo(BaseModel):
|
16
|
+
"""Data model for full text information."""
|
17
|
+
|
18
|
+
success: bool = True
|
19
|
+
abstract: Optional[str] = Field(None, description="Abstract of the article")
|
20
|
+
text: Optional[str] = Field(None, description="Full text of the article")
|
21
|
+
source: Optional[str] = Field(None, description="Source of the full text")
|
22
|
+
metadata: Optional[Dict[str, Any]] = Field(None, description="Metadata of the article")
|
23
|
+
pdf_url: Optional[str] = Field(None, description="URL to the PDF version of the article")
|
24
|
+
|
25
|
+
|
26
|
+
class DOIFetcher:
|
27
|
+
"""Fetch metadata and full text for a DOI using various APIs."""
|
28
|
+
|
29
|
+
def __init__(self, email: Optional[str] = None, url_prefixes: Optional[List[str]] = None):
|
30
|
+
"""Initialize the DOI fetcher with a contact email (required by some APIs).
|
31
|
+
|
32
|
+
Args:
|
33
|
+
email (str): Contact email for API access
|
34
|
+
url_prefixes (List[str]): List of URL prefixes to check for full text
|
35
|
+
|
36
|
+
"""
|
37
|
+
self.email = email or os.getenv("EMAIL") or "test@example.com"
|
38
|
+
self.url_prefixes = url_prefixes or os.getenv("DOI_FULL_TEXT_URLS", "").split(",")
|
39
|
+
self.headers = {"User-Agent": f"DOIFetcher/1.0 (mailto:{email})", "Accept": "application/json"}
|
40
|
+
|
41
|
+
def clean_text(self, text: str) -> str:
|
42
|
+
"""Clean extracted text by removing extra whitespace and normalized characters.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
text:
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
str: The cleaned text
|
49
|
+
|
50
|
+
"""
|
51
|
+
# Remove extra whitespace
|
52
|
+
text = re.sub(r"\s+", " ", text)
|
53
|
+
# Remove non-printable characters
|
54
|
+
text = "".join(char for char in text if char.isprintable())
|
55
|
+
return text.strip()
|
56
|
+
|
57
|
+
def get_metadata(self, doi: str, strict=False) -> Optional[Dict[str, Any]]:
|
58
|
+
"""Fetch metadata for a DOI using the Crossref API.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
doi (str): The DOI to look up
|
62
|
+
strict (bool): Raise exceptions if API call fails
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
Optional[Dict[str, Any]]: Metadata dictionary if successful, None otherwise
|
66
|
+
|
67
|
+
"""
|
68
|
+
base_url = "https://api.crossref.org/works/"
|
69
|
+
try:
|
70
|
+
response = requests.get(f"{base_url}{doi}", headers=self.headers)
|
71
|
+
response.raise_for_status()
|
72
|
+
return response.json()["message"]
|
73
|
+
except Exception as e:
|
74
|
+
if strict:
|
75
|
+
raise e
|
76
|
+
logfire.warn(f"Error fetching metadata: {e}")
|
77
|
+
return None
|
78
|
+
|
79
|
+
def get_unpaywall_info(self, doi: str, strict=False) -> Optional[Dict[str, Any]]:
|
80
|
+
"""Check Unpaywall for open access versions.
|
81
|
+
|
82
|
+
Example:
|
83
|
+
>>> fetcher = DOIFetcher()
|
84
|
+
>>> doi = "10.1038/nature12373"
|
85
|
+
>>> unpaywall_data = fetcher.get_unpaywall_info(doi)
|
86
|
+
>>> assert unpaywall_data["doi"] == doi
|
87
|
+
>>> unpaywall_data["best_oa_location"]["url_for_pdf"]
|
88
|
+
'https://europepmc.org/articles/pmc4221854?pdf=render'
|
89
|
+
|
90
|
+
Args:
|
91
|
+
doi (str): The DOI to look up
|
92
|
+
strict (bool): Raise exceptions if API call fails
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
Optional[Dict[str, Any]]: Unpaywall data if successful, None otherwise
|
96
|
+
|
97
|
+
"""
|
98
|
+
base_url = f"https://api.unpaywall.org/v2/{doi}"
|
99
|
+
try:
|
100
|
+
response = requests.get(f"{base_url}?email={self.email}")
|
101
|
+
response.raise_for_status()
|
102
|
+
return response.json()
|
103
|
+
except Exception as e:
|
104
|
+
if strict:
|
105
|
+
raise e
|
106
|
+
logfire.warn(f"Error fetching Unpaywall data: {e}")
|
107
|
+
return None
|
108
|
+
|
109
|
+
def get_full_text(self, doi: str, fallback_to_abstract=True) -> Optional[str]:
|
110
|
+
"""Get the full text of a paper using various methods.
|
111
|
+
|
112
|
+
Example:
|
113
|
+
>>> fetcher = DOIFetcher()
|
114
|
+
>>> doi = "10.1128/msystems.00045-18"
|
115
|
+
>>> full_text = fetcher.get_full_text(doi)
|
116
|
+
>>> assert "Populus Microbiome" in full_text
|
117
|
+
|
118
|
+
Args:
|
119
|
+
doi:
|
120
|
+
fallback_to_abstract:
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
str: The full text if available, else abstract text if fallback_to_abstract,
|
124
|
+
else None
|
125
|
+
|
126
|
+
"""
|
127
|
+
info = self.get_full_text_info(doi)
|
128
|
+
if not info:
|
129
|
+
return None
|
130
|
+
text = info.text
|
131
|
+
if text:
|
132
|
+
return self.clean_text(text)
|
133
|
+
if info.pdf_url:
|
134
|
+
text = self.text_from_pdf_url(info.pdf_url)
|
135
|
+
if text:
|
136
|
+
return self.clean_text(text)
|
137
|
+
message = "FULL TEXT NOT AVAILABLE"
|
138
|
+
if fallback_to_abstract:
|
139
|
+
metadata = info.metadata or {}
|
140
|
+
abstract = metadata.get("abstract")
|
141
|
+
if abstract:
|
142
|
+
return self.clean_text(abstract) + f"\n\n{message}"
|
143
|
+
return message
|
144
|
+
|
145
|
+
def get_full_text_info(self, doi: str) -> Optional[FullTextInfo]:
|
146
|
+
"""Attempt to get the full text of a paper using various methods.
|
147
|
+
|
148
|
+
>>> fetcher = DOIFetcher()
|
149
|
+
>>> doi = "10.1128/msystems.00045-18"
|
150
|
+
>>> info = fetcher.get_full_text_info(doi)
|
151
|
+
>>> metadata = info.metadata
|
152
|
+
>>> metadata["type"]
|
153
|
+
'journal-article'
|
154
|
+
>>> metadata["title"][0][0:20]
|
155
|
+
'Exploration of the B'
|
156
|
+
>>> assert info.pdf_url is not None
|
157
|
+
>>> info.pdf_url
|
158
|
+
'https://europepmc.org/articles/pmc6172771?pdf=render'
|
159
|
+
|
160
|
+
Args:
|
161
|
+
doi (str): The DOI to fetch
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
FullTextInfo: Full text information
|
165
|
+
|
166
|
+
"""
|
167
|
+
# Get metadata
|
168
|
+
metadata = self.get_metadata(doi)
|
169
|
+
|
170
|
+
# Check Unpaywall
|
171
|
+
unpaywall_data = self.get_unpaywall_info(doi)
|
172
|
+
if unpaywall_data and unpaywall_data.get("is_oa"):
|
173
|
+
locations = unpaywall_data.get("oa_locations", [])
|
174
|
+
if unpaywall_data.get("best_oa_location"):
|
175
|
+
best_oa_location = unpaywall_data.get("best_oa_location")
|
176
|
+
locations = [best_oa_location] + locations
|
177
|
+
|
178
|
+
# Find best open access location
|
179
|
+
for location in locations:
|
180
|
+
pdf_url = location.get("url_for_pdf")
|
181
|
+
if pdf_url:
|
182
|
+
return FullTextInfo(text=None, pdf_url=pdf_url, source="unpaywall", metadata=metadata)
|
183
|
+
|
184
|
+
# Fallback
|
185
|
+
url_prefixes = os.getenv("DOI_FULL_TEXT_URLS", "").split(",")
|
186
|
+
|
187
|
+
for url_prefix in url_prefixes:
|
188
|
+
url_prefix.rstrip("/")
|
189
|
+
url = f"{url_prefix}/{doi}"
|
190
|
+
try:
|
191
|
+
response = requests.get(url)
|
192
|
+
if response.status_code == 200:
|
193
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
194
|
+
pdf_embed = soup.find("embed", id="pdf")
|
195
|
+
if pdf_embed and pdf_embed.get("src"):
|
196
|
+
pdf_url = pdf_embed["src"]
|
197
|
+
# Remove any URL parameters after #
|
198
|
+
pdf_url = pdf_url.split("#")[0]
|
199
|
+
if not pdf_url.startswith("http"):
|
200
|
+
pdf_url = "https:" + pdf_url
|
201
|
+
return FullTextInfo(
|
202
|
+
pdf_url=pdf_url,
|
203
|
+
source=url,
|
204
|
+
metadata=metadata,
|
205
|
+
)
|
206
|
+
except Exception:
|
207
|
+
continue
|
208
|
+
|
209
|
+
def text_from_pdf_url(self, pdf_url: str, raise_for_status=False) -> Optional[str]:
|
210
|
+
"""Extract text from a PDF URL.
|
211
|
+
|
212
|
+
Example:
|
213
|
+
>>> fetcher = DOIFetcher()
|
214
|
+
>>> pdf_url = "https://ceur-ws.org/Vol-1747/IT201_ICBO2016.pdf"
|
215
|
+
>>> text = fetcher.text_from_pdf_url(pdf_url)
|
216
|
+
>>> assert "biosphere" in text
|
217
|
+
|
218
|
+
Args:
|
219
|
+
pdf_url:
|
220
|
+
raise_for_status:
|
221
|
+
|
222
|
+
Returns:
|
223
|
+
|
224
|
+
"""
|
225
|
+
session = requests_cache.CachedSession("pdf_cache")
|
226
|
+
# Download the PDF
|
227
|
+
response = session.get(pdf_url)
|
228
|
+
if raise_for_status:
|
229
|
+
response.raise_for_status()
|
230
|
+
if response.status_code != 200:
|
231
|
+
return None
|
232
|
+
with NamedTemporaryFile(delete=False) as tmpf:
|
233
|
+
tmpf.write(response.content)
|
234
|
+
tmp_name = tmpf.name
|
235
|
+
with open(tmp_name, "wb") as f:
|
236
|
+
f.write(response.content)
|
237
|
+
md = MarkItDown()
|
238
|
+
return md.convert(tmpf.name).text_content
|
@@ -0,0 +1,68 @@
|
|
1
|
+
import logfire
|
2
|
+
import pystow
|
3
|
+
from cachetools.func import lru_cache
|
4
|
+
from linkml_store.api import Collection
|
5
|
+
from linkml_store.api.stores.duckdb import DuckDBDatabase
|
6
|
+
from linkml_store.index import LLMIndexer
|
7
|
+
from oaklib import BasicOntologyInterface, get_adapter
|
8
|
+
|
9
|
+
llm_indexer = LLMIndexer()
|
10
|
+
|
11
|
+
|
12
|
+
@lru_cache
|
13
|
+
def get_collection_for_adapter(handle: str, name: str) -> Collection:
|
14
|
+
"""
|
15
|
+
Retrieve or create a cached ontology collection.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
handle (str): The ontology handle (e.g., `sqlite:obo:uberon`).
|
19
|
+
name (str): The name of the ontology (e.g., `uberon`).
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
Collection: The indexed ontology collection.
|
23
|
+
"""
|
24
|
+
adapter = get_adapter(handle)
|
25
|
+
cache_dir = pystow.join("aurelian", "indexes")
|
26
|
+
duckdb_path = str(cache_dir / f"{name}.duckdb")
|
27
|
+
database = DuckDBDatabase(duckdb_path)
|
28
|
+
collection = database.get_collection(name, create_if_not_exists=True)
|
29
|
+
|
30
|
+
if collection.size() > 0:
|
31
|
+
return collection
|
32
|
+
|
33
|
+
objs = [{"id": id, "label": lbl} for id, lbl in adapter.labels(adapter.entities())]
|
34
|
+
collection.insert(objs)
|
35
|
+
return collection
|
36
|
+
|
37
|
+
|
38
|
+
def search_ontology(adapter: BasicOntologyInterface, query: str, limit=10):
|
39
|
+
"""
|
40
|
+
Search the ontology for the given query term.
|
41
|
+
|
42
|
+
Example:
|
43
|
+
>>> from oaklib import get_adapter
|
44
|
+
>>> adapter = get_adapter("sqlite:obo:uberon")
|
45
|
+
>>> terms = search_ontology(adapter, "manus")
|
46
|
+
>>> assert len(terms) > 1
|
47
|
+
>>> terms = search_ontology(adapter, "l~digit", limit=5)
|
48
|
+
>>> assert len(terms) == 5
|
49
|
+
|
50
|
+
Args:
|
51
|
+
adapter (BasicOntologyInterface): The ontology adapter.
|
52
|
+
query (str): The query term.
|
53
|
+
limit (int): The maximum number of search results to return.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
List[Tuple[str, str]]: A list of tuples containing ontology term IDs and labels.
|
57
|
+
"""
|
58
|
+
scheme = adapter.resource.scheme
|
59
|
+
name = adapter.resource.slug
|
60
|
+
local_name = name.split(":")[-1]
|
61
|
+
handle = f"{scheme}:{name}"
|
62
|
+
|
63
|
+
collection = get_collection_for_adapter(handle, local_name)
|
64
|
+
with logfire.span("search_ontology {name} {query}", name=name, query=query):
|
65
|
+
print(f"Searching {scheme}:{name} for {query}")
|
66
|
+
qr = collection.search(query, limit=limit, index_name="llm")
|
67
|
+
objs = [(obj["id"], obj["label"]) for obj in qr.rows]
|
68
|
+
return objs
|
@@ -0,0 +1,23 @@
|
|
1
|
+
import tempfile
|
2
|
+
import requests
|
3
|
+
from pdfminer.high_level import extract_text
|
4
|
+
|
5
|
+
|
6
|
+
def extract_text_from_pdf(pdf_url: str) -> str:
|
7
|
+
"""
|
8
|
+
Download and extract text from a PDF given its URL, using a temporary file.
|
9
|
+
"""
|
10
|
+
response = requests.get(pdf_url)
|
11
|
+
if response.status_code != 200:
|
12
|
+
return "Error: Unable to retrieve PDF."
|
13
|
+
|
14
|
+
try:
|
15
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as temp_pdf:
|
16
|
+
temp_pdf.write(response.content)
|
17
|
+
temp_pdf.flush() # Ensure all data is written before reading
|
18
|
+
|
19
|
+
text = extract_text(temp_pdf.name)
|
20
|
+
return text.strip() if text else "Error: No text extracted from PDF."
|
21
|
+
|
22
|
+
except Exception as e:
|
23
|
+
return f"Error extracting PDF text: {e}"
|
@@ -0,0 +1,100 @@
|
|
1
|
+
import json
|
2
|
+
from pathlib import Path
|
3
|
+
from collections import defaultdict
|
4
|
+
import re
|
5
|
+
|
6
|
+
|
7
|
+
def parse_reportlog(log_path: str):
|
8
|
+
"""Parse pytest-reportlog output into structured format."""
|
9
|
+
tests = defaultdict(dict)
|
10
|
+
|
11
|
+
with open(log_path) as f:
|
12
|
+
for line in f:
|
13
|
+
entry = json.loads(line)
|
14
|
+
|
15
|
+
# Only process TestReport entries
|
16
|
+
if entry.get('$report_type') != 'TestReport':
|
17
|
+
continue
|
18
|
+
|
19
|
+
nodeid = entry['nodeid']
|
20
|
+
|
21
|
+
# Store test outcome
|
22
|
+
if 'outcome' in entry:
|
23
|
+
tests[nodeid]['outcome'] = entry['outcome']
|
24
|
+
|
25
|
+
# Store duration
|
26
|
+
if 'duration' in entry:
|
27
|
+
tests[nodeid]['duration'] = entry['duration']
|
28
|
+
|
29
|
+
# Convert user_properties to dict
|
30
|
+
if 'user_properties' in entry:
|
31
|
+
props = dict(entry['user_properties'])
|
32
|
+
tests[nodeid]['properties'] = props
|
33
|
+
|
34
|
+
# Store parameters from nodeid
|
35
|
+
# Extract from something like: test_search_ontology[sqlite:obo:bfo-3D spatial-10-expected0]
|
36
|
+
if '[' in nodeid:
|
37
|
+
param_str = nodeid[nodeid.index('[') + 1:nodeid.rindex(']')]
|
38
|
+
# You might want to customize this parsing based on your parameter format
|
39
|
+
tests[nodeid]['parameters'] = param_str
|
40
|
+
|
41
|
+
return tests
|
42
|
+
|
43
|
+
|
44
|
+
def generate_markdown(tests):
|
45
|
+
"""Convert test results to markdown documentation."""
|
46
|
+
md = []
|
47
|
+
md.append("# Test Results Documentation\n")
|
48
|
+
|
49
|
+
# Group tests by their base function name
|
50
|
+
test_groups = defaultdict(list)
|
51
|
+
for nodeid, data in tests.items():
|
52
|
+
# Split nodeid into parts: path::function[params]
|
53
|
+
base_name = nodeid.split('::')[1].split('[')[0] if '[' in nodeid else nodeid.split('::')[1]
|
54
|
+
test_groups[base_name].append((nodeid, data))
|
55
|
+
|
56
|
+
for base_name, group in test_groups.items():
|
57
|
+
md.append(f"## {base_name}\n")
|
58
|
+
|
59
|
+
# Create table for all test runs
|
60
|
+
md.append("### Test Runs\n")
|
61
|
+
|
62
|
+
# Headers: Parameters, Properties, Duration, Outcome
|
63
|
+
md.append('| Parameters | Properties | Duration (s) | Outcome |')
|
64
|
+
md.append('|------------|------------|-------------|---------|')
|
65
|
+
|
66
|
+
for nodeid, data in group:
|
67
|
+
# Extract parameters from nodeid
|
68
|
+
params = nodeid.split('[')[1].rstrip(']') if '[' in nodeid else ''
|
69
|
+
|
70
|
+
# Format properties
|
71
|
+
props = data.get('properties', {})
|
72
|
+
props_str = '; '.join(f"{k}: {v}" for k, v in props.items())
|
73
|
+
|
74
|
+
# Format duration
|
75
|
+
duration = f"{data.get('duration', 0):.3f}"
|
76
|
+
|
77
|
+
row = [
|
78
|
+
params,
|
79
|
+
props_str,
|
80
|
+
duration,
|
81
|
+
data.get('outcome', '')
|
82
|
+
]
|
83
|
+
|
84
|
+
md.append('| ' + ' | '.join(str(cell) for cell in row) + ' |')
|
85
|
+
|
86
|
+
md.append('')
|
87
|
+
return '\n'.join(md)
|
88
|
+
|
89
|
+
# Example usage:
|
90
|
+
if __name__ == '__main__':
|
91
|
+
# Assume report.jsonl exists from running:
|
92
|
+
# pytest test_examples.py --report-log=report.jsonl
|
93
|
+
|
94
|
+
log_path = Path('report.jsonl')
|
95
|
+
tests = parse_reportlog(log_path)
|
96
|
+
markdown = generate_markdown(tests)
|
97
|
+
|
98
|
+
# Write markdown to file
|
99
|
+
with open('docs/unit_tests.md', 'w') as f:
|
100
|
+
f.write(markdown)
|
@@ -0,0 +1,238 @@
|
|
1
|
+
import re
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import requests
|
5
|
+
from bs4 import BeautifulSoup
|
6
|
+
|
7
|
+
from aurelian.utils.doi_fetcher import DOIFetcher
|
8
|
+
|
9
|
+
BIOC_URL = "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/{pmid}/ascii"
|
10
|
+
PUBMED_EUTILS_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=xml"
|
11
|
+
EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&retmode=xml"
|
12
|
+
|
13
|
+
DOI_PATTERN = r"/(10\.\d{4,9}/[\w\-.]+)"
|
14
|
+
|
15
|
+
doi_fetcher = DOIFetcher()
|
16
|
+
|
17
|
+
|
18
|
+
def extract_doi_from_url(url: str) -> Optional[str]:
|
19
|
+
"""Extracts the DOI from a given journal URL.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
url (str): The URL of the article.
|
23
|
+
|
24
|
+
Returns:
|
25
|
+
str: The extracted DOI if found, otherwise an empty string.
|
26
|
+
|
27
|
+
"""
|
28
|
+
doi_match = re.search(DOI_PATTERN, url)
|
29
|
+
return doi_match.group(1) if doi_match else None
|
30
|
+
|
31
|
+
|
32
|
+
def doi_to_pmid(doi: str) -> Optional[str]:
|
33
|
+
"""Converts a DOI to a PMID using the NCBI ID Converter API.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
doi (str): The DOI to be converted.
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
str: The corresponding PMID if found, otherwise an empty string.
|
40
|
+
|
41
|
+
"""
|
42
|
+
API_URL = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={doi}&format=json"
|
43
|
+
response = requests.get(API_URL).json()
|
44
|
+
records = response.get("records", [])
|
45
|
+
pmid = records[0].get("pmid", None) if records else None
|
46
|
+
return pmid
|
47
|
+
|
48
|
+
|
49
|
+
def get_doi_text(doi: str) -> str:
|
50
|
+
"""Fetch the full text of an article using a DOI.
|
51
|
+
|
52
|
+
TODO: non pubmed sources
|
53
|
+
|
54
|
+
Example:
|
55
|
+
>>> doi = "10.1128/msystems.00045-18"
|
56
|
+
>>> full_text = get_doi_text(doi)
|
57
|
+
>>> assert "Populus Microbiome" in full_text
|
58
|
+
|
59
|
+
Args:
|
60
|
+
doi: The DOI of the article.
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
The full text of the article if available, otherwise an empty string.
|
64
|
+
|
65
|
+
"""
|
66
|
+
pmid = doi_to_pmid(doi)
|
67
|
+
if not pmid:
|
68
|
+
info = doi_fetcher.get_full_text(doi)
|
69
|
+
if info:
|
70
|
+
return info
|
71
|
+
else:
|
72
|
+
return f"PMID not found for {doi} and not available via unpaywall"
|
73
|
+
return get_pmid_text(pmid)
|
74
|
+
|
75
|
+
|
76
|
+
def get_pmid_from_pmcid(pmcid):
|
77
|
+
"""Fetch the PMID from a PMC ID using the Entrez E-utilities `esummary`.
|
78
|
+
|
79
|
+
Example:
|
80
|
+
>>> pmcid = "PMC5048378"
|
81
|
+
>>> pmid = get_pmid_from_pmcid(pmcid)
|
82
|
+
>>> print(pmid)
|
83
|
+
27629041
|
84
|
+
|
85
|
+
Args:
|
86
|
+
pmcid:
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
|
90
|
+
"""
|
91
|
+
if ":" in pmcid:
|
92
|
+
pmcid = pmcid.split(":")[1]
|
93
|
+
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
|
94
|
+
params = {"db": "pmc", "id": pmcid.replace("PMC", ""), "retmode": "json"} # Remove "PMC" prefix if included
|
95
|
+
|
96
|
+
response = requests.get(url, params=params)
|
97
|
+
data = response.json()
|
98
|
+
|
99
|
+
# Extract PMID
|
100
|
+
try:
|
101
|
+
uid = data["result"]["uids"][0] # Extract the UID
|
102
|
+
article_ids = data["result"][uid]["articleids"] # Get article IDs
|
103
|
+
for item in article_ids:
|
104
|
+
if item["idtype"] == "pmid":
|
105
|
+
return item["value"]
|
106
|
+
except KeyError:
|
107
|
+
return "PMID not found"
|
108
|
+
|
109
|
+
|
110
|
+
def get_pmcid_text(pmcid: str) -> str:
|
111
|
+
"""Fetch full text from PubMed Central Open Access BioC XML.
|
112
|
+
|
113
|
+
Example:
|
114
|
+
>>> pmcid = "PMC5048378"
|
115
|
+
>>> full_text = get_pmcid_text(pmcid)
|
116
|
+
>>> assert "integrated stress response (ISR)" in full_text
|
117
|
+
|
118
|
+
Args:
|
119
|
+
pmcid:
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
|
123
|
+
"""
|
124
|
+
pmid = get_pmid_from_pmcid(pmcid)
|
125
|
+
return get_pmid_text(pmid)
|
126
|
+
|
127
|
+
|
128
|
+
def get_pmid_text(pmid: str) -> str:
|
129
|
+
"""Fetch full text from PubMed Central Open Access BioC XML.
|
130
|
+
If full text is not available, fallback to fetching the abstract from PubMed.
|
131
|
+
|
132
|
+
Example:
|
133
|
+
>>> pmid = "11"
|
134
|
+
>>> full_text = get_pmid_text(pmid)
|
135
|
+
>>> print(full_text)
|
136
|
+
Identification of adenylate cyclase-coupled beta-adrenergic receptors with radiolabeled beta-adrenergic antagonists.
|
137
|
+
<BLANKLINE>
|
138
|
+
No abstract available
|
139
|
+
|
140
|
+
Args:
|
141
|
+
pmid: PubMed ID of the article.
|
142
|
+
|
143
|
+
Returns:
|
144
|
+
The full text of the article if available, otherwise the abstract.
|
145
|
+
|
146
|
+
"""
|
147
|
+
if ":" in pmid:
|
148
|
+
pmid = pmid.split(":")[1]
|
149
|
+
text = get_full_text_from_bioc(pmid)
|
150
|
+
if not text:
|
151
|
+
doi = pmid_to_doi(pmid)
|
152
|
+
if doi:
|
153
|
+
text = doi_fetcher.get_full_text(doi)
|
154
|
+
if not text:
|
155
|
+
text = get_abstract_from_pubmed(pmid)
|
156
|
+
return text
|
157
|
+
|
158
|
+
def pmid_to_doi(pmid: str) -> Optional[str]:
|
159
|
+
if ":" in pmid:
|
160
|
+
pmid = pmid.split(":")[1]
|
161
|
+
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json"
|
162
|
+
response = requests.get(url)
|
163
|
+
data = response.json()
|
164
|
+
|
165
|
+
try:
|
166
|
+
article_info = data["result"][str(pmid)]
|
167
|
+
for aid in article_info["articleids"]:
|
168
|
+
if aid["idtype"] == "doi":
|
169
|
+
return aid["value"]
|
170
|
+
elocationid = article_info.get("elocationid", "")
|
171
|
+
if elocationid.startswith("10."): # DOI starts with "10."
|
172
|
+
return elocationid
|
173
|
+
else:
|
174
|
+
return None
|
175
|
+
except KeyError:
|
176
|
+
return None
|
177
|
+
|
178
|
+
|
179
|
+
def get_full_text_from_bioc(pmid: str) -> str:
|
180
|
+
"""Fetch full text from PubMed Central Open Access BioC XML.
|
181
|
+
|
182
|
+
Example:
|
183
|
+
>>> pmid = "17299597"
|
184
|
+
>>> full_text = get_full_text_from_bioc(pmid)
|
185
|
+
>>> assert "Evolution of biological complexity." in full_text
|
186
|
+
|
187
|
+
Args:
|
188
|
+
pmid: PubMed ID of the article.
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
The full text of the article if available, otherwise an empty string.
|
192
|
+
|
193
|
+
"""
|
194
|
+
response = requests.get(BIOC_URL.format(pmid=pmid))
|
195
|
+
|
196
|
+
if response.status_code != 200:
|
197
|
+
return "" # Return empty string if request fails
|
198
|
+
|
199
|
+
soup = BeautifulSoup(response.text, "xml")
|
200
|
+
|
201
|
+
# Extract ONLY text from <text> tags within <passage>
|
202
|
+
text_sections = [text_tag.get_text() for text_tag in soup.find_all("text")]
|
203
|
+
|
204
|
+
full_text = "\n".join(text_sections).strip()
|
205
|
+
return full_text
|
206
|
+
|
207
|
+
|
208
|
+
def get_abstract_from_pubmed(pmid: str) -> str:
|
209
|
+
"""Fetch the title and abstract of an article from PubMed using Entrez E-utilities `efetch`.
|
210
|
+
|
211
|
+
Example:
|
212
|
+
>>> pmid = "31653696"
|
213
|
+
>>> abstract = get_abstract_from_pubmed(pmid)
|
214
|
+
>>> assert "The apparent deglycase activity of DJ-1" in abstract
|
215
|
+
|
216
|
+
Args:
|
217
|
+
pmid: PubMed ID of the article.
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
The title and abstract text if available, otherwise an empty string.
|
221
|
+
|
222
|
+
"""
|
223
|
+
response = requests.get(EFETCH_URL.format(pmid=pmid))
|
224
|
+
|
225
|
+
if response.status_code != 200:
|
226
|
+
return ""
|
227
|
+
|
228
|
+
soup = BeautifulSoup(response.text, "xml")
|
229
|
+
|
230
|
+
# Extract title
|
231
|
+
title_tag = soup.find("ArticleTitle")
|
232
|
+
title = title_tag.get_text().strip() if title_tag else "No title available"
|
233
|
+
|
234
|
+
# Extract abstract (may contain multiple sections)
|
235
|
+
abstract_tags = soup.find_all("AbstractText")
|
236
|
+
abstract = "\n".join(tag.get_text().strip() for tag in abstract_tags) if abstract_tags else "No abstract available"
|
237
|
+
|
238
|
+
return f"{title}\n\n{abstract}"
|