aurelian 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. aurelian/__init__.py +9 -0
  2. aurelian/agents/__init__.py +0 -0
  3. aurelian/agents/amigo/__init__.py +3 -0
  4. aurelian/agents/amigo/amigo_agent.py +77 -0
  5. aurelian/agents/amigo/amigo_config.py +85 -0
  6. aurelian/agents/amigo/amigo_evals.py +73 -0
  7. aurelian/agents/amigo/amigo_gradio.py +52 -0
  8. aurelian/agents/amigo/amigo_mcp.py +152 -0
  9. aurelian/agents/amigo/amigo_tools.py +152 -0
  10. aurelian/agents/biblio/__init__.py +42 -0
  11. aurelian/agents/biblio/biblio_agent.py +95 -0
  12. aurelian/agents/biblio/biblio_config.py +40 -0
  13. aurelian/agents/biblio/biblio_gradio.py +67 -0
  14. aurelian/agents/biblio/biblio_mcp.py +115 -0
  15. aurelian/agents/biblio/biblio_tools.py +164 -0
  16. aurelian/agents/biblio_agent.py +46 -0
  17. aurelian/agents/checklist/__init__.py +44 -0
  18. aurelian/agents/checklist/checklist_agent.py +86 -0
  19. aurelian/agents/checklist/checklist_config.py +28 -0
  20. aurelian/agents/checklist/checklist_gradio.py +70 -0
  21. aurelian/agents/checklist/checklist_mcp.py +86 -0
  22. aurelian/agents/checklist/checklist_tools.py +141 -0
  23. aurelian/agents/checklist/content/checklists.yaml +7 -0
  24. aurelian/agents/checklist/content/streams.csv +136 -0
  25. aurelian/agents/checklist_agent.py +40 -0
  26. aurelian/agents/chemistry/__init__.py +3 -0
  27. aurelian/agents/chemistry/chemistry_agent.py +47 -0
  28. aurelian/agents/chemistry/chemistry_config.py +71 -0
  29. aurelian/agents/chemistry/chemistry_evals.py +79 -0
  30. aurelian/agents/chemistry/chemistry_gradio.py +50 -0
  31. aurelian/agents/chemistry/chemistry_mcp.py +120 -0
  32. aurelian/agents/chemistry/chemistry_tools.py +121 -0
  33. aurelian/agents/chemistry/image_agent.py +15 -0
  34. aurelian/agents/d4d/__init__.py +30 -0
  35. aurelian/agents/d4d/d4d_agent.py +73 -0
  36. aurelian/agents/d4d/d4d_config.py +46 -0
  37. aurelian/agents/d4d/d4d_gradio.py +58 -0
  38. aurelian/agents/d4d/d4d_mcp.py +71 -0
  39. aurelian/agents/d4d/d4d_tools.py +157 -0
  40. aurelian/agents/d4d_agent.py +64 -0
  41. aurelian/agents/diagnosis/__init__.py +33 -0
  42. aurelian/agents/diagnosis/diagnosis_agent.py +54 -0
  43. aurelian/agents/diagnosis/diagnosis_config.py +48 -0
  44. aurelian/agents/diagnosis/diagnosis_evals.py +76 -0
  45. aurelian/agents/diagnosis/diagnosis_gradio.py +52 -0
  46. aurelian/agents/diagnosis/diagnosis_mcp.py +141 -0
  47. aurelian/agents/diagnosis/diagnosis_tools.py +204 -0
  48. aurelian/agents/diagnosis_agent.py +28 -0
  49. aurelian/agents/draw/__init__.py +3 -0
  50. aurelian/agents/draw/draw_agent.py +39 -0
  51. aurelian/agents/draw/draw_config.py +26 -0
  52. aurelian/agents/draw/draw_gradio.py +50 -0
  53. aurelian/agents/draw/draw_mcp.py +94 -0
  54. aurelian/agents/draw/draw_tools.py +100 -0
  55. aurelian/agents/draw/judge_agent.py +18 -0
  56. aurelian/agents/filesystem/__init__.py +0 -0
  57. aurelian/agents/filesystem/filesystem_config.py +27 -0
  58. aurelian/agents/filesystem/filesystem_gradio.py +49 -0
  59. aurelian/agents/filesystem/filesystem_mcp.py +89 -0
  60. aurelian/agents/filesystem/filesystem_tools.py +95 -0
  61. aurelian/agents/filesystem/py.typed +0 -0
  62. aurelian/agents/github/__init__.py +0 -0
  63. aurelian/agents/github/github_agent.py +83 -0
  64. aurelian/agents/github/github_cli.py +248 -0
  65. aurelian/agents/github/github_config.py +22 -0
  66. aurelian/agents/github/github_gradio.py +152 -0
  67. aurelian/agents/github/github_mcp.py +252 -0
  68. aurelian/agents/github/github_tools.py +408 -0
  69. aurelian/agents/github/github_tools.py.tmp +413 -0
  70. aurelian/agents/goann/__init__.py +13 -0
  71. aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.md +1000 -0
  72. aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.pdf +0 -0
  73. aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.md +693 -0
  74. aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.pdf +0 -0
  75. aurelian/agents/goann/goann_agent.py +90 -0
  76. aurelian/agents/goann/goann_config.py +90 -0
  77. aurelian/agents/goann/goann_evals.py +104 -0
  78. aurelian/agents/goann/goann_gradio.py +62 -0
  79. aurelian/agents/goann/goann_mcp.py +0 -0
  80. aurelian/agents/goann/goann_tools.py +65 -0
  81. aurelian/agents/gocam/__init__.py +52 -0
  82. aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.docx +0 -0
  83. aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.pdf +0 -0
  84. aurelian/agents/gocam/documents/DNA-binding_transcription_factor_activity_annotation_guidelines.md +100 -0
  85. aurelian/agents/gocam/documents/E3 ubiquitin ligases.docx +0 -0
  86. aurelian/agents/gocam/documents/E3 ubiquitin ligases.pdf +0 -0
  87. aurelian/agents/gocam/documents/E3_ubiquitin_ligases.md +134 -0
  88. aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.docx +0 -0
  89. aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.pdf +0 -0
  90. aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.docx +0 -0
  91. aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.pdf +0 -0
  92. aurelian/agents/gocam/documents/GO-CAM_annotation_guidelines_README.md +1 -0
  93. aurelian/agents/gocam/documents/GO-CAM_modelling_guidelines_TO_DO.md +3 -0
  94. aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.docx +0 -0
  95. aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.pdf +0 -0
  96. aurelian/agents/gocam/documents/How to annotate molecular adaptors.docx +0 -0
  97. aurelian/agents/gocam/documents/How to annotate molecular adaptors.pdf +0 -0
  98. aurelian/agents/gocam/documents/How to annotate sequestering proteins.docx +0 -0
  99. aurelian/agents/gocam/documents/How to annotate sequestering proteins.pdf +0 -0
  100. aurelian/agents/gocam/documents/How_to_annotate_complexes_in_GO-CAM.md +29 -0
  101. aurelian/agents/gocam/documents/How_to_annotate_molecular_adaptors.md +31 -0
  102. aurelian/agents/gocam/documents/How_to_annotate_sequestering_proteins.md +42 -0
  103. aurelian/agents/gocam/documents/Molecular adaptor activity.docx +0 -0
  104. aurelian/agents/gocam/documents/Molecular adaptor activity.pdf +0 -0
  105. aurelian/agents/gocam/documents/Molecular carrier activity.docx +0 -0
  106. aurelian/agents/gocam/documents/Molecular carrier activity.pdf +0 -0
  107. aurelian/agents/gocam/documents/Molecular_adaptor_activity.md +51 -0
  108. aurelian/agents/gocam/documents/Molecular_carrier_activity.md +41 -0
  109. aurelian/agents/gocam/documents/Protein sequestering activity.docx +0 -0
  110. aurelian/agents/gocam/documents/Protein sequestering activity.pdf +0 -0
  111. aurelian/agents/gocam/documents/Protein_sequestering_activity.md +50 -0
  112. aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.docx +0 -0
  113. aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.pdf +0 -0
  114. aurelian/agents/gocam/documents/Signaling_receptor_activity_annotation_guidelines.md +187 -0
  115. aurelian/agents/gocam/documents/Transcription coregulator activity.docx +0 -0
  116. aurelian/agents/gocam/documents/Transcription coregulator activity.pdf +0 -0
  117. aurelian/agents/gocam/documents/Transcription_coregulator_activity.md +36 -0
  118. aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.docx +0 -0
  119. aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.pdf +0 -0
  120. aurelian/agents/gocam/documents/Transporter_activity_annotation_annotation_guidelines.md +43 -0
  121. Regulatory Processes in GO-CAM.docx +0 -0
  122. Regulatory Processes in GO-CAM.pdf +0 -0
  123. aurelian/agents/gocam/documents/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +31 -0
  124. aurelian/agents/gocam/documents/md/DNA-binding_transcription_factor_activity_annotation_guidelines.md +131 -0
  125. aurelian/agents/gocam/documents/md/E3_ubiquitin_ligases.md +166 -0
  126. aurelian/agents/gocam/documents/md/GO-CAM_annotation_guidelines_README.md +1 -0
  127. aurelian/agents/gocam/documents/md/GO-CAM_modelling_guidelines_TO_DO.md +5 -0
  128. aurelian/agents/gocam/documents/md/How_to_annotate_complexes_in_GO-CAM.md +28 -0
  129. aurelian/agents/gocam/documents/md/How_to_annotate_molecular_adaptors.md +19 -0
  130. aurelian/agents/gocam/documents/md/How_to_annotate_sequestering_proteins.md +38 -0
  131. aurelian/agents/gocam/documents/md/Molecular_adaptor_activity.md +52 -0
  132. aurelian/agents/gocam/documents/md/Molecular_carrier_activity.md +59 -0
  133. aurelian/agents/gocam/documents/md/Protein_sequestering_activity.md +52 -0
  134. aurelian/agents/gocam/documents/md/Signaling_receptor_activity_annotation_guidelines.md +271 -0
  135. aurelian/agents/gocam/documents/md/Transcription_coregulator_activity.md +54 -0
  136. aurelian/agents/gocam/documents/md/Transporter_activity_annotation_annotation_guidelines.md +38 -0
  137. aurelian/agents/gocam/documents/md/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +39 -0
  138. aurelian/agents/gocam/documents/pandoc_md/Signaling_receptor_activity_annotation_guidelines.md +334 -0
  139. aurelian/agents/gocam/gocam_agent.py +243 -0
  140. aurelian/agents/gocam/gocam_config.py +85 -0
  141. aurelian/agents/gocam/gocam_curator_agent.py +46 -0
  142. aurelian/agents/gocam/gocam_evals.py +64 -0
  143. aurelian/agents/gocam/gocam_gradio.py +89 -0
  144. aurelian/agents/gocam/gocam_mcp.py +224 -0
  145. aurelian/agents/gocam/gocam_tools.py +294 -0
  146. aurelian/agents/linkml/__init__.py +0 -0
  147. aurelian/agents/linkml/linkml_agent.py +62 -0
  148. aurelian/agents/linkml/linkml_config.py +48 -0
  149. aurelian/agents/linkml/linkml_evals.py +66 -0
  150. aurelian/agents/linkml/linkml_gradio.py +45 -0
  151. aurelian/agents/linkml/linkml_mcp.py +181 -0
  152. aurelian/agents/linkml/linkml_tools.py +102 -0
  153. aurelian/agents/literature/__init__.py +3 -0
  154. aurelian/agents/literature/literature_agent.py +75 -0
  155. aurelian/agents/literature/literature_config.py +35 -0
  156. aurelian/agents/literature/literature_gradio.py +52 -0
  157. aurelian/agents/literature/literature_mcp.py +174 -0
  158. aurelian/agents/literature/literature_tools.py +182 -0
  159. aurelian/agents/monarch/__init__.py +0 -0
  160. aurelian/agents/monarch/monarch_agent.py +45 -0
  161. aurelian/agents/monarch/monarch_config.py +45 -0
  162. aurelian/agents/monarch/monarch_gradio.py +51 -0
  163. aurelian/agents/monarch/monarch_mcp.py +65 -0
  164. aurelian/agents/monarch/monarch_tools.py +112 -0
  165. aurelian/agents/oak/__init__.py +0 -0
  166. aurelian/agents/oak/oak_config.py +27 -0
  167. aurelian/agents/oak/oak_gradio.py +57 -0
  168. aurelian/agents/ontology_mapper/__init__.py +31 -0
  169. aurelian/agents/ontology_mapper/ontology_mapper_agent.py +57 -0
  170. aurelian/agents/ontology_mapper/ontology_mapper_config.py +50 -0
  171. aurelian/agents/ontology_mapper/ontology_mapper_evals.py +108 -0
  172. aurelian/agents/ontology_mapper/ontology_mapper_gradio.py +58 -0
  173. aurelian/agents/ontology_mapper/ontology_mapper_mcp.py +81 -0
  174. aurelian/agents/ontology_mapper/ontology_mapper_tools.py +147 -0
  175. aurelian/agents/paperqa/__init__.py +27 -0
  176. aurelian/agents/paperqa/paperqa_agent.py +66 -0
  177. aurelian/agents/paperqa/paperqa_cli.py +305 -0
  178. aurelian/agents/paperqa/paperqa_config.py +142 -0
  179. aurelian/agents/paperqa/paperqa_gradio.py +90 -0
  180. aurelian/agents/paperqa/paperqa_mcp.py +155 -0
  181. aurelian/agents/paperqa/paperqa_tools.py +566 -0
  182. aurelian/agents/phenopackets/__init__.py +3 -0
  183. aurelian/agents/phenopackets/phenopackets_agent.py +58 -0
  184. aurelian/agents/phenopackets/phenopackets_config.py +72 -0
  185. aurelian/agents/phenopackets/phenopackets_evals.py +99 -0
  186. aurelian/agents/phenopackets/phenopackets_gradio.py +55 -0
  187. aurelian/agents/phenopackets/phenopackets_mcp.py +178 -0
  188. aurelian/agents/phenopackets/phenopackets_tools.py +127 -0
  189. aurelian/agents/rag/__init__.py +40 -0
  190. aurelian/agents/rag/rag_agent.py +84 -0
  191. aurelian/agents/rag/rag_config.py +80 -0
  192. aurelian/agents/rag/rag_gradio.py +67 -0
  193. aurelian/agents/rag/rag_mcp.py +107 -0
  194. aurelian/agents/rag/rag_tools.py +189 -0
  195. aurelian/agents/rag_agent.py +54 -0
  196. aurelian/agents/robot/__init__.py +0 -0
  197. aurelian/agents/robot/assets/__init__.py +3 -0
  198. aurelian/agents/robot/assets/template.md +384 -0
  199. aurelian/agents/robot/robot_config.py +25 -0
  200. aurelian/agents/robot/robot_gradio.py +46 -0
  201. aurelian/agents/robot/robot_mcp.py +100 -0
  202. aurelian/agents/robot/robot_ontology_agent.py +139 -0
  203. aurelian/agents/robot/robot_tools.py +50 -0
  204. aurelian/agents/talisman/__init__.py +3 -0
  205. aurelian/agents/talisman/__main__.py +17 -0
  206. aurelian/agents/talisman/cli.py +70 -0
  207. aurelian/agents/talisman/run_talisman.py +18 -0
  208. aurelian/agents/talisman/talisman_agent.py +143 -0
  209. aurelian/agents/talisman/talisman_config.py +66 -0
  210. aurelian/agents/talisman/talisman_gradio.py +50 -0
  211. aurelian/agents/talisman/talisman_mcp.py +75 -0
  212. aurelian/agents/talisman/talisman_tools.py +962 -0
  213. aurelian/agents/ubergraph/__init__.py +40 -0
  214. aurelian/agents/ubergraph/ubergraph_agent.py +72 -0
  215. aurelian/agents/ubergraph/ubergraph_config.py +79 -0
  216. aurelian/agents/ubergraph/ubergraph_gradio.py +48 -0
  217. aurelian/agents/ubergraph/ubergraph_mcp.py +69 -0
  218. aurelian/agents/ubergraph/ubergraph_tools.py +118 -0
  219. aurelian/agents/uniprot/__init__.py +0 -0
  220. aurelian/agents/uniprot/uniprot_agent.py +43 -0
  221. aurelian/agents/uniprot/uniprot_config.py +43 -0
  222. aurelian/agents/uniprot/uniprot_evals.py +99 -0
  223. aurelian/agents/uniprot/uniprot_gradio.py +48 -0
  224. aurelian/agents/uniprot/uniprot_mcp.py +168 -0
  225. aurelian/agents/uniprot/uniprot_tools.py +136 -0
  226. aurelian/agents/web/__init__.py +0 -0
  227. aurelian/agents/web/web_config.py +27 -0
  228. aurelian/agents/web/web_gradio.py +48 -0
  229. aurelian/agents/web/web_mcp.py +50 -0
  230. aurelian/agents/web/web_tools.py +121 -0
  231. aurelian/chat.py +23 -0
  232. aurelian/cli.py +1004 -0
  233. aurelian/dependencies/__init__.py +0 -0
  234. aurelian/dependencies/workdir.py +78 -0
  235. aurelian/evaluators/model.py +9 -0
  236. aurelian/evaluators/substring_evaluator.py +30 -0
  237. aurelian/mcp/__init__.py +0 -0
  238. aurelian/mcp/amigo_mcp_test.py +86 -0
  239. aurelian/mcp/config_generator.py +123 -0
  240. aurelian/mcp/example_config.json +43 -0
  241. aurelian/mcp/generate_sample_config.py +37 -0
  242. aurelian/mcp/gocam_mcp_test.py +126 -0
  243. aurelian/mcp/linkml_mcp_tools.py +190 -0
  244. aurelian/mcp/mcp_discovery.py +87 -0
  245. aurelian/mcp/mcp_test.py +31 -0
  246. aurelian/mcp/phenopackets_mcp_test.py +103 -0
  247. aurelian/tools/__init__.py +0 -0
  248. aurelian/tools/web/__init__.py +0 -0
  249. aurelian/tools/web/url_download.py +51 -0
  250. aurelian/utils/__init__.py +0 -0
  251. aurelian/utils/async_utils.py +18 -0
  252. aurelian/utils/data_utils.py +32 -0
  253. aurelian/utils/documentation_manager.py +59 -0
  254. aurelian/utils/doi_fetcher.py +238 -0
  255. aurelian/utils/ontology_utils.py +68 -0
  256. aurelian/utils/pdf_fetcher.py +23 -0
  257. aurelian/utils/process_logs.py +100 -0
  258. aurelian/utils/pubmed_utils.py +238 -0
  259. aurelian/utils/pytest_report_to_markdown.py +67 -0
  260. aurelian/utils/robot_ontology_utils.py +112 -0
  261. aurelian/utils/search_utils.py +95 -0
  262. aurelian-0.1.0.dist-info/LICENSE +22 -0
  263. aurelian-0.1.0.dist-info/METADATA +109 -0
  264. aurelian-0.1.0.dist-info/RECORD +266 -0
  265. aurelian-0.1.0.dist-info/WHEEL +4 -0
  266. aurelian-0.1.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,238 @@
1
+ import os
2
+ import re
3
+ from tempfile import NamedTemporaryFile
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import logfire
7
+ import requests
8
+ import requests_cache
9
+ from bs4 import BeautifulSoup
10
+ from markitdown import MarkItDown
11
+ from openai import BaseModel
12
+ from pydantic import Field
13
+
14
+
15
+ class FullTextInfo(BaseModel):
16
+ """Data model for full text information."""
17
+
18
+ success: bool = True
19
+ abstract: Optional[str] = Field(None, description="Abstract of the article")
20
+ text: Optional[str] = Field(None, description="Full text of the article")
21
+ source: Optional[str] = Field(None, description="Source of the full text")
22
+ metadata: Optional[Dict[str, Any]] = Field(None, description="Metadata of the article")
23
+ pdf_url: Optional[str] = Field(None, description="URL to the PDF version of the article")
24
+
25
+
26
+ class DOIFetcher:
27
+ """Fetch metadata and full text for a DOI using various APIs."""
28
+
29
+ def __init__(self, email: Optional[str] = None, url_prefixes: Optional[List[str]] = None):
30
+ """Initialize the DOI fetcher with a contact email (required by some APIs).
31
+
32
+ Args:
33
+ email (str): Contact email for API access
34
+ url_prefixes (List[str]): List of URL prefixes to check for full text
35
+
36
+ """
37
+ self.email = email or os.getenv("EMAIL") or "test@example.com"
38
+ self.url_prefixes = url_prefixes or os.getenv("DOI_FULL_TEXT_URLS", "").split(",")
39
+ self.headers = {"User-Agent": f"DOIFetcher/1.0 (mailto:{email})", "Accept": "application/json"}
40
+
41
+ def clean_text(self, text: str) -> str:
42
+ """Clean extracted text by removing extra whitespace and normalized characters.
43
+
44
+ Args:
45
+ text:
46
+
47
+ Returns:
48
+ str: The cleaned text
49
+
50
+ """
51
+ # Remove extra whitespace
52
+ text = re.sub(r"\s+", " ", text)
53
+ # Remove non-printable characters
54
+ text = "".join(char for char in text if char.isprintable())
55
+ return text.strip()
56
+
57
+ def get_metadata(self, doi: str, strict=False) -> Optional[Dict[str, Any]]:
58
+ """Fetch metadata for a DOI using the Crossref API.
59
+
60
+ Args:
61
+ doi (str): The DOI to look up
62
+ strict (bool): Raise exceptions if API call fails
63
+
64
+ Returns:
65
+ Optional[Dict[str, Any]]: Metadata dictionary if successful, None otherwise
66
+
67
+ """
68
+ base_url = "https://api.crossref.org/works/"
69
+ try:
70
+ response = requests.get(f"{base_url}{doi}", headers=self.headers)
71
+ response.raise_for_status()
72
+ return response.json()["message"]
73
+ except Exception as e:
74
+ if strict:
75
+ raise e
76
+ logfire.warn(f"Error fetching metadata: {e}")
77
+ return None
78
+
79
+ def get_unpaywall_info(self, doi: str, strict=False) -> Optional[Dict[str, Any]]:
80
+ """Check Unpaywall for open access versions.
81
+
82
+ Example:
83
+ >>> fetcher = DOIFetcher()
84
+ >>> doi = "10.1038/nature12373"
85
+ >>> unpaywall_data = fetcher.get_unpaywall_info(doi)
86
+ >>> assert unpaywall_data["doi"] == doi
87
+ >>> unpaywall_data["best_oa_location"]["url_for_pdf"]
88
+ 'https://europepmc.org/articles/pmc4221854?pdf=render'
89
+
90
+ Args:
91
+ doi (str): The DOI to look up
92
+ strict (bool): Raise exceptions if API call fails
93
+
94
+ Returns:
95
+ Optional[Dict[str, Any]]: Unpaywall data if successful, None otherwise
96
+
97
+ """
98
+ base_url = f"https://api.unpaywall.org/v2/{doi}"
99
+ try:
100
+ response = requests.get(f"{base_url}?email={self.email}")
101
+ response.raise_for_status()
102
+ return response.json()
103
+ except Exception as e:
104
+ if strict:
105
+ raise e
106
+ logfire.warn(f"Error fetching Unpaywall data: {e}")
107
+ return None
108
+
109
+ def get_full_text(self, doi: str, fallback_to_abstract=True) -> Optional[str]:
110
+ """Get the full text of a paper using various methods.
111
+
112
+ Example:
113
+ >>> fetcher = DOIFetcher()
114
+ >>> doi = "10.1128/msystems.00045-18"
115
+ >>> full_text = fetcher.get_full_text(doi)
116
+ >>> assert "Populus Microbiome" in full_text
117
+
118
+ Args:
119
+ doi:
120
+ fallback_to_abstract:
121
+
122
+ Returns:
123
+ str: The full text if available, else abstract text if fallback_to_abstract,
124
+ else None
125
+
126
+ """
127
+ info = self.get_full_text_info(doi)
128
+ if not info:
129
+ return None
130
+ text = info.text
131
+ if text:
132
+ return self.clean_text(text)
133
+ if info.pdf_url:
134
+ text = self.text_from_pdf_url(info.pdf_url)
135
+ if text:
136
+ return self.clean_text(text)
137
+ message = "FULL TEXT NOT AVAILABLE"
138
+ if fallback_to_abstract:
139
+ metadata = info.metadata or {}
140
+ abstract = metadata.get("abstract")
141
+ if abstract:
142
+ return self.clean_text(abstract) + f"\n\n{message}"
143
+ return message
144
+
145
+ def get_full_text_info(self, doi: str) -> Optional[FullTextInfo]:
146
+ """Attempt to get the full text of a paper using various methods.
147
+
148
+ >>> fetcher = DOIFetcher()
149
+ >>> doi = "10.1128/msystems.00045-18"
150
+ >>> info = fetcher.get_full_text_info(doi)
151
+ >>> metadata = info.metadata
152
+ >>> metadata["type"]
153
+ 'journal-article'
154
+ >>> metadata["title"][0][0:20]
155
+ 'Exploration of the B'
156
+ >>> assert info.pdf_url is not None
157
+ >>> info.pdf_url
158
+ 'https://europepmc.org/articles/pmc6172771?pdf=render'
159
+
160
+ Args:
161
+ doi (str): The DOI to fetch
162
+
163
+ Returns:
164
+ FullTextInfo: Full text information
165
+
166
+ """
167
+ # Get metadata
168
+ metadata = self.get_metadata(doi)
169
+
170
+ # Check Unpaywall
171
+ unpaywall_data = self.get_unpaywall_info(doi)
172
+ if unpaywall_data and unpaywall_data.get("is_oa"):
173
+ locations = unpaywall_data.get("oa_locations", [])
174
+ if unpaywall_data.get("best_oa_location"):
175
+ best_oa_location = unpaywall_data.get("best_oa_location")
176
+ locations = [best_oa_location] + locations
177
+
178
+ # Find best open access location
179
+ for location in locations:
180
+ pdf_url = location.get("url_for_pdf")
181
+ if pdf_url:
182
+ return FullTextInfo(text=None, pdf_url=pdf_url, source="unpaywall", metadata=metadata)
183
+
184
+ # Fallback
185
+ url_prefixes = os.getenv("DOI_FULL_TEXT_URLS", "").split(",")
186
+
187
+ for url_prefix in url_prefixes:
188
+ url_prefix.rstrip("/")
189
+ url = f"{url_prefix}/{doi}"
190
+ try:
191
+ response = requests.get(url)
192
+ if response.status_code == 200:
193
+ soup = BeautifulSoup(response.text, "html.parser")
194
+ pdf_embed = soup.find("embed", id="pdf")
195
+ if pdf_embed and pdf_embed.get("src"):
196
+ pdf_url = pdf_embed["src"]
197
+ # Remove any URL parameters after #
198
+ pdf_url = pdf_url.split("#")[0]
199
+ if not pdf_url.startswith("http"):
200
+ pdf_url = "https:" + pdf_url
201
+ return FullTextInfo(
202
+ pdf_url=pdf_url,
203
+ source=url,
204
+ metadata=metadata,
205
+ )
206
+ except Exception:
207
+ continue
208
+
209
+ def text_from_pdf_url(self, pdf_url: str, raise_for_status=False) -> Optional[str]:
210
+ """Extract text from a PDF URL.
211
+
212
+ Example:
213
+ >>> fetcher = DOIFetcher()
214
+ >>> pdf_url = "https://ceur-ws.org/Vol-1747/IT201_ICBO2016.pdf"
215
+ >>> text = fetcher.text_from_pdf_url(pdf_url)
216
+ >>> assert "biosphere" in text
217
+
218
+ Args:
219
+ pdf_url:
220
+ raise_for_status:
221
+
222
+ Returns:
223
+
224
+ """
225
+ session = requests_cache.CachedSession("pdf_cache")
226
+ # Download the PDF
227
+ response = session.get(pdf_url)
228
+ if raise_for_status:
229
+ response.raise_for_status()
230
+ if response.status_code != 200:
231
+ return None
232
+ with NamedTemporaryFile(delete=False) as tmpf:
233
+ tmpf.write(response.content)
234
+ tmp_name = tmpf.name
235
+ with open(tmp_name, "wb") as f:
236
+ f.write(response.content)
237
+ md = MarkItDown()
238
+ return md.convert(tmpf.name).text_content
@@ -0,0 +1,68 @@
1
+ import logfire
2
+ import pystow
3
+ from cachetools.func import lru_cache
4
+ from linkml_store.api import Collection
5
+ from linkml_store.api.stores.duckdb import DuckDBDatabase
6
+ from linkml_store.index import LLMIndexer
7
+ from oaklib import BasicOntologyInterface, get_adapter
8
+
9
+ llm_indexer = LLMIndexer()
10
+
11
+
12
+ @lru_cache
13
+ def get_collection_for_adapter(handle: str, name: str) -> Collection:
14
+ """
15
+ Retrieve or create a cached ontology collection.
16
+
17
+ Args:
18
+ handle (str): The ontology handle (e.g., `sqlite:obo:uberon`).
19
+ name (str): The name of the ontology (e.g., `uberon`).
20
+
21
+ Returns:
22
+ Collection: The indexed ontology collection.
23
+ """
24
+ adapter = get_adapter(handle)
25
+ cache_dir = pystow.join("aurelian", "indexes")
26
+ duckdb_path = str(cache_dir / f"{name}.duckdb")
27
+ database = DuckDBDatabase(duckdb_path)
28
+ collection = database.get_collection(name, create_if_not_exists=True)
29
+
30
+ if collection.size() > 0:
31
+ return collection
32
+
33
+ objs = [{"id": id, "label": lbl} for id, lbl in adapter.labels(adapter.entities())]
34
+ collection.insert(objs)
35
+ return collection
36
+
37
+
38
+ def search_ontology(adapter: BasicOntologyInterface, query: str, limit=10):
39
+ """
40
+ Search the ontology for the given query term.
41
+
42
+ Example:
43
+ >>> from oaklib import get_adapter
44
+ >>> adapter = get_adapter("sqlite:obo:uberon")
45
+ >>> terms = search_ontology(adapter, "manus")
46
+ >>> assert len(terms) > 1
47
+ >>> terms = search_ontology(adapter, "l~digit", limit=5)
48
+ >>> assert len(terms) == 5
49
+
50
+ Args:
51
+ adapter (BasicOntologyInterface): The ontology adapter.
52
+ query (str): The query term.
53
+ limit (int): The maximum number of search results to return.
54
+
55
+ Returns:
56
+ List[Tuple[str, str]]: A list of tuples containing ontology term IDs and labels.
57
+ """
58
+ scheme = adapter.resource.scheme
59
+ name = adapter.resource.slug
60
+ local_name = name.split(":")[-1]
61
+ handle = f"{scheme}:{name}"
62
+
63
+ collection = get_collection_for_adapter(handle, local_name)
64
+ with logfire.span("search_ontology {name} {query}", name=name, query=query):
65
+ print(f"Searching {scheme}:{name} for {query}")
66
+ qr = collection.search(query, limit=limit, index_name="llm")
67
+ objs = [(obj["id"], obj["label"]) for obj in qr.rows]
68
+ return objs
@@ -0,0 +1,23 @@
1
+ import tempfile
2
+ import requests
3
+ from pdfminer.high_level import extract_text
4
+
5
+
6
+ def extract_text_from_pdf(pdf_url: str) -> str:
7
+ """
8
+ Download and extract text from a PDF given its URL, using a temporary file.
9
+ """
10
+ response = requests.get(pdf_url)
11
+ if response.status_code != 200:
12
+ return "Error: Unable to retrieve PDF."
13
+
14
+ try:
15
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as temp_pdf:
16
+ temp_pdf.write(response.content)
17
+ temp_pdf.flush() # Ensure all data is written before reading
18
+
19
+ text = extract_text(temp_pdf.name)
20
+ return text.strip() if text else "Error: No text extracted from PDF."
21
+
22
+ except Exception as e:
23
+ return f"Error extracting PDF text: {e}"
@@ -0,0 +1,100 @@
1
+ import json
2
+ from pathlib import Path
3
+ from collections import defaultdict
4
+ import re
5
+
6
+
7
+ def parse_reportlog(log_path: str):
8
+ """Parse pytest-reportlog output into structured format."""
9
+ tests = defaultdict(dict)
10
+
11
+ with open(log_path) as f:
12
+ for line in f:
13
+ entry = json.loads(line)
14
+
15
+ # Only process TestReport entries
16
+ if entry.get('$report_type') != 'TestReport':
17
+ continue
18
+
19
+ nodeid = entry['nodeid']
20
+
21
+ # Store test outcome
22
+ if 'outcome' in entry:
23
+ tests[nodeid]['outcome'] = entry['outcome']
24
+
25
+ # Store duration
26
+ if 'duration' in entry:
27
+ tests[nodeid]['duration'] = entry['duration']
28
+
29
+ # Convert user_properties to dict
30
+ if 'user_properties' in entry:
31
+ props = dict(entry['user_properties'])
32
+ tests[nodeid]['properties'] = props
33
+
34
+ # Store parameters from nodeid
35
+ # Extract from something like: test_search_ontology[sqlite:obo:bfo-3D spatial-10-expected0]
36
+ if '[' in nodeid:
37
+ param_str = nodeid[nodeid.index('[') + 1:nodeid.rindex(']')]
38
+ # You might want to customize this parsing based on your parameter format
39
+ tests[nodeid]['parameters'] = param_str
40
+
41
+ return tests
42
+
43
+
44
+ def generate_markdown(tests):
45
+ """Convert test results to markdown documentation."""
46
+ md = []
47
+ md.append("# Test Results Documentation\n")
48
+
49
+ # Group tests by their base function name
50
+ test_groups = defaultdict(list)
51
+ for nodeid, data in tests.items():
52
+ # Split nodeid into parts: path::function[params]
53
+ base_name = nodeid.split('::')[1].split('[')[0] if '[' in nodeid else nodeid.split('::')[1]
54
+ test_groups[base_name].append((nodeid, data))
55
+
56
+ for base_name, group in test_groups.items():
57
+ md.append(f"## {base_name}\n")
58
+
59
+ # Create table for all test runs
60
+ md.append("### Test Runs\n")
61
+
62
+ # Headers: Parameters, Properties, Duration, Outcome
63
+ md.append('| Parameters | Properties | Duration (s) | Outcome |')
64
+ md.append('|------------|------------|-------------|---------|')
65
+
66
+ for nodeid, data in group:
67
+ # Extract parameters from nodeid
68
+ params = nodeid.split('[')[1].rstrip(']') if '[' in nodeid else ''
69
+
70
+ # Format properties
71
+ props = data.get('properties', {})
72
+ props_str = '; '.join(f"{k}: {v}" for k, v in props.items())
73
+
74
+ # Format duration
75
+ duration = f"{data.get('duration', 0):.3f}"
76
+
77
+ row = [
78
+ params,
79
+ props_str,
80
+ duration,
81
+ data.get('outcome', '')
82
+ ]
83
+
84
+ md.append('| ' + ' | '.join(str(cell) for cell in row) + ' |')
85
+
86
+ md.append('')
87
+ return '\n'.join(md)
88
+
89
+ # Example usage:
90
+ if __name__ == '__main__':
91
+ # Assume report.jsonl exists from running:
92
+ # pytest test_examples.py --report-log=report.jsonl
93
+
94
+ log_path = Path('report.jsonl')
95
+ tests = parse_reportlog(log_path)
96
+ markdown = generate_markdown(tests)
97
+
98
+ # Write markdown to file
99
+ with open('docs/unit_tests.md', 'w') as f:
100
+ f.write(markdown)
@@ -0,0 +1,238 @@
1
+ import re
2
+ from typing import Optional
3
+
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+
7
+ from aurelian.utils.doi_fetcher import DOIFetcher
8
+
9
+ BIOC_URL = "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/{pmid}/ascii"
10
+ PUBMED_EUTILS_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=xml"
11
+ EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&retmode=xml"
12
+
13
+ DOI_PATTERN = r"/(10\.\d{4,9}/[\w\-.]+)"
14
+
15
+ doi_fetcher = DOIFetcher()
16
+
17
+
18
+ def extract_doi_from_url(url: str) -> Optional[str]:
19
+ """Extracts the DOI from a given journal URL.
20
+
21
+ Args:
22
+ url (str): The URL of the article.
23
+
24
+ Returns:
25
+ str: The extracted DOI if found, otherwise an empty string.
26
+
27
+ """
28
+ doi_match = re.search(DOI_PATTERN, url)
29
+ return doi_match.group(1) if doi_match else None
30
+
31
+
32
+ def doi_to_pmid(doi: str) -> Optional[str]:
33
+ """Converts a DOI to a PMID using the NCBI ID Converter API.
34
+
35
+ Args:
36
+ doi (str): The DOI to be converted.
37
+
38
+ Returns:
39
+ str: The corresponding PMID if found, otherwise an empty string.
40
+
41
+ """
42
+ API_URL = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={doi}&format=json"
43
+ response = requests.get(API_URL).json()
44
+ records = response.get("records", [])
45
+ pmid = records[0].get("pmid", None) if records else None
46
+ return pmid
47
+
48
+
49
+ def get_doi_text(doi: str) -> str:
50
+ """Fetch the full text of an article using a DOI.
51
+
52
+ TODO: non pubmed sources
53
+
54
+ Example:
55
+ >>> doi = "10.1128/msystems.00045-18"
56
+ >>> full_text = get_doi_text(doi)
57
+ >>> assert "Populus Microbiome" in full_text
58
+
59
+ Args:
60
+ doi: The DOI of the article.
61
+
62
+ Returns:
63
+ The full text of the article if available, otherwise an empty string.
64
+
65
+ """
66
+ pmid = doi_to_pmid(doi)
67
+ if not pmid:
68
+ info = doi_fetcher.get_full_text(doi)
69
+ if info:
70
+ return info
71
+ else:
72
+ return f"PMID not found for {doi} and not available via unpaywall"
73
+ return get_pmid_text(pmid)
74
+
75
+
76
+ def get_pmid_from_pmcid(pmcid):
77
+ """Fetch the PMID from a PMC ID using the Entrez E-utilities `esummary`.
78
+
79
+ Example:
80
+ >>> pmcid = "PMC5048378"
81
+ >>> pmid = get_pmid_from_pmcid(pmcid)
82
+ >>> print(pmid)
83
+ 27629041
84
+
85
+ Args:
86
+ pmcid:
87
+
88
+ Returns:
89
+
90
+ """
91
+ if ":" in pmcid:
92
+ pmcid = pmcid.split(":")[1]
93
+ url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
94
+ params = {"db": "pmc", "id": pmcid.replace("PMC", ""), "retmode": "json"} # Remove "PMC" prefix if included
95
+
96
+ response = requests.get(url, params=params)
97
+ data = response.json()
98
+
99
+ # Extract PMID
100
+ try:
101
+ uid = data["result"]["uids"][0] # Extract the UID
102
+ article_ids = data["result"][uid]["articleids"] # Get article IDs
103
+ for item in article_ids:
104
+ if item["idtype"] == "pmid":
105
+ return item["value"]
106
+ except KeyError:
107
+ return "PMID not found"
108
+
109
+
110
+ def get_pmcid_text(pmcid: str) -> str:
111
+ """Fetch full text from PubMed Central Open Access BioC XML.
112
+
113
+ Example:
114
+ >>> pmcid = "PMC5048378"
115
+ >>> full_text = get_pmcid_text(pmcid)
116
+ >>> assert "integrated stress response (ISR)" in full_text
117
+
118
+ Args:
119
+ pmcid:
120
+
121
+ Returns:
122
+
123
+ """
124
+ pmid = get_pmid_from_pmcid(pmcid)
125
+ return get_pmid_text(pmid)
126
+
127
+
128
+ def get_pmid_text(pmid: str) -> str:
129
+ """Fetch full text from PubMed Central Open Access BioC XML.
130
+ If full text is not available, fallback to fetching the abstract from PubMed.
131
+
132
+ Example:
133
+ >>> pmid = "11"
134
+ >>> full_text = get_pmid_text(pmid)
135
+ >>> print(full_text)
136
+ Identification of adenylate cyclase-coupled beta-adrenergic receptors with radiolabeled beta-adrenergic antagonists.
137
+ <BLANKLINE>
138
+ No abstract available
139
+
140
+ Args:
141
+ pmid: PubMed ID of the article.
142
+
143
+ Returns:
144
+ The full text of the article if available, otherwise the abstract.
145
+
146
+ """
147
+ if ":" in pmid:
148
+ pmid = pmid.split(":")[1]
149
+ text = get_full_text_from_bioc(pmid)
150
+ if not text:
151
+ doi = pmid_to_doi(pmid)
152
+ if doi:
153
+ text = doi_fetcher.get_full_text(doi)
154
+ if not text:
155
+ text = get_abstract_from_pubmed(pmid)
156
+ return text
157
+
158
+ def pmid_to_doi(pmid: str) -> Optional[str]:
159
+ if ":" in pmid:
160
+ pmid = pmid.split(":")[1]
161
+ url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json"
162
+ response = requests.get(url)
163
+ data = response.json()
164
+
165
+ try:
166
+ article_info = data["result"][str(pmid)]
167
+ for aid in article_info["articleids"]:
168
+ if aid["idtype"] == "doi":
169
+ return aid["value"]
170
+ elocationid = article_info.get("elocationid", "")
171
+ if elocationid.startswith("10."): # DOI starts with "10."
172
+ return elocationid
173
+ else:
174
+ return None
175
+ except KeyError:
176
+ return None
177
+
178
+
179
+ def get_full_text_from_bioc(pmid: str) -> str:
180
+ """Fetch full text from PubMed Central Open Access BioC XML.
181
+
182
+ Example:
183
+ >>> pmid = "17299597"
184
+ >>> full_text = get_full_text_from_bioc(pmid)
185
+ >>> assert "Evolution of biological complexity." in full_text
186
+
187
+ Args:
188
+ pmid: PubMed ID of the article.
189
+
190
+ Returns:
191
+ The full text of the article if available, otherwise an empty string.
192
+
193
+ """
194
+ response = requests.get(BIOC_URL.format(pmid=pmid))
195
+
196
+ if response.status_code != 200:
197
+ return "" # Return empty string if request fails
198
+
199
+ soup = BeautifulSoup(response.text, "xml")
200
+
201
+ # Extract ONLY text from <text> tags within <passage>
202
+ text_sections = [text_tag.get_text() for text_tag in soup.find_all("text")]
203
+
204
+ full_text = "\n".join(text_sections).strip()
205
+ return full_text
206
+
207
+
208
+ def get_abstract_from_pubmed(pmid: str) -> str:
209
+ """Fetch the title and abstract of an article from PubMed using Entrez E-utilities `efetch`.
210
+
211
+ Example:
212
+ >>> pmid = "31653696"
213
+ >>> abstract = get_abstract_from_pubmed(pmid)
214
+ >>> assert "The apparent deglycase activity of DJ-1" in abstract
215
+
216
+ Args:
217
+ pmid: PubMed ID of the article.
218
+
219
+ Returns:
220
+ The title and abstract text if available, otherwise an empty string.
221
+
222
+ """
223
+ response = requests.get(EFETCH_URL.format(pmid=pmid))
224
+
225
+ if response.status_code != 200:
226
+ return ""
227
+
228
+ soup = BeautifulSoup(response.text, "xml")
229
+
230
+ # Extract title
231
+ title_tag = soup.find("ArticleTitle")
232
+ title = title_tag.get_text().strip() if title_tag else "No title available"
233
+
234
+ # Extract abstract (may contain multiple sections)
235
+ abstract_tags = soup.find_all("AbstractText")
236
+ abstract = "\n".join(tag.get_text().strip() for tag in abstract_tags) if abstract_tags else "No abstract available"
237
+
238
+ return f"{title}\n\n{abstract}"