ursa-ai 0.0.3__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ursa-ai might be problematic. Click here for more details.

Files changed (38) hide show
  1. ursa_ai-0.2.1/LICENSE +8 -0
  2. ursa_ai-0.2.1/PKG-INFO +118 -0
  3. ursa_ai-0.2.1/README.md +77 -0
  4. ursa_ai-0.2.1/pyproject.toml +76 -0
  5. ursa_ai-0.2.1/src/ursa/agents/__init__.py +10 -0
  6. ursa_ai-0.2.1/src/ursa/agents/arxiv_agent.py +349 -0
  7. ursa_ai-0.2.1/src/ursa/agents/base.py +42 -0
  8. ursa_ai-0.2.1/src/ursa/agents/code_review_agent.py +332 -0
  9. ursa_ai-0.2.1/src/ursa/agents/execution_agent.py +497 -0
  10. ursa_ai-0.2.1/src/ursa/agents/hypothesizer_agent.py +597 -0
  11. ursa_ai-0.2.1/src/ursa/agents/mp_agent.py +257 -0
  12. ursa_ai-0.2.1/src/ursa/agents/planning_agent.py +138 -0
  13. ursa_ai-0.2.1/src/ursa/agents/recall_agent.py +25 -0
  14. ursa_ai-0.2.1/src/ursa/agents/websearch_agent.py +193 -0
  15. ursa_ai-0.2.1/src/ursa/prompt_library/code_review_prompts.py +51 -0
  16. ursa_ai-0.2.1/src/ursa/prompt_library/execution_prompts.py +36 -0
  17. ursa_ai-0.2.1/src/ursa/prompt_library/hypothesizer_prompts.py +17 -0
  18. ursa_ai-0.2.1/src/ursa/prompt_library/literature_prompts.py +11 -0
  19. ursa_ai-0.2.1/src/ursa/prompt_library/planning_prompts.py +79 -0
  20. ursa_ai-0.2.1/src/ursa/prompt_library/websearch_prompts.py +131 -0
  21. ursa_ai-0.2.1/src/ursa/tools/run_command.py +27 -0
  22. ursa_ai-0.2.1/src/ursa/tools/write_code.py +42 -0
  23. ursa_ai-0.2.1/src/ursa/util/diff_renderer.py +121 -0
  24. ursa_ai-0.2.1/src/ursa/util/memory_logger.py +171 -0
  25. ursa_ai-0.2.1/src/ursa/util/parse.py +89 -0
  26. ursa_ai-0.2.1/src/ursa_ai.egg-info/PKG-INFO +118 -0
  27. ursa_ai-0.2.1/src/ursa_ai.egg-info/SOURCES.txt +29 -0
  28. ursa_ai-0.2.1/src/ursa_ai.egg-info/requires.txt +21 -0
  29. ursa_ai-0.0.3/PKG-INFO +0 -7
  30. ursa_ai-0.0.3/README.md +0 -0
  31. ursa_ai-0.0.3/pyproject.toml +0 -33
  32. ursa_ai-0.0.3/src/ursa/__init__.py +0 -2
  33. ursa_ai-0.0.3/src/ursa/py.typed +0 -0
  34. ursa_ai-0.0.3/src/ursa_ai.egg-info/PKG-INFO +0 -7
  35. ursa_ai-0.0.3/src/ursa_ai.egg-info/SOURCES.txt +0 -8
  36. {ursa_ai-0.0.3 → ursa_ai-0.2.1}/setup.cfg +0 -0
  37. {ursa_ai-0.0.3 → ursa_ai-0.2.1}/src/ursa_ai.egg-info/dependency_links.txt +0 -0
  38. {ursa_ai-0.0.3 → ursa_ai-0.2.1}/src/ursa_ai.egg-info/top_level.txt +0 -0
ursa_ai-0.2.1/LICENSE ADDED
@@ -0,0 +1,8 @@
1
+ This program is Open-Source under the BSD-3 License.
2
+
3
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4
+ Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
5
+ Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
6
+ Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
7
+
8
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ursa_ai-0.2.1/PKG-INFO ADDED
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: ursa-ai
3
+ Version: 0.2.1
4
+ Summary: Agents for science at LANL
5
+ Author-email: Mike Grosskopf <mikegros@lanl.gov>, Rahul Somasundaram <rsomasundaram@lanl.gov>, Arthur Lui <alui@lanl.gov>
6
+ Project-URL: Homepage, https://github.com/lanl/ursa
7
+ Project-URL: Repository, https://github.com/lanl/ursa
8
+ Project-URL: Issues, https://github.com/lanl/ursa/issues
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: arxiv<3.0,>=2.2.0
20
+ Requires-Dist: beautifulsoup4<5.0,>=4.13.4
21
+ Requires-Dist: coolname<3.0,>=2.2.0
22
+ Requires-Dist: langchain<0.4,>=0.3.22
23
+ Requires-Dist: langchain-community<0.4,>=0.3.20
24
+ Requires-Dist: langchain-litellm>=0.2.2
25
+ Requires-Dist: langchain-openai<0.4,>=0.3.12
26
+ Requires-Dist: langgraph>=0.5
27
+ Requires-Dist: pandas<3.0,>=2.2.3
28
+ Requires-Dist: pillow>=11.2.1
29
+ Requires-Dist: pymupdf<2.0,>=1.26.0
30
+ Requires-Dist: pypdf<6.0,>=5.4.0
31
+ Requires-Dist: rich<14.0,>=13.7.0
32
+ Requires-Dist: langchain-chroma>=0.2.4
33
+ Requires-Dist: chromadb>=1.0.15
34
+ Requires-Dist: mp-api>=0.45.8
35
+ Requires-Dist: langchain-google-genai>=2.1.9
36
+ Requires-Dist: langchain-anthropic>=0.3.18
37
+ Requires-Dist: langgraph-checkpoint-sqlite>=2.0.10
38
+ Requires-Dist: duckduckgo-search>=8.1.1
39
+ Requires-Dist: langchain-ollama>=0.3.6
40
+ Dynamic: license-file
41
+
42
+ # URSA - The Universal Research and Scientific Agent
43
+
44
+ <img src="./logos/logo.png" alt="URSA Logo" width="200" height="200">
45
+
46
+ The flexible agentic workflow for accelerating scientific tasks.
47
+ Composes information flow between agents for planning, code writing and execution, and online research to solve complex problems.
48
+
49
+ ## Installation
50
+ You can install `ursa` via `pip` or `uv`.
51
+
52
+ **pip**
53
+ ```bash
54
+ pip install ursa-ai
55
+ ```
56
+
57
+ **uv**
58
+ ```bash
59
+ uv add ursa-ai
60
+ ```
61
+
62
+ ## How to use this code
63
+ Better documentation will be incoming, but for now there are examples in the examples folder that should give
64
+ a decent idea for how to set up some basic problems. They also should give some idea of how to pass results from
65
+ one agent to another. I will look to add things with multi-agent graphs, etc. in the future.
66
+
67
+ Documentation for each URSA agent:
68
+ - [Planning Agent](docs/planning_agent.md)
69
+ - [Execution Agent](docs/execution_agent.md)
70
+ - [ArXiv Agent](docs/arxiv_agent.md)
71
+ - [Web Search Agent](docs/web_search_agent.md)
72
+ - [Hypothesizer Agent](docs/hypothesizer_agent.md)
73
+
74
+ Documentation for combining agents:
75
+ - [ArXiv -> Execution for Materials](docs/combining_arxiv_and_execution.md )
76
+ - [ArXiv -> Execution for Neutron Star Properties](docs/combining_arxiv_and_execution_neutronStar.md )
77
+
78
+ # Sandboxing
79
+ The Execution Agent is allowed to run system commands and write/run code. Being able to execute arbitrary system commands or write
80
+ and execute code has the potential to cause problems like:
81
+ - Damage code or data on the computer
82
+ - Damage the computer
83
+ - Transmit your local data
84
+
85
+ The Web Search Agent scrapes data from urls, so has the potential to attempt to pull information from questionable sources.
86
+
87
+ Some suggestions for sandboxing the agent:
88
+ - Creating a specific environment such that limits URSA's access to only what you want. Examples:
89
+ - Creating/using a virtual machine that is sandboxed from the rest of your machine
90
+ - Creating a new account on your machine specifically for URSA
91
+ - Creating a network blacklist/whitelist to ensure that network commands and webscraping are contained to safe sources
92
+
93
+ You have a duty for ensuring that you use URSA responsibly.
94
+
95
+ ## Development Dependencies
96
+
97
+ * [`uv`](https://docs.astral.sh/uv/)
98
+ * `uv` is an extremely fast python package and project manager, written in Rust.
99
+ Follow installation instructions
100
+ [here](https://docs.astral.sh/uv/getting-started/installation/)
101
+
102
+ * [`just`](https://github.com/casey/just)
103
+ * After installing `uv`, you can install just with `uv tool install rust-just`
104
+
105
+ ## Development Team
106
+
107
+ URSA has been developed at Los Alamos National Laboratory as part of the ArtIMis project.
108
+
109
+ <img src="./logos/artimis.png" alt="ArtIMis Logo" width="200" height="200">
110
+
111
+ ### Notice of Copyright Assertion (O4958):
112
+ *This program is Open-Source under the BSD-3 License.
113
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:*
114
+ - *Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.*
115
+ - *Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.*
116
+ - *Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.*
117
+
118
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,77 @@
1
+ # URSA - The Universal Research and Scientific Agent
2
+
3
+ <img src="./logos/logo.png" alt="URSA Logo" width="200" height="200">
4
+
5
+ The flexible agentic workflow for accelerating scientific tasks.
6
+ Composes information flow between agents for planning, code writing and execution, and online research to solve complex problems.
7
+
8
+ ## Installation
9
+ You can install `ursa` via `pip` or `uv`.
10
+
11
+ **pip**
12
+ ```bash
13
+ pip install ursa-ai
14
+ ```
15
+
16
+ **uv**
17
+ ```bash
18
+ uv add ursa-ai
19
+ ```
20
+
21
+ ## How to use this code
22
+ Better documentation will be incoming, but for now there are examples in the examples folder that should give
23
+ a decent idea for how to set up some basic problems. They also should give some idea of how to pass results from
24
+ one agent to another. I will look to add things with multi-agent graphs, etc. in the future.
25
+
26
+ Documentation for each URSA agent:
27
+ - [Planning Agent](docs/planning_agent.md)
28
+ - [Execution Agent](docs/execution_agent.md)
29
+ - [ArXiv Agent](docs/arxiv_agent.md)
30
+ - [Web Search Agent](docs/web_search_agent.md)
31
+ - [Hypothesizer Agent](docs/hypothesizer_agent.md)
32
+
33
+ Documentation for combining agents:
34
+ - [ArXiv -> Execution for Materials](docs/combining_arxiv_and_execution.md )
35
+ - [ArXiv -> Execution for Neutron Star Properties](docs/combining_arxiv_and_execution_neutronStar.md )
36
+
37
+ # Sandboxing
38
+ The Execution Agent is allowed to run system commands and write/run code. Being able to execute arbitrary system commands or write
39
+ and execute code has the potential to cause problems like:
40
+ - Damage code or data on the computer
41
+ - Damage the computer
42
+ - Transmit your local data
43
+
44
+ The Web Search Agent scrapes data from urls, so has the potential to attempt to pull information from questionable sources.
45
+
46
+ Some suggestions for sandboxing the agent:
47
+ - Creating a specific environment such that limits URSA's access to only what you want. Examples:
48
+ - Creating/using a virtual machine that is sandboxed from the rest of your machine
49
+ - Creating a new account on your machine specifically for URSA
50
+ - Creating a network blacklist/whitelist to ensure that network commands and webscraping are contained to safe sources
51
+
52
+ You have a duty for ensuring that you use URSA responsibly.
53
+
54
+ ## Development Dependencies
55
+
56
+ * [`uv`](https://docs.astral.sh/uv/)
57
+ * `uv` is an extremely fast python package and project manager, written in Rust.
58
+ Follow installation instructions
59
+ [here](https://docs.astral.sh/uv/getting-started/installation/)
60
+
61
+ * [`just`](https://github.com/casey/just)
62
+ * After installing `uv`, you can install just with `uv tool install rust-just`
63
+
64
+ ## Development Team
65
+
66
+ URSA has been developed at Los Alamos National Laboratory as part of the ArtIMis project.
67
+
68
+ <img src="./logos/artimis.png" alt="ArtIMis Logo" width="200" height="200">
69
+
70
+ ### Notice of Copyright Assertion (O4958):
71
+ *This program is Open-Source under the BSD-3 License.
72
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:*
73
+ - *Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.*
74
+ - *Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.*
75
+ - *Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.*
76
+
77
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,76 @@
1
+ [project]
2
+ name = "ursa-ai"
3
+ dynamic = ["version"]
4
+ description = "Agents for science at LANL"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Mike Grosskopf", email = "mikegros@lanl.gov" },
8
+ { name = "Rahul Somasundaram", email = "rsomasundaram@lanl.gov" },
9
+ { name = "Arthur Lui", email = "alui@lanl.gov" }
10
+ ]
11
+ requires-python = ">=3.10"
12
+ dependencies = [
13
+ "arxiv>=2.2.0,<3.0",
14
+ "beautifulsoup4>=4.13.4,<5.0",
15
+ "coolname>=2.2.0,<3.0",
16
+ "langchain>=0.3.22,<0.4",
17
+ "langchain-community>=0.3.20,<0.4",
18
+ "langchain-litellm>=0.2.2",
19
+ "langchain-openai>=0.3.12,<0.4",
20
+ "langgraph>=0.5",
21
+ "pandas>=2.2.3,<3.0",
22
+ "pillow>=11.2.1",
23
+ "pymupdf>=1.26.0,<2.0",
24
+ "pypdf>=5.4.0,<6.0",
25
+ "rich>=13.7.0,<14.0",
26
+ "langchain-chroma>=0.2.4",
27
+ "chromadb>=1.0.15",
28
+ "mp-api>=0.45.8",
29
+ "langchain-google-genai>=2.1.9",
30
+ "langchain-anthropic>=0.3.18",
31
+ "langgraph-checkpoint-sqlite>=2.0.10",
32
+ "duckduckgo-search>=8.1.1",
33
+ "langchain-ollama>=0.3.6",
34
+ ]
35
+ classifiers = [
36
+ "Operating System :: OS Independent",
37
+ "License :: OSI Approved :: MIT License",
38
+ "Programming Language :: Python :: 3.10",
39
+ "Programming Language :: Python :: 3.11",
40
+ "Programming Language :: Python :: 3.12",
41
+ "Programming Language :: Python :: 3.13",
42
+ "Programming Language :: Python :: 3.14",
43
+ ]
44
+
45
+ [project.urls]
46
+ Homepage = "https://github.com/lanl/ursa"
47
+ Repository = "https://github.com/lanl/ursa"
48
+ Issues = "https://github.com/lanl/ursa/issues"
49
+
50
+ [build-system]
51
+ requires = ["setuptools>=74.1", "setuptools-git-versioning>=2.0,<3"]
52
+ build-backend = "setuptools.build_meta"
53
+
54
+ [tool.setuptools-git-versioning]
55
+ enabled = true
56
+
57
+ [tool.ruff]
58
+ line-length = 80
59
+
60
+ [tool.ruff.lint]
61
+ ignore = ["D100"]
62
+ extend-select = ["I", "W505"] # "D"
63
+ extend-unsafe-fixes = ["F401"]
64
+ pydocstyle.convention = "numpy"
65
+ pycodestyle.max-doc-length = 80
66
+
67
+ # Ignore test file documentation linting.
68
+ [tool.ruff.lint.extend-per-file-ignores]
69
+ "tests/**/*.py" = ["D"]
70
+
71
+ [dependency-groups]
72
+ dev = [
73
+ "langgraph-checkpoint-sqlite>=2.0.10",
74
+ "notebook>=7.3.3",
75
+ "scikit-optimize>=0.10.2",
76
+ ]
@@ -0,0 +1,10 @@
1
+ from .planning_agent import PlanningAgent, PlanningState
2
+ from .websearch_agent import WebSearchAgent, WebSearchState
3
+ from .execution_agent import ExecutionAgent, ExecutionState
4
+ from .code_review_agent import CodeReviewAgent, CodeReviewState
5
+ from .hypothesizer_agent import HypothesizerAgent, HypothesizerState
6
+ from .arxiv_agent import ArxivAgent, PaperState, PaperMetadata
7
+ from .recall_agent import RecallAgent
8
+ from .base import BaseAgent, BaseChatModel
9
+ from .mp_agent import MaterialsProjectAgent
10
+
@@ -0,0 +1,349 @@
1
+ import os
2
+ import pymupdf
3
+ import requests
4
+ import feedparser
5
+ from PIL import Image
6
+ from io import BytesIO
7
+ import base64
8
+ from urllib.parse import quote
9
+ from typing_extensions import TypedDict, List
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from tqdm import tqdm
12
+ import statistics
13
+ import re
14
+
15
+ from langchain_community.document_loaders import PyPDFLoader
16
+ from langchain_core.output_parsers import StrOutputParser
17
+ from langchain_core.prompts import ChatPromptTemplate
18
+ from langgraph.graph import StateGraph, END, START
19
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
20
+ from langchain_chroma import Chroma
21
+
22
+ from .base import BaseAgent
23
+
24
+ try:
25
+ from openai import OpenAI
26
+ except:
27
+ pass
28
+
29
+ # embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
30
+ # embeddings = OpenAIEmbeddings()
31
+
32
+ class PaperMetadata(TypedDict):
33
+ arxiv_id: str
34
+ full_text: str
35
+
36
+ class PaperState(TypedDict, total=False):
37
+ query: str
38
+ context: str
39
+ papers: List[PaperMetadata]
40
+ summaries: List[str]
41
+ final_summary: str
42
+
43
+
44
+ def describe_image(image: Image.Image) -> str:
45
+ if 'OpenAI' not in globals():
46
+ print("Vision transformer for summarizing images currently only implemented for OpenAI API.")
47
+ return ""
48
+ client = OpenAI()
49
+
50
+ buffered = BytesIO()
51
+ image.save(buffered, format="PNG")
52
+ img_base64 = base64.b64encode(buffered.getvalue()).decode()
53
+
54
+ response = client.chat.completions.create(
55
+ model="gpt-4-vision-preview",
56
+ messages=[
57
+ {"role": "system", "content": "You are a scientific assistant who explains plots and scientific diagrams."},
58
+ {
59
+ "role": "user",
60
+ "content": [
61
+ {"type": "text", "text": "Describe this scientific image or plot in detail."},
62
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
63
+ ],
64
+ },
65
+ ],
66
+ max_tokens=500,
67
+ )
68
+ return response.choices[0].message.content.strip()
69
+
70
+
71
+ def extract_and_describe_images(pdf_path: str, max_images: int = 5) -> List[str]:
72
+ doc = pymupdf.open(pdf_path)
73
+ descriptions = []
74
+ image_count = 0
75
+
76
+ for page_index in range(len(doc)):
77
+ if image_count >= max_images:
78
+ break
79
+ page = doc[page_index]
80
+ images = page.get_images(full=True)
81
+
82
+ for img_index, img in enumerate(images):
83
+ if image_count >= max_images:
84
+ break
85
+ xref = img[0]
86
+ base_image = doc.extract_image(xref)
87
+ image_bytes = base_image["image"]
88
+ image = Image.open(BytesIO(image_bytes))
89
+
90
+ try:
91
+ desc = describe_image(image)
92
+ descriptions.append(f"Page {page_index + 1}, Image {img_index + 1}: {desc}")
93
+ except Exception as e:
94
+ descriptions.append(f"Page {page_index + 1}, Image {img_index + 1}: [Error: {e}]")
95
+ image_count += 1
96
+
97
+ return descriptions
98
+
99
+
100
+ def remove_surrogates(text: str) -> str:
101
+ return re.sub(r'[\ud800-\udfff]', '', text)
102
+
103
+
104
+ class ArxivAgent(BaseAgent):
105
+ def __init__(self,
106
+ llm="openai/o3-mini",
107
+ summarize: bool = True,
108
+ process_images = True,
109
+ max_results: int = 3,
110
+ download_papers: bool = True,
111
+ rag_embedding = None,
112
+ database_path ='arxiv_papers',
113
+ summaries_path ='arxiv_generated_summaries',
114
+ vectorstore_path ='arxiv_vectorstores',**kwargs):
115
+
116
+ super().__init__(llm, **kwargs)
117
+ self.summarize = summarize
118
+ self.process_images = process_images
119
+ self.max_results = max_results
120
+ self.database_path = database_path
121
+ self.summaries_path = summaries_path
122
+ self.vectorstore_path = vectorstore_path
123
+ self.download_papers = download_papers
124
+ self.rag_embedding = rag_embedding
125
+
126
+ self.graph = self._build_graph()
127
+
128
+ os.makedirs(self.database_path, exist_ok=True)
129
+
130
+ os.makedirs(self.summaries_path, exist_ok=True)
131
+
132
+
133
+ def _fetch_papers(self, query: str) -> List[PaperMetadata]:
134
+
135
+ if self.download_papers:
136
+
137
+ encoded_query = quote(query)
138
+ url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results={self.max_results}"
139
+ feed = feedparser.parse(url)
140
+
141
+ for i,entry in enumerate(feed.entries):
142
+ full_id = entry.id.split('/abs/')[-1]
143
+ arxiv_id = full_id.split('/')[-1]
144
+ title = entry.title.strip()
145
+ authors = ", ".join(author.name for author in entry.authors)
146
+ pdf_url = f"https://arxiv.org/pdf/{full_id}.pdf"
147
+ pdf_filename = os.path.join(self.database_path, f"{arxiv_id}.pdf")
148
+
149
+ if os.path.exists(pdf_filename):
150
+ print(f"Paper # {i+1}, Title: {title}, already exists in database")
151
+ else:
152
+ print(f"Downloading paper # {i+1}, Title: {title}")
153
+ response = requests.get(pdf_url)
154
+ with open(pdf_filename, 'wb') as f:
155
+ f.write(response.content)
156
+
157
+
158
+ papers = []
159
+
160
+ pdf_files = [f for f in os.listdir(self.database_path) if f.lower().endswith(".pdf")]
161
+
162
+ for i,pdf_filename in enumerate(pdf_files):
163
+ full_text = ""
164
+ arxiv_id = pdf_filename.split('.pdf')[0]
165
+ vec_save_loc = self.vectorstore_path + '/' + arxiv_id
166
+
167
+ if self.summarize and not os.path.exists(vec_save_loc):
168
+ try:
169
+ loader = PyPDFLoader( os.path.join(self.database_path, pdf_filename) )
170
+ pages = loader.load()
171
+ full_text = "\n".join([p.page_content for p in pages])
172
+
173
+ if self.process_images:
174
+ image_descriptions = extract_and_describe_images( os.path.join(self.database_path, pdf_filename) )
175
+ full_text += "\n\n[Image Interpretations]\n" + "\n".join(image_descriptions)
176
+
177
+ except Exception as e:
178
+ full_text = f"Error loading paper: {e}"
179
+
180
+ papers.append({
181
+ "arxiv_id": arxiv_id,
182
+ "full_text": full_text,
183
+ })
184
+
185
+ return papers
186
+
187
+ def _fetch_node(self, state: PaperState) -> PaperState:
188
+ papers = self._fetch_papers(state["query"])
189
+ return {**state, "papers": papers}
190
+
191
+
192
+ def _get_or_build_vectorstore(self, paper_text: str, arxiv_id: str):
193
+ os.makedirs(self.vectorstore_path, exist_ok=True)
194
+
195
+ persist_directory = os.path.join(self.vectorstore_path, arxiv_id)
196
+
197
+ if os.path.exists(persist_directory):
198
+ vectorstore = Chroma(persist_directory=persist_directory, embedding_function=self.rag_embedding)
199
+ else:
200
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
201
+ docs = splitter.create_documents([paper_text])
202
+ vectorstore = Chroma.from_documents(docs, self.rag_embedding, persist_directory=persist_directory)
203
+
204
+ return vectorstore.as_retriever(search_kwargs={"k": 5})
205
+
206
+
207
+ def _summarize_node(self, state: PaperState) -> PaperState:
208
+
209
+ prompt = ChatPromptTemplate.from_template("""
210
+ You are a scientific assistant responsible for summarizing extracts from research papers, in the context of the following task: {context}
211
+
212
+ Summarize the retrieved scientific content below.
213
+
214
+ {retrieved_content}
215
+ """)
216
+
217
+ chain = prompt | self.llm | StrOutputParser()
218
+
219
+ summaries = [None] * len(state["papers"])
220
+ relevancy_scores = [0.0] * len(state["papers"])
221
+
222
+ def process_paper(i, paper):
223
+ arxiv_id = paper["arxiv_id"]
224
+ summary_filename = os.path.join(self.summaries_path, f"{arxiv_id}_summary.txt")
225
+
226
+ try:
227
+ cleaned_text = remove_surrogates(paper["full_text"])
228
+ if self.rag_embedding:
229
+ retriever = self._get_or_build_vectorstore(cleaned_text, arxiv_id)
230
+
231
+ relevant_docs_with_scores = retriever.vectorstore.similarity_search_with_score(state["context"], k=5)
232
+
233
+ if relevant_docs_with_scores:
234
+ score = sum([s for _, s in relevant_docs_with_scores]) / len(relevant_docs_with_scores)
235
+ relevancy_scores[i] = abs(1.0 - score)
236
+ else:
237
+ relevancy_scores[i] = 0.0
238
+
239
+ retrieved_content = "\n\n".join([doc.page_content for doc, _ in relevant_docs_with_scores])
240
+ else:
241
+ retrieved_content = cleaned_text
242
+
243
+ summary = chain.invoke({"retrieved_content": retrieved_content, "context": state["context"]})
244
+
245
+ except Exception as e:
246
+ summary = f"Error summarizing paper: {e}"
247
+ relevancy_scores[i] = 0.0
248
+
249
+ with open(summary_filename, "w") as f:
250
+ f.write(summary)
251
+
252
+ return i, summary
253
+
254
+ if ('papers' not in state or len(state['papers']) == 0):
255
+ print(f"No papers retrieved - bad query or network connection to ArXiv?")
256
+ return {**state, "summaries": None}
257
+
258
+ with ThreadPoolExecutor(max_workers=min(32, len(state["papers"]))) as executor:
259
+ futures = [executor.submit(process_paper, i, paper) for i, paper in enumerate(state["papers"])]
260
+
261
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Summarizing Papers"):
262
+ i, result = future.result()
263
+ summaries[i] = result
264
+
265
+ if self.rag_embedding:
266
+ print(f"\nMax Relevancy Score: {max(relevancy_scores)}")
267
+ print(f"Min Relevancy Score: {min(relevancy_scores)}")
268
+ print(f"Median Relevancy Score: {statistics.median(relevancy_scores)}\n")
269
+
270
+ return {**state, "summaries": summaries}
271
+
272
+
273
+
274
+ def _aggregate_node(self, state: PaperState) -> PaperState:
275
+ summaries = state["summaries"]
276
+ papers = state["papers"]
277
+ formatted = []
278
+
279
+ if 'summaries' not in state or state['summaries'] is None or 'papers' not in state or state['papers'] is None:
280
+ return {**state, "final_summary": None}
281
+
282
+ for i, (paper, summary) in enumerate(zip(papers, summaries)):
283
+ citation = f"[{i+1}] Arxiv ID: {paper['arxiv_id']}"
284
+ formatted.append(f"{citation}\n\nSummary:\n{summary}")
285
+
286
+ combined = "\n\n" + ("\n\n" + "-" * 40 + "\n\n").join(formatted)
287
+
288
+ with open(self.summaries_path+'/summaries_combined.txt', "w") as f:
289
+ f.write(combined)
290
+
291
+ prompt = ChatPromptTemplate.from_template("""
292
+ You are a scientific assistant helping extract insights from summaries of research papers.
293
+
294
+ Here are the summaries of a large number of extracts from scientific papers:
295
+
296
+ {Summaries}
297
+
298
+ Your task is to read all the summaries and provide a response to this task: {context}
299
+ """)
300
+
301
+ chain = prompt | self.llm | StrOutputParser()
302
+
303
+ final_summary = chain.invoke({"Summaries": combined, "context":state["context"]})
304
+
305
+ with open(self.summaries_path+'/final_summary.txt', "w") as f:
306
+ f.write(final_summary)
307
+
308
+ return {**state, "final_summary": final_summary}
309
+
310
+
311
+
312
+ def _build_graph(self):
313
+ builder = StateGraph(PaperState)
314
+ builder.add_node("fetch_papers", self._fetch_node)
315
+
316
+ if self.summarize:
317
+ builder.add_node("summarize_each", self._summarize_node)
318
+ builder.add_node("aggregate", self._aggregate_node)
319
+
320
+ builder.set_entry_point("fetch_papers")
321
+ builder.add_edge("fetch_papers", "summarize_each")
322
+ builder.add_edge("summarize_each", "aggregate")
323
+ builder.set_finish_point("aggregate")
324
+
325
+ else:
326
+ builder.set_entry_point("fetch_papers")
327
+ builder.set_finish_point("fetch_papers")
328
+
329
+ graph = builder.compile()
330
+ return graph
331
+
332
+ def run(self, arxiv_search_query: str, context: str) -> str:
333
+ result = self.graph.invoke({"query": arxiv_search_query, "context":context})
334
+
335
+ if self.summarize:
336
+ return result.get("final_summary", "No summary generated.")
337
+ else:
338
+ return "\n\nFinished Fetching papers!"
339
+
340
+
341
+
342
+ if __name__ == "__main__":
343
+ agent = ArxivAgent()
344
+ result = agent.run(arxiv_search_query="Experimental Constraints on neutron star radius",
345
+ context="What are the constraints on the neutron star radius and what uncertainties are there on the constraints?")
346
+
347
+ print(result)
348
+
349
+
@@ -0,0 +1,42 @@
1
+ from langchain_core.language_models.chat_models import BaseChatModel
2
+ from langchain_litellm import ChatLiteLLM
3
+ from langgraph.checkpoint.base import BaseCheckpointSaver
4
+ from langchain_core.load import dumps
5
+
6
+ import json
7
+
8
+ class BaseAgent:
9
+ # llm: BaseChatModel
10
+ # llm_with_tools: Runnable[LanguageModelInput, BaseMessage]
11
+
12
+ def __init__(
13
+ self,
14
+ llm: str | BaseChatModel,
15
+ checkpointer: BaseCheckpointSaver = None,
16
+ **kwargs,
17
+ ):
18
+ match llm:
19
+ case BaseChatModel():
20
+ self.llm = llm
21
+
22
+ case str():
23
+ self.llm_provider, self.llm_model = llm.split("/")
24
+ self.llm = ChatLiteLLM(
25
+ model=llm,
26
+ max_tokens=kwargs.pop("max_tokens", 10000),
27
+ max_retries=kwargs.pop("max_retries", 2),
28
+ **kwargs,
29
+ )
30
+
31
+ case _:
32
+ raise TypeError(
33
+ "llm argument must be a string with the provider and model, or a BaseChatModel instance."
34
+ )
35
+
36
+ self.checkpointer = checkpointer
37
+ self.thread_id = self.__class__.__name__
38
+
39
+ def write_state(self, filename, state):
40
+ json_state = dumps(state, ensure_ascii=False)
41
+ with open(filename, "w") as f:
42
+ f.write(json_state)