ursa-ai 0.0.3__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ursa-ai might be problematic. Click here for more details.

Files changed (38) hide show
  1. ursa_ai-0.2.2/LICENSE +8 -0
  2. ursa_ai-0.2.2/PKG-INFO +130 -0
  3. ursa_ai-0.2.2/README.md +88 -0
  4. ursa_ai-0.2.2/pyproject.toml +77 -0
  5. ursa_ai-0.2.2/src/ursa/agents/__init__.py +10 -0
  6. ursa_ai-0.2.2/src/ursa/agents/arxiv_agent.py +349 -0
  7. ursa_ai-0.2.2/src/ursa/agents/base.py +42 -0
  8. ursa_ai-0.2.2/src/ursa/agents/code_review_agent.py +332 -0
  9. ursa_ai-0.2.2/src/ursa/agents/execution_agent.py +497 -0
  10. ursa_ai-0.2.2/src/ursa/agents/hypothesizer_agent.py +597 -0
  11. ursa_ai-0.2.2/src/ursa/agents/mp_agent.py +257 -0
  12. ursa_ai-0.2.2/src/ursa/agents/planning_agent.py +138 -0
  13. ursa_ai-0.2.2/src/ursa/agents/recall_agent.py +25 -0
  14. ursa_ai-0.2.2/src/ursa/agents/websearch_agent.py +193 -0
  15. ursa_ai-0.2.2/src/ursa/prompt_library/code_review_prompts.py +51 -0
  16. ursa_ai-0.2.2/src/ursa/prompt_library/execution_prompts.py +36 -0
  17. ursa_ai-0.2.2/src/ursa/prompt_library/hypothesizer_prompts.py +17 -0
  18. ursa_ai-0.2.2/src/ursa/prompt_library/literature_prompts.py +11 -0
  19. ursa_ai-0.2.2/src/ursa/prompt_library/planning_prompts.py +79 -0
  20. ursa_ai-0.2.2/src/ursa/prompt_library/websearch_prompts.py +131 -0
  21. ursa_ai-0.2.2/src/ursa/tools/run_command.py +27 -0
  22. ursa_ai-0.2.2/src/ursa/tools/write_code.py +42 -0
  23. ursa_ai-0.2.2/src/ursa/util/diff_renderer.py +121 -0
  24. ursa_ai-0.2.2/src/ursa/util/memory_logger.py +171 -0
  25. ursa_ai-0.2.2/src/ursa/util/parse.py +89 -0
  26. ursa_ai-0.2.2/src/ursa_ai.egg-info/PKG-INFO +130 -0
  27. ursa_ai-0.2.2/src/ursa_ai.egg-info/SOURCES.txt +29 -0
  28. ursa_ai-0.2.2/src/ursa_ai.egg-info/requires.txt +21 -0
  29. ursa_ai-0.0.3/PKG-INFO +0 -7
  30. ursa_ai-0.0.3/README.md +0 -0
  31. ursa_ai-0.0.3/pyproject.toml +0 -33
  32. ursa_ai-0.0.3/src/ursa/__init__.py +0 -2
  33. ursa_ai-0.0.3/src/ursa/py.typed +0 -0
  34. ursa_ai-0.0.3/src/ursa_ai.egg-info/PKG-INFO +0 -7
  35. ursa_ai-0.0.3/src/ursa_ai.egg-info/SOURCES.txt +0 -8
  36. {ursa_ai-0.0.3 → ursa_ai-0.2.2}/setup.cfg +0 -0
  37. {ursa_ai-0.0.3 → ursa_ai-0.2.2}/src/ursa_ai.egg-info/dependency_links.txt +0 -0
  38. {ursa_ai-0.0.3 → ursa_ai-0.2.2}/src/ursa_ai.egg-info/top_level.txt +0 -0
ursa_ai-0.2.2/LICENSE ADDED
@@ -0,0 +1,8 @@
1
+ This program is Open-Source under the BSD-3 License.
2
+
3
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4
+ Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
5
+ Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
6
+ Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
7
+
8
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ursa_ai-0.2.2/PKG-INFO ADDED
@@ -0,0 +1,130 @@
1
+ Metadata-Version: 2.4
2
+ Name: ursa-ai
3
+ Version: 0.2.2
4
+ Summary: Agents for science at LANL
5
+ Author-email: Mike Grosskopf <mikegros@lanl.gov>, Rahul Somasundaram <rsomasundaram@lanl.gov>, Arthur Lui <alui@lanl.gov>
6
+ Project-URL: Homepage, https://github.com/lanl/ursa
7
+ Project-URL: Documentation, https://github.com/lanl/ursa/tree/main/docs
8
+ Project-URL: Repository, https://github.com/lanl/ursa
9
+ Project-URL: Issues, https://github.com/lanl/ursa/issues
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: License :: OSI Approved :: BSD License
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Programming Language :: Python :: 3.14
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: arxiv<3.0,>=2.2.0
21
+ Requires-Dist: beautifulsoup4<5.0,>=4.13.4
22
+ Requires-Dist: coolname<3.0,>=2.2.0
23
+ Requires-Dist: langchain<0.4,>=0.3.22
24
+ Requires-Dist: langchain-community<0.4,>=0.3.20
25
+ Requires-Dist: langchain-litellm>=0.2.2
26
+ Requires-Dist: langchain-openai<0.4,>=0.3.12
27
+ Requires-Dist: langgraph>=0.5
28
+ Requires-Dist: pandas<3.0,>=2.2.3
29
+ Requires-Dist: pillow>=11.2.1
30
+ Requires-Dist: pymupdf<2.0,>=1.26.0
31
+ Requires-Dist: pypdf<6.0,>=5.4.0
32
+ Requires-Dist: rich<14.0,>=13.7.0
33
+ Requires-Dist: langchain-chroma>=0.2.4
34
+ Requires-Dist: chromadb>=1.0.15
35
+ Requires-Dist: mp-api>=0.45.8
36
+ Requires-Dist: langchain-google-genai>=2.1.9
37
+ Requires-Dist: langchain-anthropic>=0.3.18
38
+ Requires-Dist: langgraph-checkpoint-sqlite>=2.0.10
39
+ Requires-Dist: duckduckgo-search>=8.1.1
40
+ Requires-Dist: langchain-ollama>=0.3.6
41
+ Dynamic: license-file
42
+
43
+ # URSA - The Universal Research and Scientific Agent
44
+
45
+ <img src="./logos/logo.png" alt="URSA Logo" width="200" height="200">
46
+
47
+ [![PyPI Version][pypi-version]](https://pypi.org/project/ursa-ai/)
48
+ [![PyPI Downloads][total-downloads]](https://pepy.tech/projects/ursa-ai)
49
+
50
+ The flexible agentic workflow for accelerating scientific tasks.
51
+ Composes information flow between agents for planning, code writing and execution, and online research to solve complex problems.
52
+
53
+ ## Installation
54
+ You can install `ursa` via `pip` or `uv`.
55
+
56
+ **pip**
57
+ ```bash
58
+ pip install ursa-ai
59
+ ```
60
+
61
+ **uv**
62
+ ```bash
63
+ uv add ursa-ai
64
+ ```
65
+
66
+ ## How to use this code
67
+ Better documentation will be incoming, but for now there are examples in the examples folder that should give
68
+ a decent idea for how to set up some basic problems. They also should give some idea of how to pass results from
69
+ one agent to another. I will look to add things with multi-agent graphs, etc. in the future.
70
+
71
+ Documentation for each URSA agent:
72
+ - [Planning Agent](docs/planning_agent.md)
73
+ - [Execution Agent](docs/execution_agent.md)
74
+ - [ArXiv Agent](docs/arxiv_agent.md)
75
+ - [Web Search Agent](docs/web_search_agent.md)
76
+ - [Hypothesizer Agent](docs/hypothesizer_agent.md)
77
+
78
+ Documentation for combining agents:
79
+ - [ArXiv -> Execution for Materials](docs/combining_arxiv_and_execution.md)
80
+ - [ArXiv -> Execution for Neutron Star Properties](docs/combining_arxiv_and_execution_neutronStar.md)
81
+
82
+ # Sandboxing
83
+ The Execution Agent is allowed to run system commands and write/run code. Being able to execute arbitrary system commands or write
84
+ and execute code has the potential to cause problems like:
85
+ - Damage code or data on the computer
86
+ - Damage the computer
87
+ - Transmit your local data
88
+
89
+ The Web Search Agent scrapes data from urls, so has the potential to attempt to pull information from questionable sources.
90
+
91
+ Some suggestions for sandboxing the agent:
92
+ - Creating a specific environment such that limits URSA's access to only what you want. Examples:
93
+ - Creating/using a virtual machine that is sandboxed from the rest of your machine
94
+ - Creating a new account on your machine specifically for URSA
95
+ - Creating a network blacklist/whitelist to ensure that network commands and webscraping are contained to safe sources
96
+
97
+ You have a duty for ensuring that you use URSA responsibly.
98
+
99
+ ## Development Dependencies
100
+
101
+ * [`uv`](https://docs.astral.sh/uv/)
102
+ * `uv` is an extremely fast python package and project manager, written in Rust.
103
+ Follow installation instructions
104
+ [here](https://docs.astral.sh/uv/getting-started/installation/)
105
+
106
+ * [`ruff`](https://docs.astral.sh/ruff/)
107
+ * An extremely fast Python linter and code formatter, written in Rust.
108
+ * After installing `uv`, you can install just ruff `uv tool install ruff`
109
+
110
+ * [`just`](https://github.com/casey/just)
111
+ * A modern way to save and run project-specific commands
112
+ * After installing `uv`, you can install just with `uv tool install rust-just`
113
+
114
+ ## Development Team
115
+
116
+ URSA has been developed at Los Alamos National Laboratory as part of the ArtIMis project.
117
+
118
+ <img src="./logos/artimis.png" alt="ArtIMis Logo" width="200" height="200">
119
+
120
+ ### Notice of Copyright Assertion (O4958):
121
+ *This program is Open-Source under the BSD-3 License.
122
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:*
123
+ - *Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.*
124
+ - *Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.*
125
+ - *Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.*
126
+
127
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
128
+
129
+ [pypi-version]: https://img.shields.io/pypi/v/ursa-ai?style=flat-square&label=PyPI
130
+ [total-downloads]: https://img.shields.io/pepy/dt/ursa-ai?style=flat-square&label=downloads&color=blue
@@ -0,0 +1,88 @@
1
+ # URSA - The Universal Research and Scientific Agent
2
+
3
+ <img src="./logos/logo.png" alt="URSA Logo" width="200" height="200">
4
+
5
+ [![PyPI Version][pypi-version]](https://pypi.org/project/ursa-ai/)
6
+ [![PyPI Downloads][total-downloads]](https://pepy.tech/projects/ursa-ai)
7
+
8
+ The flexible agentic workflow for accelerating scientific tasks.
9
+ Composes information flow between agents for planning, code writing and execution, and online research to solve complex problems.
10
+
11
+ ## Installation
12
+ You can install `ursa` via `pip` or `uv`.
13
+
14
+ **pip**
15
+ ```bash
16
+ pip install ursa-ai
17
+ ```
18
+
19
+ **uv**
20
+ ```bash
21
+ uv add ursa-ai
22
+ ```
23
+
24
+ ## How to use this code
25
+ Better documentation will be incoming, but for now there are examples in the examples folder that should give
26
+ a decent idea for how to set up some basic problems. They also should give some idea of how to pass results from
27
+ one agent to another. I will look to add things with multi-agent graphs, etc. in the future.
28
+
29
+ Documentation for each URSA agent:
30
+ - [Planning Agent](docs/planning_agent.md)
31
+ - [Execution Agent](docs/execution_agent.md)
32
+ - [ArXiv Agent](docs/arxiv_agent.md)
33
+ - [Web Search Agent](docs/web_search_agent.md)
34
+ - [Hypothesizer Agent](docs/hypothesizer_agent.md)
35
+
36
+ Documentation for combining agents:
37
+ - [ArXiv -> Execution for Materials](docs/combining_arxiv_and_execution.md)
38
+ - [ArXiv -> Execution for Neutron Star Properties](docs/combining_arxiv_and_execution_neutronStar.md)
39
+
40
+ # Sandboxing
41
+ The Execution Agent is allowed to run system commands and write/run code. Being able to execute arbitrary system commands or write
42
+ and execute code has the potential to cause problems like:
43
+ - Damage code or data on the computer
44
+ - Damage the computer
45
+ - Transmit your local data
46
+
47
+ The Web Search Agent scrapes data from urls, so has the potential to attempt to pull information from questionable sources.
48
+
49
+ Some suggestions for sandboxing the agent:
50
+ - Creating a specific environment such that limits URSA's access to only what you want. Examples:
51
+ - Creating/using a virtual machine that is sandboxed from the rest of your machine
52
+ - Creating a new account on your machine specifically for URSA
53
+ - Creating a network blacklist/whitelist to ensure that network commands and webscraping are contained to safe sources
54
+
55
+ You have a duty for ensuring that you use URSA responsibly.
56
+
57
+ ## Development Dependencies
58
+
59
+ * [`uv`](https://docs.astral.sh/uv/)
60
+ * `uv` is an extremely fast python package and project manager, written in Rust.
61
+ Follow installation instructions
62
+ [here](https://docs.astral.sh/uv/getting-started/installation/)
63
+
64
+ * [`ruff`](https://docs.astral.sh/ruff/)
65
+ * An extremely fast Python linter and code formatter, written in Rust.
66
+ * After installing `uv`, you can install just ruff `uv tool install ruff`
67
+
68
+ * [`just`](https://github.com/casey/just)
69
+ * A modern way to save and run project-specific commands
70
+ * After installing `uv`, you can install just with `uv tool install rust-just`
71
+
72
+ ## Development Team
73
+
74
+ URSA has been developed at Los Alamos National Laboratory as part of the ArtIMis project.
75
+
76
+ <img src="./logos/artimis.png" alt="ArtIMis Logo" width="200" height="200">
77
+
78
+ ### Notice of Copyright Assertion (O4958):
79
+ *This program is Open-Source under the BSD-3 License.
80
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:*
81
+ - *Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.*
82
+ - *Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.*
83
+ - *Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.*
84
+
85
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
86
+
87
+ [pypi-version]: https://img.shields.io/pypi/v/ursa-ai?style=flat-square&label=PyPI
88
+ [total-downloads]: https://img.shields.io/pepy/dt/ursa-ai?style=flat-square&label=downloads&color=blue
@@ -0,0 +1,77 @@
1
+ [project]
2
+ name = "ursa-ai"
3
+ dynamic = ["version"]
4
+ description = "Agents for science at LANL"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Mike Grosskopf", email = "mikegros@lanl.gov" },
8
+ { name = "Rahul Somasundaram", email = "rsomasundaram@lanl.gov" },
9
+ { name = "Arthur Lui", email = "alui@lanl.gov" }
10
+ ]
11
+ requires-python = ">=3.10"
12
+ dependencies = [
13
+ "arxiv>=2.2.0,<3.0",
14
+ "beautifulsoup4>=4.13.4,<5.0",
15
+ "coolname>=2.2.0,<3.0",
16
+ "langchain>=0.3.22,<0.4",
17
+ "langchain-community>=0.3.20,<0.4",
18
+ "langchain-litellm>=0.2.2",
19
+ "langchain-openai>=0.3.12,<0.4",
20
+ "langgraph>=0.5",
21
+ "pandas>=2.2.3,<3.0",
22
+ "pillow>=11.2.1",
23
+ "pymupdf>=1.26.0,<2.0",
24
+ "pypdf>=5.4.0,<6.0",
25
+ "rich>=13.7.0,<14.0",
26
+ "langchain-chroma>=0.2.4",
27
+ "chromadb>=1.0.15",
28
+ "mp-api>=0.45.8",
29
+ "langchain-google-genai>=2.1.9",
30
+ "langchain-anthropic>=0.3.18",
31
+ "langgraph-checkpoint-sqlite>=2.0.10",
32
+ "duckduckgo-search>=8.1.1",
33
+ "langchain-ollama>=0.3.6",
34
+ ]
35
+ classifiers = [
36
+ "Operating System :: OS Independent",
37
+ "License :: OSI Approved :: BSD License",
38
+ "Programming Language :: Python :: 3.10",
39
+ "Programming Language :: Python :: 3.11",
40
+ "Programming Language :: Python :: 3.12",
41
+ "Programming Language :: Python :: 3.13",
42
+ "Programming Language :: Python :: 3.14",
43
+ ]
44
+
45
+ [project.urls]
46
+ Homepage = "https://github.com/lanl/ursa"
47
+ Documentation = "https://github.com/lanl/ursa/tree/main/docs"
48
+ Repository = "https://github.com/lanl/ursa"
49
+ Issues = "https://github.com/lanl/ursa/issues"
50
+
51
+ [build-system]
52
+ requires = ["setuptools>=74.1", "setuptools-git-versioning>=2.0,<3"]
53
+ build-backend = "setuptools.build_meta"
54
+
55
+ [tool.setuptools-git-versioning]
56
+ enabled = true
57
+
58
+ [tool.ruff]
59
+ line-length = 80
60
+
61
+ [tool.ruff.lint]
62
+ ignore = ["D100"]
63
+ extend-select = ["I", "W505"] # "D"
64
+ extend-unsafe-fixes = ["F401"]
65
+ pydocstyle.convention = "numpy"
66
+ pycodestyle.max-doc-length = 80
67
+
68
+ # Ignore test file documentation linting.
69
+ [tool.ruff.lint.extend-per-file-ignores]
70
+ "tests/**/*.py" = ["D"]
71
+
72
+ [dependency-groups]
73
+ dev = [
74
+ "langgraph-checkpoint-sqlite>=2.0.10",
75
+ "notebook>=7.3.3",
76
+ "scikit-optimize>=0.10.2",
77
+ ]
@@ -0,0 +1,10 @@
1
+ from .planning_agent import PlanningAgent, PlanningState
2
+ from .websearch_agent import WebSearchAgent, WebSearchState
3
+ from .execution_agent import ExecutionAgent, ExecutionState
4
+ from .code_review_agent import CodeReviewAgent, CodeReviewState
5
+ from .hypothesizer_agent import HypothesizerAgent, HypothesizerState
6
+ from .arxiv_agent import ArxivAgent, PaperState, PaperMetadata
7
+ from .recall_agent import RecallAgent
8
+ from .base import BaseAgent, BaseChatModel
9
+ from .mp_agent import MaterialsProjectAgent
10
+
@@ -0,0 +1,349 @@
1
+ import os
2
+ import pymupdf
3
+ import requests
4
+ import feedparser
5
+ from PIL import Image
6
+ from io import BytesIO
7
+ import base64
8
+ from urllib.parse import quote
9
+ from typing_extensions import TypedDict, List
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from tqdm import tqdm
12
+ import statistics
13
+ import re
14
+
15
+ from langchain_community.document_loaders import PyPDFLoader
16
+ from langchain_core.output_parsers import StrOutputParser
17
+ from langchain_core.prompts import ChatPromptTemplate
18
+ from langgraph.graph import StateGraph, END, START
19
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
20
+ from langchain_chroma import Chroma
21
+
22
+ from .base import BaseAgent
23
+
24
+ try:
25
+ from openai import OpenAI
26
+ except:
27
+ pass
28
+
29
+ # embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
30
+ # embeddings = OpenAIEmbeddings()
31
+
32
+ class PaperMetadata(TypedDict):
33
+ arxiv_id: str
34
+ full_text: str
35
+
36
+ class PaperState(TypedDict, total=False):
37
+ query: str
38
+ context: str
39
+ papers: List[PaperMetadata]
40
+ summaries: List[str]
41
+ final_summary: str
42
+
43
+
44
+ def describe_image(image: Image.Image) -> str:
45
+ if 'OpenAI' not in globals():
46
+ print("Vision transformer for summarizing images currently only implemented for OpenAI API.")
47
+ return ""
48
+ client = OpenAI()
49
+
50
+ buffered = BytesIO()
51
+ image.save(buffered, format="PNG")
52
+ img_base64 = base64.b64encode(buffered.getvalue()).decode()
53
+
54
+ response = client.chat.completions.create(
55
+ model="gpt-4-vision-preview",
56
+ messages=[
57
+ {"role": "system", "content": "You are a scientific assistant who explains plots and scientific diagrams."},
58
+ {
59
+ "role": "user",
60
+ "content": [
61
+ {"type": "text", "text": "Describe this scientific image or plot in detail."},
62
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
63
+ ],
64
+ },
65
+ ],
66
+ max_tokens=500,
67
+ )
68
+ return response.choices[0].message.content.strip()
69
+
70
+
71
+ def extract_and_describe_images(pdf_path: str, max_images: int = 5) -> List[str]:
72
+ doc = pymupdf.open(pdf_path)
73
+ descriptions = []
74
+ image_count = 0
75
+
76
+ for page_index in range(len(doc)):
77
+ if image_count >= max_images:
78
+ break
79
+ page = doc[page_index]
80
+ images = page.get_images(full=True)
81
+
82
+ for img_index, img in enumerate(images):
83
+ if image_count >= max_images:
84
+ break
85
+ xref = img[0]
86
+ base_image = doc.extract_image(xref)
87
+ image_bytes = base_image["image"]
88
+ image = Image.open(BytesIO(image_bytes))
89
+
90
+ try:
91
+ desc = describe_image(image)
92
+ descriptions.append(f"Page {page_index + 1}, Image {img_index + 1}: {desc}")
93
+ except Exception as e:
94
+ descriptions.append(f"Page {page_index + 1}, Image {img_index + 1}: [Error: {e}]")
95
+ image_count += 1
96
+
97
+ return descriptions
98
+
99
+
100
+ def remove_surrogates(text: str) -> str:
101
+ return re.sub(r'[\ud800-\udfff]', '', text)
102
+
103
+
104
+ class ArxivAgent(BaseAgent):
105
+ def __init__(self,
106
+ llm="openai/o3-mini",
107
+ summarize: bool = True,
108
+ process_images = True,
109
+ max_results: int = 3,
110
+ download_papers: bool = True,
111
+ rag_embedding = None,
112
+ database_path ='arxiv_papers',
113
+ summaries_path ='arxiv_generated_summaries',
114
+ vectorstore_path ='arxiv_vectorstores',**kwargs):
115
+
116
+ super().__init__(llm, **kwargs)
117
+ self.summarize = summarize
118
+ self.process_images = process_images
119
+ self.max_results = max_results
120
+ self.database_path = database_path
121
+ self.summaries_path = summaries_path
122
+ self.vectorstore_path = vectorstore_path
123
+ self.download_papers = download_papers
124
+ self.rag_embedding = rag_embedding
125
+
126
+ self.graph = self._build_graph()
127
+
128
+ os.makedirs(self.database_path, exist_ok=True)
129
+
130
+ os.makedirs(self.summaries_path, exist_ok=True)
131
+
132
+
133
+ def _fetch_papers(self, query: str) -> List[PaperMetadata]:
134
+
135
+ if self.download_papers:
136
+
137
+ encoded_query = quote(query)
138
+ url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results={self.max_results}"
139
+ feed = feedparser.parse(url)
140
+
141
+ for i,entry in enumerate(feed.entries):
142
+ full_id = entry.id.split('/abs/')[-1]
143
+ arxiv_id = full_id.split('/')[-1]
144
+ title = entry.title.strip()
145
+ authors = ", ".join(author.name for author in entry.authors)
146
+ pdf_url = f"https://arxiv.org/pdf/{full_id}.pdf"
147
+ pdf_filename = os.path.join(self.database_path, f"{arxiv_id}.pdf")
148
+
149
+ if os.path.exists(pdf_filename):
150
+ print(f"Paper # {i+1}, Title: {title}, already exists in database")
151
+ else:
152
+ print(f"Downloading paper # {i+1}, Title: {title}")
153
+ response = requests.get(pdf_url)
154
+ with open(pdf_filename, 'wb') as f:
155
+ f.write(response.content)
156
+
157
+
158
+ papers = []
159
+
160
+ pdf_files = [f for f in os.listdir(self.database_path) if f.lower().endswith(".pdf")]
161
+
162
+ for i,pdf_filename in enumerate(pdf_files):
163
+ full_text = ""
164
+ arxiv_id = pdf_filename.split('.pdf')[0]
165
+ vec_save_loc = self.vectorstore_path + '/' + arxiv_id
166
+
167
+ if self.summarize and not os.path.exists(vec_save_loc):
168
+ try:
169
+ loader = PyPDFLoader( os.path.join(self.database_path, pdf_filename) )
170
+ pages = loader.load()
171
+ full_text = "\n".join([p.page_content for p in pages])
172
+
173
+ if self.process_images:
174
+ image_descriptions = extract_and_describe_images( os.path.join(self.database_path, pdf_filename) )
175
+ full_text += "\n\n[Image Interpretations]\n" + "\n".join(image_descriptions)
176
+
177
+ except Exception as e:
178
+ full_text = f"Error loading paper: {e}"
179
+
180
+ papers.append({
181
+ "arxiv_id": arxiv_id,
182
+ "full_text": full_text,
183
+ })
184
+
185
+ return papers
186
+
187
+ def _fetch_node(self, state: PaperState) -> PaperState:
188
+ papers = self._fetch_papers(state["query"])
189
+ return {**state, "papers": papers}
190
+
191
+
192
+ def _get_or_build_vectorstore(self, paper_text: str, arxiv_id: str):
193
+ os.makedirs(self.vectorstore_path, exist_ok=True)
194
+
195
+ persist_directory = os.path.join(self.vectorstore_path, arxiv_id)
196
+
197
+ if os.path.exists(persist_directory):
198
+ vectorstore = Chroma(persist_directory=persist_directory, embedding_function=self.rag_embedding)
199
+ else:
200
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
201
+ docs = splitter.create_documents([paper_text])
202
+ vectorstore = Chroma.from_documents(docs, self.rag_embedding, persist_directory=persist_directory)
203
+
204
+ return vectorstore.as_retriever(search_kwargs={"k": 5})
205
+
206
+
207
+ def _summarize_node(self, state: PaperState) -> PaperState:
208
+
209
+ prompt = ChatPromptTemplate.from_template("""
210
+ You are a scientific assistant responsible for summarizing extracts from research papers, in the context of the following task: {context}
211
+
212
+ Summarize the retrieved scientific content below.
213
+
214
+ {retrieved_content}
215
+ """)
216
+
217
+ chain = prompt | self.llm | StrOutputParser()
218
+
219
+ summaries = [None] * len(state["papers"])
220
+ relevancy_scores = [0.0] * len(state["papers"])
221
+
222
+ def process_paper(i, paper):
223
+ arxiv_id = paper["arxiv_id"]
224
+ summary_filename = os.path.join(self.summaries_path, f"{arxiv_id}_summary.txt")
225
+
226
+ try:
227
+ cleaned_text = remove_surrogates(paper["full_text"])
228
+ if self.rag_embedding:
229
+ retriever = self._get_or_build_vectorstore(cleaned_text, arxiv_id)
230
+
231
+ relevant_docs_with_scores = retriever.vectorstore.similarity_search_with_score(state["context"], k=5)
232
+
233
+ if relevant_docs_with_scores:
234
+ score = sum([s for _, s in relevant_docs_with_scores]) / len(relevant_docs_with_scores)
235
+ relevancy_scores[i] = abs(1.0 - score)
236
+ else:
237
+ relevancy_scores[i] = 0.0
238
+
239
+ retrieved_content = "\n\n".join([doc.page_content for doc, _ in relevant_docs_with_scores])
240
+ else:
241
+ retrieved_content = cleaned_text
242
+
243
+ summary = chain.invoke({"retrieved_content": retrieved_content, "context": state["context"]})
244
+
245
+ except Exception as e:
246
+ summary = f"Error summarizing paper: {e}"
247
+ relevancy_scores[i] = 0.0
248
+
249
+ with open(summary_filename, "w") as f:
250
+ f.write(summary)
251
+
252
+ return i, summary
253
+
254
+ if ('papers' not in state or len(state['papers']) == 0):
255
+ print(f"No papers retrieved - bad query or network connection to ArXiv?")
256
+ return {**state, "summaries": None}
257
+
258
+ with ThreadPoolExecutor(max_workers=min(32, len(state["papers"]))) as executor:
259
+ futures = [executor.submit(process_paper, i, paper) for i, paper in enumerate(state["papers"])]
260
+
261
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Summarizing Papers"):
262
+ i, result = future.result()
263
+ summaries[i] = result
264
+
265
+ if self.rag_embedding:
266
+ print(f"\nMax Relevancy Score: {max(relevancy_scores)}")
267
+ print(f"Min Relevancy Score: {min(relevancy_scores)}")
268
+ print(f"Median Relevancy Score: {statistics.median(relevancy_scores)}\n")
269
+
270
+ return {**state, "summaries": summaries}
271
+
272
+
273
+
274
+ def _aggregate_node(self, state: PaperState) -> PaperState:
275
+ summaries = state["summaries"]
276
+ papers = state["papers"]
277
+ formatted = []
278
+
279
+ if 'summaries' not in state or state['summaries'] is None or 'papers' not in state or state['papers'] is None:
280
+ return {**state, "final_summary": None}
281
+
282
+ for i, (paper, summary) in enumerate(zip(papers, summaries)):
283
+ citation = f"[{i+1}] Arxiv ID: {paper['arxiv_id']}"
284
+ formatted.append(f"{citation}\n\nSummary:\n{summary}")
285
+
286
+ combined = "\n\n" + ("\n\n" + "-" * 40 + "\n\n").join(formatted)
287
+
288
+ with open(self.summaries_path+'/summaries_combined.txt', "w") as f:
289
+ f.write(combined)
290
+
291
+ prompt = ChatPromptTemplate.from_template("""
292
+ You are a scientific assistant helping extract insights from summaries of research papers.
293
+
294
+ Here are the summaries of a large number of extracts from scientific papers:
295
+
296
+ {Summaries}
297
+
298
+ Your task is to read all the summaries and provide a response to this task: {context}
299
+ """)
300
+
301
+ chain = prompt | self.llm | StrOutputParser()
302
+
303
+ final_summary = chain.invoke({"Summaries": combined, "context":state["context"]})
304
+
305
+ with open(self.summaries_path+'/final_summary.txt', "w") as f:
306
+ f.write(final_summary)
307
+
308
+ return {**state, "final_summary": final_summary}
309
+
310
+
311
+
312
+ def _build_graph(self):
313
+ builder = StateGraph(PaperState)
314
+ builder.add_node("fetch_papers", self._fetch_node)
315
+
316
+ if self.summarize:
317
+ builder.add_node("summarize_each", self._summarize_node)
318
+ builder.add_node("aggregate", self._aggregate_node)
319
+
320
+ builder.set_entry_point("fetch_papers")
321
+ builder.add_edge("fetch_papers", "summarize_each")
322
+ builder.add_edge("summarize_each", "aggregate")
323
+ builder.set_finish_point("aggregate")
324
+
325
+ else:
326
+ builder.set_entry_point("fetch_papers")
327
+ builder.set_finish_point("fetch_papers")
328
+
329
+ graph = builder.compile()
330
+ return graph
331
+
332
+ def run(self, arxiv_search_query: str, context: str) -> str:
333
+ result = self.graph.invoke({"query": arxiv_search_query, "context":context})
334
+
335
+ if self.summarize:
336
+ return result.get("final_summary", "No summary generated.")
337
+ else:
338
+ return "\n\nFinished Fetching papers!"
339
+
340
+
341
+
342
+ if __name__ == "__main__":
343
+ agent = ArxivAgent()
344
+ result = agent.run(arxiv_search_query="Experimental Constraints on neutron star radius",
345
+ context="What are the constraints on the neutron star radius and what uncertainties are there on the constraints?")
346
+
347
+ print(result)
348
+
349
+