tooluniverse 0.1.4__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tooluniverse might be problematic. Click here for more details.
- tooluniverse/__init__.py +340 -4
- tooluniverse/admetai_tool.py +84 -0
- tooluniverse/agentic_tool.py +563 -0
- tooluniverse/alphafold_tool.py +96 -0
- tooluniverse/base_tool.py +129 -6
- tooluniverse/boltz_tool.py +207 -0
- tooluniverse/chem_tool.py +192 -0
- tooluniverse/compose_scripts/__init__.py +1 -0
- tooluniverse/compose_scripts/biomarker_discovery.py +293 -0
- tooluniverse/compose_scripts/comprehensive_drug_discovery.py +186 -0
- tooluniverse/compose_scripts/drug_safety_analyzer.py +89 -0
- tooluniverse/compose_scripts/literature_tool.py +34 -0
- tooluniverse/compose_scripts/output_summarizer.py +279 -0
- tooluniverse/compose_scripts/tool_description_optimizer.py +681 -0
- tooluniverse/compose_scripts/tool_discover.py +705 -0
- tooluniverse/compose_scripts/tool_graph_composer.py +448 -0
- tooluniverse/compose_tool.py +371 -0
- tooluniverse/ctg_tool.py +1002 -0
- tooluniverse/custom_tool.py +81 -0
- tooluniverse/dailymed_tool.py +108 -0
- tooluniverse/data/admetai_tools.json +155 -0
- tooluniverse/data/agentic_tools.json +1156 -0
- tooluniverse/data/alphafold_tools.json +87 -0
- tooluniverse/data/boltz_tools.json +9 -0
- tooluniverse/data/chembl_tools.json +16 -0
- tooluniverse/data/clait_tools.json +108 -0
- tooluniverse/data/clinicaltrials_gov_tools.json +326 -0
- tooluniverse/data/compose_tools.json +202 -0
- tooluniverse/data/dailymed_tools.json +70 -0
- tooluniverse/data/dataset_tools.json +646 -0
- tooluniverse/data/disease_target_score_tools.json +712 -0
- tooluniverse/data/efo_tools.json +17 -0
- tooluniverse/data/embedding_tools.json +319 -0
- tooluniverse/data/enrichr_tools.json +31 -0
- tooluniverse/data/europe_pmc_tools.json +22 -0
- tooluniverse/data/expert_feedback_tools.json +10 -0
- tooluniverse/data/fda_drug_adverse_event_tools.json +491 -0
- tooluniverse/data/fda_drug_labeling_tools.json +544 -168
- tooluniverse/data/fda_drugs_with_brand_generic_names_for_tool.py +76929 -148860
- tooluniverse/data/finder_tools.json +209 -0
- tooluniverse/data/gene_ontology_tools.json +113 -0
- tooluniverse/data/gwas_tools.json +1082 -0
- tooluniverse/data/hpa_tools.json +333 -0
- tooluniverse/data/humanbase_tools.json +47 -0
- tooluniverse/data/idmap_tools.json +74 -0
- tooluniverse/data/mcp_client_tools_example.json +113 -0
- tooluniverse/data/mcpautoloadertool_defaults.json +28 -0
- tooluniverse/data/medlineplus_tools.json +141 -0
- tooluniverse/data/monarch_tools.json +1 -1
- tooluniverse/data/openalex_tools.json +36 -0
- tooluniverse/data/opentarget_tools.json +82 -58
- tooluniverse/data/output_summarization_tools.json +101 -0
- tooluniverse/data/packages/bioinformatics_core_tools.json +1756 -0
- tooluniverse/data/packages/categorized_tools.txt +206 -0
- tooluniverse/data/packages/cheminformatics_tools.json +347 -0
- tooluniverse/data/packages/earth_sciences_tools.json +74 -0
- tooluniverse/data/packages/genomics_tools.json +776 -0
- tooluniverse/data/packages/image_processing_tools.json +38 -0
- tooluniverse/data/packages/machine_learning_tools.json +789 -0
- tooluniverse/data/packages/neuroscience_tools.json +62 -0
- tooluniverse/data/packages/original_tools.txt +0 -0
- tooluniverse/data/packages/physics_astronomy_tools.json +62 -0
- tooluniverse/data/packages/scientific_computing_tools.json +560 -0
- tooluniverse/data/packages/single_cell_tools.json +453 -0
- tooluniverse/data/packages/software_tools.json +4954 -0
- tooluniverse/data/packages/structural_biology_tools.json +396 -0
- tooluniverse/data/packages/visualization_tools.json +399 -0
- tooluniverse/data/pubchem_tools.json +215 -0
- tooluniverse/data/pubtator_tools.json +68 -0
- tooluniverse/data/rcsb_pdb_tools.json +1332 -0
- tooluniverse/data/reactome_tools.json +19 -0
- tooluniverse/data/semantic_scholar_tools.json +26 -0
- tooluniverse/data/special_tools.json +2 -25
- tooluniverse/data/tool_composition_tools.json +88 -0
- tooluniverse/data/toolfinderkeyword_defaults.json +34 -0
- tooluniverse/data/txagent_client_tools.json +9 -0
- tooluniverse/data/uniprot_tools.json +211 -0
- tooluniverse/data/url_fetch_tools.json +94 -0
- tooluniverse/data/uspto_downloader_tools.json +9 -0
- tooluniverse/data/uspto_tools.json +811 -0
- tooluniverse/data/xml_tools.json +3275 -0
- tooluniverse/dataset_tool.py +296 -0
- tooluniverse/default_config.py +165 -0
- tooluniverse/efo_tool.py +42 -0
- tooluniverse/embedding_database.py +630 -0
- tooluniverse/embedding_sync.py +396 -0
- tooluniverse/enrichr_tool.py +266 -0
- tooluniverse/europe_pmc_tool.py +52 -0
- tooluniverse/execute_function.py +1775 -95
- tooluniverse/extended_hooks.py +444 -0
- tooluniverse/gene_ontology_tool.py +194 -0
- tooluniverse/graphql_tool.py +158 -36
- tooluniverse/gwas_tool.py +358 -0
- tooluniverse/hpa_tool.py +1645 -0
- tooluniverse/humanbase_tool.py +389 -0
- tooluniverse/logging_config.py +254 -0
- tooluniverse/mcp_client_tool.py +764 -0
- tooluniverse/mcp_integration.py +413 -0
- tooluniverse/mcp_tool_registry.py +925 -0
- tooluniverse/medlineplus_tool.py +337 -0
- tooluniverse/openalex_tool.py +228 -0
- tooluniverse/openfda_adv_tool.py +283 -0
- tooluniverse/openfda_tool.py +393 -160
- tooluniverse/output_hook.py +1122 -0
- tooluniverse/package_tool.py +195 -0
- tooluniverse/pubchem_tool.py +158 -0
- tooluniverse/pubtator_tool.py +168 -0
- tooluniverse/rcsb_pdb_tool.py +38 -0
- tooluniverse/reactome_tool.py +108 -0
- tooluniverse/remote/boltz/boltz_mcp_server.py +50 -0
- tooluniverse/remote/depmap_24q2/depmap_24q2_mcp_tool.py +442 -0
- tooluniverse/remote/expert_feedback/human_expert_mcp_tools.py +2013 -0
- tooluniverse/remote/expert_feedback/simple_test.py +23 -0
- tooluniverse/remote/expert_feedback/start_web_interface.py +188 -0
- tooluniverse/remote/expert_feedback/web_only_interface.py +0 -0
- tooluniverse/remote/expert_feedback_mcp/human_expert_mcp_server.py +1611 -0
- tooluniverse/remote/expert_feedback_mcp/simple_test.py +34 -0
- tooluniverse/remote/expert_feedback_mcp/start_web_interface.py +91 -0
- tooluniverse/remote/immune_compass/compass_tool.py +327 -0
- tooluniverse/remote/pinnacle/pinnacle_tool.py +328 -0
- tooluniverse/remote/transcriptformer/transcriptformer_tool.py +586 -0
- tooluniverse/remote/uspto_downloader/uspto_downloader_mcp_server.py +61 -0
- tooluniverse/remote/uspto_downloader/uspto_downloader_tool.py +120 -0
- tooluniverse/remote_tool.py +99 -0
- tooluniverse/restful_tool.py +53 -30
- tooluniverse/scripts/generate_tool_graph.py +408 -0
- tooluniverse/scripts/visualize_tool_graph.py +829 -0
- tooluniverse/semantic_scholar_tool.py +62 -0
- tooluniverse/smcp.py +2452 -0
- tooluniverse/smcp_server.py +975 -0
- tooluniverse/test/mcp_server_test.py +0 -0
- tooluniverse/test/test_admetai_tool.py +370 -0
- tooluniverse/test/test_agentic_tool.py +129 -0
- tooluniverse/test/test_alphafold_tool.py +71 -0
- tooluniverse/test/test_chem_tool.py +37 -0
- tooluniverse/test/test_compose_lieraturereview.py +63 -0
- tooluniverse/test/test_compose_tool.py +448 -0
- tooluniverse/test/test_dailymed.py +69 -0
- tooluniverse/test/test_dataset_tool.py +200 -0
- tooluniverse/test/test_disease_target_score.py +56 -0
- tooluniverse/test/test_drugbank_filter_examples.py +179 -0
- tooluniverse/test/test_efo.py +31 -0
- tooluniverse/test/test_enrichr_tool.py +21 -0
- tooluniverse/test/test_europe_pmc_tool.py +20 -0
- tooluniverse/test/test_fda_adv.py +95 -0
- tooluniverse/test/test_fda_drug_labeling.py +91 -0
- tooluniverse/test/test_gene_ontology_tools.py +66 -0
- tooluniverse/test/test_gwas_tool.py +139 -0
- tooluniverse/test/test_hpa.py +625 -0
- tooluniverse/test/test_humanbase_tool.py +20 -0
- tooluniverse/test/test_idmap_tools.py +61 -0
- tooluniverse/test/test_mcp_server.py +211 -0
- tooluniverse/test/test_mcp_tool.py +247 -0
- tooluniverse/test/test_medlineplus.py +220 -0
- tooluniverse/test/test_openalex_tool.py +32 -0
- tooluniverse/test/test_opentargets.py +28 -0
- tooluniverse/test/test_pubchem_tool.py +116 -0
- tooluniverse/test/test_pubtator_tool.py +37 -0
- tooluniverse/test/test_rcsb_pdb_tool.py +86 -0
- tooluniverse/test/test_reactome.py +54 -0
- tooluniverse/test/test_semantic_scholar_tool.py +24 -0
- tooluniverse/test/test_software_tools.py +147 -0
- tooluniverse/test/test_tool_description_optimizer.py +49 -0
- tooluniverse/test/test_tool_finder.py +26 -0
- tooluniverse/test/test_tool_finder_llm.py +252 -0
- tooluniverse/test/test_tools_find.py +195 -0
- tooluniverse/test/test_uniprot_tools.py +74 -0
- tooluniverse/test/test_uspto_tool.py +72 -0
- tooluniverse/test/test_xml_tool.py +113 -0
- tooluniverse/tool_finder_embedding.py +267 -0
- tooluniverse/tool_finder_keyword.py +693 -0
- tooluniverse/tool_finder_llm.py +699 -0
- tooluniverse/tool_graph_web_ui.py +955 -0
- tooluniverse/tool_registry.py +416 -0
- tooluniverse/uniprot_tool.py +155 -0
- tooluniverse/url_tool.py +253 -0
- tooluniverse/uspto_tool.py +240 -0
- tooluniverse/utils.py +369 -41
- tooluniverse/xml_tool.py +369 -0
- tooluniverse-1.0.0.dist-info/METADATA +377 -0
- tooluniverse-1.0.0.dist-info/RECORD +186 -0
- {tooluniverse-0.1.4.dist-info → tooluniverse-1.0.0.dist-info}/WHEEL +1 -1
- tooluniverse-1.0.0.dist-info/entry_points.txt +9 -0
- tooluniverse-0.1.4.dist-info/METADATA +0 -141
- tooluniverse-0.1.4.dist-info/RECORD +0 -18
- {tooluniverse-0.1.4.dist-info → tooluniverse-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {tooluniverse-0.1.4.dist-info → tooluniverse-1.0.0.dist-info}/top_level.txt +0 -0
tooluniverse/url_tool.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import re
|
|
3
|
+
from .base_tool import BaseTool
|
|
4
|
+
from html import unescape
|
|
5
|
+
from .tool_registry import register_tool
|
|
6
|
+
import io
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import subprocess
|
|
10
|
+
import pdfplumber
|
|
11
|
+
from playwright.sync_api import sync_playwright
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_tool("URLHTMLTagTool")
|
|
15
|
+
class URLHTMLTagTool(BaseTool):
|
|
16
|
+
"""
|
|
17
|
+
Fetches a webpage and extracts the content of a specified HTML tag.
|
|
18
|
+
Expects: {"url": "https://..."}
|
|
19
|
+
The tag to extract is specified in the tool's configuration.
|
|
20
|
+
The tag to extract is specified in the tool's configuration.
|
|
21
|
+
Optional: {"timeout": <seconds>} (default 20)
|
|
22
|
+
Returns: {"content": "<extracted content>"} or {"error": "..."}
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, tool_config):
|
|
26
|
+
super().__init__(tool_config)
|
|
27
|
+
self.tag_to_fetch = tool_config["fields"].get("tag", "title")
|
|
28
|
+
self.return_key = tool_config["fields"].get("return_key", "content")
|
|
29
|
+
|
|
30
|
+
def run(self, arguments: dict):
|
|
31
|
+
url = arguments.get("url")
|
|
32
|
+
if not url:
|
|
33
|
+
return {"error": "Parameter 'url' is required."}
|
|
34
|
+
|
|
35
|
+
# Basic validation
|
|
36
|
+
if not (url.startswith("http://") or url.startswith("https://")):
|
|
37
|
+
return {"error": "URL must start with http:// or https://"}
|
|
38
|
+
|
|
39
|
+
timeout = arguments.get("timeout", 20)
|
|
40
|
+
try:
|
|
41
|
+
resp = requests.get(url, timeout=timeout)
|
|
42
|
+
except requests.Timeout:
|
|
43
|
+
return {"error": "Request timed out."}
|
|
44
|
+
except Exception as e:
|
|
45
|
+
return {"error": f"Request failed: {e}"}
|
|
46
|
+
|
|
47
|
+
if resp.status_code != 200:
|
|
48
|
+
return {"error": f"HTTP {resp.status_code}", "detail": resp.text[:300]}
|
|
49
|
+
|
|
50
|
+
ctype = resp.headers.get("Content-Type", "").lower()
|
|
51
|
+
if "html" not in ctype:
|
|
52
|
+
# Still attempt extraction if text-like
|
|
53
|
+
if not ctype.startswith("text/"):
|
|
54
|
+
return {"error": "Response is not HTML."}
|
|
55
|
+
|
|
56
|
+
text = resp.text
|
|
57
|
+
|
|
58
|
+
# Extract <tag>...</tag>
|
|
59
|
+
m = re.search(
|
|
60
|
+
rf"<{self.tag_to_fetch}>(.*?)</{self.tag_to_fetch}>",
|
|
61
|
+
text,
|
|
62
|
+
flags=re.IGNORECASE | re.DOTALL,
|
|
63
|
+
)
|
|
64
|
+
if not m:
|
|
65
|
+
return {"error": f"No <{self.tag_to_fetch}> tag found."}
|
|
66
|
+
|
|
67
|
+
raw_content = m.group(1).strip()
|
|
68
|
+
# Collapse whitespace
|
|
69
|
+
cleaned = re.sub(r"\s+", " ", raw_content)
|
|
70
|
+
cleaned = unescape(cleaned)
|
|
71
|
+
|
|
72
|
+
return {self.return_key: cleaned}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@register_tool("URLToPDFTextTool")
|
|
76
|
+
class URLToPDFTextTool(BaseTool):
|
|
77
|
+
"""
|
|
78
|
+
Loads a webpage (with JavaScript), exports it as a PDF, and extracts text.
|
|
79
|
+
Expects: {"url": "https://..."}
|
|
80
|
+
Optional: {"timeout": <seconds>} (default 30)
|
|
81
|
+
Returns: {"text": "<extracted text>"} or {"error": "..."}
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(self, tool_config):
|
|
85
|
+
super().__init__(tool_config)
|
|
86
|
+
self.return_key = tool_config["fields"].get("return_key", "text")
|
|
87
|
+
|
|
88
|
+
def _ensure_playwright_browsers(
|
|
89
|
+
self,
|
|
90
|
+
browsers=("chromium",),
|
|
91
|
+
with_deps: bool = False,
|
|
92
|
+
timeout_seconds: int = 600,
|
|
93
|
+
):
|
|
94
|
+
"""
|
|
95
|
+
Ensure Playwright browser binaries are installed.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
None on success, or an error string on failure.
|
|
99
|
+
"""
|
|
100
|
+
# Allow user to skip auto-install via env var
|
|
101
|
+
if os.environ.get("PLAYWRIGHT_SKIP_BROWSER_INSTALL", "") in (
|
|
102
|
+
"1",
|
|
103
|
+
"true",
|
|
104
|
+
"True",
|
|
105
|
+
):
|
|
106
|
+
return "PLAYWRIGHT_SKIP_BROWSER_INSTALL is set; skipping browser install."
|
|
107
|
+
|
|
108
|
+
# Detect if running inside an active asyncio event loop (Colab/Jupyter)
|
|
109
|
+
try:
|
|
110
|
+
import asyncio
|
|
111
|
+
|
|
112
|
+
loop = asyncio.get_event_loop()
|
|
113
|
+
running_async = loop.is_running()
|
|
114
|
+
except Exception:
|
|
115
|
+
running_async = False
|
|
116
|
+
|
|
117
|
+
def try_launch_one_sync():
|
|
118
|
+
try:
|
|
119
|
+
from playwright.sync_api import sync_playwright
|
|
120
|
+
|
|
121
|
+
with sync_playwright() as p:
|
|
122
|
+
b = getattr(p, browsers[0])
|
|
123
|
+
browser = b.launch(headless=True, timeout=10_000)
|
|
124
|
+
browser.close()
|
|
125
|
+
return True, None
|
|
126
|
+
except Exception as e:
|
|
127
|
+
return False, str(e)
|
|
128
|
+
|
|
129
|
+
async def try_launch_one_async():
|
|
130
|
+
try:
|
|
131
|
+
from playwright.async_api import async_playwright
|
|
132
|
+
|
|
133
|
+
async with async_playwright() as p:
|
|
134
|
+
b = getattr(p, browsers[0])
|
|
135
|
+
browser = await b.launch(headless=True, timeout=10_000)
|
|
136
|
+
await browser.close()
|
|
137
|
+
return True, None
|
|
138
|
+
except Exception as e:
|
|
139
|
+
return False, str(e)
|
|
140
|
+
|
|
141
|
+
if running_async:
|
|
142
|
+
# Use async Playwright API for browser launch check
|
|
143
|
+
try:
|
|
144
|
+
ok, msg = loop.run_until_complete(try_launch_one_async())
|
|
145
|
+
except Exception as e:
|
|
146
|
+
ok, msg = False, str(e)
|
|
147
|
+
else:
|
|
148
|
+
ok, msg = try_launch_one_sync()
|
|
149
|
+
|
|
150
|
+
if ok:
|
|
151
|
+
return None # browsers are already installed
|
|
152
|
+
|
|
153
|
+
# Attempt install using the same Python executable
|
|
154
|
+
cmd = [sys.executable, "-m", "playwright", "install"] + list(browsers)
|
|
155
|
+
if with_deps:
|
|
156
|
+
cmd.append("--with-deps")
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
subprocess.run(
|
|
160
|
+
cmd, check=True, capture_output=True, text=True, timeout=timeout_seconds
|
|
161
|
+
)
|
|
162
|
+
except subprocess.CalledProcessError as e:
|
|
163
|
+
stdout = e.stdout or ""
|
|
164
|
+
stderr = e.stderr or ""
|
|
165
|
+
return f"playwright install failed (exit {e.returncode}). stdout:\n{stdout}\nstderr:\n{stderr}"
|
|
166
|
+
except Exception as e:
|
|
167
|
+
return f"Failed to run playwright install: {e}"
|
|
168
|
+
|
|
169
|
+
# Try launching again after install
|
|
170
|
+
if running_async:
|
|
171
|
+
try:
|
|
172
|
+
ok2, msg2 = loop.run_until_complete(try_launch_one_async())
|
|
173
|
+
except Exception as e:
|
|
174
|
+
ok2, msg2 = False, str(e)
|
|
175
|
+
else:
|
|
176
|
+
ok2, msg2 = try_launch_one_sync()
|
|
177
|
+
|
|
178
|
+
if ok2:
|
|
179
|
+
return None
|
|
180
|
+
return f"Browsers installed but launch still fails: {msg2}"
|
|
181
|
+
|
|
182
|
+
def run(self, arguments: dict):
|
|
183
|
+
url = arguments.get("url")
|
|
184
|
+
if not url:
|
|
185
|
+
return {"error": "Parameter 'url' is required."}
|
|
186
|
+
if not (url.startswith("http://") or url.startswith("https://")):
|
|
187
|
+
return {"error": "URL must start with http:// or https://"}
|
|
188
|
+
|
|
189
|
+
timeout = arguments.get("timeout", 30)
|
|
190
|
+
|
|
191
|
+
# Ensure browsers are installed (auto-install if needed)
|
|
192
|
+
ensure_error = self._ensure_playwright_browsers(
|
|
193
|
+
browsers=("chromium",), with_deps=False
|
|
194
|
+
)
|
|
195
|
+
if ensure_error is not None:
|
|
196
|
+
return {"error": f"Playwright browser check/install failed: {ensure_error}"}
|
|
197
|
+
|
|
198
|
+
# Detect if running inside an active asyncio event loop (Colab/Jupyter)
|
|
199
|
+
try:
|
|
200
|
+
import asyncio
|
|
201
|
+
|
|
202
|
+
loop = asyncio.get_event_loop()
|
|
203
|
+
running_async = loop.is_running()
|
|
204
|
+
except Exception:
|
|
205
|
+
running_async = False
|
|
206
|
+
|
|
207
|
+
if running_async:
|
|
208
|
+
# Use async Playwright API
|
|
209
|
+
from playwright.async_api import async_playwright
|
|
210
|
+
import nest_asyncio
|
|
211
|
+
|
|
212
|
+
nest_asyncio.apply()
|
|
213
|
+
|
|
214
|
+
async def async_pdf():
|
|
215
|
+
async with async_playwright() as p:
|
|
216
|
+
browser = await p.chromium.launch(headless=True)
|
|
217
|
+
page = await browser.new_page()
|
|
218
|
+
await page.goto(
|
|
219
|
+
url, timeout=timeout * 1000, wait_until="networkidle"
|
|
220
|
+
)
|
|
221
|
+
pdf_bytes = await page.pdf(format="A4", print_background=True)
|
|
222
|
+
await browser.close()
|
|
223
|
+
return pdf_bytes
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
pdf_bytes = loop.run_until_complete(async_pdf())
|
|
227
|
+
except Exception as e:
|
|
228
|
+
return {"error": f"Failed to render webpage to PDF (async): {e}"}
|
|
229
|
+
else:
|
|
230
|
+
# Use sync Playwright API
|
|
231
|
+
try:
|
|
232
|
+
with sync_playwright() as p:
|
|
233
|
+
browser = p.chromium.launch(headless=True)
|
|
234
|
+
page = browser.new_page()
|
|
235
|
+
page.goto(url, timeout=timeout * 1000, wait_until="networkidle")
|
|
236
|
+
pdf_bytes = page.pdf(format="A4", print_background=True)
|
|
237
|
+
browser.close()
|
|
238
|
+
except Exception as e:
|
|
239
|
+
return {"error": f"Failed to render webpage to PDF (sync): {e}"}
|
|
240
|
+
|
|
241
|
+
# Step 2: Extract text from PDF
|
|
242
|
+
try:
|
|
243
|
+
text = ""
|
|
244
|
+
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
|
245
|
+
for page in pdf.pages:
|
|
246
|
+
page_text = page.extract_text()
|
|
247
|
+
if page_text:
|
|
248
|
+
text += page_text + "\n"
|
|
249
|
+
if not text.strip():
|
|
250
|
+
return {"error": "No text could be extracted from rendered PDF."}
|
|
251
|
+
return {self.return_key: text.strip()}
|
|
252
|
+
except Exception as e:
|
|
253
|
+
return {"error": f"Failed to extract text from PDF: {e}"}
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import os
|
|
5
|
+
from requests.adapters import HTTPAdapter
|
|
6
|
+
from urllib3.util.retry import Retry
|
|
7
|
+
from .base_tool import BaseTool
|
|
8
|
+
from .tool_registry import register_tool
|
|
9
|
+
from dotenv import load_dotenv, find_dotenv
|
|
10
|
+
|
|
11
|
+
load_dotenv(find_dotenv(usecwd=True))
|
|
12
|
+
|
|
13
|
+
USPTO_API_KEY = os.environ.get("USPTO_API_KEY")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_tool("USPTOOpenDataPortalTool")
|
|
17
|
+
class USPTOOpenDataPortalTool(BaseTool):
|
|
18
|
+
"""
|
|
19
|
+
A tool for interacting with the USPTO Open Data Portal API to search for and retrieve patent information.
|
|
20
|
+
The run method dynamically constructs API requests based on the provided tool configuration.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
tool_config,
|
|
26
|
+
api_key=USPTO_API_KEY,
|
|
27
|
+
base_url="https://api.uspto.gov/api/v1",
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Initializes the USPTOOpenDataPortalTool.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
tool_config: The configuration for the specific tool being run.
|
|
34
|
+
api_key: Your USPTO Open Data Portal API key.
|
|
35
|
+
base_url: The base URL for the USPTO API.
|
|
36
|
+
"""
|
|
37
|
+
super().__init__(tool_config)
|
|
38
|
+
self.base_url = base_url
|
|
39
|
+
if api_key == "YOUR_API_KEY" or not api_key:
|
|
40
|
+
raise ValueError(
|
|
41
|
+
"You must set a USPTO API key via the USPTO_API_KEY environment variable."
|
|
42
|
+
)
|
|
43
|
+
self.headers = {"X-API-KEY": api_key, "Accept": "application/json"}
|
|
44
|
+
self.session = requests.Session()
|
|
45
|
+
retry_strategy = Retry(
|
|
46
|
+
total=5,
|
|
47
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
|
48
|
+
backoff_factor=5, # first retry waits 5s, then 10s, 20s, …
|
|
49
|
+
raise_on_status=False,
|
|
50
|
+
)
|
|
51
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
52
|
+
self.session.mount("https://", adapter)
|
|
53
|
+
|
|
54
|
+
def get_by_path(self, d, keys):
|
|
55
|
+
"""Safely navigate nested dicts by a list of keys."""
|
|
56
|
+
for k in keys:
|
|
57
|
+
if d is None:
|
|
58
|
+
return None
|
|
59
|
+
if isinstance(d, dict):
|
|
60
|
+
d = d.get(k)
|
|
61
|
+
else:
|
|
62
|
+
return None
|
|
63
|
+
return d
|
|
64
|
+
|
|
65
|
+
def assign_by_path(self, d, path, value):
|
|
66
|
+
"""Create nested dicts for a dot‑path and set the final key to value."""
|
|
67
|
+
keys = path.split(".")
|
|
68
|
+
for k in keys[:-1]:
|
|
69
|
+
d = d.setdefault(k, {})
|
|
70
|
+
d[keys[-1]] = value
|
|
71
|
+
|
|
72
|
+
def prune_item(self, item, return_fields):
|
|
73
|
+
out = {}
|
|
74
|
+
missing_fields = []
|
|
75
|
+
|
|
76
|
+
# 1) First, handle all the list‑of‑objects fields (the "/" ones),
|
|
77
|
+
# grouping them by their list‑path prefix.
|
|
78
|
+
list_groups = {}
|
|
79
|
+
for field in return_fields:
|
|
80
|
+
if "/" in field:
|
|
81
|
+
list_path, prop = field.split("/", 1)
|
|
82
|
+
list_groups.setdefault(list_path, []).append(prop)
|
|
83
|
+
|
|
84
|
+
for list_path, props in list_groups.items():
|
|
85
|
+
prefix_keys = list_path.split(".")
|
|
86
|
+
raw_list = self.get_by_path(item, prefix_keys)
|
|
87
|
+
if not isinstance(raw_list, list):
|
|
88
|
+
for prop in props:
|
|
89
|
+
missing_fields.append(f"{list_path}/{prop}")
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
pruned_list = []
|
|
93
|
+
prop_found = {prop: False for prop in props}
|
|
94
|
+
for el in raw_list:
|
|
95
|
+
if not isinstance(el, dict):
|
|
96
|
+
continue
|
|
97
|
+
pruned_el = {}
|
|
98
|
+
for prop in props:
|
|
99
|
+
if prop in el:
|
|
100
|
+
pruned_el[prop] = el[prop]
|
|
101
|
+
prop_found[prop] = True
|
|
102
|
+
if pruned_el:
|
|
103
|
+
pruned_list.append(pruned_el)
|
|
104
|
+
|
|
105
|
+
# Track missing properties
|
|
106
|
+
for prop, found in prop_found.items():
|
|
107
|
+
if not found:
|
|
108
|
+
missing_fields.append(f"{list_path}/{prop}")
|
|
109
|
+
|
|
110
|
+
if pruned_list:
|
|
111
|
+
self.assign_by_path(out, list_path, pruned_list)
|
|
112
|
+
|
|
113
|
+
# 2) Then handle all the scalar or nested‑dict fields (the "." ones without "/").
|
|
114
|
+
for field in return_fields:
|
|
115
|
+
if "/" in field:
|
|
116
|
+
continue # already done
|
|
117
|
+
keys = field.split(".")
|
|
118
|
+
raw_value = self.get_by_path(item, keys)
|
|
119
|
+
if raw_value is None:
|
|
120
|
+
missing_fields.append(field)
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
self.assign_by_path(out, field, raw_value)
|
|
124
|
+
|
|
125
|
+
out["missing_fields"] = missing_fields
|
|
126
|
+
return out
|
|
127
|
+
|
|
128
|
+
def run(self, arguments):
|
|
129
|
+
"""
|
|
130
|
+
Runs the specified tool by constructing and executing an API call based on the tool's configuration.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
arguments: A dictionary of arguments for the tool, matching the parameters in the tool definition.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
The result of the API call, either as a dictionary (for JSON) or a string (for CSV).
|
|
137
|
+
"""
|
|
138
|
+
endpoint = self.tool_config.get("api_endpoint")
|
|
139
|
+
if not endpoint:
|
|
140
|
+
return {"error": "API endpoint not found in tool configuration."}
|
|
141
|
+
|
|
142
|
+
path_params = re.findall(r"\{(\w+)\}", endpoint)
|
|
143
|
+
query_params = {}
|
|
144
|
+
|
|
145
|
+
# Substitute path parameters and build query string parameters
|
|
146
|
+
for key, value in arguments.items():
|
|
147
|
+
if key in path_params:
|
|
148
|
+
endpoint = endpoint.replace(f"{{{key}}}", str(value))
|
|
149
|
+
else:
|
|
150
|
+
query_params[key] = value
|
|
151
|
+
|
|
152
|
+
# Remove any None values from the query parameters
|
|
153
|
+
for k, v in query_params.items():
|
|
154
|
+
if v is None:
|
|
155
|
+
if (
|
|
156
|
+
self.tool_config.get("parameter")
|
|
157
|
+
.get("properties")
|
|
158
|
+
.get(k)
|
|
159
|
+
.get("default")
|
|
160
|
+
is not None
|
|
161
|
+
):
|
|
162
|
+
query_params[k] = (
|
|
163
|
+
self.tool_config.get("parameter")
|
|
164
|
+
.get("properties")
|
|
165
|
+
.get(k)
|
|
166
|
+
.get("default")
|
|
167
|
+
)
|
|
168
|
+
else:
|
|
169
|
+
del query_params[k]
|
|
170
|
+
|
|
171
|
+
# default parameters if not provided
|
|
172
|
+
for k, v in self.tool_config.get("default_query_params", {}).items():
|
|
173
|
+
if k not in query_params or query_params[k] is None:
|
|
174
|
+
query_params[k] = v
|
|
175
|
+
|
|
176
|
+
# Special handling for the inputs to this tool
|
|
177
|
+
if self.tool_config.get("name") == "get_patent_overview_by_text_query":
|
|
178
|
+
if "query" in query_params:
|
|
179
|
+
query_params["q"] = query_params["query"]
|
|
180
|
+
del query_params["query"]
|
|
181
|
+
else:
|
|
182
|
+
return {"error": "Missing required parameter 'query'."}
|
|
183
|
+
|
|
184
|
+
if query_params["exact_match"]:
|
|
185
|
+
query_params["q"] = f'"{query_params["q"]}"'
|
|
186
|
+
del query_params["exact_match"]
|
|
187
|
+
|
|
188
|
+
field_mappings = {
|
|
189
|
+
"filingDate": "applicationMetaData.filingDate",
|
|
190
|
+
"grantDate": "applicationMetaData.grantDate",
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
for old_field, new_field in field_mappings.items():
|
|
194
|
+
if old_field in query_params.get("sort", ""):
|
|
195
|
+
query_params["sort"] = query_params["sort"].replace(
|
|
196
|
+
old_field, new_field
|
|
197
|
+
)
|
|
198
|
+
if old_field in query_params.get("rangeFilters", ""):
|
|
199
|
+
query_params["rangeFilters"] = query_params["rangeFilters"].replace(
|
|
200
|
+
old_field, new_field
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
# The timeout for downloads can be longer
|
|
205
|
+
timeout = 120 if "download" in self.tool_config.get("name", "") else 30
|
|
206
|
+
|
|
207
|
+
response = self.session.get(
|
|
208
|
+
f"{self.base_url}/{endpoint}",
|
|
209
|
+
headers=self.headers,
|
|
210
|
+
params=query_params,
|
|
211
|
+
timeout=timeout,
|
|
212
|
+
)
|
|
213
|
+
response.raise_for_status()
|
|
214
|
+
|
|
215
|
+
# Otherwise, assume the response is JSON
|
|
216
|
+
if self.tool_config.get("return_fields", []):
|
|
217
|
+
# Filter the JSON response to only include specified fields
|
|
218
|
+
pruned_patents = []
|
|
219
|
+
result = response.json()
|
|
220
|
+
for patent in result.get("patentFileWrapperDataBag", []):
|
|
221
|
+
pruned_patents.append(
|
|
222
|
+
self.prune_item(patent, self.tool_config.get("return_fields"))
|
|
223
|
+
)
|
|
224
|
+
result["patentFileWrapperDataBag"] = pruned_patents
|
|
225
|
+
else:
|
|
226
|
+
result = response.json()
|
|
227
|
+
return result
|
|
228
|
+
|
|
229
|
+
except requests.exceptions.HTTPError as http_err:
|
|
230
|
+
# Attempt to return the structured error from the API response body
|
|
231
|
+
try:
|
|
232
|
+
error_details = http_err.response.json()
|
|
233
|
+
except json.JSONDecodeError:
|
|
234
|
+
error_details = http_err.response.text
|
|
235
|
+
return {
|
|
236
|
+
"error": f"HTTP Error: {http_err.response.status_code}",
|
|
237
|
+
"details": error_details,
|
|
238
|
+
}
|
|
239
|
+
except requests.exceptions.RequestException as e:
|
|
240
|
+
return {"error": "API request failed", "details": str(e)}
|