ursa-ai 0.7.0rc2__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ursa-ai might be problematic. Click here for more details.
- ursa/agents/__init__.py +13 -2
- ursa/agents/acquisition_agents.py +812 -0
- ursa/agents/arxiv_agent.py +1 -1
- ursa/agents/base.py +352 -91
- ursa/agents/chat_agent.py +58 -0
- ursa/agents/execution_agent.py +506 -260
- ursa/agents/lammps_agent.py +81 -31
- ursa/agents/planning_agent.py +27 -2
- ursa/agents/websearch_agent.py +2 -2
- ursa/cli/__init__.py +5 -1
- ursa/cli/hitl.py +46 -34
- ursa/observability/pricing.json +85 -0
- ursa/observability/pricing.py +20 -18
- ursa/util/parse.py +316 -0
- {ursa_ai-0.7.0rc2.dist-info → ursa_ai-0.7.1.dist-info}/METADATA +5 -1
- {ursa_ai-0.7.0rc2.dist-info → ursa_ai-0.7.1.dist-info}/RECORD +20 -17
- {ursa_ai-0.7.0rc2.dist-info → ursa_ai-0.7.1.dist-info}/WHEEL +0 -0
- {ursa_ai-0.7.0rc2.dist-info → ursa_ai-0.7.1.dist-info}/entry_points.txt +0 -0
- {ursa_ai-0.7.0rc2.dist-info → ursa_ai-0.7.1.dist-info}/licenses/LICENSE +0 -0
- {ursa_ai-0.7.0rc2.dist-info → ursa_ai-0.7.1.dist-info}/top_level.txt +0 -0
ursa/observability/pricing.py
CHANGED
|
@@ -4,7 +4,8 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
from dataclasses import asdict, dataclass
|
|
6
6
|
from decimal import ROUND_HALF_UP, Decimal, getcontext
|
|
7
|
-
from
|
|
7
|
+
from importlib import resources
|
|
8
|
+
from typing import Any, Optional, Tuple
|
|
8
9
|
|
|
9
10
|
getcontext().prec = 28 # robust money math
|
|
10
11
|
|
|
@@ -24,7 +25,7 @@ class ModelPricing:
|
|
|
24
25
|
"1"
|
|
25
26
|
) # e.g., 0.25 if your provider discounts cached prompt tokens
|
|
26
27
|
|
|
27
|
-
def price_tokens(self, usage:
|
|
28
|
+
def price_tokens(self, usage: dict[str, Any]) -> dict[str, Decimal]:
|
|
28
29
|
"""Compute cost components from a usage dict with keys like input_tokens, output_tokens, reasoning_tokens, cached_tokens."""
|
|
29
30
|
|
|
30
31
|
def _to_dec(x) -> Decimal:
|
|
@@ -83,7 +84,7 @@ def _dec(x: str | float | int) -> Decimal:
|
|
|
83
84
|
|
|
84
85
|
# DEFAULTS: keep $0.00 so you don’t accidentally attribute costs.
|
|
85
86
|
# Fill in values for your org as needed (USD per 1K tokens).
|
|
86
|
-
DEFAULT_REGISTRY:
|
|
87
|
+
DEFAULT_REGISTRY: dict[str, ModelPricing] = {
|
|
87
88
|
# Examples — edit to match your negotiated prices:
|
|
88
89
|
# "openai/gpt-4o": ModelPricing(_dec("5.00"), _dec("15.00")),
|
|
89
90
|
# "openai/o3-mini": ModelPricing(_dec("2.50"), _dec("10.00"), reasoning_per_1k=_dec("5.00")),
|
|
@@ -98,7 +99,7 @@ def normalize_model_name(name: str) -> str:
|
|
|
98
99
|
return (name or "").strip().lower()
|
|
99
100
|
|
|
100
101
|
|
|
101
|
-
def resolve_model_name(event:
|
|
102
|
+
def resolve_model_name(event: dict[str, Any]) -> str:
|
|
102
103
|
m = (
|
|
103
104
|
((event.get("metadata") or {}).get("model"))
|
|
104
105
|
or ((event.get("metadata") or {}).get("ls_model_name"))
|
|
@@ -108,7 +109,7 @@ def resolve_model_name(event: Dict[str, Any]) -> str:
|
|
|
108
109
|
|
|
109
110
|
|
|
110
111
|
def find_pricing(
|
|
111
|
-
model: str, registry:
|
|
112
|
+
model: str, registry: dict[str, ModelPricing]
|
|
112
113
|
) -> Optional[ModelPricing]:
|
|
113
114
|
if model in registry:
|
|
114
115
|
return registry[model]
|
|
@@ -124,15 +125,16 @@ def find_pricing(
|
|
|
124
125
|
|
|
125
126
|
|
|
126
127
|
def default_registry_path() -> str:
|
|
127
|
-
"""
|
|
128
|
-
|
|
128
|
+
"""Path to pricing file shipped with this package"""
|
|
129
|
+
path = resources.files("ursa") / "observability" / "pricing.json"
|
|
130
|
+
return str(path)
|
|
129
131
|
|
|
130
132
|
|
|
131
133
|
def load_registry(
|
|
132
134
|
path: Optional[str] = None,
|
|
133
|
-
overrides: Optional[
|
|
135
|
+
overrides: Optional[dict[str, Any]] = None,
|
|
134
136
|
use_default_if_missing: bool = True,
|
|
135
|
-
) ->
|
|
137
|
+
) -> dict[str, ModelPricing]:
|
|
136
138
|
"""
|
|
137
139
|
Load pricing registry from:
|
|
138
140
|
1) explicit `path` (if provided), else
|
|
@@ -140,7 +142,7 @@ def load_registry(
|
|
|
140
142
|
3) pricing.json next to pricing.py (if present, and use_default_if_missing)
|
|
141
143
|
4) fall back to DEFAULT_REGISTRY
|
|
142
144
|
"""
|
|
143
|
-
reg:
|
|
145
|
+
reg: dict[str, ModelPricing] = dict(DEFAULT_REGISTRY)
|
|
144
146
|
|
|
145
147
|
# 1) explicit path from caller wins
|
|
146
148
|
candidate = path
|
|
@@ -194,7 +196,7 @@ def load_registry(
|
|
|
194
196
|
# ---------- Core pricing application ----------
|
|
195
197
|
|
|
196
198
|
|
|
197
|
-
def _has_provider_cost(roll:
|
|
199
|
+
def _has_provider_cost(roll: dict[str, Any]) -> bool:
|
|
198
200
|
# Treat nonzero provider totals as authoritative
|
|
199
201
|
try:
|
|
200
202
|
return any([
|
|
@@ -211,10 +213,10 @@ def _round_money(x: Decimal) -> float:
|
|
|
211
213
|
|
|
212
214
|
|
|
213
215
|
def price_event(
|
|
214
|
-
event:
|
|
215
|
-
registry:
|
|
216
|
+
event: dict[str, Any],
|
|
217
|
+
registry: dict[str, ModelPricing],
|
|
216
218
|
overwrite: bool = False,
|
|
217
|
-
) -> Tuple[
|
|
219
|
+
) -> Tuple[dict[str, Any], Optional[Decimal], str]:
|
|
218
220
|
"""
|
|
219
221
|
Returns (event, total_cost_decimal_or_None, cost_source)
|
|
220
222
|
cost_source ∈ {"provider", "computed", "no_usage", "no_pricing"}
|
|
@@ -255,10 +257,10 @@ def price_event(
|
|
|
255
257
|
|
|
256
258
|
|
|
257
259
|
def price_payload(
|
|
258
|
-
payload:
|
|
259
|
-
registry: Optional[
|
|
260
|
+
payload: dict[str, Any],
|
|
261
|
+
registry: Optional[dict[str, ModelPricing]] = None,
|
|
260
262
|
overwrite: bool = False,
|
|
261
|
-
) ->
|
|
263
|
+
) -> dict[str, Any]:
|
|
262
264
|
"""
|
|
263
265
|
Enriches payload in-place with computed costs where missing.
|
|
264
266
|
Adds a `costs` block with totals and by-model aggregation.
|
|
@@ -266,7 +268,7 @@ def price_payload(
|
|
|
266
268
|
reg = registry or load_registry()
|
|
267
269
|
llm_events = payload.get("llm_events") or []
|
|
268
270
|
total = Decimal("0")
|
|
269
|
-
by_model:
|
|
271
|
+
by_model: dict[str, Decimal] = {}
|
|
270
272
|
sources = {"provider": 0, "computed": 0, "no_usage": 0, "no_pricing": 0}
|
|
271
273
|
|
|
272
274
|
for ev in llm_events:
|
ursa/util/parse.py
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import re
|
|
4
|
+
import shutil
|
|
5
|
+
import unicodedata
|
|
6
|
+
from typing import Any, Optional, Tuple
|
|
7
|
+
from urllib.parse import urljoin, urlparse
|
|
8
|
+
|
|
9
|
+
import justext
|
|
10
|
+
import requests
|
|
11
|
+
import trafilatura
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
3
13
|
|
|
4
14
|
|
|
5
15
|
def extract_json(text: str) -> list[dict]:
|
|
@@ -87,3 +97,309 @@ def extract_json(text: str) -> list[dict]:
|
|
|
87
97
|
return json.loads(json_str)
|
|
88
98
|
except json.JSONDecodeError as e:
|
|
89
99
|
raise ValueError("Extracted content is not valid JSON.") from e
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
PDF_CT_HINTS = (
|
|
103
|
+
"application/pdf",
|
|
104
|
+
"binary/octet-stream",
|
|
105
|
+
) # some servers mislabel
|
|
106
|
+
PDF_EXT_RE = re.compile(r"\.pdf($|\?)", re.IGNORECASE)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _is_pdf_response(resp: requests.Response) -> bool:
|
|
110
|
+
ct = resp.headers.get("Content-Type", "").lower()
|
|
111
|
+
if any(hint in ct for hint in PDF_CT_HINTS):
|
|
112
|
+
return True
|
|
113
|
+
# Sometimes servers omit CT but set filename
|
|
114
|
+
cd = resp.headers.get("Content-Disposition", "")
|
|
115
|
+
if "filename" in cd and ".pdf" in cd.lower():
|
|
116
|
+
return True
|
|
117
|
+
# Last resort: URL extension
|
|
118
|
+
return bool(PDF_EXT_RE.search(resp.url))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _derive_filename_from_cd_or_url(
|
|
122
|
+
resp: requests.Response, fallback: str
|
|
123
|
+
) -> str:
|
|
124
|
+
cd = resp.headers.get("Content-Disposition", "")
|
|
125
|
+
m = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^\";]+)"?', cd, re.IGNORECASE)
|
|
126
|
+
if m:
|
|
127
|
+
name = m.group(1)
|
|
128
|
+
# Some headers include quotes
|
|
129
|
+
name = name.strip("\"'")
|
|
130
|
+
|
|
131
|
+
# RFC 5987 may encode UTF-8 in filename*; we’re treating as plain here.
|
|
132
|
+
if not name.lower().endswith(".pdf"):
|
|
133
|
+
name += ".pdf"
|
|
134
|
+
return name
|
|
135
|
+
|
|
136
|
+
# use URL last path segment if looks like PDF
|
|
137
|
+
parsed = urlparse(resp.url)
|
|
138
|
+
base = os.path.basename(parsed.path) or fallback
|
|
139
|
+
if not base.lower().endswith(".pdf"):
|
|
140
|
+
if PDF_EXT_RE.search(resp.url):
|
|
141
|
+
base = re.sub(
|
|
142
|
+
r"(\.pdf)(?:$|\?).*", r"\1", base, flags=re.IGNORECASE
|
|
143
|
+
)
|
|
144
|
+
if not base.lower().endswith(".pdf"):
|
|
145
|
+
base += ".pdf"
|
|
146
|
+
else:
|
|
147
|
+
base = (
|
|
148
|
+
fallback
|
|
149
|
+
if fallback.lower().endswith(".pdf")
|
|
150
|
+
else fallback + ".pdf"
|
|
151
|
+
)
|
|
152
|
+
return base
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _download_stream_to(path: str, resp: requests.Response) -> str:
|
|
156
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
157
|
+
with open(path, "wb") as f:
|
|
158
|
+
shutil.copyfileobj(resp.raw, f)
|
|
159
|
+
return path
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _get_soup(
|
|
163
|
+
url: str, timeout: int = 20, headers: Optional[dict[str, str]] = None
|
|
164
|
+
) -> BeautifulSoup:
|
|
165
|
+
r = requests.get(url, timeout=timeout, headers=headers or {})
|
|
166
|
+
r.raise_for_status()
|
|
167
|
+
return BeautifulSoup(r.text, "html.parser")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _find_pdf_on_landing(soup: BeautifulSoup, base_url: str) -> Optional[str]:
|
|
171
|
+
# 1) meta citation_pdf_url
|
|
172
|
+
meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
|
|
173
|
+
if meta and meta.get("content"):
|
|
174
|
+
return urljoin(base_url, meta["content"])
|
|
175
|
+
|
|
176
|
+
# 2) obvious anchors: text contains 'PDF' or 'Download'
|
|
177
|
+
for a in soup.find_all("a", href=True):
|
|
178
|
+
label = (a.get_text(" ", strip=True) or "").lower()
|
|
179
|
+
href = a["href"]
|
|
180
|
+
if "pdf" in label or "download" in label or PDF_EXT_RE.search(href):
|
|
181
|
+
return urljoin(base_url, href)
|
|
182
|
+
|
|
183
|
+
# 3) buttons that wrap an anchor
|
|
184
|
+
for btn in soup.find_all(["button", "a"], href=True):
|
|
185
|
+
label = (btn.get_text(" ", strip=True) or "").lower()
|
|
186
|
+
href = btn.get("href")
|
|
187
|
+
if href and (
|
|
188
|
+
"pdf" in label or "download" in label or PDF_EXT_RE.search(href)
|
|
189
|
+
):
|
|
190
|
+
return urljoin(base_url, href)
|
|
191
|
+
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# def _resolve_pdf_via_unpaywall(doi: str, email: str, timeout: int = 15) -> Optional[str]:
|
|
196
|
+
# # Optional helper: respects publisher OA; returns None if no OA PDF
|
|
197
|
+
# try:
|
|
198
|
+
# url = f"https://api.unpaywall.org/v2/{doi}"
|
|
199
|
+
# r = requests.get(url, params={"email": email}, timeout=timeout)
|
|
200
|
+
# r.raise_for_status()
|
|
201
|
+
# data = r.json()
|
|
202
|
+
# loc = data.get("best_oa_location") or {}
|
|
203
|
+
# pdf = loc.get("url_for_pdf") or loc.get("url")
|
|
204
|
+
# if pdf and PDF_EXT_RE.search(pdf):
|
|
205
|
+
# return pdf
|
|
206
|
+
# # Sometimes url points to landing; try it anyway.
|
|
207
|
+
# return pdf
|
|
208
|
+
# except Exception:
|
|
209
|
+
# return None
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def resolve_pdf_from_osti_record(
|
|
213
|
+
rec: dict[str, Any],
|
|
214
|
+
*,
|
|
215
|
+
headers: Optional[dict[str, str]] = None,
|
|
216
|
+
unpaywall_email: Optional[str] = None,
|
|
217
|
+
timeout: int = 25,
|
|
218
|
+
) -> Tuple[Optional[str], Optional[str], str]:
|
|
219
|
+
"""
|
|
220
|
+
Returns (pdf_url, landing_used, note)
|
|
221
|
+
- pdf_url: direct downloadable PDF URL if found (or a strong candidate)
|
|
222
|
+
- landing_used: landing page URL we parsed (if any)
|
|
223
|
+
- note: brief trace of how we found it
|
|
224
|
+
"""
|
|
225
|
+
headers = headers or {"User-Agent": "Mozilla/5.0"}
|
|
226
|
+
note_parts: list[str] = []
|
|
227
|
+
|
|
228
|
+
links = rec.get("links", []) or []
|
|
229
|
+
# doi = rec.get("doi")
|
|
230
|
+
|
|
231
|
+
# 1) Try 'fulltext' first (OSTI purl)
|
|
232
|
+
fulltext = None
|
|
233
|
+
for link in links:
|
|
234
|
+
if link.get("rel") == "fulltext":
|
|
235
|
+
fulltext = link.get("href")
|
|
236
|
+
break
|
|
237
|
+
|
|
238
|
+
if fulltext:
|
|
239
|
+
note_parts.append("Tried links[fulltext] purl")
|
|
240
|
+
try:
|
|
241
|
+
# Follow redirects; stream to peek headers without loading whole body
|
|
242
|
+
r = requests.get(
|
|
243
|
+
fulltext,
|
|
244
|
+
headers=headers,
|
|
245
|
+
timeout=timeout,
|
|
246
|
+
allow_redirects=True,
|
|
247
|
+
stream=True,
|
|
248
|
+
)
|
|
249
|
+
r.raise_for_status()
|
|
250
|
+
|
|
251
|
+
if _is_pdf_response(r):
|
|
252
|
+
note_parts.append("fulltext resolved directly to PDF")
|
|
253
|
+
return (r.url, None, " | ".join(note_parts))
|
|
254
|
+
|
|
255
|
+
# Not a PDF: parse page HTML for meta or obvious PDF anchors
|
|
256
|
+
# (If server sent binary but CT lied, _is_pdf_response would have caught via CD or ext)
|
|
257
|
+
r.close()
|
|
258
|
+
soup = _get_soup(fulltext, timeout=timeout, headers=headers)
|
|
259
|
+
candidate = _find_pdf_on_landing(soup, fulltext)
|
|
260
|
+
if candidate:
|
|
261
|
+
note_parts.append(
|
|
262
|
+
"found PDF via meta/anchor on fulltext landing"
|
|
263
|
+
)
|
|
264
|
+
return (candidate, fulltext, " | ".join(note_parts))
|
|
265
|
+
except Exception as e:
|
|
266
|
+
note_parts.append(f"fulltext failed: {e}")
|
|
267
|
+
|
|
268
|
+
# 2) Try DOE PAGES landing (citation_doe_pages)
|
|
269
|
+
doe_pages = None
|
|
270
|
+
for link in links:
|
|
271
|
+
if link.get("rel") == "citation_doe_pages":
|
|
272
|
+
doe_pages = link.get("href")
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
if doe_pages:
|
|
276
|
+
note_parts.append("Tried links[citation_doe_pages] landing")
|
|
277
|
+
try:
|
|
278
|
+
soup = _get_soup(doe_pages, timeout=timeout, headers=headers)
|
|
279
|
+
candidate = _find_pdf_on_landing(soup, doe_pages)
|
|
280
|
+
if candidate:
|
|
281
|
+
# Candidate may itself be a landing—check if it serves PDF
|
|
282
|
+
try:
|
|
283
|
+
r2 = requests.get(
|
|
284
|
+
candidate,
|
|
285
|
+
headers=headers,
|
|
286
|
+
timeout=timeout,
|
|
287
|
+
allow_redirects=True,
|
|
288
|
+
stream=True,
|
|
289
|
+
)
|
|
290
|
+
r2.raise_for_status()
|
|
291
|
+
if _is_pdf_response(r2):
|
|
292
|
+
note_parts.append("citation_doe_pages → direct PDF")
|
|
293
|
+
return (r2.url, doe_pages, " | ".join(note_parts))
|
|
294
|
+
r2.close()
|
|
295
|
+
except Exception:
|
|
296
|
+
pass
|
|
297
|
+
# If not clearly PDF, still return as a candidate (agent will fetch & parse)
|
|
298
|
+
note_parts.append(
|
|
299
|
+
"citation_doe_pages → PDF-like candidate (not confirmed by headers)"
|
|
300
|
+
)
|
|
301
|
+
return (candidate, doe_pages, " | ".join(note_parts))
|
|
302
|
+
except Exception as e:
|
|
303
|
+
note_parts.append(f"citation_doe_pages failed: {e}")
|
|
304
|
+
|
|
305
|
+
# # 3) Optional: DOI → Unpaywall OA
|
|
306
|
+
# if doi and unpaywall_email:
|
|
307
|
+
# note_parts.append("Tried Unpaywall via DOI")
|
|
308
|
+
# pdf_from_ua = _resolve_pdf_via_unpaywall(doi, unpaywall_email)
|
|
309
|
+
# if pdf_from_ua:
|
|
310
|
+
# # May be direct PDF or landing; the caller will validate headers during download
|
|
311
|
+
# note_parts.append("Unpaywall returned candidate")
|
|
312
|
+
# return (pdf_from_ua, None, " | ".join(note_parts))
|
|
313
|
+
|
|
314
|
+
# 4) Give up
|
|
315
|
+
note_parts.append("No PDF found")
|
|
316
|
+
return (None, None, " | ".join(note_parts))
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _normalize_ws(text: str) -> str:
|
|
320
|
+
# Normalize unicode, collapse whitespace, and strip control chars
|
|
321
|
+
text = unicodedata.normalize("NFKC", text)
|
|
322
|
+
text = re.sub(r"[ \t\r\f\v]+", " ", text)
|
|
323
|
+
text = re.sub(r"\s*\n\s*", "\n", text)
|
|
324
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
325
|
+
text = text.strip()
|
|
326
|
+
return text
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _dedupe_lines(text: str, min_len: int = 40) -> str:
|
|
330
|
+
seen = set()
|
|
331
|
+
out = []
|
|
332
|
+
for line in text.splitlines():
|
|
333
|
+
stripped = line.strip()
|
|
334
|
+
# Ignore very short or repeated lines (menus, cookie banners, etc.)
|
|
335
|
+
if len(stripped) < min_len:
|
|
336
|
+
continue
|
|
337
|
+
key = stripped.lower()
|
|
338
|
+
if key in seen:
|
|
339
|
+
continue
|
|
340
|
+
seen.add(key)
|
|
341
|
+
out.append(stripped)
|
|
342
|
+
return "\n\n".join(out)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def extract_main_text_only(html: str, *, max_chars: int = 250_000) -> str:
|
|
346
|
+
"""
|
|
347
|
+
Returns plain text with navigation/ads/scripts removed.
|
|
348
|
+
Prefers trafilatura -> jusText -> BS4 paragraphs.
|
|
349
|
+
"""
|
|
350
|
+
# 1) Trafilatura
|
|
351
|
+
# You can tune config: with_metadata, include_comments, include_images, favor_recall, etc.
|
|
352
|
+
cfg = trafilatura.settings.use_config()
|
|
353
|
+
cfg.set("DEFAULT", "include_comments", "false")
|
|
354
|
+
cfg.set("DEFAULT", "include_tables", "false")
|
|
355
|
+
cfg.set("DEFAULT", "favor_recall", "false") # be stricter; less noise
|
|
356
|
+
try:
|
|
357
|
+
# If you fetched HTML already, use extract() on string; otherwise, fetch_url(url)
|
|
358
|
+
txt = trafilatura.extract(
|
|
359
|
+
html,
|
|
360
|
+
config=cfg,
|
|
361
|
+
include_comments=False,
|
|
362
|
+
include_tables=False,
|
|
363
|
+
favor_recall=False,
|
|
364
|
+
)
|
|
365
|
+
if txt and txt.strip():
|
|
366
|
+
txt = _normalize_ws(txt)
|
|
367
|
+
txt = _dedupe_lines(txt)
|
|
368
|
+
return txt[:max_chars]
|
|
369
|
+
except Exception:
|
|
370
|
+
pass
|
|
371
|
+
|
|
372
|
+
# 2) jusText
|
|
373
|
+
try:
|
|
374
|
+
paragraphs = justext.justext(html, justext.get_stoplist("English"))
|
|
375
|
+
body_paras = [p.text for p in paragraphs if not p.is_boilerplate]
|
|
376
|
+
if body_paras:
|
|
377
|
+
txt = _normalize_ws("\n\n".join(body_paras))
|
|
378
|
+
txt = _dedupe_lines(txt)
|
|
379
|
+
return txt[:max_chars]
|
|
380
|
+
except Exception:
|
|
381
|
+
pass
|
|
382
|
+
|
|
383
|
+
# 4) last-resort: BS4 paragraphs/headings only
|
|
384
|
+
from bs4 import BeautifulSoup
|
|
385
|
+
|
|
386
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
387
|
+
for tag in soup([
|
|
388
|
+
"script",
|
|
389
|
+
"style",
|
|
390
|
+
"noscript",
|
|
391
|
+
"header",
|
|
392
|
+
"footer",
|
|
393
|
+
"nav",
|
|
394
|
+
"form",
|
|
395
|
+
"aside",
|
|
396
|
+
]):
|
|
397
|
+
tag.decompose()
|
|
398
|
+
chunks = []
|
|
399
|
+
for el in soup.find_all(["h1", "h2", "h3", "p", "li", "figcaption"]):
|
|
400
|
+
t = el.get_text(" ", strip=True)
|
|
401
|
+
if t:
|
|
402
|
+
chunks.append(t)
|
|
403
|
+
txt = _normalize_ws("\n\n".join(chunks))
|
|
404
|
+
txt = _dedupe_lines(txt)
|
|
405
|
+
return txt[:max_chars]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ursa-ai
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: Agents for science at LANL
|
|
5
5
|
Author-email: Mike Grosskopf <mikegros@lanl.gov>, Nathan Debardeleben <ndebard@lanl.gov>, Rahul Somasundaram <rsomasundaram@lanl.gov>, Isaac Michaud <imichaud@lanl.gov>, Avanish Mishra <avanish@lanl.gov>, Arthur Lui <alui@lanl.gov>, Russell Bent <rbent@lanl.gov>, Earl Lawrence <earl@lanl.gov>
|
|
6
6
|
License-Expression: BSD-3-Clause
|
|
@@ -39,6 +39,8 @@ Requires-Dist: langgraph-checkpoint-sqlite<3.0,>=2.0.10
|
|
|
39
39
|
Requires-Dist: langchain-ollama<0.4,>=0.3.6
|
|
40
40
|
Requires-Dist: ddgs>=9.5.5
|
|
41
41
|
Requires-Dist: typer>=0.16.1
|
|
42
|
+
Requires-Dist: trafilatura<1.7,>=1.6.1
|
|
43
|
+
Requires-Dist: selectolax<0.5,>=0.4.0
|
|
42
44
|
Dynamic: license-file
|
|
43
45
|
|
|
44
46
|
# URSA - The Universal Research and Scientific Agent
|
|
@@ -51,6 +53,8 @@ Dynamic: license-file
|
|
|
51
53
|
The flexible agentic workflow for accelerating scientific tasks.
|
|
52
54
|
Composes information flow between agents for planning, code writing and execution, and online research to solve complex problems.
|
|
53
55
|
|
|
56
|
+
The original arxiv paper is [here](https://arxiv.org/abs/2506.22653).
|
|
57
|
+
|
|
54
58
|
## Installation
|
|
55
59
|
You can install `ursa` via `pip` or `uv`.
|
|
56
60
|
|
|
@@ -1,20 +1,23 @@
|
|
|
1
1
|
ursa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
ursa/agents/__init__.py,sha256=
|
|
3
|
-
ursa/agents/
|
|
4
|
-
ursa/agents/
|
|
2
|
+
ursa/agents/__init__.py,sha256=XApBa4fFidXOt0TKrqNZ3UtybKkdGPtyGp9vOgYwP1I,1524
|
|
3
|
+
ursa/agents/acquisition_agents.py,sha256=6TdJvrLdvTKa_mfFNvD6KHZ-Z-2DBHxiMe-yR0tXSu4,28148
|
|
4
|
+
ursa/agents/arxiv_agent.py,sha256=7veUmjnTmlOHm1Y9N65q-IxBxK1ZvlCtHyR7JBZklDw,14576
|
|
5
|
+
ursa/agents/base.py,sha256=rd2b9EQ83gh1ShzGJ2DeEu3CY80vu6OV5tQXHqqK-_E,26373
|
|
6
|
+
ursa/agents/chat_agent.py,sha256=Pw8Tweu7IL7Y5YIWRdbTNr9eC9Gc5bKu9ERBQJxNptk,1726
|
|
5
7
|
ursa/agents/code_review_agent.py,sha256=F3RJmS1jBYOoWPg5Hiy-xirutLTtw_u1_hqiKIsA2eY,11563
|
|
6
|
-
ursa/agents/execution_agent.py,sha256=
|
|
8
|
+
ursa/agents/execution_agent.py,sha256=UsCDTA8An4TbwTDrJgpvxKSPrw9GkgfSfXa26f8JJvQ,29369
|
|
7
9
|
ursa/agents/hypothesizer_agent.py,sha256=2jSOHkR8MjNhS5HMeOhZCkhT4wGDLNBs-aLgQhYBHr4,23124
|
|
8
|
-
ursa/agents/lammps_agent.py,sha256=
|
|
10
|
+
ursa/agents/lammps_agent.py,sha256=63uDq0zsR06tjT51Gv_cbGvKGzJYvDm_BmZ1h79FWLw,16375
|
|
9
11
|
ursa/agents/mp_agent.py,sha256=HvD4JC9OuGeEicW3mKZBQmQ1yn0HVB03118E1urXwx4,6833
|
|
10
12
|
ursa/agents/optimization_agent.py,sha256=siJKyujfy_oNQDbnIFaCUkE_DLvmxPYsBJBDms6Cgbg,13945
|
|
11
|
-
ursa/agents/planning_agent.py,sha256=
|
|
13
|
+
ursa/agents/planning_agent.py,sha256=JbP5RIRTKCZX7G-B2ymmS5dbUdJCkL48lNsbtWHqzk0,7320
|
|
12
14
|
ursa/agents/rag_agent.py,sha256=kc7ZYWxgWNh-LbMpREaJY90IVbs2SKPrXdCutxrEdT0,10099
|
|
13
15
|
ursa/agents/recall_agent.py,sha256=MoS0FaQFAb14DeYSCo6sYKNTjvE_u-egTOvuXSTXLpk,1693
|
|
14
|
-
ursa/agents/websearch_agent.py,sha256=
|
|
15
|
-
ursa/cli/__init__.py,sha256=
|
|
16
|
-
ursa/cli/hitl.py,sha256=
|
|
17
|
-
ursa/observability/pricing.
|
|
16
|
+
ursa/agents/websearch_agent.py,sha256=KV8H_9B3gLNpQx6B_WtMaf6Zahk_caZ1TCOFAiJBc3I,7771
|
|
17
|
+
ursa/cli/__init__.py,sha256=jkP-5aFXo3jn7faevyYlbJUEh04SdDs7at_QJDpFbRI,4042
|
|
18
|
+
ursa/cli/hitl.py,sha256=u2JYGaqi0fjvwHSqbi5A72ER2lsQb7i5gC1VQc6uW9g,15299
|
|
19
|
+
ursa/observability/pricing.json,sha256=c5d334-oBU9CP1xYXAsVf_bUrmT0g0yUE5WZUdmAuvY,6714
|
|
20
|
+
ursa/observability/pricing.py,sha256=Xe397VWXzdVu3wOLDbraHxkMzixu3CUyTyoVpHvTvIA,10575
|
|
18
21
|
ursa/observability/timing.py,sha256=yQX5ZtkxRBH_1V61KjWmEBph02DOeZuP-v0-OtJNffQ,48612
|
|
19
22
|
ursa/prompt_library/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
23
|
ursa/prompt_library/code_review_prompts.py,sha256=-HuhwW9W_p2LDn44bXLntxLADHCOyl-2KIXxRHto66w,2444
|
|
@@ -34,10 +37,10 @@ ursa/util/diff_renderer.py,sha256=1L1q2qWWb8gLhR532-LgJn2TrqXDx0gUpPVOWD_sqeU,40
|
|
|
34
37
|
ursa/util/helperFunctions.py,sha256=cs-pQEcXyOr4nb5FoH9Ssg3hktycEapW07-MKJcUNOA,4122
|
|
35
38
|
ursa/util/memory_logger.py,sha256=GiKYbQBpxlNRLKyqKFJyrbSbVCkXpRB7Yr5so43tUAw,6097
|
|
36
39
|
ursa/util/optimization_schema.py,sha256=b2wO0BjCIgQb15Q3adDu-ZyG3dfncme9OGQw6FmDGDc,2731
|
|
37
|
-
ursa/util/parse.py,sha256=
|
|
38
|
-
ursa_ai-0.7.
|
|
39
|
-
ursa_ai-0.7.
|
|
40
|
-
ursa_ai-0.7.
|
|
41
|
-
ursa_ai-0.7.
|
|
42
|
-
ursa_ai-0.7.
|
|
43
|
-
ursa_ai-0.7.
|
|
40
|
+
ursa/util/parse.py,sha256=QCQM_gEHs89WZ63CzNoaxWVRDUTADNyOrJZyUZmG36E,13657
|
|
41
|
+
ursa_ai-0.7.1.dist-info/licenses/LICENSE,sha256=4Vr6_u2zTHIUvYjoOBg9ztDbfpV3hyCFv3mTCS87gYU,1482
|
|
42
|
+
ursa_ai-0.7.1.dist-info/METADATA,sha256=IytCA3xYHrKn7OU7O2g0qdXI0q7vTc2hF7gFwTNsmO4,9943
|
|
43
|
+
ursa_ai-0.7.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
ursa_ai-0.7.1.dist-info/entry_points.txt,sha256=B0bbgM-NcAqLujZ1lfmChJZpQMYQDYFJck4moU89Y4E,39
|
|
45
|
+
ursa_ai-0.7.1.dist-info/top_level.txt,sha256=OjA1gRYSUAeiXGnpqPC8iOOGfcjFO1IlP848qMnYSdY,5
|
|
46
|
+
ursa_ai-0.7.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|