ursa-ai 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ursa/__init__.py +3 -0
- ursa/agents/__init__.py +32 -0
- ursa/agents/acquisition_agents.py +812 -0
- ursa/agents/arxiv_agent.py +429 -0
- ursa/agents/base.py +728 -0
- ursa/agents/chat_agent.py +60 -0
- ursa/agents/code_review_agent.py +341 -0
- ursa/agents/execution_agent.py +915 -0
- ursa/agents/hypothesizer_agent.py +614 -0
- ursa/agents/lammps_agent.py +465 -0
- ursa/agents/mp_agent.py +204 -0
- ursa/agents/optimization_agent.py +410 -0
- ursa/agents/planning_agent.py +219 -0
- ursa/agents/rag_agent.py +304 -0
- ursa/agents/recall_agent.py +54 -0
- ursa/agents/websearch_agent.py +196 -0
- ursa/cli/__init__.py +363 -0
- ursa/cli/hitl.py +516 -0
- ursa/cli/hitl_api.py +75 -0
- ursa/observability/metrics_charts.py +1279 -0
- ursa/observability/metrics_io.py +11 -0
- ursa/observability/metrics_session.py +750 -0
- ursa/observability/pricing.json +97 -0
- ursa/observability/pricing.py +321 -0
- ursa/observability/timing.py +1466 -0
- ursa/prompt_library/__init__.py +0 -0
- ursa/prompt_library/code_review_prompts.py +51 -0
- ursa/prompt_library/execution_prompts.py +50 -0
- ursa/prompt_library/hypothesizer_prompts.py +17 -0
- ursa/prompt_library/literature_prompts.py +11 -0
- ursa/prompt_library/optimization_prompts.py +131 -0
- ursa/prompt_library/planning_prompts.py +79 -0
- ursa/prompt_library/websearch_prompts.py +131 -0
- ursa/tools/__init__.py +0 -0
- ursa/tools/feasibility_checker.py +114 -0
- ursa/tools/feasibility_tools.py +1075 -0
- ursa/tools/run_command.py +27 -0
- ursa/tools/write_code.py +42 -0
- ursa/util/__init__.py +0 -0
- ursa/util/diff_renderer.py +128 -0
- ursa/util/helperFunctions.py +142 -0
- ursa/util/logo_generator.py +625 -0
- ursa/util/memory_logger.py +183 -0
- ursa/util/optimization_schema.py +78 -0
- ursa/util/parse.py +405 -0
- ursa_ai-0.9.1.dist-info/METADATA +304 -0
- ursa_ai-0.9.1.dist-info/RECORD +51 -0
- ursa_ai-0.9.1.dist-info/WHEEL +5 -0
- ursa_ai-0.9.1.dist-info/entry_points.txt +2 -0
- ursa_ai-0.9.1.dist-info/licenses/LICENSE +8 -0
- ursa_ai-0.9.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, Optional, Sequence
|
|
5
|
+
|
|
6
|
+
from langchain_chroma import Chroma
|
|
7
|
+
from langchain_core.documents import Document
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AgentMemory:
|
|
11
|
+
"""
|
|
12
|
+
Simple wrapper around a persistent Chroma vector-store for agent-conversation memory.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
path : str | Path | None
|
|
17
|
+
Where to keep the on-disk Chroma DB. If *None*, a folder called
|
|
18
|
+
``agent_memory_db`` is created in the package’s base directory.
|
|
19
|
+
collection_name : str
|
|
20
|
+
Name of the Chroma collection.
|
|
21
|
+
embedding_model : <TODO> | None
|
|
22
|
+
the embedding model
|
|
23
|
+
|
|
24
|
+
Notes
|
|
25
|
+
-----
|
|
26
|
+
* Requires `langchain-chroma`, and `chromadb`.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def get_db_path(cls, path: Optional[str | Path]) -> Path:
|
|
31
|
+
match path:
|
|
32
|
+
case None:
|
|
33
|
+
return Path.home() / ".cache" / "ursa" / "rag" / "db"
|
|
34
|
+
case str():
|
|
35
|
+
return Path(path)
|
|
36
|
+
case Path():
|
|
37
|
+
return path
|
|
38
|
+
case _:
|
|
39
|
+
raise TypeError(
|
|
40
|
+
f"Type of path is `{type(path)}` "
|
|
41
|
+
"but `Optional[str | Path]` was expected."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
embedding_model,
|
|
47
|
+
path: Optional[str | Path] = None,
|
|
48
|
+
collection_name: str = "agent_memory",
|
|
49
|
+
) -> None:
|
|
50
|
+
self.path = self.get_db_path(path)
|
|
51
|
+
self.collection_name = collection_name
|
|
52
|
+
self.path.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
self.embeddings = embedding_model
|
|
54
|
+
|
|
55
|
+
# If a DB already exists, load it; otherwise defer creation until `build_index`.
|
|
56
|
+
self.vectorstore: Optional[Chroma] = None
|
|
57
|
+
if any(self.path.iterdir()):
|
|
58
|
+
self.vectorstore = Chroma(
|
|
59
|
+
collection_name=self.collection_name,
|
|
60
|
+
embedding_function=self.embeddings,
|
|
61
|
+
persist_directory=str(self.path),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# --------------------------------------------------------------------- #
|
|
65
|
+
# ❶ Build & index a brand-new database #
|
|
66
|
+
# --------------------------------------------------------------------- #
|
|
67
|
+
def build_index(
|
|
68
|
+
self,
|
|
69
|
+
chunks: Sequence[str],
|
|
70
|
+
metadatas: Optional[Sequence[Dict[str, Any]]] = None,
|
|
71
|
+
) -> None:
|
|
72
|
+
"""
|
|
73
|
+
Create a fresh vector store from ``chunks``. Existing data (if any)
|
|
74
|
+
are overwritten.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
chunks : Sequence[str]
|
|
79
|
+
Text snippets (already chunked) to embed.
|
|
80
|
+
metadatas : Sequence[dict] | None
|
|
81
|
+
Optional metadata dict for each chunk, same length as ``chunks``.
|
|
82
|
+
"""
|
|
83
|
+
docs = [
|
|
84
|
+
Document(
|
|
85
|
+
page_content=text, metadata=metadatas[i] if metadatas else {}
|
|
86
|
+
)
|
|
87
|
+
for i, text in enumerate(chunks)
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
# Create (or overwrite) the collection
|
|
91
|
+
self.vectorstore = Chroma.from_documents(
|
|
92
|
+
documents=docs,
|
|
93
|
+
embedding=self.embeddings,
|
|
94
|
+
collection_name=self.collection_name,
|
|
95
|
+
persist_directory=str(self.path),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# --------------------------------------------------------------------- #
|
|
99
|
+
# ❷ Add new chunks and re-index #
|
|
100
|
+
# --------------------------------------------------------------------- #
|
|
101
|
+
def add_memories(
|
|
102
|
+
self,
|
|
103
|
+
new_chunks: Sequence[str],
|
|
104
|
+
metadatas: Optional[Sequence[Dict[str, Any]]] = None,
|
|
105
|
+
) -> None:
|
|
106
|
+
"""
|
|
107
|
+
Append new text chunks to the existing store (must call `build_index`
|
|
108
|
+
first if the DB is empty).
|
|
109
|
+
|
|
110
|
+
Raises
|
|
111
|
+
------
|
|
112
|
+
RuntimeError
|
|
113
|
+
If the vector store is not yet initialised.
|
|
114
|
+
"""
|
|
115
|
+
if self.vectorstore is None:
|
|
116
|
+
self.build_index(new_chunks, metadatas)
|
|
117
|
+
print("----- Vector store initialised -----")
|
|
118
|
+
|
|
119
|
+
docs = []
|
|
120
|
+
for i, text in enumerate(new_chunks):
|
|
121
|
+
if len(text) > 0: # only add non-empty documents
|
|
122
|
+
docs.append(
|
|
123
|
+
Document(
|
|
124
|
+
page_content=text,
|
|
125
|
+
metadata=metadatas[i] if metadatas else {},
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
self.vectorstore.add_documents(docs)
|
|
129
|
+
|
|
130
|
+
# --------------------------------------------------------------------- #
|
|
131
|
+
# ❸ Retrieve relevant chunks (RAG query) #
|
|
132
|
+
# --------------------------------------------------------------------- #
|
|
133
|
+
def retrieve(
|
|
134
|
+
self,
|
|
135
|
+
query: str,
|
|
136
|
+
k: int = 4,
|
|
137
|
+
with_scores: bool = False,
|
|
138
|
+
**search_kwargs,
|
|
139
|
+
):
|
|
140
|
+
"""
|
|
141
|
+
Return the *k* most similar chunks for `query`.
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
query : str
|
|
146
|
+
Natural-language question or statement.
|
|
147
|
+
k : int
|
|
148
|
+
How many results to return.
|
|
149
|
+
with_scores : bool
|
|
150
|
+
If True, also return similarity scores.
|
|
151
|
+
**search_kwargs
|
|
152
|
+
Extra kwargs forwarded to Chroma’s ``similarity_search*`` helpers.
|
|
153
|
+
|
|
154
|
+
Returns
|
|
155
|
+
-------
|
|
156
|
+
list[Document] | list[tuple[Document, float]]
|
|
157
|
+
"""
|
|
158
|
+
if self.vectorstore is None:
|
|
159
|
+
return ["None"]
|
|
160
|
+
|
|
161
|
+
if with_scores:
|
|
162
|
+
return self.vectorstore.similarity_search_with_score(
|
|
163
|
+
query, k=k, **search_kwargs
|
|
164
|
+
)
|
|
165
|
+
return self.vectorstore.similarity_search(query, k=k, **search_kwargs)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def delete_database(path: Optional[str | Path] = None):
|
|
169
|
+
"""
|
|
170
|
+
Simple wrapper around a persistent Chroma vector-store for agent-conversation memory.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
path : str | Path | None
|
|
175
|
+
Where the on-disk Chroma DB is for deleting. If *None*, a folder called
|
|
176
|
+
``agent_memory_db`` is created in the package’s base directory.
|
|
177
|
+
"""
|
|
178
|
+
db_path = AgentMemory.get_db_path(path)
|
|
179
|
+
if os.path.exists(db_path):
|
|
180
|
+
shutil.rmtree(db_path)
|
|
181
|
+
print(f"Database: {db_path} has been deleted.")
|
|
182
|
+
else:
|
|
183
|
+
print("No database found to delete.")
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Any, List, Literal, Optional, TypedDict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DecisionVariableType(TypedDict):
|
|
5
|
+
name: str # decision variable name
|
|
6
|
+
type: Literal[
|
|
7
|
+
"continuous",
|
|
8
|
+
"integer",
|
|
9
|
+
"logical",
|
|
10
|
+
"infinite-dimensional",
|
|
11
|
+
"finite-dimensional",
|
|
12
|
+
] # decision variable type
|
|
13
|
+
domain: str # allowable values of variable
|
|
14
|
+
description: str # natural language description
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ParameterType(TypedDict):
|
|
18
|
+
name: str # parameter name
|
|
19
|
+
value: Optional[Any] # parameter value; None
|
|
20
|
+
description: str # natural language description
|
|
21
|
+
is_user_supplied: bool # 1 if user supplied parameter
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ObjectiveType(TypedDict):
|
|
25
|
+
sense: Literal["minimize", "maximize"] # objective sense
|
|
26
|
+
expression_nl: str # sympy-representable mathematical expression
|
|
27
|
+
tags: List[
|
|
28
|
+
Literal["linear", "quadratic", "nonlinear", "convex", "nonconvex"]
|
|
29
|
+
] # objective type
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ConstraintType(TypedDict):
|
|
33
|
+
name: str # constraint name
|
|
34
|
+
expression_nl: str # sympy-representable mathematical expression
|
|
35
|
+
tags: List[
|
|
36
|
+
Literal[
|
|
37
|
+
"linear",
|
|
38
|
+
"integer",
|
|
39
|
+
"nonlinear",
|
|
40
|
+
"equality",
|
|
41
|
+
"inequality",
|
|
42
|
+
"infinite-dimensional",
|
|
43
|
+
"finite-dimensional",
|
|
44
|
+
]
|
|
45
|
+
] # constraint type
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class NotesType(TypedDict):
|
|
49
|
+
verifier: str # problem verification status and explanation
|
|
50
|
+
feasibility: str # problem feasibility status
|
|
51
|
+
user: str # notes to user
|
|
52
|
+
assumptions: str # assumptions made during formulation
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ProblemSpec(TypedDict):
|
|
56
|
+
title: str # name of the problem
|
|
57
|
+
description_nl: str # natural language description
|
|
58
|
+
decision_variables: List[
|
|
59
|
+
DecisionVariableType
|
|
60
|
+
] # list of all decision variables
|
|
61
|
+
parameters: List[ParameterType] # list of all parameters
|
|
62
|
+
objective: ObjectiveType # structred objective function details
|
|
63
|
+
constraints: List[ConstraintType] # structured constraint details
|
|
64
|
+
problem_class: Optional[str] # optimization problem class
|
|
65
|
+
latex: Optional[str] # latex formulation of the problem
|
|
66
|
+
status: Literal["DRAFT", "VERIFIED", "ERROR"] # problem status
|
|
67
|
+
notes: NotesType # structured notes data
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class SolverSpec(TypedDict):
|
|
71
|
+
solver: str # name of the solver, replace with Literal["Gurobi","Ipopt",...] to restrict solvers
|
|
72
|
+
library: str # library or relevant packages for the solver
|
|
73
|
+
algorithm: Optional[str] # algorithm used to solve the problem
|
|
74
|
+
license: Optional[
|
|
75
|
+
str
|
|
76
|
+
] # License status of the solver (open-source, commercial,etc.)
|
|
77
|
+
parameters: Optional[List[dict]] # other parameters relevant to the problem
|
|
78
|
+
notes: Optional[str] # justifying the choice of solver
|
ursa/util/parse.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
import unicodedata
|
|
6
|
+
from typing import Any, Optional, Tuple
|
|
7
|
+
from urllib.parse import urljoin, urlparse
|
|
8
|
+
|
|
9
|
+
import justext
|
|
10
|
+
import requests
|
|
11
|
+
import trafilatura
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def extract_json(text: str) -> list[dict]:
|
|
16
|
+
"""
|
|
17
|
+
Extract a JSON object or array from text that might contain markdown or other content.
|
|
18
|
+
|
|
19
|
+
The function attempts three strategies:
|
|
20
|
+
1. Extract JSON from a markdown code block labeled as JSON.
|
|
21
|
+
2. Extract JSON from any markdown code block.
|
|
22
|
+
3. Use bracket matching to extract a JSON substring starting with '{' or '['.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
A Python object parsed from the JSON string (dict or list).
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
ValueError: If no valid JSON is found.
|
|
29
|
+
"""
|
|
30
|
+
# Approach 1: Look for a markdown code block specifically labeled as JSON.
|
|
31
|
+
labeled_block = re.search(
|
|
32
|
+
r"```json\s*([\[{].*?[\]}])\s*```", text, re.DOTALL
|
|
33
|
+
)
|
|
34
|
+
if labeled_block:
|
|
35
|
+
json_str = labeled_block.group(1).strip()
|
|
36
|
+
try:
|
|
37
|
+
return json.loads(json_str)
|
|
38
|
+
except json.JSONDecodeError:
|
|
39
|
+
# Fall back to the next approach if parsing fails.
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
# Approach 2: Look for any code block delimited by triple backticks.
|
|
43
|
+
generic_block = re.search(r"```(.*?)```", text, re.DOTALL)
|
|
44
|
+
if generic_block:
|
|
45
|
+
json_str = generic_block.group(1).strip()
|
|
46
|
+
if json_str.startswith("{") or json_str.startswith("["):
|
|
47
|
+
try:
|
|
48
|
+
return json.loads(json_str)
|
|
49
|
+
except json.JSONDecodeError:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
# Approach 3: Attempt to extract JSON using bracket matching.
|
|
53
|
+
# Find the first occurrence of either '{' or '['.
|
|
54
|
+
first_obj = text.find("{")
|
|
55
|
+
first_arr = text.find("[")
|
|
56
|
+
if first_obj == -1 and first_arr == -1:
|
|
57
|
+
raise ValueError("No JSON object or array found in the text.")
|
|
58
|
+
|
|
59
|
+
# Determine which bracket comes first.
|
|
60
|
+
if first_obj == -1:
|
|
61
|
+
start = first_arr
|
|
62
|
+
open_bracket = "["
|
|
63
|
+
close_bracket = "]"
|
|
64
|
+
elif first_arr == -1:
|
|
65
|
+
start = first_obj
|
|
66
|
+
open_bracket = "{"
|
|
67
|
+
close_bracket = "}"
|
|
68
|
+
else:
|
|
69
|
+
if first_obj < first_arr:
|
|
70
|
+
start = first_obj
|
|
71
|
+
open_bracket = "{"
|
|
72
|
+
close_bracket = "}"
|
|
73
|
+
else:
|
|
74
|
+
start = first_arr
|
|
75
|
+
open_bracket = "["
|
|
76
|
+
close_bracket = "]"
|
|
77
|
+
|
|
78
|
+
# Bracket matching: find the matching closing bracket.
|
|
79
|
+
depth = 0
|
|
80
|
+
end = None
|
|
81
|
+
for i in range(start, len(text)):
|
|
82
|
+
if text[i] == open_bracket:
|
|
83
|
+
depth += 1
|
|
84
|
+
elif text[i] == close_bracket:
|
|
85
|
+
depth -= 1
|
|
86
|
+
if depth == 0:
|
|
87
|
+
end = i
|
|
88
|
+
break
|
|
89
|
+
|
|
90
|
+
if end is None:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"Could not find matching closing bracket for JSON content."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
json_str = text[start : end + 1]
|
|
96
|
+
try:
|
|
97
|
+
return json.loads(json_str)
|
|
98
|
+
except json.JSONDecodeError as e:
|
|
99
|
+
raise ValueError("Extracted content is not valid JSON.") from e
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
PDF_CT_HINTS = (
|
|
103
|
+
"application/pdf",
|
|
104
|
+
"binary/octet-stream",
|
|
105
|
+
) # some servers mislabel
|
|
106
|
+
PDF_EXT_RE = re.compile(r"\.pdf($|\?)", re.IGNORECASE)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _is_pdf_response(resp: requests.Response) -> bool:
|
|
110
|
+
ct = resp.headers.get("Content-Type", "").lower()
|
|
111
|
+
if any(hint in ct for hint in PDF_CT_HINTS):
|
|
112
|
+
return True
|
|
113
|
+
# Sometimes servers omit CT but set filename
|
|
114
|
+
cd = resp.headers.get("Content-Disposition", "")
|
|
115
|
+
if "filename" in cd and ".pdf" in cd.lower():
|
|
116
|
+
return True
|
|
117
|
+
# Last resort: URL extension
|
|
118
|
+
return bool(PDF_EXT_RE.search(resp.url))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _derive_filename_from_cd_or_url(
|
|
122
|
+
resp: requests.Response, fallback: str
|
|
123
|
+
) -> str:
|
|
124
|
+
cd = resp.headers.get("Content-Disposition", "")
|
|
125
|
+
m = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^\";]+)"?', cd, re.IGNORECASE)
|
|
126
|
+
if m:
|
|
127
|
+
name = m.group(1)
|
|
128
|
+
# Some headers include quotes
|
|
129
|
+
name = name.strip("\"'")
|
|
130
|
+
|
|
131
|
+
# RFC 5987 may encode UTF-8 in filename*; we’re treating as plain here.
|
|
132
|
+
if not name.lower().endswith(".pdf"):
|
|
133
|
+
name += ".pdf"
|
|
134
|
+
return name
|
|
135
|
+
|
|
136
|
+
# use URL last path segment if looks like PDF
|
|
137
|
+
parsed = urlparse(resp.url)
|
|
138
|
+
base = os.path.basename(parsed.path) or fallback
|
|
139
|
+
if not base.lower().endswith(".pdf"):
|
|
140
|
+
if PDF_EXT_RE.search(resp.url):
|
|
141
|
+
base = re.sub(
|
|
142
|
+
r"(\.pdf)(?:$|\?).*", r"\1", base, flags=re.IGNORECASE
|
|
143
|
+
)
|
|
144
|
+
if not base.lower().endswith(".pdf"):
|
|
145
|
+
base += ".pdf"
|
|
146
|
+
else:
|
|
147
|
+
base = (
|
|
148
|
+
fallback
|
|
149
|
+
if fallback.lower().endswith(".pdf")
|
|
150
|
+
else fallback + ".pdf"
|
|
151
|
+
)
|
|
152
|
+
return base
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _download_stream_to(path: str, resp: requests.Response) -> str:
|
|
156
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
157
|
+
with open(path, "wb") as f:
|
|
158
|
+
shutil.copyfileobj(resp.raw, f)
|
|
159
|
+
return path
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _get_soup(
|
|
163
|
+
url: str, timeout: int = 20, headers: Optional[dict[str, str]] = None
|
|
164
|
+
) -> BeautifulSoup:
|
|
165
|
+
r = requests.get(url, timeout=timeout, headers=headers or {})
|
|
166
|
+
r.raise_for_status()
|
|
167
|
+
return BeautifulSoup(r.text, "html.parser")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _find_pdf_on_landing(soup: BeautifulSoup, base_url: str) -> Optional[str]:
|
|
171
|
+
# 1) meta citation_pdf_url
|
|
172
|
+
meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
|
|
173
|
+
if meta and meta.get("content"):
|
|
174
|
+
return urljoin(base_url, meta["content"])
|
|
175
|
+
|
|
176
|
+
# 2) obvious anchors: text contains 'PDF' or 'Download'
|
|
177
|
+
for a in soup.find_all("a", href=True):
|
|
178
|
+
label = (a.get_text(" ", strip=True) or "").lower()
|
|
179
|
+
href = a["href"]
|
|
180
|
+
if "pdf" in label or "download" in label or PDF_EXT_RE.search(href):
|
|
181
|
+
return urljoin(base_url, href)
|
|
182
|
+
|
|
183
|
+
# 3) buttons that wrap an anchor
|
|
184
|
+
for btn in soup.find_all(["button", "a"], href=True):
|
|
185
|
+
label = (btn.get_text(" ", strip=True) or "").lower()
|
|
186
|
+
href = btn.get("href")
|
|
187
|
+
if href and (
|
|
188
|
+
"pdf" in label or "download" in label or PDF_EXT_RE.search(href)
|
|
189
|
+
):
|
|
190
|
+
return urljoin(base_url, href)
|
|
191
|
+
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# def _resolve_pdf_via_unpaywall(doi: str, email: str, timeout: int = 15) -> Optional[str]:
|
|
196
|
+
# # Optional helper: respects publisher OA; returns None if no OA PDF
|
|
197
|
+
# try:
|
|
198
|
+
# url = f"https://api.unpaywall.org/v2/{doi}"
|
|
199
|
+
# r = requests.get(url, params={"email": email}, timeout=timeout)
|
|
200
|
+
# r.raise_for_status()
|
|
201
|
+
# data = r.json()
|
|
202
|
+
# loc = data.get("best_oa_location") or {}
|
|
203
|
+
# pdf = loc.get("url_for_pdf") or loc.get("url")
|
|
204
|
+
# if pdf and PDF_EXT_RE.search(pdf):
|
|
205
|
+
# return pdf
|
|
206
|
+
# # Sometimes url points to landing; try it anyway.
|
|
207
|
+
# return pdf
|
|
208
|
+
# except Exception:
|
|
209
|
+
# return None
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def resolve_pdf_from_osti_record(
|
|
213
|
+
rec: dict[str, Any],
|
|
214
|
+
*,
|
|
215
|
+
headers: Optional[dict[str, str]] = None,
|
|
216
|
+
unpaywall_email: Optional[str] = None,
|
|
217
|
+
timeout: int = 25,
|
|
218
|
+
) -> Tuple[Optional[str], Optional[str], str]:
|
|
219
|
+
"""
|
|
220
|
+
Returns (pdf_url, landing_used, note)
|
|
221
|
+
- pdf_url: direct downloadable PDF URL if found (or a strong candidate)
|
|
222
|
+
- landing_used: landing page URL we parsed (if any)
|
|
223
|
+
- note: brief trace of how we found it
|
|
224
|
+
"""
|
|
225
|
+
headers = headers or {"User-Agent": "Mozilla/5.0"}
|
|
226
|
+
note_parts: list[str] = []
|
|
227
|
+
|
|
228
|
+
links = rec.get("links", []) or []
|
|
229
|
+
# doi = rec.get("doi")
|
|
230
|
+
|
|
231
|
+
# 1) Try 'fulltext' first (OSTI purl)
|
|
232
|
+
fulltext = None
|
|
233
|
+
for link in links:
|
|
234
|
+
if link.get("rel") == "fulltext":
|
|
235
|
+
fulltext = link.get("href")
|
|
236
|
+
break
|
|
237
|
+
|
|
238
|
+
if fulltext:
|
|
239
|
+
note_parts.append("Tried links[fulltext] purl")
|
|
240
|
+
try:
|
|
241
|
+
# Follow redirects; stream to peek headers without loading whole body
|
|
242
|
+
r = requests.get(
|
|
243
|
+
fulltext,
|
|
244
|
+
headers=headers,
|
|
245
|
+
timeout=timeout,
|
|
246
|
+
allow_redirects=True,
|
|
247
|
+
stream=True,
|
|
248
|
+
)
|
|
249
|
+
r.raise_for_status()
|
|
250
|
+
|
|
251
|
+
if _is_pdf_response(r):
|
|
252
|
+
note_parts.append("fulltext resolved directly to PDF")
|
|
253
|
+
return (r.url, None, " | ".join(note_parts))
|
|
254
|
+
|
|
255
|
+
# Not a PDF: parse page HTML for meta or obvious PDF anchors
|
|
256
|
+
# (If server sent binary but CT lied, _is_pdf_response would have caught via CD or ext)
|
|
257
|
+
r.close()
|
|
258
|
+
soup = _get_soup(fulltext, timeout=timeout, headers=headers)
|
|
259
|
+
candidate = _find_pdf_on_landing(soup, fulltext)
|
|
260
|
+
if candidate:
|
|
261
|
+
note_parts.append(
|
|
262
|
+
"found PDF via meta/anchor on fulltext landing"
|
|
263
|
+
)
|
|
264
|
+
return (candidate, fulltext, " | ".join(note_parts))
|
|
265
|
+
except Exception as e:
|
|
266
|
+
note_parts.append(f"fulltext failed: {e}")
|
|
267
|
+
|
|
268
|
+
# 2) Try DOE PAGES landing (citation_doe_pages)
|
|
269
|
+
doe_pages = None
|
|
270
|
+
for link in links:
|
|
271
|
+
if link.get("rel") == "citation_doe_pages":
|
|
272
|
+
doe_pages = link.get("href")
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
if doe_pages:
|
|
276
|
+
note_parts.append("Tried links[citation_doe_pages] landing")
|
|
277
|
+
try:
|
|
278
|
+
soup = _get_soup(doe_pages, timeout=timeout, headers=headers)
|
|
279
|
+
candidate = _find_pdf_on_landing(soup, doe_pages)
|
|
280
|
+
if candidate:
|
|
281
|
+
# Candidate may itself be a landing—check if it serves PDF
|
|
282
|
+
try:
|
|
283
|
+
r2 = requests.get(
|
|
284
|
+
candidate,
|
|
285
|
+
headers=headers,
|
|
286
|
+
timeout=timeout,
|
|
287
|
+
allow_redirects=True,
|
|
288
|
+
stream=True,
|
|
289
|
+
)
|
|
290
|
+
r2.raise_for_status()
|
|
291
|
+
if _is_pdf_response(r2):
|
|
292
|
+
note_parts.append("citation_doe_pages → direct PDF")
|
|
293
|
+
return (r2.url, doe_pages, " | ".join(note_parts))
|
|
294
|
+
r2.close()
|
|
295
|
+
except Exception:
|
|
296
|
+
pass
|
|
297
|
+
# If not clearly PDF, still return as a candidate (agent will fetch & parse)
|
|
298
|
+
note_parts.append(
|
|
299
|
+
"citation_doe_pages → PDF-like candidate (not confirmed by headers)"
|
|
300
|
+
)
|
|
301
|
+
return (candidate, doe_pages, " | ".join(note_parts))
|
|
302
|
+
except Exception as e:
|
|
303
|
+
note_parts.append(f"citation_doe_pages failed: {e}")
|
|
304
|
+
|
|
305
|
+
# # 3) Optional: DOI → Unpaywall OA
|
|
306
|
+
# if doi and unpaywall_email:
|
|
307
|
+
# note_parts.append("Tried Unpaywall via DOI")
|
|
308
|
+
# pdf_from_ua = _resolve_pdf_via_unpaywall(doi, unpaywall_email)
|
|
309
|
+
# if pdf_from_ua:
|
|
310
|
+
# # May be direct PDF or landing; the caller will validate headers during download
|
|
311
|
+
# note_parts.append("Unpaywall returned candidate")
|
|
312
|
+
# return (pdf_from_ua, None, " | ".join(note_parts))
|
|
313
|
+
|
|
314
|
+
# 4) Give up
|
|
315
|
+
note_parts.append("No PDF found")
|
|
316
|
+
return (None, None, " | ".join(note_parts))
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _normalize_ws(text: str) -> str:
|
|
320
|
+
# Normalize unicode, collapse whitespace, and strip control chars
|
|
321
|
+
text = unicodedata.normalize("NFKC", text)
|
|
322
|
+
text = re.sub(r"[ \t\r\f\v]+", " ", text)
|
|
323
|
+
text = re.sub(r"\s*\n\s*", "\n", text)
|
|
324
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
325
|
+
text = text.strip()
|
|
326
|
+
return text
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _dedupe_lines(text: str, min_len: int = 40) -> str:
|
|
330
|
+
seen = set()
|
|
331
|
+
out = []
|
|
332
|
+
for line in text.splitlines():
|
|
333
|
+
stripped = line.strip()
|
|
334
|
+
# Ignore very short or repeated lines (menus, cookie banners, etc.)
|
|
335
|
+
if len(stripped) < min_len:
|
|
336
|
+
continue
|
|
337
|
+
key = stripped.lower()
|
|
338
|
+
if key in seen:
|
|
339
|
+
continue
|
|
340
|
+
seen.add(key)
|
|
341
|
+
out.append(stripped)
|
|
342
|
+
return "\n\n".join(out)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def extract_main_text_only(html: str, *, max_chars: int = 250_000) -> str:
|
|
346
|
+
"""
|
|
347
|
+
Returns plain text with navigation/ads/scripts removed.
|
|
348
|
+
Prefers trafilatura -> jusText -> BS4 paragraphs.
|
|
349
|
+
"""
|
|
350
|
+
# 1) Trafilatura
|
|
351
|
+
# You can tune config: with_metadata, include_comments, include_images, favor_recall, etc.
|
|
352
|
+
cfg = trafilatura.settings.use_config()
|
|
353
|
+
cfg.set("DEFAULT", "include_comments", "false")
|
|
354
|
+
cfg.set("DEFAULT", "include_tables", "false")
|
|
355
|
+
cfg.set("DEFAULT", "favor_recall", "false") # be stricter; less noise
|
|
356
|
+
try:
|
|
357
|
+
# If you fetched HTML already, use extract() on string; otherwise, fetch_url(url)
|
|
358
|
+
txt = trafilatura.extract(
|
|
359
|
+
html,
|
|
360
|
+
config=cfg,
|
|
361
|
+
include_comments=False,
|
|
362
|
+
include_tables=False,
|
|
363
|
+
favor_recall=False,
|
|
364
|
+
)
|
|
365
|
+
if txt and txt.strip():
|
|
366
|
+
txt = _normalize_ws(txt)
|
|
367
|
+
txt = _dedupe_lines(txt)
|
|
368
|
+
return txt[:max_chars]
|
|
369
|
+
except Exception:
|
|
370
|
+
pass
|
|
371
|
+
|
|
372
|
+
# 2) jusText
|
|
373
|
+
try:
|
|
374
|
+
paragraphs = justext.justext(html, justext.get_stoplist("English"))
|
|
375
|
+
body_paras = [p.text for p in paragraphs if not p.is_boilerplate]
|
|
376
|
+
if body_paras:
|
|
377
|
+
txt = _normalize_ws("\n\n".join(body_paras))
|
|
378
|
+
txt = _dedupe_lines(txt)
|
|
379
|
+
return txt[:max_chars]
|
|
380
|
+
except Exception:
|
|
381
|
+
pass
|
|
382
|
+
|
|
383
|
+
# 4) last-resort: BS4 paragraphs/headings only
|
|
384
|
+
from bs4 import BeautifulSoup
|
|
385
|
+
|
|
386
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
387
|
+
for tag in soup([
|
|
388
|
+
"script",
|
|
389
|
+
"style",
|
|
390
|
+
"noscript",
|
|
391
|
+
"header",
|
|
392
|
+
"footer",
|
|
393
|
+
"nav",
|
|
394
|
+
"form",
|
|
395
|
+
"aside",
|
|
396
|
+
]):
|
|
397
|
+
tag.decompose()
|
|
398
|
+
chunks = []
|
|
399
|
+
for el in soup.find_all(["h1", "h2", "h3", "p", "li", "figcaption"]):
|
|
400
|
+
t = el.get_text(" ", strip=True)
|
|
401
|
+
if t:
|
|
402
|
+
chunks.append(t)
|
|
403
|
+
txt = _normalize_ws("\n\n".join(chunks))
|
|
404
|
+
txt = _dedupe_lines(txt)
|
|
405
|
+
return txt[:max_chars]
|