ursa-ai 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. ursa/__init__.py +3 -0
  2. ursa/agents/__init__.py +32 -0
  3. ursa/agents/acquisition_agents.py +812 -0
  4. ursa/agents/arxiv_agent.py +429 -0
  5. ursa/agents/base.py +728 -0
  6. ursa/agents/chat_agent.py +60 -0
  7. ursa/agents/code_review_agent.py +341 -0
  8. ursa/agents/execution_agent.py +915 -0
  9. ursa/agents/hypothesizer_agent.py +614 -0
  10. ursa/agents/lammps_agent.py +465 -0
  11. ursa/agents/mp_agent.py +204 -0
  12. ursa/agents/optimization_agent.py +410 -0
  13. ursa/agents/planning_agent.py +219 -0
  14. ursa/agents/rag_agent.py +304 -0
  15. ursa/agents/recall_agent.py +54 -0
  16. ursa/agents/websearch_agent.py +196 -0
  17. ursa/cli/__init__.py +363 -0
  18. ursa/cli/hitl.py +516 -0
  19. ursa/cli/hitl_api.py +75 -0
  20. ursa/observability/metrics_charts.py +1279 -0
  21. ursa/observability/metrics_io.py +11 -0
  22. ursa/observability/metrics_session.py +750 -0
  23. ursa/observability/pricing.json +97 -0
  24. ursa/observability/pricing.py +321 -0
  25. ursa/observability/timing.py +1466 -0
  26. ursa/prompt_library/__init__.py +0 -0
  27. ursa/prompt_library/code_review_prompts.py +51 -0
  28. ursa/prompt_library/execution_prompts.py +50 -0
  29. ursa/prompt_library/hypothesizer_prompts.py +17 -0
  30. ursa/prompt_library/literature_prompts.py +11 -0
  31. ursa/prompt_library/optimization_prompts.py +131 -0
  32. ursa/prompt_library/planning_prompts.py +79 -0
  33. ursa/prompt_library/websearch_prompts.py +131 -0
  34. ursa/tools/__init__.py +0 -0
  35. ursa/tools/feasibility_checker.py +114 -0
  36. ursa/tools/feasibility_tools.py +1075 -0
  37. ursa/tools/run_command.py +27 -0
  38. ursa/tools/write_code.py +42 -0
  39. ursa/util/__init__.py +0 -0
  40. ursa/util/diff_renderer.py +128 -0
  41. ursa/util/helperFunctions.py +142 -0
  42. ursa/util/logo_generator.py +625 -0
  43. ursa/util/memory_logger.py +183 -0
  44. ursa/util/optimization_schema.py +78 -0
  45. ursa/util/parse.py +405 -0
  46. ursa_ai-0.9.1.dist-info/METADATA +304 -0
  47. ursa_ai-0.9.1.dist-info/RECORD +51 -0
  48. ursa_ai-0.9.1.dist-info/WHEEL +5 -0
  49. ursa_ai-0.9.1.dist-info/entry_points.txt +2 -0
  50. ursa_ai-0.9.1.dist-info/licenses/LICENSE +8 -0
  51. ursa_ai-0.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,183 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Optional, Sequence
5
+
6
+ from langchain_chroma import Chroma
7
+ from langchain_core.documents import Document
8
+
9
+
10
+ class AgentMemory:
11
+ """
12
+ Simple wrapper around a persistent Chroma vector-store for agent-conversation memory.
13
+
14
+ Parameters
15
+ ----------
16
+ path : str | Path | None
17
+ Where to keep the on-disk Chroma DB. If *None*, a folder called
18
+ ``agent_memory_db`` is created in the package’s base directory.
19
+ collection_name : str
20
+ Name of the Chroma collection.
21
+ embedding_model : <TODO> | None
22
+ the embedding model
23
+
24
+ Notes
25
+ -----
26
+ * Requires `langchain-chroma`, and `chromadb`.
27
+ """
28
+
29
+ @classmethod
30
+ def get_db_path(cls, path: Optional[str | Path]) -> Path:
31
+ match path:
32
+ case None:
33
+ return Path.home() / ".cache" / "ursa" / "rag" / "db"
34
+ case str():
35
+ return Path(path)
36
+ case Path():
37
+ return path
38
+ case _:
39
+ raise TypeError(
40
+ f"Type of path is `{type(path)}` "
41
+ "but `Optional[str | Path]` was expected."
42
+ )
43
+
44
+ def __init__(
45
+ self,
46
+ embedding_model,
47
+ path: Optional[str | Path] = None,
48
+ collection_name: str = "agent_memory",
49
+ ) -> None:
50
+ self.path = self.get_db_path(path)
51
+ self.collection_name = collection_name
52
+ self.path.mkdir(parents=True, exist_ok=True)
53
+ self.embeddings = embedding_model
54
+
55
+ # If a DB already exists, load it; otherwise defer creation until `build_index`.
56
+ self.vectorstore: Optional[Chroma] = None
57
+ if any(self.path.iterdir()):
58
+ self.vectorstore = Chroma(
59
+ collection_name=self.collection_name,
60
+ embedding_function=self.embeddings,
61
+ persist_directory=str(self.path),
62
+ )
63
+
64
+ # --------------------------------------------------------------------- #
65
+ # ❶ Build & index a brand-new database #
66
+ # --------------------------------------------------------------------- #
67
+ def build_index(
68
+ self,
69
+ chunks: Sequence[str],
70
+ metadatas: Optional[Sequence[Dict[str, Any]]] = None,
71
+ ) -> None:
72
+ """
73
+ Create a fresh vector store from ``chunks``. Existing data (if any)
74
+ are overwritten.
75
+
76
+ Parameters
77
+ ----------
78
+ chunks : Sequence[str]
79
+ Text snippets (already chunked) to embed.
80
+ metadatas : Sequence[dict] | None
81
+ Optional metadata dict for each chunk, same length as ``chunks``.
82
+ """
83
+ docs = [
84
+ Document(
85
+ page_content=text, metadata=metadatas[i] if metadatas else {}
86
+ )
87
+ for i, text in enumerate(chunks)
88
+ ]
89
+
90
+ # Create (or overwrite) the collection
91
+ self.vectorstore = Chroma.from_documents(
92
+ documents=docs,
93
+ embedding=self.embeddings,
94
+ collection_name=self.collection_name,
95
+ persist_directory=str(self.path),
96
+ )
97
+
98
+ # --------------------------------------------------------------------- #
99
+ # ❷ Add new chunks and re-index #
100
+ # --------------------------------------------------------------------- #
101
+ def add_memories(
102
+ self,
103
+ new_chunks: Sequence[str],
104
+ metadatas: Optional[Sequence[Dict[str, Any]]] = None,
105
+ ) -> None:
106
+ """
107
+ Append new text chunks to the existing store (must call `build_index`
108
+ first if the DB is empty).
109
+
110
+ Raises
111
+ ------
112
+ RuntimeError
113
+ If the vector store is not yet initialised.
114
+ """
115
+ if self.vectorstore is None:
116
+ self.build_index(new_chunks, metadatas)
117
+ print("----- Vector store initialised -----")
118
+
119
+ docs = []
120
+ for i, text in enumerate(new_chunks):
121
+ if len(text) > 0: # only add non-empty documents
122
+ docs.append(
123
+ Document(
124
+ page_content=text,
125
+ metadata=metadatas[i] if metadatas else {},
126
+ )
127
+ )
128
+ self.vectorstore.add_documents(docs)
129
+
130
+ # --------------------------------------------------------------------- #
131
+ # ❸ Retrieve relevant chunks (RAG query) #
132
+ # --------------------------------------------------------------------- #
133
+ def retrieve(
134
+ self,
135
+ query: str,
136
+ k: int = 4,
137
+ with_scores: bool = False,
138
+ **search_kwargs,
139
+ ):
140
+ """
141
+ Return the *k* most similar chunks for `query`.
142
+
143
+ Parameters
144
+ ----------
145
+ query : str
146
+ Natural-language question or statement.
147
+ k : int
148
+ How many results to return.
149
+ with_scores : bool
150
+ If True, also return similarity scores.
151
+ **search_kwargs
152
+ Extra kwargs forwarded to Chroma’s ``similarity_search*`` helpers.
153
+
154
+ Returns
155
+ -------
156
+ list[Document] | list[tuple[Document, float]]
157
+ """
158
+ if self.vectorstore is None:
159
+ return ["None"]
160
+
161
+ if with_scores:
162
+ return self.vectorstore.similarity_search_with_score(
163
+ query, k=k, **search_kwargs
164
+ )
165
+ return self.vectorstore.similarity_search(query, k=k, **search_kwargs)
166
+
167
+
168
+ def delete_database(path: Optional[str | Path] = None):
169
+ """
170
+ Simple wrapper around a persistent Chroma vector-store for agent-conversation memory.
171
+
172
+ Parameters
173
+ ----------
174
+ path : str | Path | None
175
+ Where the on-disk Chroma DB is for deleting. If *None*, a folder called
176
+ ``agent_memory_db`` is created in the package’s base directory.
177
+ """
178
+ db_path = AgentMemory.get_db_path(path)
179
+ if os.path.exists(db_path):
180
+ shutil.rmtree(db_path)
181
+ print(f"Database: {db_path} has been deleted.")
182
+ else:
183
+ print("No database found to delete.")
@@ -0,0 +1,78 @@
1
+ from typing import Any, List, Literal, Optional, TypedDict
2
+
3
+
4
+ class DecisionVariableType(TypedDict):
5
+ name: str # decision variable name
6
+ type: Literal[
7
+ "continuous",
8
+ "integer",
9
+ "logical",
10
+ "infinite-dimensional",
11
+ "finite-dimensional",
12
+ ] # decision variable type
13
+ domain: str # allowable values of variable
14
+ description: str # natural language description
15
+
16
+
17
+ class ParameterType(TypedDict):
18
+ name: str # parameter name
19
+ value: Optional[Any] # parameter value; None
20
+ description: str # natural language description
21
+ is_user_supplied: bool # 1 if user supplied parameter
22
+
23
+
24
+ class ObjectiveType(TypedDict):
25
+ sense: Literal["minimize", "maximize"] # objective sense
26
+ expression_nl: str # sympy-representable mathematical expression
27
+ tags: List[
28
+ Literal["linear", "quadratic", "nonlinear", "convex", "nonconvex"]
29
+ ] # objective type
30
+
31
+
32
+ class ConstraintType(TypedDict):
33
+ name: str # constraint name
34
+ expression_nl: str # sympy-representable mathematical expression
35
+ tags: List[
36
+ Literal[
37
+ "linear",
38
+ "integer",
39
+ "nonlinear",
40
+ "equality",
41
+ "inequality",
42
+ "infinite-dimensional",
43
+ "finite-dimensional",
44
+ ]
45
+ ] # constraint type
46
+
47
+
48
+ class NotesType(TypedDict):
49
+ verifier: str # problem verification status and explanation
50
+ feasibility: str # problem feasibility status
51
+ user: str # notes to user
52
+ assumptions: str # assumptions made during formulation
53
+
54
+
55
+ class ProblemSpec(TypedDict):
56
+ title: str # name of the problem
57
+ description_nl: str # natural language description
58
+ decision_variables: List[
59
+ DecisionVariableType
60
+ ] # list of all decision variables
61
+ parameters: List[ParameterType] # list of all parameters
62
+ objective: ObjectiveType # structred objective function details
63
+ constraints: List[ConstraintType] # structured constraint details
64
+ problem_class: Optional[str] # optimization problem class
65
+ latex: Optional[str] # latex formulation of the problem
66
+ status: Literal["DRAFT", "VERIFIED", "ERROR"] # problem status
67
+ notes: NotesType # structured notes data
68
+
69
+
70
+ class SolverSpec(TypedDict):
71
+ solver: str # name of the solver, replace with Literal["Gurobi","Ipopt",...] to restrict solvers
72
+ library: str # library or relevant packages for the solver
73
+ algorithm: Optional[str] # algorithm used to solve the problem
74
+ license: Optional[
75
+ str
76
+ ] # License status of the solver (open-source, commercial,etc.)
77
+ parameters: Optional[List[dict]] # other parameters relevant to the problem
78
+ notes: Optional[str] # justifying the choice of solver
ursa/util/parse.py ADDED
@@ -0,0 +1,405 @@
1
+ import json
2
+ import os
3
+ import re
4
+ import shutil
5
+ import unicodedata
6
+ from typing import Any, Optional, Tuple
7
+ from urllib.parse import urljoin, urlparse
8
+
9
+ import justext
10
+ import requests
11
+ import trafilatura
12
+ from bs4 import BeautifulSoup
13
+
14
+
15
+ def extract_json(text: str) -> list[dict]:
16
+ """
17
+ Extract a JSON object or array from text that might contain markdown or other content.
18
+
19
+ The function attempts three strategies:
20
+ 1. Extract JSON from a markdown code block labeled as JSON.
21
+ 2. Extract JSON from any markdown code block.
22
+ 3. Use bracket matching to extract a JSON substring starting with '{' or '['.
23
+
24
+ Returns:
25
+ A Python object parsed from the JSON string (dict or list).
26
+
27
+ Raises:
28
+ ValueError: If no valid JSON is found.
29
+ """
30
+ # Approach 1: Look for a markdown code block specifically labeled as JSON.
31
+ labeled_block = re.search(
32
+ r"```json\s*([\[{].*?[\]}])\s*```", text, re.DOTALL
33
+ )
34
+ if labeled_block:
35
+ json_str = labeled_block.group(1).strip()
36
+ try:
37
+ return json.loads(json_str)
38
+ except json.JSONDecodeError:
39
+ # Fall back to the next approach if parsing fails.
40
+ pass
41
+
42
+ # Approach 2: Look for any code block delimited by triple backticks.
43
+ generic_block = re.search(r"```(.*?)```", text, re.DOTALL)
44
+ if generic_block:
45
+ json_str = generic_block.group(1).strip()
46
+ if json_str.startswith("{") or json_str.startswith("["):
47
+ try:
48
+ return json.loads(json_str)
49
+ except json.JSONDecodeError:
50
+ pass
51
+
52
+ # Approach 3: Attempt to extract JSON using bracket matching.
53
+ # Find the first occurrence of either '{' or '['.
54
+ first_obj = text.find("{")
55
+ first_arr = text.find("[")
56
+ if first_obj == -1 and first_arr == -1:
57
+ raise ValueError("No JSON object or array found in the text.")
58
+
59
+ # Determine which bracket comes first.
60
+ if first_obj == -1:
61
+ start = first_arr
62
+ open_bracket = "["
63
+ close_bracket = "]"
64
+ elif first_arr == -1:
65
+ start = first_obj
66
+ open_bracket = "{"
67
+ close_bracket = "}"
68
+ else:
69
+ if first_obj < first_arr:
70
+ start = first_obj
71
+ open_bracket = "{"
72
+ close_bracket = "}"
73
+ else:
74
+ start = first_arr
75
+ open_bracket = "["
76
+ close_bracket = "]"
77
+
78
+ # Bracket matching: find the matching closing bracket.
79
+ depth = 0
80
+ end = None
81
+ for i in range(start, len(text)):
82
+ if text[i] == open_bracket:
83
+ depth += 1
84
+ elif text[i] == close_bracket:
85
+ depth -= 1
86
+ if depth == 0:
87
+ end = i
88
+ break
89
+
90
+ if end is None:
91
+ raise ValueError(
92
+ "Could not find matching closing bracket for JSON content."
93
+ )
94
+
95
+ json_str = text[start : end + 1]
96
+ try:
97
+ return json.loads(json_str)
98
+ except json.JSONDecodeError as e:
99
+ raise ValueError("Extracted content is not valid JSON.") from e
100
+
101
+
102
+ PDF_CT_HINTS = (
103
+ "application/pdf",
104
+ "binary/octet-stream",
105
+ ) # some servers mislabel
106
+ PDF_EXT_RE = re.compile(r"\.pdf($|\?)", re.IGNORECASE)
107
+
108
+
109
+ def _is_pdf_response(resp: requests.Response) -> bool:
110
+ ct = resp.headers.get("Content-Type", "").lower()
111
+ if any(hint in ct for hint in PDF_CT_HINTS):
112
+ return True
113
+ # Sometimes servers omit CT but set filename
114
+ cd = resp.headers.get("Content-Disposition", "")
115
+ if "filename" in cd and ".pdf" in cd.lower():
116
+ return True
117
+ # Last resort: URL extension
118
+ return bool(PDF_EXT_RE.search(resp.url))
119
+
120
+
121
+ def _derive_filename_from_cd_or_url(
122
+ resp: requests.Response, fallback: str
123
+ ) -> str:
124
+ cd = resp.headers.get("Content-Disposition", "")
125
+ m = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^\";]+)"?', cd, re.IGNORECASE)
126
+ if m:
127
+ name = m.group(1)
128
+ # Some headers include quotes
129
+ name = name.strip("\"'")
130
+
131
+ # RFC 5987 may encode UTF-8 in filename*; we’re treating as plain here.
132
+ if not name.lower().endswith(".pdf"):
133
+ name += ".pdf"
134
+ return name
135
+
136
+ # use URL last path segment if looks like PDF
137
+ parsed = urlparse(resp.url)
138
+ base = os.path.basename(parsed.path) or fallback
139
+ if not base.lower().endswith(".pdf"):
140
+ if PDF_EXT_RE.search(resp.url):
141
+ base = re.sub(
142
+ r"(\.pdf)(?:$|\?).*", r"\1", base, flags=re.IGNORECASE
143
+ )
144
+ if not base.lower().endswith(".pdf"):
145
+ base += ".pdf"
146
+ else:
147
+ base = (
148
+ fallback
149
+ if fallback.lower().endswith(".pdf")
150
+ else fallback + ".pdf"
151
+ )
152
+ return base
153
+
154
+
155
+ def _download_stream_to(path: str, resp: requests.Response) -> str:
156
+ os.makedirs(os.path.dirname(path), exist_ok=True)
157
+ with open(path, "wb") as f:
158
+ shutil.copyfileobj(resp.raw, f)
159
+ return path
160
+
161
+
162
+ def _get_soup(
163
+ url: str, timeout: int = 20, headers: Optional[dict[str, str]] = None
164
+ ) -> BeautifulSoup:
165
+ r = requests.get(url, timeout=timeout, headers=headers or {})
166
+ r.raise_for_status()
167
+ return BeautifulSoup(r.text, "html.parser")
168
+
169
+
170
+ def _find_pdf_on_landing(soup: BeautifulSoup, base_url: str) -> Optional[str]:
171
+ # 1) meta citation_pdf_url
172
+ meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
173
+ if meta and meta.get("content"):
174
+ return urljoin(base_url, meta["content"])
175
+
176
+ # 2) obvious anchors: text contains 'PDF' or 'Download'
177
+ for a in soup.find_all("a", href=True):
178
+ label = (a.get_text(" ", strip=True) or "").lower()
179
+ href = a["href"]
180
+ if "pdf" in label or "download" in label or PDF_EXT_RE.search(href):
181
+ return urljoin(base_url, href)
182
+
183
+ # 3) buttons that wrap an anchor
184
+ for btn in soup.find_all(["button", "a"], href=True):
185
+ label = (btn.get_text(" ", strip=True) or "").lower()
186
+ href = btn.get("href")
187
+ if href and (
188
+ "pdf" in label or "download" in label or PDF_EXT_RE.search(href)
189
+ ):
190
+ return urljoin(base_url, href)
191
+
192
+ return None
193
+
194
+
195
+ # def _resolve_pdf_via_unpaywall(doi: str, email: str, timeout: int = 15) -> Optional[str]:
196
+ # # Optional helper: respects publisher OA; returns None if no OA PDF
197
+ # try:
198
+ # url = f"https://api.unpaywall.org/v2/{doi}"
199
+ # r = requests.get(url, params={"email": email}, timeout=timeout)
200
+ # r.raise_for_status()
201
+ # data = r.json()
202
+ # loc = data.get("best_oa_location") or {}
203
+ # pdf = loc.get("url_for_pdf") or loc.get("url")
204
+ # if pdf and PDF_EXT_RE.search(pdf):
205
+ # return pdf
206
+ # # Sometimes url points to landing; try it anyway.
207
+ # return pdf
208
+ # except Exception:
209
+ # return None
210
+
211
+
212
+ def resolve_pdf_from_osti_record(
213
+ rec: dict[str, Any],
214
+ *,
215
+ headers: Optional[dict[str, str]] = None,
216
+ unpaywall_email: Optional[str] = None,
217
+ timeout: int = 25,
218
+ ) -> Tuple[Optional[str], Optional[str], str]:
219
+ """
220
+ Returns (pdf_url, landing_used, note)
221
+ - pdf_url: direct downloadable PDF URL if found (or a strong candidate)
222
+ - landing_used: landing page URL we parsed (if any)
223
+ - note: brief trace of how we found it
224
+ """
225
+ headers = headers or {"User-Agent": "Mozilla/5.0"}
226
+ note_parts: list[str] = []
227
+
228
+ links = rec.get("links", []) or []
229
+ # doi = rec.get("doi")
230
+
231
+ # 1) Try 'fulltext' first (OSTI purl)
232
+ fulltext = None
233
+ for link in links:
234
+ if link.get("rel") == "fulltext":
235
+ fulltext = link.get("href")
236
+ break
237
+
238
+ if fulltext:
239
+ note_parts.append("Tried links[fulltext] purl")
240
+ try:
241
+ # Follow redirects; stream to peek headers without loading whole body
242
+ r = requests.get(
243
+ fulltext,
244
+ headers=headers,
245
+ timeout=timeout,
246
+ allow_redirects=True,
247
+ stream=True,
248
+ )
249
+ r.raise_for_status()
250
+
251
+ if _is_pdf_response(r):
252
+ note_parts.append("fulltext resolved directly to PDF")
253
+ return (r.url, None, " | ".join(note_parts))
254
+
255
+ # Not a PDF: parse page HTML for meta or obvious PDF anchors
256
+ # (If server sent binary but CT lied, _is_pdf_response would have caught via CD or ext)
257
+ r.close()
258
+ soup = _get_soup(fulltext, timeout=timeout, headers=headers)
259
+ candidate = _find_pdf_on_landing(soup, fulltext)
260
+ if candidate:
261
+ note_parts.append(
262
+ "found PDF via meta/anchor on fulltext landing"
263
+ )
264
+ return (candidate, fulltext, " | ".join(note_parts))
265
+ except Exception as e:
266
+ note_parts.append(f"fulltext failed: {e}")
267
+
268
+ # 2) Try DOE PAGES landing (citation_doe_pages)
269
+ doe_pages = None
270
+ for link in links:
271
+ if link.get("rel") == "citation_doe_pages":
272
+ doe_pages = link.get("href")
273
+ break
274
+
275
+ if doe_pages:
276
+ note_parts.append("Tried links[citation_doe_pages] landing")
277
+ try:
278
+ soup = _get_soup(doe_pages, timeout=timeout, headers=headers)
279
+ candidate = _find_pdf_on_landing(soup, doe_pages)
280
+ if candidate:
281
+ # Candidate may itself be a landing—check if it serves PDF
282
+ try:
283
+ r2 = requests.get(
284
+ candidate,
285
+ headers=headers,
286
+ timeout=timeout,
287
+ allow_redirects=True,
288
+ stream=True,
289
+ )
290
+ r2.raise_for_status()
291
+ if _is_pdf_response(r2):
292
+ note_parts.append("citation_doe_pages → direct PDF")
293
+ return (r2.url, doe_pages, " | ".join(note_parts))
294
+ r2.close()
295
+ except Exception:
296
+ pass
297
+ # If not clearly PDF, still return as a candidate (agent will fetch & parse)
298
+ note_parts.append(
299
+ "citation_doe_pages → PDF-like candidate (not confirmed by headers)"
300
+ )
301
+ return (candidate, doe_pages, " | ".join(note_parts))
302
+ except Exception as e:
303
+ note_parts.append(f"citation_doe_pages failed: {e}")
304
+
305
+ # # 3) Optional: DOI → Unpaywall OA
306
+ # if doi and unpaywall_email:
307
+ # note_parts.append("Tried Unpaywall via DOI")
308
+ # pdf_from_ua = _resolve_pdf_via_unpaywall(doi, unpaywall_email)
309
+ # if pdf_from_ua:
310
+ # # May be direct PDF or landing; the caller will validate headers during download
311
+ # note_parts.append("Unpaywall returned candidate")
312
+ # return (pdf_from_ua, None, " | ".join(note_parts))
313
+
314
+ # 4) Give up
315
+ note_parts.append("No PDF found")
316
+ return (None, None, " | ".join(note_parts))
317
+
318
+
319
+ def _normalize_ws(text: str) -> str:
320
+ # Normalize unicode, collapse whitespace, and strip control chars
321
+ text = unicodedata.normalize("NFKC", text)
322
+ text = re.sub(r"[ \t\r\f\v]+", " ", text)
323
+ text = re.sub(r"\s*\n\s*", "\n", text)
324
+ text = re.sub(r"\n{3,}", "\n\n", text)
325
+ text = text.strip()
326
+ return text
327
+
328
+
329
+ def _dedupe_lines(text: str, min_len: int = 40) -> str:
330
+ seen = set()
331
+ out = []
332
+ for line in text.splitlines():
333
+ stripped = line.strip()
334
+ # Ignore very short or repeated lines (menus, cookie banners, etc.)
335
+ if len(stripped) < min_len:
336
+ continue
337
+ key = stripped.lower()
338
+ if key in seen:
339
+ continue
340
+ seen.add(key)
341
+ out.append(stripped)
342
+ return "\n\n".join(out)
343
+
344
+
345
+ def extract_main_text_only(html: str, *, max_chars: int = 250_000) -> str:
346
+ """
347
+ Returns plain text with navigation/ads/scripts removed.
348
+ Prefers trafilatura -> jusText -> BS4 paragraphs.
349
+ """
350
+ # 1) Trafilatura
351
+ # You can tune config: with_metadata, include_comments, include_images, favor_recall, etc.
352
+ cfg = trafilatura.settings.use_config()
353
+ cfg.set("DEFAULT", "include_comments", "false")
354
+ cfg.set("DEFAULT", "include_tables", "false")
355
+ cfg.set("DEFAULT", "favor_recall", "false") # be stricter; less noise
356
+ try:
357
+ # If you fetched HTML already, use extract() on string; otherwise, fetch_url(url)
358
+ txt = trafilatura.extract(
359
+ html,
360
+ config=cfg,
361
+ include_comments=False,
362
+ include_tables=False,
363
+ favor_recall=False,
364
+ )
365
+ if txt and txt.strip():
366
+ txt = _normalize_ws(txt)
367
+ txt = _dedupe_lines(txt)
368
+ return txt[:max_chars]
369
+ except Exception:
370
+ pass
371
+
372
+ # 2) jusText
373
+ try:
374
+ paragraphs = justext.justext(html, justext.get_stoplist("English"))
375
+ body_paras = [p.text for p in paragraphs if not p.is_boilerplate]
376
+ if body_paras:
377
+ txt = _normalize_ws("\n\n".join(body_paras))
378
+ txt = _dedupe_lines(txt)
379
+ return txt[:max_chars]
380
+ except Exception:
381
+ pass
382
+
383
+ # 4) last-resort: BS4 paragraphs/headings only
384
+ from bs4 import BeautifulSoup
385
+
386
+ soup = BeautifulSoup(html, "html.parser")
387
+ for tag in soup([
388
+ "script",
389
+ "style",
390
+ "noscript",
391
+ "header",
392
+ "footer",
393
+ "nav",
394
+ "form",
395
+ "aside",
396
+ ]):
397
+ tag.decompose()
398
+ chunks = []
399
+ for el in soup.find_all(["h1", "h2", "h3", "p", "li", "figcaption"]):
400
+ t = el.get_text(" ", strip=True)
401
+ if t:
402
+ chunks.append(t)
403
+ txt = _normalize_ws("\n\n".join(chunks))
404
+ txt = _dedupe_lines(txt)
405
+ return txt[:max_chars]