cite-agent 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cite-agent might be problematic. Click here for more details.
- cite_agent/__init__.py +1 -1
- cite_agent/agent_backend_only.py +30 -4
- cite_agent/cli.py +24 -26
- cite_agent/cli_conversational.py +294 -0
- cite_agent/enhanced_ai_agent.py +2776 -118
- cite_agent/streaming_ui.py +252 -0
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/METADATA +4 -3
- cite_agent-1.0.5.dist-info/RECORD +50 -0
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/top_level.txt +1 -0
- src/__init__.py +1 -0
- src/services/__init__.py +132 -0
- src/services/auth_service/__init__.py +3 -0
- src/services/auth_service/auth_manager.py +33 -0
- src/services/graph/__init__.py +1 -0
- src/services/graph/knowledge_graph.py +194 -0
- src/services/llm_service/__init__.py +5 -0
- src/services/llm_service/llm_manager.py +495 -0
- src/services/paper_service/__init__.py +5 -0
- src/services/paper_service/openalex.py +231 -0
- src/services/performance_service/__init__.py +1 -0
- src/services/performance_service/rust_performance.py +395 -0
- src/services/research_service/__init__.py +23 -0
- src/services/research_service/chatbot.py +2056 -0
- src/services/research_service/citation_manager.py +436 -0
- src/services/research_service/context_manager.py +1441 -0
- src/services/research_service/conversation_manager.py +597 -0
- src/services/research_service/critical_paper_detector.py +577 -0
- src/services/research_service/enhanced_research.py +121 -0
- src/services/research_service/enhanced_synthesizer.py +375 -0
- src/services/research_service/query_generator.py +777 -0
- src/services/research_service/synthesizer.py +1273 -0
- src/services/search_service/__init__.py +5 -0
- src/services/search_service/indexer.py +186 -0
- src/services/search_service/search_engine.py +342 -0
- src/services/simple_enhanced_main.py +287 -0
- cite_agent/__distribution__.py +0 -7
- cite_agent-1.0.3.dist-info/RECORD +0 -23
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/WHEEL +0 -0
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/entry_points.txt +0 -0
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Asynchronous client for the OpenAlex scholarly metadata API.
|
|
2
|
+
|
|
3
|
+
The implementation focuses on resilience. It maintains a small in-memory cache
|
|
4
|
+
and provides graceful fallbacks when the upstream service is unavailable so that
|
|
5
|
+
advanced features inside `SophisticatedResearchEngine` can continue operating
|
|
6
|
+
in restricted CI environments.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_OPENALEX_HOST = os.getenv("OPENALEX_BASE_URL", "https://api.openalex.org")
|
|
23
|
+
_DEFAULT_MAILTO = os.getenv("OPENALEX_MAILTO", "research@nocturnal.dev")
|
|
24
|
+
_CACHE_TTL_SECONDS = int(os.getenv("OPENALEX_CACHE_TTL", "1800"))
|
|
25
|
+
_DEFAULT_TIMEOUT = float(os.getenv("OPENALEX_TIMEOUT", "12.0"))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class OpenAlexClient:
|
|
29
|
+
"""Thin asynchronous wrapper around OpenAlex endpoints with caching."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*,
|
|
34
|
+
api_key: Optional[str] = None,
|
|
35
|
+
mailto: Optional[str] = None,
|
|
36
|
+
timeout: float = _DEFAULT_TIMEOUT,
|
|
37
|
+
cache_ttl: int = _CACHE_TTL_SECONDS,
|
|
38
|
+
) -> None:
|
|
39
|
+
self.api_key = api_key or os.getenv("OPENALEX_API_KEY")
|
|
40
|
+
self.mailto = mailto or _DEFAULT_MAILTO
|
|
41
|
+
self.timeout = timeout
|
|
42
|
+
self.cache_ttl = cache_ttl
|
|
43
|
+
self._cache: Dict[str, Tuple[float, Any]] = {}
|
|
44
|
+
self._cache_lock = asyncio.Lock()
|
|
45
|
+
self._session_lock = asyncio.Lock()
|
|
46
|
+
self._session: Optional[httpx.AsyncClient] = None
|
|
47
|
+
|
|
48
|
+
# ------------------------------------------------------------------
|
|
49
|
+
async def get_paper_by_id(self, paper_id: str) -> Optional[Dict[str, Any]]:
|
|
50
|
+
"""Retrieve a single paper by OpenAlex ID or DOI.
|
|
51
|
+
|
|
52
|
+
Returns `None` if the paper cannot be retrieved. When the OpenAlex API
|
|
53
|
+
is unreachable a best-effort synthetic document is returned so downstream
|
|
54
|
+
synthesis can continue in offline environments.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
normalized_id = self._normalise_id(paper_id)
|
|
58
|
+
cache_key = f"work:{normalized_id}"
|
|
59
|
+
cached = await self._read_cache(cache_key)
|
|
60
|
+
if cached is not None:
|
|
61
|
+
return cached
|
|
62
|
+
|
|
63
|
+
params = {"mailto": self.mailto}
|
|
64
|
+
if self.api_key:
|
|
65
|
+
params["api_key"] = self.api_key
|
|
66
|
+
|
|
67
|
+
url = f"{_OPENALEX_HOST}/works/{normalized_id}"
|
|
68
|
+
try:
|
|
69
|
+
session = await self._get_session()
|
|
70
|
+
response = await session.get(url, params=params)
|
|
71
|
+
response.raise_for_status()
|
|
72
|
+
data = response.json()
|
|
73
|
+
await self._write_cache(cache_key, data)
|
|
74
|
+
return data
|
|
75
|
+
except Exception as exc:
|
|
76
|
+
logger.warning("OpenAlex work lookup failed", extra={"paper_id": paper_id, "error": str(exc)})
|
|
77
|
+
fallback = self._fallback_document(normalized_id)
|
|
78
|
+
await self._write_cache(cache_key, fallback)
|
|
79
|
+
return fallback
|
|
80
|
+
|
|
81
|
+
async def get_papers_bulk(self, paper_ids: Iterable[str]) -> List[Dict[str, Any]]:
|
|
82
|
+
"""Retrieve multiple papers concurrently with caching."""
|
|
83
|
+
|
|
84
|
+
tasks = [self.get_paper_by_id(pid) for pid in paper_ids]
|
|
85
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
86
|
+
papers = []
|
|
87
|
+
for result in results:
|
|
88
|
+
if isinstance(result, dict) and result:
|
|
89
|
+
papers.append(result)
|
|
90
|
+
return papers
|
|
91
|
+
|
|
92
|
+
async def search_works(
|
|
93
|
+
self,
|
|
94
|
+
query: str,
|
|
95
|
+
*,
|
|
96
|
+
limit: int = 10,
|
|
97
|
+
filters: Optional[Dict[str, str]] = None,
|
|
98
|
+
sort: str = "relevance_score:desc",
|
|
99
|
+
) -> Dict[str, Any]:
|
|
100
|
+
"""Execute a search against OpenAlex works endpoint."""
|
|
101
|
+
|
|
102
|
+
limit = max(1, min(limit, 200))
|
|
103
|
+
params: Dict[str, Any] = {
|
|
104
|
+
"search": query,
|
|
105
|
+
"per-page": limit,
|
|
106
|
+
"page": 1,
|
|
107
|
+
"sort": sort,
|
|
108
|
+
"mailto": self.mailto,
|
|
109
|
+
}
|
|
110
|
+
if filters:
|
|
111
|
+
params["filter"] = ",".join(f"{k}:{v}" for k, v in filters.items())
|
|
112
|
+
if self.api_key:
|
|
113
|
+
params["api_key"] = self.api_key
|
|
114
|
+
|
|
115
|
+
cache_key = self._make_cache_key("search", params)
|
|
116
|
+
cached = await self._read_cache(cache_key)
|
|
117
|
+
if cached is not None:
|
|
118
|
+
return cached
|
|
119
|
+
|
|
120
|
+
url = f"{_OPENALEX_HOST}/works"
|
|
121
|
+
try:
|
|
122
|
+
session = await self._get_session()
|
|
123
|
+
response = await session.get(url, params=params)
|
|
124
|
+
response.raise_for_status()
|
|
125
|
+
payload = response.json()
|
|
126
|
+
await self._write_cache(cache_key, payload)
|
|
127
|
+
return payload
|
|
128
|
+
except Exception as exc:
|
|
129
|
+
logger.warning("OpenAlex search failed", extra={"query": query, "error": str(exc)})
|
|
130
|
+
# Provide deterministic empty payload to callers
|
|
131
|
+
empty = {"results": [], "meta": {"count": 0, "page": 1}}
|
|
132
|
+
await self._write_cache(cache_key, empty)
|
|
133
|
+
return empty
|
|
134
|
+
|
|
135
|
+
async def get_related_works(self, paper_id: str, limit: int = 5) -> List[Dict[str, Any]]:
|
|
136
|
+
"""Fetch related works leveraging OpenAlex's recommendation endpoint."""
|
|
137
|
+
|
|
138
|
+
normalized_id = self._normalise_id(paper_id)
|
|
139
|
+
params = {"per-page": max(1, min(limit, 50)), "mailto": self.mailto}
|
|
140
|
+
if self.api_key:
|
|
141
|
+
params["api_key"] = self.api_key
|
|
142
|
+
|
|
143
|
+
url = f"{_OPENALEX_HOST}/works/{normalized_id}/related"
|
|
144
|
+
cache_key = self._make_cache_key("related", normalized_id, params)
|
|
145
|
+
cached = await self._read_cache(cache_key)
|
|
146
|
+
if cached is not None:
|
|
147
|
+
return cached
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
session = await self._get_session()
|
|
151
|
+
response = await session.get(url, params=params)
|
|
152
|
+
response.raise_for_status()
|
|
153
|
+
data = response.json().get("results", [])
|
|
154
|
+
await self._write_cache(cache_key, data)
|
|
155
|
+
return data
|
|
156
|
+
except Exception as exc:
|
|
157
|
+
logger.info("OpenAlex related works unavailable", extra={"paper": paper_id, "error": str(exc)})
|
|
158
|
+
await self._write_cache(cache_key, [])
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
async def close(self) -> None:
|
|
162
|
+
async with self._session_lock:
|
|
163
|
+
if self._session is not None:
|
|
164
|
+
try:
|
|
165
|
+
await self._session.aclose()
|
|
166
|
+
finally:
|
|
167
|
+
self._session = None
|
|
168
|
+
|
|
169
|
+
# ------------------------------------------------------------------
|
|
170
|
+
async def _get_session(self) -> httpx.AsyncClient:
|
|
171
|
+
async with self._session_lock:
|
|
172
|
+
if self._session is None:
|
|
173
|
+
headers = {
|
|
174
|
+
"User-Agent": "Nocturnal-Archive/advanced-research (contact@nocturnal.dev)",
|
|
175
|
+
"Accept": "application/json",
|
|
176
|
+
}
|
|
177
|
+
self._session = httpx.AsyncClient(timeout=self.timeout, headers=headers)
|
|
178
|
+
return self._session
|
|
179
|
+
|
|
180
|
+
def _normalise_id(self, paper_id: str) -> str:
|
|
181
|
+
if paper_id.startswith("http"):
|
|
182
|
+
return paper_id.rstrip("/").split("/")[-1]
|
|
183
|
+
if paper_id.startswith("doi:"):
|
|
184
|
+
return paper_id
|
|
185
|
+
if "/" in paper_id and not paper_id.startswith("W"):
|
|
186
|
+
return f"doi:{paper_id}"
|
|
187
|
+
return paper_id
|
|
188
|
+
|
|
189
|
+
def _make_cache_key(self, namespace: str, *parts: Any) -> str:
|
|
190
|
+
raw = "|".join(str(part) for part in parts)
|
|
191
|
+
return f"{namespace}:{hash(raw) & 0xFFFFFFFFFFFF:x}"
|
|
192
|
+
|
|
193
|
+
async def _read_cache(self, key: str) -> Optional[Any]:
|
|
194
|
+
async with self._cache_lock:
|
|
195
|
+
entry = self._cache.get(key)
|
|
196
|
+
if not entry:
|
|
197
|
+
return None
|
|
198
|
+
expires_at, value = entry
|
|
199
|
+
if datetime.utcnow().timestamp() > expires_at:
|
|
200
|
+
self._cache.pop(key, None)
|
|
201
|
+
return None
|
|
202
|
+
return value
|
|
203
|
+
|
|
204
|
+
async def _write_cache(self, key: str, value: Any) -> None:
|
|
205
|
+
async with self._cache_lock:
|
|
206
|
+
self._cache[key] = (datetime.utcnow().timestamp() + self.cache_ttl, value)
|
|
207
|
+
|
|
208
|
+
def _fallback_document(self, paper_id: str) -> Dict[str, Any]:
|
|
209
|
+
"""Generate a deterministic placeholder when OpenAlex is unreachable."""
|
|
210
|
+
|
|
211
|
+
safe_id = re.sub(r"[^A-Za-z0-9]", "", paper_id) or "paper"
|
|
212
|
+
title = f"Placeholder synthesis for {safe_id}"
|
|
213
|
+
abstract = (
|
|
214
|
+
"OpenAlex was unavailable during retrieval. This placeholder combines "
|
|
215
|
+
"the paper identifier with contextual heuristics so downstream "
|
|
216
|
+
"components can continue operating."
|
|
217
|
+
)
|
|
218
|
+
return {
|
|
219
|
+
"id": paper_id,
|
|
220
|
+
"title": title,
|
|
221
|
+
"abstract": abstract,
|
|
222
|
+
"concepts": [],
|
|
223
|
+
"authorships": [],
|
|
224
|
+
"publication_year": datetime.utcnow().year,
|
|
225
|
+
"cited_by_count": 0,
|
|
226
|
+
"doi": paper_id if paper_id.startswith("doi:") else "",
|
|
227
|
+
"fallback": True,
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
__all__ = ["OpenAlexClient"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Performance service package for high-performance operations"""
|
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
"""
|
|
2
|
+
High-performance Rust-powered services for web scraping and text processing.
|
|
3
|
+
This module provides Python bindings to the Rust performance library.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import logging
|
|
8
|
+
from typing import List, Dict, Optional, Any
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _utc_now() -> datetime:
|
|
17
|
+
return datetime.now(timezone.utc)
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
import nocturnal_performance as rust_perf # type: ignore[import]
|
|
21
|
+
RUST_AVAILABLE = True
|
|
22
|
+
logger.info("Rust performance module loaded successfully")
|
|
23
|
+
except ImportError:
|
|
24
|
+
RUST_AVAILABLE = False
|
|
25
|
+
logger.warning("Rust performance module not available, falling back to Python implementations")
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ScrapedContent:
|
|
29
|
+
"""Represents scraped content from a URL."""
|
|
30
|
+
url: str
|
|
31
|
+
title: str
|
|
32
|
+
content: str
|
|
33
|
+
metadata: Dict[str, str]
|
|
34
|
+
timestamp: datetime
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class ProcessedText:
|
|
38
|
+
"""Represents processed text with various analyses."""
|
|
39
|
+
original: str
|
|
40
|
+
cleaned: str
|
|
41
|
+
chunks: List[str]
|
|
42
|
+
keywords: List[str]
|
|
43
|
+
summary: str
|
|
44
|
+
|
|
45
|
+
class HighPerformanceService:
|
|
46
|
+
"""High-performance service using Rust backend when available."""
|
|
47
|
+
|
|
48
|
+
def __init__(self, max_concurrent: int = 10):
|
|
49
|
+
self.max_concurrent = max_concurrent
|
|
50
|
+
self.rust_scraper = None
|
|
51
|
+
|
|
52
|
+
if RUST_AVAILABLE:
|
|
53
|
+
try:
|
|
54
|
+
self.rust_scraper = rust_perf.HighPerformanceScraper(max_concurrent)
|
|
55
|
+
logger.info("Initialized Rust performance scraper")
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.error(f"Failed to initialize Rust scraper: {e}")
|
|
58
|
+
self.rust_scraper = None
|
|
59
|
+
|
|
60
|
+
async def scrape_urls(self, urls: List[str]) -> List[ScrapedContent]:
|
|
61
|
+
"""Scrape multiple URLs concurrently with high performance."""
|
|
62
|
+
if not urls:
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
if self.rust_scraper and RUST_AVAILABLE:
|
|
66
|
+
try:
|
|
67
|
+
# Use Rust implementation
|
|
68
|
+
rust_results = await self.rust_scraper.scrape_urls(urls)
|
|
69
|
+
|
|
70
|
+
# Convert Rust results to Python objects
|
|
71
|
+
results = []
|
|
72
|
+
for result in rust_results:
|
|
73
|
+
scraped = ScrapedContent(
|
|
74
|
+
url=result["url"],
|
|
75
|
+
title=result["title"],
|
|
76
|
+
content=result["content"],
|
|
77
|
+
metadata=result["metadata"],
|
|
78
|
+
timestamp=datetime.fromisoformat(result["timestamp"].replace("Z", "+00:00"))
|
|
79
|
+
)
|
|
80
|
+
results.append(scraped)
|
|
81
|
+
|
|
82
|
+
logger.info(f"Scraped {len(results)} URLs using Rust backend")
|
|
83
|
+
return results
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.error(f"Rust scraping failed, falling back to Python: {e}")
|
|
87
|
+
|
|
88
|
+
# Fallback to Python implementation
|
|
89
|
+
return await self._scrape_urls_python(urls)
|
|
90
|
+
|
|
91
|
+
async def process_text_batch(self, texts: List[str]) -> List[ProcessedText]:
|
|
92
|
+
"""Process multiple texts concurrently with high performance."""
|
|
93
|
+
if not texts:
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
if self.rust_scraper and RUST_AVAILABLE:
|
|
97
|
+
try:
|
|
98
|
+
# Use Rust implementation
|
|
99
|
+
rust_results = await self.rust_scraper.process_text_batch(texts)
|
|
100
|
+
|
|
101
|
+
# Convert Rust results to Python objects
|
|
102
|
+
results = []
|
|
103
|
+
for result in rust_results:
|
|
104
|
+
processed = ProcessedText(
|
|
105
|
+
original=result["original"],
|
|
106
|
+
cleaned=result["cleaned"],
|
|
107
|
+
chunks=result["chunks"],
|
|
108
|
+
keywords=result["keywords"],
|
|
109
|
+
summary=result["summary"]
|
|
110
|
+
)
|
|
111
|
+
results.append(processed)
|
|
112
|
+
|
|
113
|
+
logger.info(f"Processed {len(results)} texts using Rust backend")
|
|
114
|
+
return results
|
|
115
|
+
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.error(f"Rust text processing failed, falling back to Python: {e}")
|
|
118
|
+
|
|
119
|
+
# Fallback to Python implementation
|
|
120
|
+
return await self._process_text_batch_python(texts)
|
|
121
|
+
|
|
122
|
+
async def extract_keywords(self, text: str, max_keywords: int = 10) -> List[str]:
|
|
123
|
+
"""Extract keywords from text using high-performance algorithms."""
|
|
124
|
+
if self.rust_scraper and RUST_AVAILABLE:
|
|
125
|
+
try:
|
|
126
|
+
return await self.rust_scraper.extract_keywords(text, max_keywords)
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.error(f"Rust keyword extraction failed, falling back to Python: {e}")
|
|
129
|
+
|
|
130
|
+
# Fallback to Python implementation
|
|
131
|
+
return await self._extract_keywords_python(text, max_keywords)
|
|
132
|
+
|
|
133
|
+
async def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
|
|
134
|
+
"""Chunk text into smaller pieces with overlap."""
|
|
135
|
+
if self.rust_scraper and RUST_AVAILABLE:
|
|
136
|
+
try:
|
|
137
|
+
return await self.rust_scraper.chunk_text(text, chunk_size, overlap)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.error(f"Rust text chunking failed, falling back to Python: {e}")
|
|
140
|
+
|
|
141
|
+
# Fallback to Python implementation
|
|
142
|
+
return await self._chunk_text_python(text, chunk_size, overlap)
|
|
143
|
+
|
|
144
|
+
def fast_text_clean(self, text: str) -> str:
|
|
145
|
+
"""Fast text cleaning using Rust implementation."""
|
|
146
|
+
if RUST_AVAILABLE:
|
|
147
|
+
try:
|
|
148
|
+
return rust_perf.fast_text_clean(text)
|
|
149
|
+
except Exception as e:
|
|
150
|
+
logger.error(f"Rust text cleaning failed, falling back to Python: {e}")
|
|
151
|
+
|
|
152
|
+
# Fallback to Python implementation
|
|
153
|
+
return self._clean_text_python(text)
|
|
154
|
+
|
|
155
|
+
def fast_url_validation(self, url: str) -> bool:
|
|
156
|
+
"""Fast URL validation using Rust implementation."""
|
|
157
|
+
if RUST_AVAILABLE:
|
|
158
|
+
try:
|
|
159
|
+
return rust_perf.fast_url_validation(url)
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.error(f"Rust URL validation failed, falling back to Python: {e}")
|
|
162
|
+
|
|
163
|
+
# Fallback to Python implementation
|
|
164
|
+
return self._validate_url_python(url)
|
|
165
|
+
|
|
166
|
+
def fast_text_similarity(self, text1: str, text2: str) -> float:
|
|
167
|
+
"""Fast text similarity calculation using Rust implementation."""
|
|
168
|
+
if RUST_AVAILABLE:
|
|
169
|
+
try:
|
|
170
|
+
return rust_perf.fast_text_similarity(text1, text2)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.error(f"Rust text similarity failed, falling back to Python: {e}")
|
|
173
|
+
|
|
174
|
+
# Fallback to Python implementation
|
|
175
|
+
return self._calculate_similarity_python(text1, text2)
|
|
176
|
+
|
|
177
|
+
# Python fallback implementations
|
|
178
|
+
async def _scrape_urls_python(self, urls: List[str]) -> List[ScrapedContent]:
|
|
179
|
+
"""Python fallback implementation for URL scraping."""
|
|
180
|
+
import aiohttp
|
|
181
|
+
from bs4 import BeautifulSoup
|
|
182
|
+
import re
|
|
183
|
+
|
|
184
|
+
async def scrape_single_url(session: aiohttp.ClientSession, url: str) -> Optional[ScrapedContent]:
|
|
185
|
+
try:
|
|
186
|
+
async with session.get(url, timeout=30) as response:
|
|
187
|
+
if response.status != 200:
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
html = await response.text()
|
|
191
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
192
|
+
|
|
193
|
+
# Extract title
|
|
194
|
+
title = soup.find('title')
|
|
195
|
+
title_text = title.get_text() if title else "No title"
|
|
196
|
+
|
|
197
|
+
# Extract content
|
|
198
|
+
content_selectors = ['article', 'main', '[role="main"]', '.content', '.main-content', 'body']
|
|
199
|
+
content = ""
|
|
200
|
+
for selector in content_selectors:
|
|
201
|
+
element = soup.select_one(selector)
|
|
202
|
+
if element:
|
|
203
|
+
content = element.get_text(separator=' ', strip=True)
|
|
204
|
+
break
|
|
205
|
+
|
|
206
|
+
# Extract metadata
|
|
207
|
+
metadata = {}
|
|
208
|
+
for meta in soup.find_all('meta'):
|
|
209
|
+
name = meta.get('name') or meta.get('property')
|
|
210
|
+
content_attr = meta.get('content')
|
|
211
|
+
if name and content_attr:
|
|
212
|
+
metadata[name] = content_attr
|
|
213
|
+
|
|
214
|
+
return ScrapedContent(
|
|
215
|
+
url=url,
|
|
216
|
+
title=title_text,
|
|
217
|
+
content=content,
|
|
218
|
+
metadata=metadata,
|
|
219
|
+
timestamp=_utc_now()
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(f"Error scraping {url}: {e}")
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
# Scrape URLs concurrently
|
|
227
|
+
connector = aiohttp.TCPConnector(limit=self.max_concurrent)
|
|
228
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
|
229
|
+
tasks = [scrape_single_url(session, url) for url in urls]
|
|
230
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
231
|
+
|
|
232
|
+
# Filter out None results and exceptions
|
|
233
|
+
scraped_content = []
|
|
234
|
+
for result in results:
|
|
235
|
+
if isinstance(result, ScrapedContent):
|
|
236
|
+
scraped_content.append(result)
|
|
237
|
+
|
|
238
|
+
return scraped_content
|
|
239
|
+
|
|
240
|
+
async def _process_text_batch_python(self, texts: List[str]) -> List[ProcessedText]:
|
|
241
|
+
"""Python fallback implementation for text processing."""
|
|
242
|
+
tasks = [self._process_single_text_python(text) for text in texts]
|
|
243
|
+
return await asyncio.gather(*tasks)
|
|
244
|
+
|
|
245
|
+
async def _process_single_text_python(self, text: str) -> ProcessedText:
|
|
246
|
+
"""Process a single text using Python implementation."""
|
|
247
|
+
cleaned = self._clean_text_python(text)
|
|
248
|
+
chunks = await self._chunk_text_python(cleaned, 1000, 200)
|
|
249
|
+
keywords = await self._extract_keywords_python(cleaned, 10)
|
|
250
|
+
summary = await self._generate_summary_python(cleaned)
|
|
251
|
+
|
|
252
|
+
return ProcessedText(
|
|
253
|
+
original=text,
|
|
254
|
+
cleaned=cleaned,
|
|
255
|
+
chunks=chunks,
|
|
256
|
+
keywords=keywords,
|
|
257
|
+
summary=summary
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
def _clean_text_python(self, text: str) -> str:
|
|
261
|
+
"""Python implementation of text cleaning."""
|
|
262
|
+
import re
|
|
263
|
+
|
|
264
|
+
# Remove extra whitespace
|
|
265
|
+
text = re.sub(r'\s+', ' ', text)
|
|
266
|
+
|
|
267
|
+
# Remove special characters but keep basic punctuation
|
|
268
|
+
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text)
|
|
269
|
+
|
|
270
|
+
# Normalize quotes and dashes
|
|
271
|
+
text = text.replace('"', '"').replace('"', '"')
|
|
272
|
+
text = text.replace(''', "'").replace(''', "'")
|
|
273
|
+
text = text.replace('–', '-').replace('—', '-')
|
|
274
|
+
|
|
275
|
+
return text.strip()
|
|
276
|
+
|
|
277
|
+
async def _chunk_text_python(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
|
278
|
+
"""Python implementation of text chunking."""
|
|
279
|
+
import re
|
|
280
|
+
|
|
281
|
+
# Split into sentences
|
|
282
|
+
sentences = re.split(r'[.!?]+', text)
|
|
283
|
+
sentences = [s.strip() for s in sentences if s.strip()]
|
|
284
|
+
|
|
285
|
+
chunks = []
|
|
286
|
+
current_chunk = ""
|
|
287
|
+
current_size = 0
|
|
288
|
+
|
|
289
|
+
for sentence in sentences:
|
|
290
|
+
sentence_size = len(sentence)
|
|
291
|
+
|
|
292
|
+
if current_size + sentence_size > chunk_size and current_chunk:
|
|
293
|
+
chunks.append(current_chunk.strip())
|
|
294
|
+
|
|
295
|
+
# Start new chunk with overlap
|
|
296
|
+
if overlap > 0:
|
|
297
|
+
words = current_chunk.split()
|
|
298
|
+
overlap_words = min(overlap // 10, len(words))
|
|
299
|
+
if overlap_words > 0:
|
|
300
|
+
current_chunk = " ".join(words[-overlap_words:]) + " "
|
|
301
|
+
current_size = len(current_chunk)
|
|
302
|
+
else:
|
|
303
|
+
current_chunk = ""
|
|
304
|
+
current_size = 0
|
|
305
|
+
else:
|
|
306
|
+
current_chunk = ""
|
|
307
|
+
current_size = 0
|
|
308
|
+
|
|
309
|
+
current_chunk += sentence + ". "
|
|
310
|
+
current_size += sentence_size + 2
|
|
311
|
+
|
|
312
|
+
if current_chunk.strip():
|
|
313
|
+
chunks.append(current_chunk.strip())
|
|
314
|
+
|
|
315
|
+
return chunks
|
|
316
|
+
|
|
317
|
+
async def _extract_keywords_python(self, text: str, max_keywords: int) -> List[str]:
|
|
318
|
+
"""Python implementation of keyword extraction."""
|
|
319
|
+
import re
|
|
320
|
+
from collections import Counter
|
|
321
|
+
|
|
322
|
+
# Stop words
|
|
323
|
+
stop_words = {
|
|
324
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
|
325
|
+
'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
|
326
|
+
'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must',
|
|
327
|
+
'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
# Extract words
|
|
331
|
+
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
|
332
|
+
words = [word for word in words if len(word) > 2 and word not in stop_words]
|
|
333
|
+
|
|
334
|
+
# Count frequencies
|
|
335
|
+
word_freq = Counter(words)
|
|
336
|
+
|
|
337
|
+
# Return top keywords
|
|
338
|
+
return [word for word, _ in word_freq.most_common(max_keywords)]
|
|
339
|
+
|
|
340
|
+
async def _generate_summary_python(self, text: str) -> str:
|
|
341
|
+
"""Python implementation of text summarization."""
|
|
342
|
+
import re
|
|
343
|
+
from collections import Counter
|
|
344
|
+
|
|
345
|
+
# Split into sentences
|
|
346
|
+
sentences = re.split(r'[.!?]+', text)
|
|
347
|
+
sentences = [s.strip() for s in sentences if s.strip()]
|
|
348
|
+
|
|
349
|
+
if len(sentences) <= 3:
|
|
350
|
+
return text
|
|
351
|
+
|
|
352
|
+
# Calculate word frequencies
|
|
353
|
+
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
|
354
|
+
word_freq = Counter(words)
|
|
355
|
+
|
|
356
|
+
# Score sentences
|
|
357
|
+
sentence_scores = []
|
|
358
|
+
for i, sentence in enumerate(sentences):
|
|
359
|
+
sentence_words = re.findall(r'\b[a-zA-Z]+\b', sentence.lower())
|
|
360
|
+
score = sum(word_freq.get(word, 0) for word in sentence_words)
|
|
361
|
+
sentence_scores.append((i, score))
|
|
362
|
+
|
|
363
|
+
# Sort by score and take top sentences
|
|
364
|
+
sentence_scores.sort(key=lambda x: x[1], reverse=True)
|
|
365
|
+
top_indices = sorted([i for i, _ in sentence_scores[:3]])
|
|
366
|
+
|
|
367
|
+
summary = ". ".join(sentences[i] for i in top_indices)
|
|
368
|
+
return summary + "."
|
|
369
|
+
|
|
370
|
+
def _validate_url_python(self, url: str) -> bool:
|
|
371
|
+
"""Python implementation of URL validation."""
|
|
372
|
+
try:
|
|
373
|
+
from urllib.parse import urlparse
|
|
374
|
+
result = urlparse(url)
|
|
375
|
+
return all([result.scheme, result.netloc])
|
|
376
|
+
except:
|
|
377
|
+
return False
|
|
378
|
+
|
|
379
|
+
def _calculate_similarity_python(self, text1: str, text2: str) -> float:
|
|
380
|
+
"""Python implementation of text similarity."""
|
|
381
|
+
import re
|
|
382
|
+
from collections import Counter
|
|
383
|
+
|
|
384
|
+
# Extract words
|
|
385
|
+
words1 = set(re.findall(r'\b[a-zA-Z]+\b', text1.lower()))
|
|
386
|
+
words2 = set(re.findall(r'\b[a-zA-Z]+\b', text2.lower()))
|
|
387
|
+
|
|
388
|
+
# Calculate Jaccard similarity
|
|
389
|
+
intersection = len(words1.intersection(words2))
|
|
390
|
+
union = len(words1.union(words2))
|
|
391
|
+
|
|
392
|
+
return intersection / union if union > 0 else 0.0
|
|
393
|
+
|
|
394
|
+
# Global instance
|
|
395
|
+
performance_service = HighPerformanceService()
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Research service package exposing conversation and synthesis utilities."""
|
|
2
|
+
|
|
3
|
+
from .chatbot import ResearchChatbot
|
|
4
|
+
from .citation_manager import CitationManager
|
|
5
|
+
from .context_manager import ResearchContextManager
|
|
6
|
+
from .conversation_manager import ResearchConversationManager
|
|
7
|
+
from .critical_paper_detector import CriticalPaperDetector
|
|
8
|
+
from .enhanced_research import EnhancedResearchPipeline
|
|
9
|
+
from .enhanced_synthesizer import EnhancedSynthesizer
|
|
10
|
+
from .query_generator import ResearchQueryGenerator
|
|
11
|
+
from .synthesizer import ResearchSynthesizer
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ResearchChatbot",
|
|
15
|
+
"CitationManager",
|
|
16
|
+
"ResearchContextManager",
|
|
17
|
+
"ResearchConversationManager",
|
|
18
|
+
"CriticalPaperDetector",
|
|
19
|
+
"EnhancedResearchPipeline",
|
|
20
|
+
"EnhancedSynthesizer",
|
|
21
|
+
"ResearchQueryGenerator",
|
|
22
|
+
"ResearchSynthesizer",
|
|
23
|
+
]
|