async-web-search 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- async_web_search-0.1.0/.gitignore +6 -0
- async_web_search-0.1.0/LICENSE +21 -0
- async_web_search-0.1.0/PKG-INFO +59 -0
- async_web_search-0.1.0/README.md +38 -0
- async_web_search-0.1.0/pyproject.toml +64 -0
- async_web_search-0.1.0/src/web_search/__init__.py +17 -0
- async_web_search-0.1.0/src/web_search/config.py +36 -0
- async_web_search-0.1.0/src/web_search/google.py +143 -0
- async_web_search-0.1.0/src/web_search/knowledge.py +137 -0
- async_web_search-0.1.0/src/web_search/search.py +31 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Your Name
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: async-web-search
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Async web search library supporting Google, Wikipedia, and arXiv
|
|
5
|
+
Project-URL: Homepage, https://github.com/nwaughachukwuma/web-search
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/nwaughachukwuma/web-search/issues
|
|
7
|
+
Author: Chukwuma
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Requires-Dist: asyncio
|
|
16
|
+
Requires-Dist: beautifulsoup4
|
|
17
|
+
Requires-Dist: httpx
|
|
18
|
+
Requires-Dist: lxml
|
|
19
|
+
Requires-Dist: wikipedia
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Web Search
|
|
23
|
+
|
|
24
|
+
Async web search library supporting Google Custom Search, Wikipedia, and arXiv APIs.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install web-search
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from web_search import WebSearch, WebSearchConfig, GoogleSearchConfig
|
|
36
|
+
|
|
37
|
+
config = WebSearchConfig(
|
|
38
|
+
sources=["google", "wikipedia"],
|
|
39
|
+
google_config=GoogleSearchConfig(
|
|
40
|
+
api_key="your_api_key",
|
|
41
|
+
cse_id="your_cse_id"
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
search = WebSearch(config)
|
|
46
|
+
results = await search.search("quantum computing")
|
|
47
|
+
print(results)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
- Async/concurrent searching
|
|
53
|
+
- Multiple source support (Google, Wikipedia, arXiv)
|
|
54
|
+
- Content extraction and cleaning
|
|
55
|
+
- Configurable search parameters
|
|
56
|
+
|
|
57
|
+
## License
|
|
58
|
+
|
|
59
|
+
MIT
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Web Search
|
|
2
|
+
|
|
3
|
+
Async web search library supporting Google Custom Search, Wikipedia, and arXiv APIs.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install web-search
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from web_search import WebSearch, WebSearchConfig, GoogleSearchConfig
|
|
15
|
+
|
|
16
|
+
config = WebSearchConfig(
|
|
17
|
+
sources=["google", "wikipedia"],
|
|
18
|
+
google_config=GoogleSearchConfig(
|
|
19
|
+
api_key="your_api_key",
|
|
20
|
+
cse_id="your_cse_id"
|
|
21
|
+
)
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
search = WebSearch(config)
|
|
25
|
+
results = await search.search("quantum computing")
|
|
26
|
+
print(results)
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Features
|
|
30
|
+
|
|
31
|
+
- Async/concurrent searching
|
|
32
|
+
- Multiple source support (Google, Wikipedia, arXiv)
|
|
33
|
+
- Content extraction and cleaning
|
|
34
|
+
- Configurable search parameters
|
|
35
|
+
|
|
36
|
+
## License
|
|
37
|
+
|
|
38
|
+
MIT
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "async-web-search"
|
|
7
|
+
requires-python = ">=3.9"
|
|
8
|
+
version = "0.1.0"
|
|
9
|
+
description = "Async web search library supporting Google, Wikipedia, and arXiv"
|
|
10
|
+
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
authors = [
|
|
14
|
+
{ name="Chukwuma" },
|
|
15
|
+
]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Topic :: Internet :: WWW/HTTP :: Dynamic Content",
|
|
21
|
+
"Topic :: Software Development :: Libraries :: Python Modules"
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"asyncio",
|
|
25
|
+
"httpx",
|
|
26
|
+
"beautifulsoup4",
|
|
27
|
+
"wikipedia",
|
|
28
|
+
"lxml"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
"Homepage" = "https://github.com/nwaughachukwuma/web-search"
|
|
33
|
+
"Bug Tracker" = "https://github.com/nwaughachukwuma/web-search/issues"
|
|
34
|
+
|
|
35
|
+
[tool.hatch.build]
|
|
36
|
+
include = [
|
|
37
|
+
"src/**/*.py",
|
|
38
|
+
"LICENSE",
|
|
39
|
+
"README.md"
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["src/web_search"]
|
|
44
|
+
|
|
45
|
+
[tool.ruff]
|
|
46
|
+
line-length = 120
|
|
47
|
+
include = ["src/**/*.(py|pyi)", "tests/**/*.(py|pyi)", "pyproject.toml"]
|
|
48
|
+
|
|
49
|
+
[tool.ruff.lint]
|
|
50
|
+
select = ["D102", "F401"]
|
|
51
|
+
|
|
52
|
+
[tool.ruff.lint.isort]
|
|
53
|
+
case-sensitive = true
|
|
54
|
+
relative-imports-order = "closest-to-furthest"
|
|
55
|
+
|
|
56
|
+
[tool.ruff.format]
|
|
57
|
+
quote-style = "double"
|
|
58
|
+
indent-style = "space"
|
|
59
|
+
|
|
60
|
+
[tool.pylint]
|
|
61
|
+
max-line-length = 120
|
|
62
|
+
|
|
63
|
+
[tool.mypy]
|
|
64
|
+
ignore_missing_imports = false
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .search import WebSearch
|
|
2
|
+
from .config import (
|
|
3
|
+
WebSearchConfig,
|
|
4
|
+
GoogleSearchConfig,
|
|
5
|
+
KnowledgeSearchConfig,
|
|
6
|
+
SearchSources,
|
|
7
|
+
SearchResult,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"WebSearch",
|
|
12
|
+
"WebSearchConfig",
|
|
13
|
+
"GoogleSearchConfig",
|
|
14
|
+
"KnowledgeSearchConfig",
|
|
15
|
+
"SearchSources",
|
|
16
|
+
"SearchResult",
|
|
17
|
+
]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
SearchSources = Literal["google", "wikipedia", "arxiv"]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class GoogleSearchConfig:
|
|
9
|
+
api_key: str
|
|
10
|
+
cse_id: str
|
|
11
|
+
max_results: int = 3
|
|
12
|
+
app_domain: str | None = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class KnowledgeSearchConfig:
|
|
17
|
+
max_results: int = 3
|
|
18
|
+
max_sources: int = 10
|
|
19
|
+
max_preview_chars: int = 1024
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class WebSearchConfig:
|
|
24
|
+
sources: list[SearchSources] = field(default_factory=lambda: ["google"])
|
|
25
|
+
google_config: GoogleSearchConfig | None = None
|
|
26
|
+
knowledge_config: KnowledgeSearchConfig | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SearchResult:
|
|
31
|
+
url: str
|
|
32
|
+
title: str
|
|
33
|
+
preview: str
|
|
34
|
+
|
|
35
|
+
def __str__(self):
|
|
36
|
+
return f"Title: {self.title}\nPreview: {self.preview}"
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Any, Coroutine, Dict, List
|
|
3
|
+
from urllib.parse import unquote
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
|
|
8
|
+
from .config import GoogleSearchConfig, SearchResult
|
|
9
|
+
|
|
10
|
+
GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GoogleSearch:
|
|
14
|
+
config: GoogleSearchConfig
|
|
15
|
+
|
|
16
|
+
def __init__(self, config: GoogleSearchConfig | None = None):
|
|
17
|
+
self.config = config if config else GoogleSearchConfig()
|
|
18
|
+
|
|
19
|
+
async def _compile_google_search(self, query: str):
|
|
20
|
+
results = await self._google_search(query)
|
|
21
|
+
return "\n\n".join(str(item) for item in results if item.preview)
|
|
22
|
+
|
|
23
|
+
async def _google_search(self, query: str, **kwargs):
|
|
24
|
+
"""
|
|
25
|
+
Perform a Google search using the Custom Search Engine API
|
|
26
|
+
"""
|
|
27
|
+
params = {
|
|
28
|
+
"q": unquote(query),
|
|
29
|
+
"key": self.config.api_key,
|
|
30
|
+
"cx": self.config.cse_id,
|
|
31
|
+
"num": 5,
|
|
32
|
+
}
|
|
33
|
+
params.update(kwargs)
|
|
34
|
+
headers = {"Referer": self.config.app_domain}
|
|
35
|
+
|
|
36
|
+
async with httpx.AsyncClient() as client:
|
|
37
|
+
response = await client.get(
|
|
38
|
+
GOOGLE_SEARCH_URL, params=params, headers=headers
|
|
39
|
+
)
|
|
40
|
+
response.raise_for_status()
|
|
41
|
+
|
|
42
|
+
json_data = response.json()
|
|
43
|
+
|
|
44
|
+
items = json_data.get("items", [])[: self.config.max_results]
|
|
45
|
+
result = await self.extract_relevant_items(items)
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
async def extract_relevant_items(
|
|
49
|
+
self, search_results: List[Dict[str, Any]]
|
|
50
|
+
) -> List[SearchResult]:
|
|
51
|
+
"""
|
|
52
|
+
Extract relevant items from the search results
|
|
53
|
+
"""
|
|
54
|
+
tasks: list[Coroutine[Any, Any, SearchResult | None]] = []
|
|
55
|
+
|
|
56
|
+
for item in search_results:
|
|
57
|
+
url = item.get("link")
|
|
58
|
+
if url and self._is_valid_url(url):
|
|
59
|
+
tasks.append(self._process_search_item(url, item))
|
|
60
|
+
|
|
61
|
+
if not tasks:
|
|
62
|
+
return []
|
|
63
|
+
|
|
64
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
65
|
+
return [item for item in results if isinstance(item, SearchResult)]
|
|
66
|
+
|
|
67
|
+
def _is_valid_url(self, url: str) -> bool:
|
|
68
|
+
invalid_extensions = (
|
|
69
|
+
".pdf",
|
|
70
|
+
".doc",
|
|
71
|
+
".docx",
|
|
72
|
+
".ppt",
|
|
73
|
+
".pptx",
|
|
74
|
+
".xls",
|
|
75
|
+
".xlsx",
|
|
76
|
+
".zip",
|
|
77
|
+
".rar",
|
|
78
|
+
)
|
|
79
|
+
invalid_domains = ("youtube.com", "vimeo.com", "facebook.com", "twitter.com")
|
|
80
|
+
return not (
|
|
81
|
+
url.endswith(invalid_extensions)
|
|
82
|
+
or any(domain in url for domain in invalid_domains)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
async def _process_search_item(
|
|
86
|
+
self, url: str, item: Dict, char_limit=2000
|
|
87
|
+
) -> SearchResult | None:
|
|
88
|
+
"""
|
|
89
|
+
Process and fetch the result of a single search item url
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
content = await self._scrape_page_content(url)
|
|
93
|
+
return SearchResult(
|
|
94
|
+
url=url, title=item.get("title", ""), preview=content[:char_limit]
|
|
95
|
+
)
|
|
96
|
+
except Exception:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
async def _scrape_page_content(self, url: str) -> str:
|
|
100
|
+
"""
|
|
101
|
+
Fetch and extract content from a webpage
|
|
102
|
+
"""
|
|
103
|
+
try:
|
|
104
|
+
async with httpx.AsyncClient() as client:
|
|
105
|
+
response = await client.get(url)
|
|
106
|
+
response.raise_for_status()
|
|
107
|
+
|
|
108
|
+
soup = BeautifulSoup(response.text, "lxml")
|
|
109
|
+
# Remove unwanted elements
|
|
110
|
+
for element in soup.find_all(
|
|
111
|
+
["script", "style", "nav", "header", "footer", "ads"]
|
|
112
|
+
):
|
|
113
|
+
element.decompose()
|
|
114
|
+
|
|
115
|
+
content_elements = soup.find_all(
|
|
116
|
+
["article", "main", "div"],
|
|
117
|
+
class_=["content", "article", "post", "entry", "main-content"],
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if not content_elements:
|
|
121
|
+
# Fallback to paragraph extraction if no main content container found
|
|
122
|
+
content_elements = soup.find_all("p")
|
|
123
|
+
|
|
124
|
+
# Extract text from found elements
|
|
125
|
+
content = "\n".join(
|
|
126
|
+
element.get_text(strip=True)
|
|
127
|
+
for element in content_elements
|
|
128
|
+
if element.get_text(strip=True)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# If still no content, try getting all text
|
|
132
|
+
if not content:
|
|
133
|
+
content = soup.get_text(strip=True)
|
|
134
|
+
|
|
135
|
+
return self._clean_content(content)
|
|
136
|
+
except Exception:
|
|
137
|
+
return ""
|
|
138
|
+
|
|
139
|
+
def _clean_content(self, content: str) -> str:
|
|
140
|
+
"""Remove very short lines (likely navigation/menu items)"""
|
|
141
|
+
content = " ".join(content.split())
|
|
142
|
+
lines = [line for line in content.split("\n") if len(line) > 30]
|
|
143
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
import wikipedia
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from .config import KnowledgeSearchConfig, SearchResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class KnowledgeSearch:
|
|
10
|
+
config: KnowledgeSearchConfig
|
|
11
|
+
|
|
12
|
+
def __init__(self, config: KnowledgeSearchConfig | None = None):
|
|
13
|
+
self.config = config if config else KnowledgeSearchConfig()
|
|
14
|
+
|
|
15
|
+
async def fetch_knowledge(self, query: str):
|
|
16
|
+
"""
|
|
17
|
+
Fetch knowledge from multiple sources concurrently,
|
|
18
|
+
including Wikipedia, arXiv, and other scientific sources
|
|
19
|
+
"""
|
|
20
|
+
# listed in order of importance
|
|
21
|
+
tasks = [
|
|
22
|
+
self._search_wikipedia(query),
|
|
23
|
+
self._search_arxiv_papers(query),
|
|
24
|
+
# add more knowledge sources here
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
sources: list[SearchResult] = []
|
|
28
|
+
|
|
29
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
30
|
+
for result in results:
|
|
31
|
+
if isinstance(result, list):
|
|
32
|
+
sources.extend(result)
|
|
33
|
+
|
|
34
|
+
sources = sources[: self.config.max_sources]
|
|
35
|
+
return "\n\n".join(str(source) for source in sources if source.preview)
|
|
36
|
+
|
|
37
|
+
async def _compile_wikipedia(self, query: str) -> str:
|
|
38
|
+
results = await self._search_wikipedia(query)
|
|
39
|
+
return "\n\n".join(str(item) for item in results)
|
|
40
|
+
|
|
41
|
+
async def _compile_arxiv_papers(self, query: str) -> str:
|
|
42
|
+
results = await self._search_arxiv_papers(query)
|
|
43
|
+
return "\n\n".join(str(item) for item in results)
|
|
44
|
+
|
|
45
|
+
async def _search_wikipedia(self, query: str) -> list[SearchResult]:
|
|
46
|
+
"""
|
|
47
|
+
Fetch relevant Wikipedia articles
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
sources: list[SearchResult] = []
|
|
51
|
+
search_results = wikipedia.search(query, results=self.config.max_results)
|
|
52
|
+
|
|
53
|
+
for title in search_results:
|
|
54
|
+
try:
|
|
55
|
+
page = wikipedia.page(title)
|
|
56
|
+
if not page.content:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
preview = self._extract_relevant_wiki_sections(page.content)
|
|
60
|
+
if not preview:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
sources.append(
|
|
64
|
+
SearchResult(url=page.url, title=page.title, preview=preview)
|
|
65
|
+
)
|
|
66
|
+
except wikipedia.exceptions.DisambiguationError:
|
|
67
|
+
continue
|
|
68
|
+
except wikipedia.exceptions.PageError:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
return sources
|
|
72
|
+
except Exception:
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
async def _search_arxiv_papers(self, query: str) -> list[SearchResult]:
|
|
76
|
+
"""
|
|
77
|
+
Fetch papers from arXiv and other scientific sources
|
|
78
|
+
"""
|
|
79
|
+
ARXIV_URL = "http://export.arxiv.org/api/query"
|
|
80
|
+
try:
|
|
81
|
+
params = {
|
|
82
|
+
"search_query": f"all:{query}",
|
|
83
|
+
"start": 0,
|
|
84
|
+
"max_results": self.config.max_results,
|
|
85
|
+
"sortBy": "relevance",
|
|
86
|
+
"sortOrder": "descending",
|
|
87
|
+
}
|
|
88
|
+
async with httpx.AsyncClient(timeout=20) as client:
|
|
89
|
+
response = await client.get(ARXIV_URL, params=params)
|
|
90
|
+
response.raise_for_status()
|
|
91
|
+
|
|
92
|
+
soup = BeautifulSoup(response.text, "lxml-xml")
|
|
93
|
+
entries = soup.find_all("entry")
|
|
94
|
+
|
|
95
|
+
sources: list[SearchResult] = []
|
|
96
|
+
for entry in entries:
|
|
97
|
+
title = entry.title.text.strip()
|
|
98
|
+
url = entry.id.text.strip()
|
|
99
|
+
preview = entry.summary.text.strip()
|
|
100
|
+
|
|
101
|
+
if not preview:
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
sources.append(SearchResult(url=url, title=title, preview=preview))
|
|
105
|
+
|
|
106
|
+
return sources
|
|
107
|
+
except Exception:
|
|
108
|
+
return []
|
|
109
|
+
|
|
110
|
+
def _extract_relevant_wiki_sections(self, content: str) -> str:
|
|
111
|
+
"""
|
|
112
|
+
Extract the most relevant sections from Wikipedia content
|
|
113
|
+
"""
|
|
114
|
+
paragraphs = content.split("\n\n")
|
|
115
|
+
# Remove references and other metadata
|
|
116
|
+
cleaned_paragraphs = [
|
|
117
|
+
p
|
|
118
|
+
for p in paragraphs
|
|
119
|
+
if not any(
|
|
120
|
+
marker in p.lower()
|
|
121
|
+
for marker in [
|
|
122
|
+
"references",
|
|
123
|
+
"external links",
|
|
124
|
+
"see also",
|
|
125
|
+
"== notes ==",
|
|
126
|
+
]
|
|
127
|
+
)
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
result = ""
|
|
131
|
+
for p in cleaned_paragraphs:
|
|
132
|
+
if len(result + p) <= self.config.max_preview_chars:
|
|
133
|
+
result += p + "\n\n"
|
|
134
|
+
else:
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
return result.strip()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Any, Coroutine, List
|
|
3
|
+
|
|
4
|
+
from .google import GoogleSearch
|
|
5
|
+
from .knowledge import KnowledgeSearch
|
|
6
|
+
from .config import WebSearchConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class WebSearch(GoogleSearch, KnowledgeSearch):
|
|
10
|
+
def __init__(self, config: WebSearchConfig | None = None):
|
|
11
|
+
self.config = config if config else WebSearchConfig()
|
|
12
|
+
|
|
13
|
+
self.sources = self.config.sources
|
|
14
|
+
GoogleSearch.__init__(self, self.config.google_config)
|
|
15
|
+
KnowledgeSearch.__init__(self, config=self.config.knowledge_config)
|
|
16
|
+
|
|
17
|
+
async def search(self, query: str):
|
|
18
|
+
"""
|
|
19
|
+
Search the web for relevant content
|
|
20
|
+
"""
|
|
21
|
+
tasks: List[Coroutine[Any, Any, str]] = []
|
|
22
|
+
|
|
23
|
+
if "google" in self.sources:
|
|
24
|
+
tasks.append(self._compile_google_search(query))
|
|
25
|
+
if "wikipedia" in self.sources:
|
|
26
|
+
tasks.append(self._compile_wikipedia(query))
|
|
27
|
+
if "arxiv" in self.sources:
|
|
28
|
+
tasks.append(self._compile_arxiv_papers(query))
|
|
29
|
+
|
|
30
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
31
|
+
return "\n\n".join(item for item in results if isinstance(item, str))
|