async-web-search 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ __pycache__
2
+ .venv/
3
+ venv/
4
+ *.pyc
5
+
6
+ __ruff_cache__/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Your Name
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,59 @@
1
+ Metadata-Version: 2.3
2
+ Name: async-web-search
3
+ Version: 0.1.0
4
+ Summary: Async web search library supporting Google, Wikipedia, and arXiv
5
+ Project-URL: Homepage, https://github.com/nwaughachukwuma/web-search
6
+ Project-URL: Bug Tracker, https://github.com/nwaughachukwuma/web-search/issues
7
+ Author: Chukwuma
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
13
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
14
+ Requires-Python: >=3.9
15
+ Requires-Dist: asyncio
16
+ Requires-Dist: beautifulsoup4
17
+ Requires-Dist: httpx
18
+ Requires-Dist: lxml
19
+ Requires-Dist: wikipedia
20
+ Description-Content-Type: text/markdown
21
+
22
+ # Web Search
23
+
24
+ Async web search library supporting Google Custom Search, Wikipedia, and arXiv APIs.
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install web-search
30
+ ```
31
+
32
+ ## Usage
33
+
34
+ ```python
35
+ from web_search import WebSearch, WebSearchConfig, GoogleSearchConfig
36
+
37
+ config = WebSearchConfig(
38
+ sources=["google", "wikipedia"],
39
+ google_config=GoogleSearchConfig(
40
+ api_key="your_api_key",
41
+ cse_id="your_cse_id"
42
+ )
43
+ )
44
+
45
+ search = WebSearch(config)
46
+ results = await search.search("quantum computing")
47
+ print(results)
48
+ ```
49
+
50
+ ## Features
51
+
52
+ - Async/concurrent searching
53
+ - Multiple source support (Google, Wikipedia, arXiv)
54
+ - Content extraction and cleaning
55
+ - Configurable search parameters
56
+
57
+ ## License
58
+
59
+ MIT
@@ -0,0 +1,38 @@
1
+ # Web Search
2
+
3
+ Async web search library supporting Google Custom Search, Wikipedia, and arXiv APIs.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install web-search
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from web_search import WebSearch, WebSearchConfig, GoogleSearchConfig
15
+
16
+ config = WebSearchConfig(
17
+ sources=["google", "wikipedia"],
18
+ google_config=GoogleSearchConfig(
19
+ api_key="your_api_key",
20
+ cse_id="your_cse_id"
21
+ )
22
+ )
23
+
24
+ search = WebSearch(config)
25
+ results = await search.search("quantum computing")
26
+ print(results)
27
+ ```
28
+
29
+ ## Features
30
+
31
+ - Async/concurrent searching
32
+ - Multiple source support (Google, Wikipedia, arXiv)
33
+ - Content extraction and cleaning
34
+ - Configurable search parameters
35
+
36
+ ## License
37
+
38
+ MIT
@@ -0,0 +1,64 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "async-web-search"
7
+ requires-python = ">=3.9"
8
+ version = "0.1.0"
9
+ description = "Async web search library supporting Google, Wikipedia, and arXiv"
10
+
11
+ readme = "README.md"
12
+ license = "MIT"
13
+ authors = [
14
+ { name="Chukwuma" },
15
+ ]
16
+ classifiers = [
17
+ "Programming Language :: Python :: 3",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ "Topic :: Internet :: WWW/HTTP :: Dynamic Content",
21
+ "Topic :: Software Development :: Libraries :: Python Modules"
22
+ ]
23
+ dependencies = [
24
+ "asyncio",
25
+ "httpx",
26
+ "beautifulsoup4",
27
+ "wikipedia",
28
+ "lxml"
29
+ ]
30
+
31
+ [project.urls]
32
+ "Homepage" = "https://github.com/nwaughachukwuma/web-search"
33
+ "Bug Tracker" = "https://github.com/nwaughachukwuma/web-search/issues"
34
+
35
+ [tool.hatch.build]
36
+ include = [
37
+ "src/**/*.py",
38
+ "LICENSE",
39
+ "README.md"
40
+ ]
41
+
42
+ [tool.hatch.build.targets.wheel]
43
+ packages = ["src/web_search"]
44
+
45
+ [tool.ruff]
46
+ line-length = 120
47
+ include = ["src/**/*.(py|pyi)", "tests/**/*.(py|pyi)", "pyproject.toml"]
48
+
49
+ [tool.ruff.lint]
50
+ select = ["D102", "F401"]
51
+
52
+ [tool.ruff.lint.isort]
53
+ case-sensitive = true
54
+ relative-imports-order = "closest-to-furthest"
55
+
56
+ [tool.ruff.format]
57
+ quote-style = "double"
58
+ indent-style = "space"
59
+
60
+ [tool.pylint]
61
+ max-line-length = 120
62
+
63
+ [tool.mypy]
64
+ ignore_missing_imports = false
@@ -0,0 +1,17 @@
1
+ from .search import WebSearch
2
+ from .config import (
3
+ WebSearchConfig,
4
+ GoogleSearchConfig,
5
+ KnowledgeSearchConfig,
6
+ SearchSources,
7
+ SearchResult,
8
+ )
9
+
10
+ __all__ = [
11
+ "WebSearch",
12
+ "WebSearchConfig",
13
+ "GoogleSearchConfig",
14
+ "KnowledgeSearchConfig",
15
+ "SearchSources",
16
+ "SearchResult",
17
+ ]
@@ -0,0 +1,36 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Literal
3
+
4
+ SearchSources = Literal["google", "wikipedia", "arxiv"]
5
+
6
+
7
+ @dataclass
8
+ class GoogleSearchConfig:
9
+ api_key: str
10
+ cse_id: str
11
+ max_results: int = 3
12
+ app_domain: str | None = None
13
+
14
+
15
+ @dataclass
16
+ class KnowledgeSearchConfig:
17
+ max_results: int = 3
18
+ max_sources: int = 10
19
+ max_preview_chars: int = 1024
20
+
21
+
22
+ @dataclass
23
+ class WebSearchConfig:
24
+ sources: list[SearchSources] = field(default_factory=lambda: ["google"])
25
+ google_config: GoogleSearchConfig | None = None
26
+ knowledge_config: KnowledgeSearchConfig | None = None
27
+
28
+
29
+ @dataclass
30
+ class SearchResult:
31
+ url: str
32
+ title: str
33
+ preview: str
34
+
35
+ def __str__(self):
36
+ return f"Title: {self.title}\nPreview: {self.preview}"
@@ -0,0 +1,143 @@
1
+ import asyncio
2
+ from typing import Any, Coroutine, Dict, List
3
+ from urllib.parse import unquote
4
+
5
+ import httpx
6
+ from bs4 import BeautifulSoup
7
+
8
+ from .config import GoogleSearchConfig, SearchResult
9
+
10
+ GOOGLE_SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
11
+
12
+
13
+ class GoogleSearch:
14
+ config: GoogleSearchConfig
15
+
16
+ def __init__(self, config: GoogleSearchConfig | None = None):
17
+ self.config = config if config else GoogleSearchConfig()
18
+
19
+ async def _compile_google_search(self, query: str):
20
+ results = await self._google_search(query)
21
+ return "\n\n".join(str(item) for item in results if item.preview)
22
+
23
+ async def _google_search(self, query: str, **kwargs):
24
+ """
25
+ Perform a Google search using the Custom Search Engine API
26
+ """
27
+ params = {
28
+ "q": unquote(query),
29
+ "key": self.config.api_key,
30
+ "cx": self.config.cse_id,
31
+ "num": 5,
32
+ }
33
+ params.update(kwargs)
34
+ headers = {"Referer": self.config.app_domain}
35
+
36
+ async with httpx.AsyncClient() as client:
37
+ response = await client.get(
38
+ GOOGLE_SEARCH_URL, params=params, headers=headers
39
+ )
40
+ response.raise_for_status()
41
+
42
+ json_data = response.json()
43
+
44
+ items = json_data.get("items", [])[: self.config.max_results]
45
+ result = await self.extract_relevant_items(items)
46
+ return result
47
+
48
+ async def extract_relevant_items(
49
+ self, search_results: List[Dict[str, Any]]
50
+ ) -> List[SearchResult]:
51
+ """
52
+ Extract relevant items from the search results
53
+ """
54
+ tasks: list[Coroutine[Any, Any, SearchResult | None]] = []
55
+
56
+ for item in search_results:
57
+ url = item.get("link")
58
+ if url and self._is_valid_url(url):
59
+ tasks.append(self._process_search_item(url, item))
60
+
61
+ if not tasks:
62
+ return []
63
+
64
+ results = await asyncio.gather(*tasks, return_exceptions=True)
65
+ return [item for item in results if isinstance(item, SearchResult)]
66
+
67
+ def _is_valid_url(self, url: str) -> bool:
68
+ invalid_extensions = (
69
+ ".pdf",
70
+ ".doc",
71
+ ".docx",
72
+ ".ppt",
73
+ ".pptx",
74
+ ".xls",
75
+ ".xlsx",
76
+ ".zip",
77
+ ".rar",
78
+ )
79
+ invalid_domains = ("youtube.com", "vimeo.com", "facebook.com", "twitter.com")
80
+ return not (
81
+ url.endswith(invalid_extensions)
82
+ or any(domain in url for domain in invalid_domains)
83
+ )
84
+
85
+ async def _process_search_item(
86
+ self, url: str, item: Dict, char_limit=2000
87
+ ) -> SearchResult | None:
88
+ """
89
+ Process and fetch the result of a single search item url
90
+ """
91
+ try:
92
+ content = await self._scrape_page_content(url)
93
+ return SearchResult(
94
+ url=url, title=item.get("title", ""), preview=content[:char_limit]
95
+ )
96
+ except Exception:
97
+ return None
98
+
99
+ async def _scrape_page_content(self, url: str) -> str:
100
+ """
101
+ Fetch and extract content from a webpage
102
+ """
103
+ try:
104
+ async with httpx.AsyncClient() as client:
105
+ response = await client.get(url)
106
+ response.raise_for_status()
107
+
108
+ soup = BeautifulSoup(response.text, "lxml")
109
+ # Remove unwanted elements
110
+ for element in soup.find_all(
111
+ ["script", "style", "nav", "header", "footer", "ads"]
112
+ ):
113
+ element.decompose()
114
+
115
+ content_elements = soup.find_all(
116
+ ["article", "main", "div"],
117
+ class_=["content", "article", "post", "entry", "main-content"],
118
+ )
119
+
120
+ if not content_elements:
121
+ # Fallback to paragraph extraction if no main content container found
122
+ content_elements = soup.find_all("p")
123
+
124
+ # Extract text from found elements
125
+ content = "\n".join(
126
+ element.get_text(strip=True)
127
+ for element in content_elements
128
+ if element.get_text(strip=True)
129
+ )
130
+
131
+ # If still no content, try getting all text
132
+ if not content:
133
+ content = soup.get_text(strip=True)
134
+
135
+ return self._clean_content(content)
136
+ except Exception:
137
+ return ""
138
+
139
+ def _clean_content(self, content: str) -> str:
140
+ """Remove very short lines (likely navigation/menu items)"""
141
+ content = " ".join(content.split())
142
+ lines = [line for line in content.split("\n") if len(line) > 30]
143
+ return "\n".join(lines)
@@ -0,0 +1,137 @@
1
+ import asyncio
2
+
3
+ import httpx
4
+ import wikipedia
5
+ from bs4 import BeautifulSoup
6
+ from .config import KnowledgeSearchConfig, SearchResult
7
+
8
+
9
+ class KnowledgeSearch:
10
+ config: KnowledgeSearchConfig
11
+
12
+ def __init__(self, config: KnowledgeSearchConfig | None = None):
13
+ self.config = config if config else KnowledgeSearchConfig()
14
+
15
+ async def fetch_knowledge(self, query: str):
16
+ """
17
+ Fetch knowledge from multiple sources concurrently,
18
+ including Wikipedia, arXiv, and other scientific sources
19
+ """
20
+ # listed in order of importance
21
+ tasks = [
22
+ self._search_wikipedia(query),
23
+ self._search_arxiv_papers(query),
24
+ # add more knowledge sources here
25
+ ]
26
+
27
+ sources: list[SearchResult] = []
28
+
29
+ results = await asyncio.gather(*tasks, return_exceptions=True)
30
+ for result in results:
31
+ if isinstance(result, list):
32
+ sources.extend(result)
33
+
34
+ sources = sources[: self.config.max_sources]
35
+ return "\n\n".join(str(source) for source in sources if source.preview)
36
+
37
+ async def _compile_wikipedia(self, query: str) -> str:
38
+ results = await self._search_wikipedia(query)
39
+ return "\n\n".join(str(item) for item in results)
40
+
41
+ async def _compile_arxiv_papers(self, query: str) -> str:
42
+ results = await self._search_arxiv_papers(query)
43
+ return "\n\n".join(str(item) for item in results)
44
+
45
+ async def _search_wikipedia(self, query: str) -> list[SearchResult]:
46
+ """
47
+ Fetch relevant Wikipedia articles
48
+ """
49
+ try:
50
+ sources: list[SearchResult] = []
51
+ search_results = wikipedia.search(query, results=self.config.max_results)
52
+
53
+ for title in search_results:
54
+ try:
55
+ page = wikipedia.page(title)
56
+ if not page.content:
57
+ continue
58
+
59
+ preview = self._extract_relevant_wiki_sections(page.content)
60
+ if not preview:
61
+ continue
62
+
63
+ sources.append(
64
+ SearchResult(url=page.url, title=page.title, preview=preview)
65
+ )
66
+ except wikipedia.exceptions.DisambiguationError:
67
+ continue
68
+ except wikipedia.exceptions.PageError:
69
+ continue
70
+
71
+ return sources
72
+ except Exception:
73
+ return []
74
+
75
+ async def _search_arxiv_papers(self, query: str) -> list[SearchResult]:
76
+ """
77
+ Fetch papers from arXiv and other scientific sources
78
+ """
79
+ ARXIV_URL = "http://export.arxiv.org/api/query"
80
+ try:
81
+ params = {
82
+ "search_query": f"all:{query}",
83
+ "start": 0,
84
+ "max_results": self.config.max_results,
85
+ "sortBy": "relevance",
86
+ "sortOrder": "descending",
87
+ }
88
+ async with httpx.AsyncClient(timeout=20) as client:
89
+ response = await client.get(ARXIV_URL, params=params)
90
+ response.raise_for_status()
91
+
92
+ soup = BeautifulSoup(response.text, "lxml-xml")
93
+ entries = soup.find_all("entry")
94
+
95
+ sources: list[SearchResult] = []
96
+ for entry in entries:
97
+ title = entry.title.text.strip()
98
+ url = entry.id.text.strip()
99
+ preview = entry.summary.text.strip()
100
+
101
+ if not preview:
102
+ continue
103
+
104
+ sources.append(SearchResult(url=url, title=title, preview=preview))
105
+
106
+ return sources
107
+ except Exception:
108
+ return []
109
+
110
+ def _extract_relevant_wiki_sections(self, content: str) -> str:
111
+ """
112
+ Extract the most relevant sections from Wikipedia content
113
+ """
114
+ paragraphs = content.split("\n\n")
115
+ # Remove references and other metadata
116
+ cleaned_paragraphs = [
117
+ p
118
+ for p in paragraphs
119
+ if not any(
120
+ marker in p.lower()
121
+ for marker in [
122
+ "references",
123
+ "external links",
124
+ "see also",
125
+ "== notes ==",
126
+ ]
127
+ )
128
+ ]
129
+
130
+ result = ""
131
+ for p in cleaned_paragraphs:
132
+ if len(result + p) <= self.config.max_preview_chars:
133
+ result += p + "\n\n"
134
+ else:
135
+ break
136
+
137
+ return result.strip()
@@ -0,0 +1,31 @@
1
+ import asyncio
2
+ from typing import Any, Coroutine, List
3
+
4
+ from .google import GoogleSearch
5
+ from .knowledge import KnowledgeSearch
6
+ from .config import WebSearchConfig
7
+
8
+
9
+ class WebSearch(GoogleSearch, KnowledgeSearch):
10
+ def __init__(self, config: WebSearchConfig | None = None):
11
+ self.config = config if config else WebSearchConfig()
12
+
13
+ self.sources = self.config.sources
14
+ GoogleSearch.__init__(self, self.config.google_config)
15
+ KnowledgeSearch.__init__(self, config=self.config.knowledge_config)
16
+
17
+ async def search(self, query: str):
18
+ """
19
+ Search the web for relevant content
20
+ """
21
+ tasks: List[Coroutine[Any, Any, str]] = []
22
+
23
+ if "google" in self.sources:
24
+ tasks.append(self._compile_google_search(query))
25
+ if "wikipedia" in self.sources:
26
+ tasks.append(self._compile_wikipedia(query))
27
+ if "arxiv" in self.sources:
28
+ tasks.append(self._compile_arxiv_papers(query))
29
+
30
+ results = await asyncio.gather(*tasks, return_exceptions=True)
31
+ return "\n\n".join(item for item in results if isinstance(item, str))