better-websearch-cli 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,148 @@
1
+ Metadata-Version: 2.4
2
+ Name: better-websearch-cli
3
+ Version: 0.0.1
4
+ Summary: CLI tool to fetch URLs and search the web
5
+ Requires-Python: >=3.14
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: typer[all]>=0.12
9
+ Requires-Dist: httpx>=0.25
10
+ Requires-Dist: selectolax>=0.3
11
+ Requires-Dist: markdownify>=0.12
12
+ Requires-Dist: rich>=13.0
13
+ Provides-Extra: dev
14
+ Requires-Dist: pytest; extra == "dev"
15
+ Requires-Dist: ruff; extra == "dev"
16
+ Requires-Dist: mypy; extra == "dev"
17
+ Provides-Extra: spa
18
+ Requires-Dist: playwright>=1.40; extra == "spa"
19
+ Dynamic: license-file
20
+
21
+ <p align="center">
22
+ <picture>
23
+ <source media="(prefers-color-scheme: dark)" srcset="public/banner.jpg">
24
+ <source media="(prefers-color-scheme: light)" srcset="public/banner.jpg">
25
+ <img src="public/banner.jpg" alt="Websearch CLI Logo" width="100%">
26
+ </picture>
27
+ </p>
28
+
29
+ <h1 align="center">Websearch CLI</h1>
30
+
31
+ <p align="center">
32
+ Fetch URLs and search the web from your terminal. Fast, simple, and extensible.
33
+ </p>
34
+
35
+ <p align="center">
36
+ <a href="https://pypi.org/project/websearch/">
37
+ <img src="https://img.shields.io/pypi/v/websearch" alt="PyPI Version">
38
+ </a>
39
+ <a href="https://github.com/AliiiBenn/websearch">
40
+ <img src="https://img.shields.io/github/license/AliiiBenn/websearch" alt="License">
41
+ </a>
42
+ <a href="https://github.com/AliiiBenn/websearch/actions">
43
+ <img src="https://img.shields.io/github/actions/workflow/status/AliiiBenn/websearch/test" alt="Tests">
44
+ </a>
45
+ <a href="https://www.python.org/">
46
+ <img src="https://img.shields.io/badge/python-3.14+-blue" alt="Python">
47
+ </a>
48
+ </p>
49
+
50
+ Fetch web pages and search the internet - all from your command line.
51
+
52
+ ## Why Websearch CLI?
53
+
54
+ - **Fast** - Async HTTP powered by httpx
55
+ - **Smart caching** - Local cache with TTL and automatic eviction
56
+ - **Clean output** - HTML to Markdown conversion with XSS protection
57
+ - **Type-safe** - 100% type-annotated Python
58
+
59
+ ## Features
60
+
61
+ - Fetch URLs and convert to Markdown
62
+ - Web search via Brave Search API
63
+ - Local caching with TTL and size limits
64
+ - XSS protection and HTML sanitization
65
+ - SPA (Single Page App) detection
66
+ - Retry with exponential backoff
67
+
68
+ ## Quick Start
69
+
70
+ ```bash
71
+ # Install
72
+ uv add websearch
73
+
74
+ # Fetch a URL as Markdown
75
+ websearch fetch https://example.com
76
+
77
+ # Search the web
78
+ websearch search "python async tutorial"
79
+
80
+ # Fetch with options
81
+ websearch fetch https://example.com --no-cache --verbose
82
+ websearch search "python" -n 20 -t news
83
+ ```
84
+
85
+ ## Command Overview
86
+
87
+ | Command | Description |
88
+ |---------|-------------|
89
+ | `websearch fetch <url>` | Fetch URL and convert to Markdown |
90
+ | `websearch search <query>` | Search the web |
91
+ | `websearch ping` | Check if CLI is working |
92
+
93
+ ## Fetch Options
94
+
95
+ ```
96
+ --refresh, -r Skip cache and force fresh fetch
97
+ --no-cache Disable caching
98
+ --no-verify Skip SSL certificate verification
99
+ --output, -o PATH Output file path
100
+ --verbose, -v Show verbose output
101
+ ```
102
+
103
+ ## Search Options
104
+
105
+ ```
106
+ --count, -n <n> Number of results (1-50, default: 10)
107
+ --type, -t <type> Result type: web, news, images, videos
108
+ --output, -o PATH Output file path
109
+ --json Output raw JSON response
110
+ --no-cache Disable caching
111
+ ```
112
+
113
+ ## Configuration
114
+
115
+ Set your Brave API key:
116
+
117
+ ```bash
118
+ export BRAVE_API_KEY=your_api_key_here
119
+ ```
120
+
121
+ Get your API key at https://brave.com/search/api/
122
+
123
+ ## Development
124
+
125
+ ```bash
126
+ # Clone and install
127
+ git clone https://github.com/AliiiBenn/websearch.git
128
+ cd websearch
129
+ uv sync --dev
130
+
131
+ # Run tests
132
+ uv run pytest
133
+
134
+ # Lint and type-check
135
+ uv run ruff check websearch/
136
+ uv run mypy websearch/
137
+
138
+ # Try it out
139
+ uv run websearch fetch https://example.com
140
+ ```
141
+
142
+ ## Contributing
143
+
144
+ Contributions are welcome! Feel free to open issues or submit PRs.
145
+
146
+ ## License
147
+
148
+ MIT - See [LICENSE](LICENSE) for details.
@@ -0,0 +1,30 @@
1
+ better_websearch_cli-0.0.1.dist-info/licenses/LICENSE,sha256=S69ieINMLlsFsxDUi1x2dDdYTJ5pUgg7bFV3UFDra3M,1070
2
+ websearch/__main__.py,sha256=jJZn0xqgiVJv8ZRITXEltYjcVpHCWwWJgoGmC1Db6lc,40
3
+ websearch/main.py,sha256=Y6oqX_rSsjXGvx0dxjI4SyXCdnzT9wWpyINRl2XM2c0,5652
4
+ websearch/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ websearch/core/cache/__init__.py,sha256=L6JdU6OL9P1jWBQV89ZmlH9TcupXr5T-_Dqa3ppJHsk,102
6
+ websearch/core/cache/cache.py,sha256=yZxzZpLKy8to9z9ZOp8QZGBQJnmLoD98tK7y2zg7DNo,6985
7
+ websearch/core/cache/key.py,sha256=uOWX4hNdYoryS0WePo1FfhJPfXrn_3SJg7NFKUuX4Fo,2353
8
+ websearch/core/cache/storage.py,sha256=PU1DPTXF_iBVlc3REtmr0kEzhTrmhq6HEsHEZJFXOLI,5873
9
+ websearch/core/cache/ttl.py,sha256=ACZEPsgDlwaVnsXxBTrbPvxiM_icrBDfmMAOqpkdkyo,1614
10
+ websearch/core/converter/__init__.py,sha256=HRyFronkvSTIfv3Ey9Ia8VY33JKWxGzIsIhtO9c2M2E,137
11
+ websearch/core/converter/converter.py,sha256=eBMOYU2C-B8nQpSZYPYdiHG5YY8kDA9aCMFQnPMDjyM,3110
12
+ websearch/core/converter/encoding.py,sha256=_vbQ6iYNJEmFmBLmxwqNPBW_DnvFiCiGudbmx1_gc1o,451
13
+ websearch/core/converter/security.py,sha256=FjYkEtedEKAcuhS6xbPibVA7z02sQeMBGEFxtKTweGE,763
14
+ websearch/core/fetcher/__init__.py,sha256=SWrxKOnqCz3eP8JvOeF6XibFFe8pfhNYRtcl3sRkGzs,927
15
+ websearch/core/fetcher/backoff.py,sha256=jdERTOLFVXGHrVDXx1nxWlubg_zL_Nj_EvzyfCKB2Hg,582
16
+ websearch/core/fetcher/detection.py,sha256=7p4SUFZI71AMG7qKRFMT2yyDiV8fygzDKeMiaIGneBk,1058
17
+ websearch/core/fetcher/errors.py,sha256=o5Cj-cB1Je6GjPjAb14fApQwHqnN6elzYWNfYjeAUKY,1605
18
+ websearch/core/fetcher/fetcher.py,sha256=Lor6j9iUHUxd15JF40Yqg49ad61_doawNIm5xyajkM8,7643
19
+ websearch/core/search/__init__.py,sha256=wflO_nwTh_P7kx7GTnNHmTfXrDWJZfJd2PWgq_ekCxo,503
20
+ websearch/core/search/client.py,sha256=B3zlaJvEF3ykAOcgFM-bxCUNACSQrZFucCJI16rttSI,4685
21
+ websearch/core/search/search.py,sha256=CReu47M5rHMuLolru2HLJ02seTht8Uu66by_GMm0Dqw,5677
22
+ websearch/core/search/types.py,sha256=DX4aXOSFqunwtacFsE-NFyfq1y3QgwGx5RkxOeZgHXc,1102
23
+ websearch/core/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ websearch/core/types/maybe.py,sha256=eKJEg3S9W5CKn9fwjf6VHkqwI_kiGfKSdnxnJ6ZNDz4,5090
25
+ websearch/core/types/result.py,sha256=DPJgrjh05EuTn5ZgEx6qlG6Ozr9qXoXxkKxqj4uzMuw,5754
26
+ better_websearch_cli-0.0.1.dist-info/METADATA,sha256=g1dnZNT0-ugJ0q6_8L2Qy11DsWKcWHQ8tml3CtyROnE,3758
27
+ better_websearch_cli-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
28
+ better_websearch_cli-0.0.1.dist-info/entry_points.txt,sha256=gDCbSYPCmbUYRzHhI7dpvO18VWq2Vnbq54VdNDOXczI,50
29
+ better_websearch_cli-0.0.1.dist-info/top_level.txt,sha256=D1fdg-HoBORVAOYW9PRiEIq8wqg7uUQMyhmDJv6X3S4,10
30
+ better_websearch_cli-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ websearch = websearch.main:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Websearch CLI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ websearch
websearch/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from websearch.main import main
2
+
3
+ main()
File without changes
@@ -0,0 +1,5 @@
1
+ """URL and search result cache."""
2
+
3
+ from websearch.core.cache.cache import Cache
4
+
5
+ __all__ = ["Cache"]
@@ -0,0 +1,254 @@
1
+ """Main cache interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from websearch.core.cache.storage import CacheStorage
9
+ from websearch.core.cache.ttl import (
10
+ DEFAULT_SEARCH_TTL,
11
+ DEFAULT_URL_TTL,
12
+ get_search_ttl,
13
+ get_url_ttl,
14
+ is_expired,
15
+ )
16
+ from websearch.core.types.maybe import Just, Maybe, Nothing
17
+
18
+
19
+ class Cache:
20
+ """URL and search result cache."""
21
+
22
+ def __init__(
23
+ self,
24
+ cache_dir: Path | None = None,
25
+ enabled: bool = True,
26
+ max_size: int = 500 * 1024 * 1024, # 500MB
27
+ ):
28
+ """Initialize cache.
29
+
30
+ Args:
31
+ cache_dir: Optional custom cache directory
32
+ enabled: Whether caching is enabled
33
+ max_size: Maximum cache size in bytes
34
+ """
35
+ self.storage = CacheStorage(cache_dir)
36
+ self.enabled = enabled
37
+ self.max_size = max_size
38
+
39
+ def get_url(self, url: str) -> Maybe[tuple[bytes, dict[str, Any]]]:
40
+ """Get cached URL content and metadata.
41
+
42
+ Args:
43
+ url: URL to retrieve
44
+
45
+ Returns:
46
+ Just((content, metadata)) on cache hit,
47
+ Nothing on cache miss or expired
48
+ """
49
+ if not self.enabled:
50
+ return Nothing()
51
+
52
+ content, metadata = self.storage.get_url(url)
53
+
54
+ if content is None or metadata is None:
55
+ return Nothing()
56
+
57
+ # Check expiration
58
+ cached_at = metadata.get("cached_at")
59
+ ttl = metadata.get("ttl", DEFAULT_URL_TTL)
60
+
61
+ if cached_at is None:
62
+ return Nothing()
63
+
64
+ from datetime import datetime
65
+
66
+ if isinstance(cached_at, str):
67
+ cached_at = datetime.fromisoformat(cached_at.replace("Z", "+00:00"))
68
+
69
+ if is_expired(cached_at, ttl):
70
+ return Nothing()
71
+
72
+ return Just((content, metadata))
73
+
74
+ def set_url(
75
+ self,
76
+ url: str,
77
+ content: bytes,
78
+ metadata: dict[str, Any] | None = None,
79
+ ttl: float | None = None,
80
+ ) -> None:
81
+ """Cache URL content.
82
+
83
+ Args:
84
+ url: URL being cached
85
+ content: HTML content
86
+ metadata: Optional additional metadata
87
+ ttl: Optional TTL override
88
+ """
89
+ if not self.enabled:
90
+ return
91
+
92
+ # Check size and evict if needed
93
+ self._evict_if_needed()
94
+
95
+ meta = metadata.copy() if metadata else {}
96
+ meta["url"] = url
97
+ meta["ttl"] = ttl or get_url_ttl()
98
+
99
+ self.storage.set_url(url, content, meta)
100
+
101
+ def get_search(
102
+ self, query: str, count: int, result_type: str = "web"
103
+ ) -> Maybe[dict[str, Any]]:
104
+ """Get cached search results.
105
+
106
+ Args:
107
+ query: Search query
108
+ count: Number of results
109
+ result_type: Type of results
110
+
111
+ Returns:
112
+ Just(results) on cache hit, Nothing otherwise
113
+ """
114
+ if not self.enabled:
115
+ return Nothing()
116
+
117
+ data = self.storage.get_search(query, count, result_type)
118
+
119
+ if data is None:
120
+ return Nothing()
121
+
122
+ metadata = data.get("metadata", {})
123
+ cached_at = metadata.get("cached_at")
124
+ ttl = metadata.get("ttl", DEFAULT_SEARCH_TTL)
125
+
126
+ if cached_at is None:
127
+ return Nothing()
128
+
129
+ from datetime import datetime
130
+
131
+ if isinstance(cached_at, str):
132
+ cached_at = datetime.fromisoformat(cached_at.replace("Z", "+00:00"))
133
+
134
+ if is_expired(cached_at, ttl):
135
+ return Nothing()
136
+
137
+ return Just(data.get("results", {}))
138
+
139
+ def set_search(
140
+ self,
141
+ query: str,
142
+ count: int,
143
+ result_type: str,
144
+ results: dict[str, Any],
145
+ ) -> None:
146
+ """Cache search results.
147
+
148
+ Args:
149
+ query: Search query
150
+ count: Number of results
151
+ result_type: Type of results
152
+ results: Results data
153
+ """
154
+ if not self.enabled:
155
+ return
156
+
157
+ ttl = get_search_ttl()
158
+ self.storage.set_search(query, count, result_type, results, ttl)
159
+
160
+ def is_fresh(self, url: str) -> bool:
161
+ """Check if URL is cached and fresh.
162
+
163
+ Args:
164
+ url: URL to check
165
+
166
+ Returns:
167
+ True if cached and not expired
168
+ """
169
+ return self.get_url(url).is_just()
170
+
171
+ def invalidate(self, url: str) -> bool:
172
+ """Invalidate cached URL.
173
+
174
+ Args:
175
+ url: URL to invalidate
176
+
177
+ Returns:
178
+ True if something was deleted
179
+ """
180
+ return self.storage.delete(url)
181
+
182
+ def clear(self) -> None:
183
+ """Clear all cache."""
184
+ self.storage.clear()
185
+
186
+ def stats(self) -> dict[str, Any]:
187
+ """Get cache statistics.
188
+
189
+ Returns:
190
+ Dict with size, url_count, search_count
191
+ """
192
+ size = self.storage.get_size()
193
+
194
+ url_count = 0
195
+ search_count = 0
196
+
197
+ storage = self.storage
198
+ if storage.url_dir.exists():
199
+ url_count = len(list(storage.url_dir.rglob("*.html")))
200
+
201
+ if storage.search_dir.exists():
202
+ search_count = len(list(storage.search_dir.rglob("*.json")))
203
+
204
+ return {
205
+ "size_bytes": size,
206
+ "size_mb": round(size / (1024 * 1024), 2),
207
+ "url_count": url_count,
208
+ "search_count": search_count,
209
+ "enabled": self.enabled,
210
+ "max_size_mb": self.max_size / (1024 * 1024),
211
+ }
212
+
213
+ def _evict_if_needed(self) -> None:
214
+ """Evict LRU entries if cache exceeds max size."""
215
+ size = self.storage.get_size()
216
+
217
+ if size < self.max_size:
218
+ return
219
+
220
+ # Collect all cached files with their last access time
221
+ files_with_time = []
222
+ if self.storage.url_dir.exists():
223
+ for path in self.storage.url_dir.rglob("*"):
224
+ if path.is_file():
225
+ files_with_time.append((path.stat().st_atime, path))
226
+
227
+ if self.storage.search_dir.exists():
228
+ for path in self.storage.search_dir.rglob("*.json"):
229
+ if path.is_file():
230
+ files_with_time.append((path.stat().st_atime, path))
231
+
232
+ # Sort by access time (oldest first)
233
+ files_with_time.sort(key=lambda x: x[0])
234
+
235
+ # Delete oldest files until under limit
236
+ current_size = size
237
+ for _, path in files_with_time:
238
+ if current_size < self.max_size * 0.9: # Stop at 90% to avoid frequent eviction
239
+ break
240
+ try:
241
+ file_size = path.stat().st_size
242
+ path.unlink()
243
+ current_size -= file_size
244
+
245
+ # Try to delete parent dirs if empty
246
+ parent = path.parent
247
+ while parent != self.storage.cache_dir and parent.exists():
248
+ try:
249
+ parent.rmdir()
250
+ except OSError:
251
+ break
252
+ parent = parent.parent
253
+ except OSError:
254
+ continue
@@ -0,0 +1,99 @@
1
+ """Cache key generation and URL normalization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from pathlib import Path
7
+ from urllib.parse import unquote, urlparse
8
+
9
+
10
+ def normalize_url(url: str) -> str:
11
+ """Normalize URL for consistent caching.
12
+
13
+ Args:
14
+ url: Raw URL
15
+
16
+ Returns:
17
+ Normalized URL with lowercase scheme/domain, decoded path, sorted query
18
+ """
19
+ parsed = urlparse(url)
20
+
21
+ # Lowercase scheme and netloc
22
+ scheme = parsed.scheme.lower()
23
+ netloc = parsed.netloc.lower()
24
+
25
+ # Remove default ports
26
+ if (scheme == "http" and netloc.endswith(":80")) or (
27
+ scheme == "https" and netloc.endswith(":443")
28
+ ):
29
+ netloc = netloc.rsplit(":", 1)[0]
30
+
31
+ # Decode path and lowercase it for case-insensitive matching
32
+ path = unquote(parsed.path).lower()
33
+ if not path:
34
+ path = "/"
35
+
36
+ # Normalize path (remove trailing slash except for root)
37
+ if path != "/" and path.endswith("/"):
38
+ path = path.rstrip("/")
39
+
40
+ # Build query with sorted params
41
+ query = parsed.query
42
+
43
+ # Rebuild URL
44
+ normalized = f"{scheme}://{netloc}{path}"
45
+ if query:
46
+ normalized += f"?{query}"
47
+
48
+ return normalized
49
+
50
+
51
+ def get_cache_key(url: str) -> Path:
52
+ """Get filesystem path for URL cache.
53
+
54
+ Args:
55
+ url: Normalized URL
56
+
57
+ Returns:
58
+ Path relative to cache directory
59
+ """
60
+ normalized = normalize_url(url)
61
+ parsed = urlparse(normalized)
62
+
63
+ domain = parsed.netloc
64
+ path = parsed.path.lstrip("/")
65
+
66
+ if not path:
67
+ path = "index.html"
68
+ elif not path.endswith(".html"):
69
+ path = path + "/index.html"
70
+
71
+ return Path(domain) / path
72
+
73
+
74
+ def get_url_hash(url: str) -> str:
75
+ """Get SHA256 hash of URL for search cache keys.
76
+
77
+ Args:
78
+ url: URL to hash
79
+
80
+ Returns:
81
+ Short hex hash (8 characters)
82
+ """
83
+ normalized = normalize_url(url)
84
+ return hashlib.sha256(normalized.encode()).hexdigest()[:8]
85
+
86
+
87
+ def get_search_key(query: str, count: int, result_type: str = "web") -> str:
88
+ """Get cache filename for search results.
89
+
90
+ Args:
91
+ query: Search query
92
+ count: Number of results
93
+ result_type: Type of results (web, news, etc.)
94
+
95
+ Returns:
96
+ Cache filename like "dc9a8f5_10_web.json"
97
+ """
98
+ query_hash = get_url_hash(query)
99
+ return f"{query_hash}_{count}_{result_type}.json"