better-websearch-cli 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- better_websearch_cli-0.0.1.dist-info/METADATA +148 -0
- better_websearch_cli-0.0.1.dist-info/RECORD +30 -0
- better_websearch_cli-0.0.1.dist-info/WHEEL +5 -0
- better_websearch_cli-0.0.1.dist-info/entry_points.txt +2 -0
- better_websearch_cli-0.0.1.dist-info/licenses/LICENSE +21 -0
- better_websearch_cli-0.0.1.dist-info/top_level.txt +1 -0
- websearch/__main__.py +3 -0
- websearch/core/__init__.py +0 -0
- websearch/core/cache/__init__.py +5 -0
- websearch/core/cache/cache.py +254 -0
- websearch/core/cache/key.py +99 -0
- websearch/core/cache/storage.py +202 -0
- websearch/core/cache/ttl.py +73 -0
- websearch/core/converter/__init__.py +5 -0
- websearch/core/converter/converter.py +104 -0
- websearch/core/converter/encoding.py +20 -0
- websearch/core/converter/security.py +30 -0
- websearch/core/fetcher/__init__.py +41 -0
- websearch/core/fetcher/backoff.py +24 -0
- websearch/core/fetcher/detection.py +52 -0
- websearch/core/fetcher/errors.py +100 -0
- websearch/core/fetcher/fetcher.py +222 -0
- websearch/core/search/__init__.py +23 -0
- websearch/core/search/client.py +164 -0
- websearch/core/search/search.py +196 -0
- websearch/core/search/types.py +50 -0
- websearch/core/types/__init__.py +0 -0
- websearch/core/types/maybe.py +185 -0
- websearch/core/types/result.py +209 -0
- websearch/main.py +174 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: better-websearch-cli
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: CLI tool to fetch URLs and search the web
|
|
5
|
+
Requires-Python: >=3.14
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: typer[all]>=0.12
|
|
9
|
+
Requires-Dist: httpx>=0.25
|
|
10
|
+
Requires-Dist: selectolax>=0.3
|
|
11
|
+
Requires-Dist: markdownify>=0.12
|
|
12
|
+
Requires-Dist: rich>=13.0
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: pytest; extra == "dev"
|
|
15
|
+
Requires-Dist: ruff; extra == "dev"
|
|
16
|
+
Requires-Dist: mypy; extra == "dev"
|
|
17
|
+
Provides-Extra: spa
|
|
18
|
+
Requires-Dist: playwright>=1.40; extra == "spa"
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<picture>
|
|
23
|
+
<source media="(prefers-color-scheme: dark)" srcset="public/banner.jpg">
|
|
24
|
+
<source media="(prefers-color-scheme: light)" srcset="public/banner.jpg">
|
|
25
|
+
<img src="public/banner.jpg" alt="Websearch CLI Logo" width="100%">
|
|
26
|
+
</picture>
|
|
27
|
+
</p>
|
|
28
|
+
|
|
29
|
+
<h1 align="center">Websearch CLI</h1>
|
|
30
|
+
|
|
31
|
+
<p align="center">
|
|
32
|
+
Fetch URLs and search the web from your terminal. Fast, simple, and extensible.
|
|
33
|
+
</p>
|
|
34
|
+
|
|
35
|
+
<p align="center">
|
|
36
|
+
<a href="https://pypi.org/project/websearch/">
|
|
37
|
+
<img src="https://img.shields.io/pypi/v/websearch" alt="PyPI Version">
|
|
38
|
+
</a>
|
|
39
|
+
<a href="https://github.com/AliiiBenn/websearch">
|
|
40
|
+
<img src="https://img.shields.io/github/license/AliiiBenn/websearch" alt="License">
|
|
41
|
+
</a>
|
|
42
|
+
<a href="https://github.com/AliiiBenn/websearch/actions">
|
|
43
|
+
<img src="https://img.shields.io/github/actions/workflow/status/AliiiBenn/websearch/test" alt="Tests">
|
|
44
|
+
</a>
|
|
45
|
+
<a href="https://www.python.org/">
|
|
46
|
+
<img src="https://img.shields.io/badge/python-3.14+-blue" alt="Python">
|
|
47
|
+
</a>
|
|
48
|
+
</p>
|
|
49
|
+
|
|
50
|
+
Fetch web pages and search the internet - all from your command line.
|
|
51
|
+
|
|
52
|
+
## Why Websearch CLI?
|
|
53
|
+
|
|
54
|
+
- **Fast** - Async HTTP powered by httpx
|
|
55
|
+
- **Smart caching** - Local cache with TTL and automatic eviction
|
|
56
|
+
- **Clean output** - HTML to Markdown conversion with XSS protection
|
|
57
|
+
- **Type-safe** - 100% type-annotated Python
|
|
58
|
+
|
|
59
|
+
## Features
|
|
60
|
+
|
|
61
|
+
- Fetch URLs and convert to Markdown
|
|
62
|
+
- Web search via Brave Search API
|
|
63
|
+
- Local caching with TTL and size limits
|
|
64
|
+
- XSS protection and HTML sanitization
|
|
65
|
+
- SPA (Single Page App) detection
|
|
66
|
+
- Retry with exponential backoff
|
|
67
|
+
|
|
68
|
+
## Quick Start
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Install
|
|
72
|
+
uv add websearch
|
|
73
|
+
|
|
74
|
+
# Fetch a URL as Markdown
|
|
75
|
+
websearch fetch https://example.com
|
|
76
|
+
|
|
77
|
+
# Search the web
|
|
78
|
+
websearch search "python async tutorial"
|
|
79
|
+
|
|
80
|
+
# Fetch with options
|
|
81
|
+
websearch fetch https://example.com --no-cache --verbose
|
|
82
|
+
websearch search "python" -n 20 -t news
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Command Overview
|
|
86
|
+
|
|
87
|
+
| Command | Description |
|
|
88
|
+
|---------|-------------|
|
|
89
|
+
| `websearch fetch <url>` | Fetch URL and convert to Markdown |
|
|
90
|
+
| `websearch search <query>` | Search the web |
|
|
91
|
+
| `websearch ping` | Check if CLI is working |
|
|
92
|
+
|
|
93
|
+
## Fetch Options
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
--refresh, -r Skip cache and force fresh fetch
|
|
97
|
+
--no-cache Disable caching
|
|
98
|
+
--no-verify Skip SSL certificate verification
|
|
99
|
+
--output, -o PATH Output file path
|
|
100
|
+
--verbose, -v Show verbose output
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Search Options
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
--count, -n <n> Number of results (1-50, default: 10)
|
|
107
|
+
--type, -t <type> Result type: web, news, images, videos
|
|
108
|
+
--output, -o PATH Output file path
|
|
109
|
+
--json Output raw JSON response
|
|
110
|
+
--no-cache Disable caching
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Configuration
|
|
114
|
+
|
|
115
|
+
Set your Brave API key:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
export BRAVE_API_KEY=your_api_key_here
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Get your API key at https://brave.com/search/api/
|
|
122
|
+
|
|
123
|
+
## Development
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Clone and install
|
|
127
|
+
git clone https://github.com/AliiiBenn/websearch.git
|
|
128
|
+
cd websearch
|
|
129
|
+
uv sync --dev
|
|
130
|
+
|
|
131
|
+
# Run tests
|
|
132
|
+
uv run pytest
|
|
133
|
+
|
|
134
|
+
# Lint and type-check
|
|
135
|
+
uv run ruff check websearch/
|
|
136
|
+
uv run mypy websearch/
|
|
137
|
+
|
|
138
|
+
# Try it out
|
|
139
|
+
uv run websearch fetch https://example.com
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Contributing
|
|
143
|
+
|
|
144
|
+
Contributions are welcome! Feel free to open issues or submit PRs.
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
MIT - See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
better_websearch_cli-0.0.1.dist-info/licenses/LICENSE,sha256=S69ieINMLlsFsxDUi1x2dDdYTJ5pUgg7bFV3UFDra3M,1070
|
|
2
|
+
websearch/__main__.py,sha256=jJZn0xqgiVJv8ZRITXEltYjcVpHCWwWJgoGmC1Db6lc,40
|
|
3
|
+
websearch/main.py,sha256=Y6oqX_rSsjXGvx0dxjI4SyXCdnzT9wWpyINRl2XM2c0,5652
|
|
4
|
+
websearch/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
websearch/core/cache/__init__.py,sha256=L6JdU6OL9P1jWBQV89ZmlH9TcupXr5T-_Dqa3ppJHsk,102
|
|
6
|
+
websearch/core/cache/cache.py,sha256=yZxzZpLKy8to9z9ZOp8QZGBQJnmLoD98tK7y2zg7DNo,6985
|
|
7
|
+
websearch/core/cache/key.py,sha256=uOWX4hNdYoryS0WePo1FfhJPfXrn_3SJg7NFKUuX4Fo,2353
|
|
8
|
+
websearch/core/cache/storage.py,sha256=PU1DPTXF_iBVlc3REtmr0kEzhTrmhq6HEsHEZJFXOLI,5873
|
|
9
|
+
websearch/core/cache/ttl.py,sha256=ACZEPsgDlwaVnsXxBTrbPvxiM_icrBDfmMAOqpkdkyo,1614
|
|
10
|
+
websearch/core/converter/__init__.py,sha256=HRyFronkvSTIfv3Ey9Ia8VY33JKWxGzIsIhtO9c2M2E,137
|
|
11
|
+
websearch/core/converter/converter.py,sha256=eBMOYU2C-B8nQpSZYPYdiHG5YY8kDA9aCMFQnPMDjyM,3110
|
|
12
|
+
websearch/core/converter/encoding.py,sha256=_vbQ6iYNJEmFmBLmxwqNPBW_DnvFiCiGudbmx1_gc1o,451
|
|
13
|
+
websearch/core/converter/security.py,sha256=FjYkEtedEKAcuhS6xbPibVA7z02sQeMBGEFxtKTweGE,763
|
|
14
|
+
websearch/core/fetcher/__init__.py,sha256=SWrxKOnqCz3eP8JvOeF6XibFFe8pfhNYRtcl3sRkGzs,927
|
|
15
|
+
websearch/core/fetcher/backoff.py,sha256=jdERTOLFVXGHrVDXx1nxWlubg_zL_Nj_EvzyfCKB2Hg,582
|
|
16
|
+
websearch/core/fetcher/detection.py,sha256=7p4SUFZI71AMG7qKRFMT2yyDiV8fygzDKeMiaIGneBk,1058
|
|
17
|
+
websearch/core/fetcher/errors.py,sha256=o5Cj-cB1Je6GjPjAb14fApQwHqnN6elzYWNfYjeAUKY,1605
|
|
18
|
+
websearch/core/fetcher/fetcher.py,sha256=Lor6j9iUHUxd15JF40Yqg49ad61_doawNIm5xyajkM8,7643
|
|
19
|
+
websearch/core/search/__init__.py,sha256=wflO_nwTh_P7kx7GTnNHmTfXrDWJZfJd2PWgq_ekCxo,503
|
|
20
|
+
websearch/core/search/client.py,sha256=B3zlaJvEF3ykAOcgFM-bxCUNACSQrZFucCJI16rttSI,4685
|
|
21
|
+
websearch/core/search/search.py,sha256=CReu47M5rHMuLolru2HLJ02seTht8Uu66by_GMm0Dqw,5677
|
|
22
|
+
websearch/core/search/types.py,sha256=DX4aXOSFqunwtacFsE-NFyfq1y3QgwGx5RkxOeZgHXc,1102
|
|
23
|
+
websearch/core/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
websearch/core/types/maybe.py,sha256=eKJEg3S9W5CKn9fwjf6VHkqwI_kiGfKSdnxnJ6ZNDz4,5090
|
|
25
|
+
websearch/core/types/result.py,sha256=DPJgrjh05EuTn5ZgEx6qlG6Ozr9qXoXxkKxqj4uzMuw,5754
|
|
26
|
+
better_websearch_cli-0.0.1.dist-info/METADATA,sha256=g1dnZNT0-ugJ0q6_8L2Qy11DsWKcWHQ8tml3CtyROnE,3758
|
|
27
|
+
better_websearch_cli-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
28
|
+
better_websearch_cli-0.0.1.dist-info/entry_points.txt,sha256=gDCbSYPCmbUYRzHhI7dpvO18VWq2Vnbq54VdNDOXczI,50
|
|
29
|
+
better_websearch_cli-0.0.1.dist-info/top_level.txt,sha256=D1fdg-HoBORVAOYW9PRiEIq8wqg7uUQMyhmDJv6X3S4,10
|
|
30
|
+
better_websearch_cli-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Websearch CLI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
websearch
|
websearch/__main__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""Main cache interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from websearch.core.cache.storage import CacheStorage
|
|
9
|
+
from websearch.core.cache.ttl import (
|
|
10
|
+
DEFAULT_SEARCH_TTL,
|
|
11
|
+
DEFAULT_URL_TTL,
|
|
12
|
+
get_search_ttl,
|
|
13
|
+
get_url_ttl,
|
|
14
|
+
is_expired,
|
|
15
|
+
)
|
|
16
|
+
from websearch.core.types.maybe import Just, Maybe, Nothing
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Cache:
|
|
20
|
+
"""URL and search result cache."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
cache_dir: Path | None = None,
|
|
25
|
+
enabled: bool = True,
|
|
26
|
+
max_size: int = 500 * 1024 * 1024, # 500MB
|
|
27
|
+
):
|
|
28
|
+
"""Initialize cache.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
cache_dir: Optional custom cache directory
|
|
32
|
+
enabled: Whether caching is enabled
|
|
33
|
+
max_size: Maximum cache size in bytes
|
|
34
|
+
"""
|
|
35
|
+
self.storage = CacheStorage(cache_dir)
|
|
36
|
+
self.enabled = enabled
|
|
37
|
+
self.max_size = max_size
|
|
38
|
+
|
|
39
|
+
def get_url(self, url: str) -> Maybe[tuple[bytes, dict[str, Any]]]:
|
|
40
|
+
"""Get cached URL content and metadata.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
url: URL to retrieve
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Just((content, metadata)) on cache hit,
|
|
47
|
+
Nothing on cache miss or expired
|
|
48
|
+
"""
|
|
49
|
+
if not self.enabled:
|
|
50
|
+
return Nothing()
|
|
51
|
+
|
|
52
|
+
content, metadata = self.storage.get_url(url)
|
|
53
|
+
|
|
54
|
+
if content is None or metadata is None:
|
|
55
|
+
return Nothing()
|
|
56
|
+
|
|
57
|
+
# Check expiration
|
|
58
|
+
cached_at = metadata.get("cached_at")
|
|
59
|
+
ttl = metadata.get("ttl", DEFAULT_URL_TTL)
|
|
60
|
+
|
|
61
|
+
if cached_at is None:
|
|
62
|
+
return Nothing()
|
|
63
|
+
|
|
64
|
+
from datetime import datetime
|
|
65
|
+
|
|
66
|
+
if isinstance(cached_at, str):
|
|
67
|
+
cached_at = datetime.fromisoformat(cached_at.replace("Z", "+00:00"))
|
|
68
|
+
|
|
69
|
+
if is_expired(cached_at, ttl):
|
|
70
|
+
return Nothing()
|
|
71
|
+
|
|
72
|
+
return Just((content, metadata))
|
|
73
|
+
|
|
74
|
+
def set_url(
|
|
75
|
+
self,
|
|
76
|
+
url: str,
|
|
77
|
+
content: bytes,
|
|
78
|
+
metadata: dict[str, Any] | None = None,
|
|
79
|
+
ttl: float | None = None,
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Cache URL content.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
url: URL being cached
|
|
85
|
+
content: HTML content
|
|
86
|
+
metadata: Optional additional metadata
|
|
87
|
+
ttl: Optional TTL override
|
|
88
|
+
"""
|
|
89
|
+
if not self.enabled:
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
# Check size and evict if needed
|
|
93
|
+
self._evict_if_needed()
|
|
94
|
+
|
|
95
|
+
meta = metadata.copy() if metadata else {}
|
|
96
|
+
meta["url"] = url
|
|
97
|
+
meta["ttl"] = ttl or get_url_ttl()
|
|
98
|
+
|
|
99
|
+
self.storage.set_url(url, content, meta)
|
|
100
|
+
|
|
101
|
+
def get_search(
|
|
102
|
+
self, query: str, count: int, result_type: str = "web"
|
|
103
|
+
) -> Maybe[dict[str, Any]]:
|
|
104
|
+
"""Get cached search results.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
query: Search query
|
|
108
|
+
count: Number of results
|
|
109
|
+
result_type: Type of results
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Just(results) on cache hit, Nothing otherwise
|
|
113
|
+
"""
|
|
114
|
+
if not self.enabled:
|
|
115
|
+
return Nothing()
|
|
116
|
+
|
|
117
|
+
data = self.storage.get_search(query, count, result_type)
|
|
118
|
+
|
|
119
|
+
if data is None:
|
|
120
|
+
return Nothing()
|
|
121
|
+
|
|
122
|
+
metadata = data.get("metadata", {})
|
|
123
|
+
cached_at = metadata.get("cached_at")
|
|
124
|
+
ttl = metadata.get("ttl", DEFAULT_SEARCH_TTL)
|
|
125
|
+
|
|
126
|
+
if cached_at is None:
|
|
127
|
+
return Nothing()
|
|
128
|
+
|
|
129
|
+
from datetime import datetime
|
|
130
|
+
|
|
131
|
+
if isinstance(cached_at, str):
|
|
132
|
+
cached_at = datetime.fromisoformat(cached_at.replace("Z", "+00:00"))
|
|
133
|
+
|
|
134
|
+
if is_expired(cached_at, ttl):
|
|
135
|
+
return Nothing()
|
|
136
|
+
|
|
137
|
+
return Just(data.get("results", {}))
|
|
138
|
+
|
|
139
|
+
def set_search(
|
|
140
|
+
self,
|
|
141
|
+
query: str,
|
|
142
|
+
count: int,
|
|
143
|
+
result_type: str,
|
|
144
|
+
results: dict[str, Any],
|
|
145
|
+
) -> None:
|
|
146
|
+
"""Cache search results.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
query: Search query
|
|
150
|
+
count: Number of results
|
|
151
|
+
result_type: Type of results
|
|
152
|
+
results: Results data
|
|
153
|
+
"""
|
|
154
|
+
if not self.enabled:
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
ttl = get_search_ttl()
|
|
158
|
+
self.storage.set_search(query, count, result_type, results, ttl)
|
|
159
|
+
|
|
160
|
+
def is_fresh(self, url: str) -> bool:
|
|
161
|
+
"""Check if URL is cached and fresh.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
url: URL to check
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
True if cached and not expired
|
|
168
|
+
"""
|
|
169
|
+
return self.get_url(url).is_just()
|
|
170
|
+
|
|
171
|
+
def invalidate(self, url: str) -> bool:
|
|
172
|
+
"""Invalidate cached URL.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
url: URL to invalidate
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
True if something was deleted
|
|
179
|
+
"""
|
|
180
|
+
return self.storage.delete(url)
|
|
181
|
+
|
|
182
|
+
def clear(self) -> None:
|
|
183
|
+
"""Clear all cache."""
|
|
184
|
+
self.storage.clear()
|
|
185
|
+
|
|
186
|
+
def stats(self) -> dict[str, Any]:
|
|
187
|
+
"""Get cache statistics.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Dict with size, url_count, search_count
|
|
191
|
+
"""
|
|
192
|
+
size = self.storage.get_size()
|
|
193
|
+
|
|
194
|
+
url_count = 0
|
|
195
|
+
search_count = 0
|
|
196
|
+
|
|
197
|
+
storage = self.storage
|
|
198
|
+
if storage.url_dir.exists():
|
|
199
|
+
url_count = len(list(storage.url_dir.rglob("*.html")))
|
|
200
|
+
|
|
201
|
+
if storage.search_dir.exists():
|
|
202
|
+
search_count = len(list(storage.search_dir.rglob("*.json")))
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
"size_bytes": size,
|
|
206
|
+
"size_mb": round(size / (1024 * 1024), 2),
|
|
207
|
+
"url_count": url_count,
|
|
208
|
+
"search_count": search_count,
|
|
209
|
+
"enabled": self.enabled,
|
|
210
|
+
"max_size_mb": self.max_size / (1024 * 1024),
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
def _evict_if_needed(self) -> None:
|
|
214
|
+
"""Evict LRU entries if cache exceeds max size."""
|
|
215
|
+
size = self.storage.get_size()
|
|
216
|
+
|
|
217
|
+
if size < self.max_size:
|
|
218
|
+
return
|
|
219
|
+
|
|
220
|
+
# Collect all cached files with their last access time
|
|
221
|
+
files_with_time = []
|
|
222
|
+
if self.storage.url_dir.exists():
|
|
223
|
+
for path in self.storage.url_dir.rglob("*"):
|
|
224
|
+
if path.is_file():
|
|
225
|
+
files_with_time.append((path.stat().st_atime, path))
|
|
226
|
+
|
|
227
|
+
if self.storage.search_dir.exists():
|
|
228
|
+
for path in self.storage.search_dir.rglob("*.json"):
|
|
229
|
+
if path.is_file():
|
|
230
|
+
files_with_time.append((path.stat().st_atime, path))
|
|
231
|
+
|
|
232
|
+
# Sort by access time (oldest first)
|
|
233
|
+
files_with_time.sort(key=lambda x: x[0])
|
|
234
|
+
|
|
235
|
+
# Delete oldest files until under limit
|
|
236
|
+
current_size = size
|
|
237
|
+
for _, path in files_with_time:
|
|
238
|
+
if current_size < self.max_size * 0.9: # Stop at 90% to avoid frequent eviction
|
|
239
|
+
break
|
|
240
|
+
try:
|
|
241
|
+
file_size = path.stat().st_size
|
|
242
|
+
path.unlink()
|
|
243
|
+
current_size -= file_size
|
|
244
|
+
|
|
245
|
+
# Try to delete parent dirs if empty
|
|
246
|
+
parent = path.parent
|
|
247
|
+
while parent != self.storage.cache_dir and parent.exists():
|
|
248
|
+
try:
|
|
249
|
+
parent.rmdir()
|
|
250
|
+
except OSError:
|
|
251
|
+
break
|
|
252
|
+
parent = parent.parent
|
|
253
|
+
except OSError:
|
|
254
|
+
continue
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Cache key generation and URL normalization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from urllib.parse import unquote, urlparse
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def normalize_url(url: str) -> str:
|
|
11
|
+
"""Normalize URL for consistent caching.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
url: Raw URL
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Normalized URL with lowercase scheme/domain, decoded path, sorted query
|
|
18
|
+
"""
|
|
19
|
+
parsed = urlparse(url)
|
|
20
|
+
|
|
21
|
+
# Lowercase scheme and netloc
|
|
22
|
+
scheme = parsed.scheme.lower()
|
|
23
|
+
netloc = parsed.netloc.lower()
|
|
24
|
+
|
|
25
|
+
# Remove default ports
|
|
26
|
+
if (scheme == "http" and netloc.endswith(":80")) or (
|
|
27
|
+
scheme == "https" and netloc.endswith(":443")
|
|
28
|
+
):
|
|
29
|
+
netloc = netloc.rsplit(":", 1)[0]
|
|
30
|
+
|
|
31
|
+
# Decode path and lowercase it for case-insensitive matching
|
|
32
|
+
path = unquote(parsed.path).lower()
|
|
33
|
+
if not path:
|
|
34
|
+
path = "/"
|
|
35
|
+
|
|
36
|
+
# Normalize path (remove trailing slash except for root)
|
|
37
|
+
if path != "/" and path.endswith("/"):
|
|
38
|
+
path = path.rstrip("/")
|
|
39
|
+
|
|
40
|
+
# Build query with sorted params
|
|
41
|
+
query = parsed.query
|
|
42
|
+
|
|
43
|
+
# Rebuild URL
|
|
44
|
+
normalized = f"{scheme}://{netloc}{path}"
|
|
45
|
+
if query:
|
|
46
|
+
normalized += f"?{query}"
|
|
47
|
+
|
|
48
|
+
return normalized
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_cache_key(url: str) -> Path:
|
|
52
|
+
"""Get filesystem path for URL cache.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
url: Normalized URL
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Path relative to cache directory
|
|
59
|
+
"""
|
|
60
|
+
normalized = normalize_url(url)
|
|
61
|
+
parsed = urlparse(normalized)
|
|
62
|
+
|
|
63
|
+
domain = parsed.netloc
|
|
64
|
+
path = parsed.path.lstrip("/")
|
|
65
|
+
|
|
66
|
+
if not path:
|
|
67
|
+
path = "index.html"
|
|
68
|
+
elif not path.endswith(".html"):
|
|
69
|
+
path = path + "/index.html"
|
|
70
|
+
|
|
71
|
+
return Path(domain) / path
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_url_hash(url: str) -> str:
|
|
75
|
+
"""Get SHA256 hash of URL for search cache keys.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
url: URL to hash
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Short hex hash (8 characters)
|
|
82
|
+
"""
|
|
83
|
+
normalized = normalize_url(url)
|
|
84
|
+
return hashlib.sha256(normalized.encode()).hexdigest()[:8]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_search_key(query: str, count: int, result_type: str = "web") -> str:
|
|
88
|
+
"""Get cache filename for search results.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
query: Search query
|
|
92
|
+
count: Number of results
|
|
93
|
+
result_type: Type of results (web, news, etc.)
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Cache filename like "dc9a8f5_10_web.json"
|
|
97
|
+
"""
|
|
98
|
+
query_hash = get_url_hash(query)
|
|
99
|
+
return f"{query_hash}_{count}_{result_type}.json"
|