tofu-search 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tofu_search-0.2.0/LICENSE +21 -0
- tofu_search-0.2.0/PKG-INFO +247 -0
- tofu_search-0.2.0/README.md +210 -0
- tofu_search-0.2.0/pyproject.toml +45 -0
- tofu_search-0.2.0/setup.cfg +4 -0
- tofu_search-0.2.0/tofu_search/__init__.py +147 -0
- tofu_search-0.2.0/tofu_search/config.py +112 -0
- tofu_search-0.2.0/tofu_search/fetch/__init__.py +17 -0
- tofu_search-0.2.0/tofu_search/fetch/content_filter.py +226 -0
- tofu_search-0.2.0/tofu_search/fetch/core.py +443 -0
- tofu_search-0.2.0/tofu_search/fetch/html_extract.py +404 -0
- tofu_search-0.2.0/tofu_search/fetch/http.py +306 -0
- tofu_search-0.2.0/tofu_search/fetch/interactive_login.py +151 -0
- tofu_search-0.2.0/tofu_search/fetch/pdf_extract.py +140 -0
- tofu_search-0.2.0/tofu_search/fetch/playwright_pool.py +676 -0
- tofu_search-0.2.0/tofu_search/fetch/utils.py +627 -0
- tofu_search-0.2.0/tofu_search/http_client.py +48 -0
- tofu_search-0.2.0/tofu_search/llm_adapter.py +128 -0
- tofu_search-0.2.0/tofu_search/log.py +36 -0
- tofu_search-0.2.0/tofu_search/providers.py +110 -0
- tofu_search-0.2.0/tofu_search/search/__init__.py +22 -0
- tofu_search-0.2.0/tofu_search/search/_common.py +127 -0
- tofu_search-0.2.0/tofu_search/search/browser_fallback.py +37 -0
- tofu_search-0.2.0/tofu_search/search/dedup.py +92 -0
- tofu_search-0.2.0/tofu_search/search/deepen.py +170 -0
- tofu_search-0.2.0/tofu_search/search/engines/__init__.py +1 -0
- tofu_search-0.2.0/tofu_search/search/engines/bing.py +153 -0
- tofu_search-0.2.0/tofu_search/search/engines/brave.py +81 -0
- tofu_search-0.2.0/tofu_search/search/engines/ddg.py +110 -0
- tofu_search-0.2.0/tofu_search/search/engines/marginalia.py +61 -0
- tofu_search-0.2.0/tofu_search/search/engines/searxng.py +165 -0
- tofu_search-0.2.0/tofu_search/search/engines/xhs.py +138 -0
- tofu_search-0.2.0/tofu_search/search/format.py +61 -0
- tofu_search-0.2.0/tofu_search/search/orchestrator.py +460 -0
- tofu_search-0.2.0/tofu_search/search/rerank.py +184 -0
- tofu_search-0.2.0/tofu_search/search/vertical.py +1053 -0
- tofu_search-0.2.0/tofu_search.egg-info/PKG-INFO +247 -0
- tofu_search-0.2.0/tofu_search.egg-info/SOURCES.txt +39 -0
- tofu_search-0.2.0/tofu_search.egg-info/dependency_links.txt +1 -0
- tofu_search-0.2.0/tofu_search.egg-info/requires.txt +21 -0
- tofu_search-0.2.0/tofu_search.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-2026 Tofu Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tofu-search
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Multi-engine web search, vertical lookups, and content fetching with optional LLM filtering — standalone library from the Tofu AI assistant
|
|
5
|
+
Author: Tofu Team
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/rangehow/tofu-search
|
|
8
|
+
Project-URL: Documentation, https://github.com/rangehow/tofu-search#readme
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: requests>=2.28
|
|
20
|
+
Requires-Dist: trafilatura>=1.6
|
|
21
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
22
|
+
Requires-Dist: python-dateutil>=2.8
|
|
23
|
+
Requires-Dist: urllib3>=1.26
|
|
24
|
+
Requires-Dist: lxml>=4.9
|
|
25
|
+
Provides-Extra: playwright
|
|
26
|
+
Requires-Dist: playwright>=1.40; extra == "playwright"
|
|
27
|
+
Provides-Extra: pdf
|
|
28
|
+
Requires-Dist: pymupdf>=1.23; extra == "pdf"
|
|
29
|
+
Requires-Dist: pymupdf4llm>=0.0.5; extra == "pdf"
|
|
30
|
+
Provides-Extra: all
|
|
31
|
+
Requires-Dist: playwright>=1.40; extra == "all"
|
|
32
|
+
Requires-Dist: pymupdf>=1.23; extra == "all"
|
|
33
|
+
Requires-Dist: pymupdf4llm>=0.0.5; extra == "all"
|
|
34
|
+
Provides-Extra: server
|
|
35
|
+
Requires-Dist: flask>=3.0; extra == "server"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
# 🔍 tofu-search
|
|
39
|
+
|
|
40
|
+
**Multi-engine web search + content fetching with optional LLM filtering** — a
|
|
41
|
+
standalone Python library extracted from the [Tofu AI assistant](https://github.com/rangehow/tofu-search).
|
|
42
|
+
|
|
43
|
+
This is a full re-extraction that keeps **100% of Tofu's current search/fetch
|
|
44
|
+
capabilities**: every engine, the structured "vertical" lookups, one-hop
|
|
45
|
+
deepening, the SPA/bot-protection Playwright fallback, authenticated-source
|
|
46
|
+
fetching, and the host-browser fallback — the last two exposed through
|
|
47
|
+
optional [provider seams](#host-integration-provider-seams) so the library
|
|
48
|
+
stays dependency-free when used standalone.
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
- **Multi-engine search (parallel)**: DuckDuckGo (HTML + API), Brave, Bing,
|
|
53
|
+
SearXNG, Marginalia — plus Xiaohongshu when an auth-source provider supplies
|
|
54
|
+
a logged-in session.
|
|
55
|
+
- **Vertical / structured search**: auto-detects CVE IDs, arXiv IDs, DOIs,
|
|
56
|
+
stock tickers, PyPI/npm packages, GitHub repos, IP addresses, Hugging Face
|
|
57
|
+
daily papers, and Semantic Scholar related-work — answered from the relevant
|
|
58
|
+
free API alongside web results.
|
|
59
|
+
- **Content deduplication**: Jaccard similarity on shingles (CJK + Latin aware).
|
|
60
|
+
- **Concurrent page fetching**: Race-to-N strategy with SSL fallback + a
|
|
61
|
+
per-domain circuit breaker.
|
|
62
|
+
- **One-hop deepening** *(opt-in)*: follow the best query-relevant outbound
|
|
63
|
+
links one hop deeper, bounded like a crawl budget.
|
|
64
|
+
- **LLM content filter** *(optional)*: relevance verdict + noise removal. When
|
|
65
|
+
no LLM is configured the step is silently skipped (raw text returned as-is).
|
|
66
|
+
- **BM25 reranking**: pure-Python, no external API calls.
|
|
67
|
+
- **SPA / bot-protection support**: optional Playwright fallback for
|
|
68
|
+
JS-rendered and challenge pages.
|
|
69
|
+
- **PDF extraction**: optional pymupdf / pymupdf4llm integration.
|
|
70
|
+
- **Host integration seams**: register a browser provider (fetch/search via a
|
|
71
|
+
real browser the user controls) and an auth-source provider (cookies/proxy
|
|
72
|
+
for login-walled domains) — both no-ops by default.
|
|
73
|
+
|
|
74
|
+
## Quick Start
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install tofu-search
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Basic search (no LLM required)
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from tofu_search import search
|
|
84
|
+
|
|
85
|
+
results = search("Python asyncio tutorial")
|
|
86
|
+
for r in results:
|
|
87
|
+
print(f"{r['title']}: {r['url']}")
|
|
88
|
+
if r.get('full_content'):
|
|
89
|
+
print(f" {r['full_content'][:200]}...")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### With OpenAI content filtering
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from tofu_search import search, configure
|
|
96
|
+
|
|
97
|
+
configure(
|
|
98
|
+
llm_api_key="sk-...",
|
|
99
|
+
llm_base_url="https://api.openai.com/v1",
|
|
100
|
+
llm_model="gpt-4o-mini",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
results = search("Python asyncio tutorial")
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### With a custom LLM callable
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from tofu_search import search, configure
|
|
110
|
+
|
|
111
|
+
def my_llm(messages, **kwargs):
|
|
112
|
+
# Your LLM call — receives OpenAI-format messages.
|
|
113
|
+
# kwargs may include: stop, temperature, timeout
|
|
114
|
+
return "response text"
|
|
115
|
+
|
|
116
|
+
configure(llm_function=my_llm)
|
|
117
|
+
results = search("Python asyncio tutorial")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Fetch a single URL
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from tofu_search import fetch_url
|
|
124
|
+
|
|
125
|
+
content = fetch_url("https://example.com")
|
|
126
|
+
if content:
|
|
127
|
+
print(f"Got {len(content)} characters")
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Vertical (structured-identifier) search
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from tofu_search import detect_vertical_intent, search_vertical
|
|
134
|
+
|
|
135
|
+
domain, identifier, params = detect_vertical_intent("CVE-2021-44228")
|
|
136
|
+
record = search_vertical(domain, identifier, params)
|
|
137
|
+
print(record['content']) # CVSS score, description, references from NVD
|
|
138
|
+
|
|
139
|
+
# Or force a domain-level fan-out (free-text → Hugging Face + Semantic Scholar):
|
|
140
|
+
from tofu_search import search_vertical_domain
|
|
141
|
+
print(search_vertical_domain('academic', 'mamba state space models')['content'])
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Host integration (provider seams)
|
|
145
|
+
|
|
146
|
+
The standalone library never imports a host application. To unlock the two
|
|
147
|
+
host-only capabilities, register a provider — dependency points inward (host →
|
|
148
|
+
library), exactly like a plugin.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from tofu_search import (
|
|
152
|
+
BrowserProvider, AuthSourceProvider,
|
|
153
|
+
register_browser_provider, register_auth_source_provider,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
class MyBrowser(BrowserProvider):
|
|
157
|
+
def is_connected(self): return True
|
|
158
|
+
def fetch_url(self, url, *, max_chars=None, timeout=15): ...
|
|
159
|
+
def search(self, query, *, max_results=8): ...
|
|
160
|
+
|
|
161
|
+
class MyAuth(AuthSourceProvider):
|
|
162
|
+
def match_source(self, url): ... # → {'domain','cookies','proxy',...} | None
|
|
163
|
+
def get_source(self, domain): ...
|
|
164
|
+
|
|
165
|
+
register_browser_provider(MyBrowser()) # last-resort fetch/search fallback
|
|
166
|
+
register_auth_source_provider(MyAuth()) # cookies for login-walled domains
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
When no provider is registered, the browser fallback and authenticated fetch
|
|
170
|
+
paths are inert no-ops — the anonymous HTTP + Playwright pipeline runs as normal.
|
|
171
|
+
|
|
172
|
+
## Configuration
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from tofu_search import configure
|
|
176
|
+
|
|
177
|
+
configure(
|
|
178
|
+
# Search / fetch settings
|
|
179
|
+
fetch_top_n=6, # Max results to return
|
|
180
|
+
fetch_timeout=15, # HTTP timeout per request (seconds)
|
|
181
|
+
fetch_max_chars_search=60000, # Max chars per page in search results
|
|
182
|
+
fetch_max_chars_direct=200000, # Max chars for direct fetch_url()
|
|
183
|
+
|
|
184
|
+
# LLM settings (for content filter)
|
|
185
|
+
llm_api_key="sk-...",
|
|
186
|
+
llm_base_url="https://api.openai.com/v1",
|
|
187
|
+
llm_model="gpt-4o-mini",
|
|
188
|
+
# Or a custom callable instead:
|
|
189
|
+
# llm_function=my_callable,
|
|
190
|
+
|
|
191
|
+
# Filter settings
|
|
192
|
+
filter_enabled=True, # Enable/disable LLM filter
|
|
193
|
+
filter_min_chars=3000, # Min chars to trigger LLM filter
|
|
194
|
+
)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Many settings also read from environment variables: `FETCH_TOP_N`,
|
|
198
|
+
`FETCH_TIMEOUT`, `FETCH_MAX_CHARS_SEARCH`, `FETCH_MAX_CHARS_DIRECT`,
|
|
199
|
+
`FETCH_MAX_CHARS_PDF`, `FETCH_MAX_BYTES`. One-hop deepening is enabled with
|
|
200
|
+
`SEARCH_DEEPEN_HOPS=1` (or per call: `perform_web_search(..., deepen=True)`).
|
|
201
|
+
Semantic Scholar raises its rate limit with `SEMANTIC_SCHOLAR_API_KEY`.
|
|
202
|
+
|
|
203
|
+
## Pipeline
|
|
204
|
+
|
|
205
|
+
`perform_web_search` runs an overlapping streaming pipeline:
|
|
206
|
+
|
|
207
|
+
1. **Multi-engine search**: engines fire in parallel; each engine's URLs are
|
|
208
|
+
deduped and submitted to the fetch pool the moment they arrive (the first
|
|
209
|
+
page fetch starts before slow engines finish).
|
|
210
|
+
2. **URL dedup**: scheme/trailing-slash-insensitive keys.
|
|
211
|
+
3. **Content dedup**: Jaccard similarity on title+snippet shingles.
|
|
212
|
+
4. **Page fetch**: concurrent HTTP with race-to-N; SSL retry, circuit breaker,
|
|
213
|
+
Playwright fallback for SPA/bot-protection pages.
|
|
214
|
+
- **4b. Deepen** *(opt-in)*: one hop along the best query-relevant links.
|
|
215
|
+
5. **LLM content filter** *(optional)*: relevance verdict + noise removal.
|
|
216
|
+
6. **BM25 rerank**: score documents against the query, select top-N.
|
|
217
|
+
|
|
218
|
+
Step 5 is automatically skipped when no LLM is configured.
|
|
219
|
+
|
|
220
|
+
## Optional Dependencies
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
# SPA / JS-rendered page support
|
|
224
|
+
pip install tofu-search[playwright]
|
|
225
|
+
python -m playwright install chromium
|
|
226
|
+
|
|
227
|
+
# PDF extraction
|
|
228
|
+
pip install tofu-search[pdf]
|
|
229
|
+
|
|
230
|
+
# Everything
|
|
231
|
+
pip install tofu-search[all]
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Or just run `./install.sh` (see below).
|
|
235
|
+
|
|
236
|
+
## Install script
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
./install.sh # core deps
|
|
240
|
+
./install.sh --all # core + playwright + pdf, and installs chromium
|
|
241
|
+
./install.sh --playwright
|
|
242
|
+
./install.sh --pdf
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## License
|
|
246
|
+
|
|
247
|
+
MIT
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# 🔍 tofu-search
|
|
2
|
+
|
|
3
|
+
**Multi-engine web search + content fetching with optional LLM filtering** — a
|
|
4
|
+
standalone Python library extracted from the [Tofu AI assistant](https://github.com/rangehow/tofu-search).
|
|
5
|
+
|
|
6
|
+
This is a full re-extraction that keeps **100% of Tofu's current search/fetch
|
|
7
|
+
capabilities**: every engine, the structured "vertical" lookups, one-hop
|
|
8
|
+
deepening, the SPA/bot-protection Playwright fallback, authenticated-source
|
|
9
|
+
fetching, and the host-browser fallback — the last two exposed through
|
|
10
|
+
optional [provider seams](#host-integration-provider-seams) so the library
|
|
11
|
+
stays dependency-free when used standalone.
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- **Multi-engine search (parallel)**: DuckDuckGo (HTML + API), Brave, Bing,
|
|
16
|
+
SearXNG, Marginalia — plus Xiaohongshu when an auth-source provider supplies
|
|
17
|
+
a logged-in session.
|
|
18
|
+
- **Vertical / structured search**: auto-detects CVE IDs, arXiv IDs, DOIs,
|
|
19
|
+
stock tickers, PyPI/npm packages, GitHub repos, IP addresses, Hugging Face
|
|
20
|
+
daily papers, and Semantic Scholar related-work — answered from the relevant
|
|
21
|
+
free API alongside web results.
|
|
22
|
+
- **Content deduplication**: Jaccard similarity on shingles (CJK + Latin aware).
|
|
23
|
+
- **Concurrent page fetching**: Race-to-N strategy with SSL fallback + a
|
|
24
|
+
per-domain circuit breaker.
|
|
25
|
+
- **One-hop deepening** *(opt-in)*: follow the best query-relevant outbound
|
|
26
|
+
links one hop deeper, bounded like a crawl budget.
|
|
27
|
+
- **LLM content filter** *(optional)*: relevance verdict + noise removal. When
|
|
28
|
+
no LLM is configured the step is silently skipped (raw text returned as-is).
|
|
29
|
+
- **BM25 reranking**: pure-Python, no external API calls.
|
|
30
|
+
- **SPA / bot-protection support**: optional Playwright fallback for
|
|
31
|
+
JS-rendered and challenge pages.
|
|
32
|
+
- **PDF extraction**: optional pymupdf / pymupdf4llm integration.
|
|
33
|
+
- **Host integration seams**: register a browser provider (fetch/search via a
|
|
34
|
+
real browser the user controls) and an auth-source provider (cookies/proxy
|
|
35
|
+
for login-walled domains) — both no-ops by default.
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install tofu-search
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Basic search (no LLM required)
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from tofu_search import search
|
|
47
|
+
|
|
48
|
+
results = search("Python asyncio tutorial")
|
|
49
|
+
for r in results:
|
|
50
|
+
print(f"{r['title']}: {r['url']}")
|
|
51
|
+
if r.get('full_content'):
|
|
52
|
+
print(f" {r['full_content'][:200]}...")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### With OpenAI content filtering
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from tofu_search import search, configure
|
|
59
|
+
|
|
60
|
+
configure(
|
|
61
|
+
llm_api_key="sk-...",
|
|
62
|
+
llm_base_url="https://api.openai.com/v1",
|
|
63
|
+
llm_model="gpt-4o-mini",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
results = search("Python asyncio tutorial")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### With a custom LLM callable
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from tofu_search import search, configure
|
|
73
|
+
|
|
74
|
+
def my_llm(messages, **kwargs):
|
|
75
|
+
# Your LLM call — receives OpenAI-format messages.
|
|
76
|
+
# kwargs may include: stop, temperature, timeout
|
|
77
|
+
return "response text"
|
|
78
|
+
|
|
79
|
+
configure(llm_function=my_llm)
|
|
80
|
+
results = search("Python asyncio tutorial")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Fetch a single URL
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from tofu_search import fetch_url
|
|
87
|
+
|
|
88
|
+
content = fetch_url("https://example.com")
|
|
89
|
+
if content:
|
|
90
|
+
print(f"Got {len(content)} characters")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Vertical (structured-identifier) search
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from tofu_search import detect_vertical_intent, search_vertical
|
|
97
|
+
|
|
98
|
+
domain, identifier, params = detect_vertical_intent("CVE-2021-44228")
|
|
99
|
+
record = search_vertical(domain, identifier, params)
|
|
100
|
+
print(record['content']) # CVSS score, description, references from NVD
|
|
101
|
+
|
|
102
|
+
# Or force a domain-level fan-out (free-text → Hugging Face + Semantic Scholar):
|
|
103
|
+
from tofu_search import search_vertical_domain
|
|
104
|
+
print(search_vertical_domain('academic', 'mamba state space models')['content'])
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Host integration (provider seams)
|
|
108
|
+
|
|
109
|
+
The standalone library never imports a host application. To unlock the two
|
|
110
|
+
host-only capabilities, register a provider — dependency points inward (host →
|
|
111
|
+
library), exactly like a plugin.
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from tofu_search import (
|
|
115
|
+
BrowserProvider, AuthSourceProvider,
|
|
116
|
+
register_browser_provider, register_auth_source_provider,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
class MyBrowser(BrowserProvider):
|
|
120
|
+
def is_connected(self): return True
|
|
121
|
+
def fetch_url(self, url, *, max_chars=None, timeout=15): ...
|
|
122
|
+
def search(self, query, *, max_results=8): ...
|
|
123
|
+
|
|
124
|
+
class MyAuth(AuthSourceProvider):
|
|
125
|
+
def match_source(self, url): ... # → {'domain','cookies','proxy',...} | None
|
|
126
|
+
def get_source(self, domain): ...
|
|
127
|
+
|
|
128
|
+
register_browser_provider(MyBrowser()) # last-resort fetch/search fallback
|
|
129
|
+
register_auth_source_provider(MyAuth()) # cookies for login-walled domains
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
When no provider is registered, the browser fallback and authenticated fetch
|
|
133
|
+
paths are inert no-ops — the anonymous HTTP + Playwright pipeline runs as normal.
|
|
134
|
+
|
|
135
|
+
## Configuration
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from tofu_search import configure
|
|
139
|
+
|
|
140
|
+
configure(
|
|
141
|
+
# Search / fetch settings
|
|
142
|
+
fetch_top_n=6, # Max results to return
|
|
143
|
+
fetch_timeout=15, # HTTP timeout per request (seconds)
|
|
144
|
+
fetch_max_chars_search=60000, # Max chars per page in search results
|
|
145
|
+
fetch_max_chars_direct=200000, # Max chars for direct fetch_url()
|
|
146
|
+
|
|
147
|
+
# LLM settings (for content filter)
|
|
148
|
+
llm_api_key="sk-...",
|
|
149
|
+
llm_base_url="https://api.openai.com/v1",
|
|
150
|
+
llm_model="gpt-4o-mini",
|
|
151
|
+
# Or a custom callable instead:
|
|
152
|
+
# llm_function=my_callable,
|
|
153
|
+
|
|
154
|
+
# Filter settings
|
|
155
|
+
filter_enabled=True, # Enable/disable LLM filter
|
|
156
|
+
filter_min_chars=3000, # Min chars to trigger LLM filter
|
|
157
|
+
)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Many settings also read from environment variables: `FETCH_TOP_N`,
|
|
161
|
+
`FETCH_TIMEOUT`, `FETCH_MAX_CHARS_SEARCH`, `FETCH_MAX_CHARS_DIRECT`,
|
|
162
|
+
`FETCH_MAX_CHARS_PDF`, `FETCH_MAX_BYTES`. One-hop deepening is enabled with
|
|
163
|
+
`SEARCH_DEEPEN_HOPS=1` (or per call: `perform_web_search(..., deepen=True)`).
|
|
164
|
+
Semantic Scholar raises its rate limit with `SEMANTIC_SCHOLAR_API_KEY`.
|
|
165
|
+
|
|
166
|
+
## Pipeline
|
|
167
|
+
|
|
168
|
+
`perform_web_search` runs an overlapping streaming pipeline:
|
|
169
|
+
|
|
170
|
+
1. **Multi-engine search**: engines fire in parallel; each engine's URLs are
|
|
171
|
+
deduped and submitted to the fetch pool the moment they arrive (the first
|
|
172
|
+
page fetch starts before slow engines finish).
|
|
173
|
+
2. **URL dedup**: scheme/trailing-slash-insensitive keys.
|
|
174
|
+
3. **Content dedup**: Jaccard similarity on title+snippet shingles.
|
|
175
|
+
4. **Page fetch**: concurrent HTTP with race-to-N; SSL retry, circuit breaker,
|
|
176
|
+
Playwright fallback for SPA/bot-protection pages.
|
|
177
|
+
- **4b. Deepen** *(opt-in)*: one hop along the best query-relevant links.
|
|
178
|
+
5. **LLM content filter** *(optional)*: relevance verdict + noise removal.
|
|
179
|
+
6. **BM25 rerank**: score documents against the query, select top-N.
|
|
180
|
+
|
|
181
|
+
Step 5 is automatically skipped when no LLM is configured.
|
|
182
|
+
|
|
183
|
+
## Optional Dependencies
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
# SPA / JS-rendered page support
|
|
187
|
+
pip install tofu-search[playwright]
|
|
188
|
+
python -m playwright install chromium
|
|
189
|
+
|
|
190
|
+
# PDF extraction
|
|
191
|
+
pip install tofu-search[pdf]
|
|
192
|
+
|
|
193
|
+
# Everything
|
|
194
|
+
pip install tofu-search[all]
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Or just run `./install.sh` (see below).
|
|
198
|
+
|
|
199
|
+
## Install script
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
./install.sh # core deps
|
|
203
|
+
./install.sh --all # core + playwright + pdf, and installs chromium
|
|
204
|
+
./install.sh --playwright
|
|
205
|
+
./install.sh --pdf
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
MIT
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tofu-search"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Multi-engine web search, vertical lookups, and content fetching with optional LLM filtering — standalone library from the Tofu AI assistant"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Tofu Team"},
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
dependencies = [
|
|
26
|
+
"requests>=2.28",
|
|
27
|
+
"trafilatura>=1.6",
|
|
28
|
+
"beautifulsoup4>=4.12",
|
|
29
|
+
"python-dateutil>=2.8",
|
|
30
|
+
"urllib3>=1.26",
|
|
31
|
+
"lxml>=4.9",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
playwright = ["playwright>=1.40"]
|
|
36
|
+
pdf = ["pymupdf>=1.23", "pymupdf4llm>=0.0.5"]
|
|
37
|
+
all = ["playwright>=1.40", "pymupdf>=1.23", "pymupdf4llm>=0.0.5"]
|
|
38
|
+
server = ["flask>=3.0"]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/rangehow/tofu-search"
|
|
42
|
+
Documentation = "https://github.com/rangehow/tofu-search#readme"
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.packages.find]
|
|
45
|
+
include = ["tofu_search*"]
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""tofu-search — Standalone multi-engine web search with LLM content filtering.
|
|
2
|
+
|
|
3
|
+
Provides a 7-step search pipeline: multi-engine search, URL dedup,
|
|
4
|
+
content dedup, concurrent page fetch, optional LLM content filter,
|
|
5
|
+
BM25 reranking, and result formatting.
|
|
6
|
+
|
|
7
|
+
Usage::
|
|
8
|
+
|
|
9
|
+
from tofu_search import search, fetch_url, configure
|
|
10
|
+
|
|
11
|
+
# Basic search (no LLM filter — still useful):
|
|
12
|
+
results = search("Python asyncio tutorial")
|
|
13
|
+
|
|
14
|
+
# With OpenAI-compatible LLM for content filtering:
|
|
15
|
+
configure(
|
|
16
|
+
llm_api_key="sk-...",
|
|
17
|
+
llm_base_url="https://api.openai.com/v1",
|
|
18
|
+
llm_model="gpt-4o-mini",
|
|
19
|
+
)
|
|
20
|
+
results = search("Python asyncio tutorial")
|
|
21
|
+
|
|
22
|
+
# With custom LLM callable:
|
|
23
|
+
def my_llm(messages, **kwargs):
|
|
24
|
+
return "your response"
|
|
25
|
+
configure(llm_function=my_llm)
|
|
26
|
+
results = search("Python asyncio tutorial")
|
|
27
|
+
|
|
28
|
+
# Fetch a single URL:
|
|
29
|
+
content = fetch_url("https://example.com")
|
|
30
|
+
|
|
31
|
+
# Format results for display:
|
|
32
|
+
text = format_results(results)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
__version__ = '0.2.0'
|
|
36
|
+
|
|
37
|
+
from tofu_search.config import SearchConfig, configure, get_config
|
|
38
|
+
from tofu_search.fetch.core import (
|
|
39
|
+
extract_urls_from_text,
|
|
40
|
+
fetch_page_content,
|
|
41
|
+
fetch_urls,
|
|
42
|
+
)
|
|
43
|
+
from tofu_search.providers import (
|
|
44
|
+
AuthSourceProvider,
|
|
45
|
+
BrowserProvider,
|
|
46
|
+
register_auth_source_provider,
|
|
47
|
+
register_browser_provider,
|
|
48
|
+
)
|
|
49
|
+
from tofu_search.search.format import format_search_for_tool_response as format_results
|
|
50
|
+
from tofu_search.search.orchestrator import perform_web_search
|
|
51
|
+
from tofu_search.search.vertical import (
|
|
52
|
+
detect_vertical_intent,
|
|
53
|
+
list_domains,
|
|
54
|
+
search_vertical,
|
|
55
|
+
search_vertical_domain,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
__all__ = [
|
|
59
|
+
'search',
|
|
60
|
+
'fetch_url',
|
|
61
|
+
'configure',
|
|
62
|
+
'get_config',
|
|
63
|
+
'SearchConfig',
|
|
64
|
+
'perform_web_search',
|
|
65
|
+
'format_results',
|
|
66
|
+
'fetch_urls',
|
|
67
|
+
'fetch_page_content',
|
|
68
|
+
'extract_urls_from_text',
|
|
69
|
+
# Vertical (structured-identifier) search
|
|
70
|
+
'detect_vertical_intent',
|
|
71
|
+
'search_vertical',
|
|
72
|
+
'search_vertical_domain',
|
|
73
|
+
'list_domains',
|
|
74
|
+
# Provider seams (host integration)
|
|
75
|
+
'BrowserProvider',
|
|
76
|
+
'AuthSourceProvider',
|
|
77
|
+
'register_browser_provider',
|
|
78
|
+
'register_auth_source_provider',
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def search(query: str, *, max_results: int | None = None,
|
|
83
|
+
user_question: str = '', **kwargs) -> list[dict]:
|
|
84
|
+
"""Search the web and return processed results.
|
|
85
|
+
|
|
86
|
+
This is the primary public API. Runs the full 7-step pipeline:
|
|
87
|
+
multi-engine search → URL dedup → content dedup → page fetch →
|
|
88
|
+
LLM content filter (if configured) → BM25 rerank → format.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
query: Search query string.
|
|
92
|
+
max_results: Maximum number of results to return.
|
|
93
|
+
Default: 6 (configurable via configure(fetch_top_n=N)).
|
|
94
|
+
user_question: The user's original question (helps LLM filter judge
|
|
95
|
+
relevance). If not provided, query is used.
|
|
96
|
+
**kwargs: Additional SearchConfig overrides for this call only.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of result dicts, each with keys:
|
|
100
|
+
- title (str): Page title
|
|
101
|
+
- url (str): Page URL
|
|
102
|
+
- snippet (str): Search result snippet
|
|
103
|
+
- source (str): Search engine name
|
|
104
|
+
- full_content (str, optional): Fetched and cleaned page content
|
|
105
|
+
|
|
106
|
+
Example::
|
|
107
|
+
|
|
108
|
+
results = search("Python asyncio tutorial")
|
|
109
|
+
for r in results:
|
|
110
|
+
print(f"{r['title']}: {r['url']}")
|
|
111
|
+
if r.get('full_content'):
|
|
112
|
+
print(f" Content: {r['full_content'][:200]}...")
|
|
113
|
+
"""
|
|
114
|
+
config = None
|
|
115
|
+
if kwargs:
|
|
116
|
+
config = get_config().copy(**kwargs)
|
|
117
|
+
|
|
118
|
+
return perform_web_search(
|
|
119
|
+
query,
|
|
120
|
+
max_results=max_results,
|
|
121
|
+
user_question=user_question or query,
|
|
122
|
+
config=config,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def fetch_url(url: str, *, max_chars: int | None = None,
|
|
127
|
+
timeout: int | None = None) -> str | None:
|
|
128
|
+
"""Fetch and extract text content from a single URL.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
url: URL to fetch.
|
|
132
|
+
max_chars: Max characters of extracted text. Default: 200,000.
|
|
133
|
+
timeout: Request timeout in seconds. Default: 15.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Extracted text content string, or None if fetch failed.
|
|
137
|
+
|
|
138
|
+
Example::
|
|
139
|
+
|
|
140
|
+
content = fetch_url("https://example.com")
|
|
141
|
+
if content:
|
|
142
|
+
print(f"Got {len(content)} chars")
|
|
143
|
+
"""
|
|
144
|
+
cfg = get_config()
|
|
145
|
+
if max_chars is None:
|
|
146
|
+
max_chars = cfg.fetch_max_chars_direct
|
|
147
|
+
return fetch_page_content(url, max_chars=max_chars, timeout=timeout)
|