tofu-search 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. tofu_search-0.2.0/LICENSE +21 -0
  2. tofu_search-0.2.0/PKG-INFO +247 -0
  3. tofu_search-0.2.0/README.md +210 -0
  4. tofu_search-0.2.0/pyproject.toml +45 -0
  5. tofu_search-0.2.0/setup.cfg +4 -0
  6. tofu_search-0.2.0/tofu_search/__init__.py +147 -0
  7. tofu_search-0.2.0/tofu_search/config.py +112 -0
  8. tofu_search-0.2.0/tofu_search/fetch/__init__.py +17 -0
  9. tofu_search-0.2.0/tofu_search/fetch/content_filter.py +226 -0
  10. tofu_search-0.2.0/tofu_search/fetch/core.py +443 -0
  11. tofu_search-0.2.0/tofu_search/fetch/html_extract.py +404 -0
  12. tofu_search-0.2.0/tofu_search/fetch/http.py +306 -0
  13. tofu_search-0.2.0/tofu_search/fetch/interactive_login.py +151 -0
  14. tofu_search-0.2.0/tofu_search/fetch/pdf_extract.py +140 -0
  15. tofu_search-0.2.0/tofu_search/fetch/playwright_pool.py +676 -0
  16. tofu_search-0.2.0/tofu_search/fetch/utils.py +627 -0
  17. tofu_search-0.2.0/tofu_search/http_client.py +48 -0
  18. tofu_search-0.2.0/tofu_search/llm_adapter.py +128 -0
  19. tofu_search-0.2.0/tofu_search/log.py +36 -0
  20. tofu_search-0.2.0/tofu_search/providers.py +110 -0
  21. tofu_search-0.2.0/tofu_search/search/__init__.py +22 -0
  22. tofu_search-0.2.0/tofu_search/search/_common.py +127 -0
  23. tofu_search-0.2.0/tofu_search/search/browser_fallback.py +37 -0
  24. tofu_search-0.2.0/tofu_search/search/dedup.py +92 -0
  25. tofu_search-0.2.0/tofu_search/search/deepen.py +170 -0
  26. tofu_search-0.2.0/tofu_search/search/engines/__init__.py +1 -0
  27. tofu_search-0.2.0/tofu_search/search/engines/bing.py +153 -0
  28. tofu_search-0.2.0/tofu_search/search/engines/brave.py +81 -0
  29. tofu_search-0.2.0/tofu_search/search/engines/ddg.py +110 -0
  30. tofu_search-0.2.0/tofu_search/search/engines/marginalia.py +61 -0
  31. tofu_search-0.2.0/tofu_search/search/engines/searxng.py +165 -0
  32. tofu_search-0.2.0/tofu_search/search/engines/xhs.py +138 -0
  33. tofu_search-0.2.0/tofu_search/search/format.py +61 -0
  34. tofu_search-0.2.0/tofu_search/search/orchestrator.py +460 -0
  35. tofu_search-0.2.0/tofu_search/search/rerank.py +184 -0
  36. tofu_search-0.2.0/tofu_search/search/vertical.py +1053 -0
  37. tofu_search-0.2.0/tofu_search.egg-info/PKG-INFO +247 -0
  38. tofu_search-0.2.0/tofu_search.egg-info/SOURCES.txt +39 -0
  39. tofu_search-0.2.0/tofu_search.egg-info/dependency_links.txt +1 -0
  40. tofu_search-0.2.0/tofu_search.egg-info/requires.txt +21 -0
  41. tofu_search-0.2.0/tofu_search.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024-2026 Tofu Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.4
2
+ Name: tofu-search
3
+ Version: 0.2.0
4
+ Summary: Multi-engine web search, vertical lookups, and content fetching with optional LLM filtering — standalone library from the Tofu AI assistant
5
+ Author: Tofu Team
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/rangehow/tofu-search
8
+ Project-URL: Documentation, https://github.com/rangehow/tofu-search#readme
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: requests>=2.28
20
+ Requires-Dist: trafilatura>=1.6
21
+ Requires-Dist: beautifulsoup4>=4.12
22
+ Requires-Dist: python-dateutil>=2.8
23
+ Requires-Dist: urllib3>=1.26
24
+ Requires-Dist: lxml>=4.9
25
+ Provides-Extra: playwright
26
+ Requires-Dist: playwright>=1.40; extra == "playwright"
27
+ Provides-Extra: pdf
28
+ Requires-Dist: pymupdf>=1.23; extra == "pdf"
29
+ Requires-Dist: pymupdf4llm>=0.0.5; extra == "pdf"
30
+ Provides-Extra: all
31
+ Requires-Dist: playwright>=1.40; extra == "all"
32
+ Requires-Dist: pymupdf>=1.23; extra == "all"
33
+ Requires-Dist: pymupdf4llm>=0.0.5; extra == "all"
34
+ Provides-Extra: server
35
+ Requires-Dist: flask>=3.0; extra == "server"
36
+ Dynamic: license-file
37
+
38
+ # 🔍 tofu-search
39
+
40
+ **Multi-engine web search + content fetching with optional LLM filtering** — a
41
+ standalone Python library extracted from the [Tofu AI assistant](https://github.com/rangehow/tofu-search).
42
+
43
+ This is a full re-extraction that keeps **100% of Tofu's current search/fetch
44
+ capabilities**: every engine, the structured "vertical" lookups, one-hop
45
+ deepening, the SPA/bot-protection Playwright fallback, authenticated-source
46
+ fetching, and the host-browser fallback — the last two exposed through
47
+ optional [provider seams](#host-integration-provider-seams) so the library
48
+ stays dependency-free when used standalone.
49
+
50
+ ## Features
51
+
52
+ - **Multi-engine search (parallel)**: DuckDuckGo (HTML + API), Brave, Bing,
53
+ SearXNG, Marginalia — plus Xiaohongshu when an auth-source provider supplies
54
+ a logged-in session.
55
+ - **Vertical / structured search**: auto-detects CVE IDs, arXiv IDs, DOIs,
56
+ stock tickers, PyPI/npm packages, GitHub repos, IP addresses, Hugging Face
57
+ daily papers, and Semantic Scholar related-work — answered from the relevant
58
+ free API alongside web results.
59
+ - **Content deduplication**: Jaccard similarity on shingles (CJK + Latin aware).
60
+ - **Concurrent page fetching**: Race-to-N strategy with SSL fallback + a
61
+ per-domain circuit breaker.
62
+ - **One-hop deepening** *(opt-in)*: follow the best query-relevant outbound
63
+ links one hop deeper, bounded like a crawl budget.
64
+ - **LLM content filter** *(optional)*: relevance verdict + noise removal. When
65
+ no LLM is configured the step is silently skipped (raw text returned as-is).
66
+ - **BM25 reranking**: pure-Python, no external API calls.
67
+ - **SPA / bot-protection support**: optional Playwright fallback for
68
+ JS-rendered and challenge pages.
69
+ - **PDF extraction**: optional pymupdf / pymupdf4llm integration.
70
+ - **Host integration seams**: register a browser provider (fetch/search via a
71
+ real browser the user controls) and an auth-source provider (cookies/proxy
72
+ for login-walled domains) — both no-ops by default.
73
+
74
+ ## Quick Start
75
+
76
+ ```bash
77
+ pip install tofu-search
78
+ ```
79
+
80
+ ### Basic search (no LLM required)
81
+
82
+ ```python
83
+ from tofu_search import search
84
+
85
+ results = search("Python asyncio tutorial")
86
+ for r in results:
87
+ print(f"{r['title']}: {r['url']}")
88
+ if r.get('full_content'):
89
+ print(f" {r['full_content'][:200]}...")
90
+ ```
91
+
92
+ ### With OpenAI content filtering
93
+
94
+ ```python
95
+ from tofu_search import search, configure
96
+
97
+ configure(
98
+ llm_api_key="sk-...",
99
+ llm_base_url="https://api.openai.com/v1",
100
+ llm_model="gpt-4o-mini",
101
+ )
102
+
103
+ results = search("Python asyncio tutorial")
104
+ ```
105
+
106
+ ### With a custom LLM callable
107
+
108
+ ```python
109
+ from tofu_search import search, configure
110
+
111
+ def my_llm(messages, **kwargs):
112
+ # Your LLM call — receives OpenAI-format messages.
113
+ # kwargs may include: stop, temperature, timeout
114
+ return "response text"
115
+
116
+ configure(llm_function=my_llm)
117
+ results = search("Python asyncio tutorial")
118
+ ```
119
+
120
+ ### Fetch a single URL
121
+
122
+ ```python
123
+ from tofu_search import fetch_url
124
+
125
+ content = fetch_url("https://example.com")
126
+ if content:
127
+ print(f"Got {len(content)} characters")
128
+ ```
129
+
130
+ ### Vertical (structured-identifier) search
131
+
132
+ ```python
133
+ from tofu_search import detect_vertical_intent, search_vertical
134
+
135
+ domain, identifier, params = detect_vertical_intent("CVE-2021-44228")
136
+ record = search_vertical(domain, identifier, params)
137
+ print(record['content']) # CVSS score, description, references from NVD
138
+
139
+ # Or force a domain-level fan-out (free-text → Hugging Face + Semantic Scholar):
140
+ from tofu_search import search_vertical_domain
141
+ print(search_vertical_domain('academic', 'mamba state space models')['content'])
142
+ ```
143
+
144
+ ## Host integration (provider seams)
145
+
146
+ The standalone library never imports a host application. To unlock the two
147
+ host-only capabilities, register a provider — dependency points inward (host →
148
+ library), exactly like a plugin.
149
+
150
+ ```python
151
+ from tofu_search import (
152
+ BrowserProvider, AuthSourceProvider,
153
+ register_browser_provider, register_auth_source_provider,
154
+ )
155
+
156
+ class MyBrowser(BrowserProvider):
157
+ def is_connected(self): return True
158
+ def fetch_url(self, url, *, max_chars=None, timeout=15): ...
159
+ def search(self, query, *, max_results=8): ...
160
+
161
+ class MyAuth(AuthSourceProvider):
162
+ def match_source(self, url): ... # → {'domain','cookies','proxy',...} | None
163
+ def get_source(self, domain): ...
164
+
165
+ register_browser_provider(MyBrowser()) # last-resort fetch/search fallback
166
+ register_auth_source_provider(MyAuth()) # cookies for login-walled domains
167
+ ```
168
+
169
+ When no provider is registered, the browser fallback and authenticated fetch
170
+ paths are inert no-ops — the anonymous HTTP + Playwright pipeline runs as normal.
171
+
172
+ ## Configuration
173
+
174
+ ```python
175
+ from tofu_search import configure
176
+
177
+ configure(
178
+ # Search / fetch settings
179
+ fetch_top_n=6, # Max results to return
180
+ fetch_timeout=15, # HTTP timeout per request (seconds)
181
+ fetch_max_chars_search=60000, # Max chars per page in search results
182
+ fetch_max_chars_direct=200000, # Max chars for direct fetch_url()
183
+
184
+ # LLM settings (for content filter)
185
+ llm_api_key="sk-...",
186
+ llm_base_url="https://api.openai.com/v1",
187
+ llm_model="gpt-4o-mini",
188
+ # Or a custom callable instead:
189
+ # llm_function=my_callable,
190
+
191
+ # Filter settings
192
+ filter_enabled=True, # Enable/disable LLM filter
193
+ filter_min_chars=3000, # Min chars to trigger LLM filter
194
+ )
195
+ ```
196
+
197
+ Many settings also read from environment variables: `FETCH_TOP_N`,
198
+ `FETCH_TIMEOUT`, `FETCH_MAX_CHARS_SEARCH`, `FETCH_MAX_CHARS_DIRECT`,
199
+ `FETCH_MAX_CHARS_PDF`, `FETCH_MAX_BYTES`. One-hop deepening is enabled with
200
+ `SEARCH_DEEPEN_HOPS=1` (or per call: `perform_web_search(..., deepen=True)`).
201
+ Semantic Scholar raises its rate limit with `SEMANTIC_SCHOLAR_API_KEY`.
202
+
203
+ ## Pipeline
204
+
205
+ `perform_web_search` runs an overlapping streaming pipeline:
206
+
207
+ 1. **Multi-engine search**: engines fire in parallel; each engine's URLs are
208
+ deduped and submitted to the fetch pool the moment they arrive (the first
209
+ page fetch starts before slow engines finish).
210
+ 2. **URL dedup**: scheme/trailing-slash-insensitive keys.
211
+ 3. **Content dedup**: Jaccard similarity on title+snippet shingles.
212
+ 4. **Page fetch**: concurrent HTTP with race-to-N; SSL retry, circuit breaker,
213
+ Playwright fallback for SPA/bot-protection pages.
214
+ - **4b. Deepen** *(opt-in)*: one hop along the best query-relevant links.
215
+ 5. **LLM content filter** *(optional)*: relevance verdict + noise removal.
216
+ 6. **BM25 rerank**: score documents against the query, select top-N.
217
+
218
+ Step 5 is automatically skipped when no LLM is configured.
219
+
220
+ ## Optional Dependencies
221
+
222
+ ```bash
223
+ # SPA / JS-rendered page support
224
+ pip install tofu-search[playwright]
225
+ python -m playwright install chromium
226
+
227
+ # PDF extraction
228
+ pip install tofu-search[pdf]
229
+
230
+ # Everything
231
+ pip install tofu-search[all]
232
+ ```
233
+
234
+ Or just run `./install.sh` (see below).
235
+
236
+ ## Install script
237
+
238
+ ```bash
239
+ ./install.sh # core deps
240
+ ./install.sh --all # core + playwright + pdf, and installs chromium
241
+ ./install.sh --playwright
242
+ ./install.sh --pdf
243
+ ```
244
+
245
+ ## License
246
+
247
+ MIT
@@ -0,0 +1,210 @@
1
+ # 🔍 tofu-search
2
+
3
+ **Multi-engine web search + content fetching with optional LLM filtering** — a
4
+ standalone Python library extracted from the [Tofu AI assistant](https://github.com/rangehow/tofu-search).
5
+
6
+ This is a full re-extraction that keeps **100% of Tofu's current search/fetch
7
+ capabilities**: every engine, the structured "vertical" lookups, one-hop
8
+ deepening, the SPA/bot-protection Playwright fallback, authenticated-source
9
+ fetching, and the host-browser fallback — the last two exposed through
10
+ optional [provider seams](#host-integration-provider-seams) so the library
11
+ stays dependency-free when used standalone.
12
+
13
+ ## Features
14
+
15
+ - **Multi-engine search (parallel)**: DuckDuckGo (HTML + API), Brave, Bing,
16
+ SearXNG, Marginalia — plus Xiaohongshu when an auth-source provider supplies
17
+ a logged-in session.
18
+ - **Vertical / structured search**: auto-detects CVE IDs, arXiv IDs, DOIs,
19
+ stock tickers, PyPI/npm packages, GitHub repos, IP addresses, Hugging Face
20
+ daily papers, and Semantic Scholar related-work — answered from the relevant
21
+ free API alongside web results.
22
+ - **Content deduplication**: Jaccard similarity on shingles (CJK + Latin aware).
23
+ - **Concurrent page fetching**: Race-to-N strategy with SSL fallback + a
24
+ per-domain circuit breaker.
25
+ - **One-hop deepening** *(opt-in)*: follow the best query-relevant outbound
26
+ links one hop deeper, bounded like a crawl budget.
27
+ - **LLM content filter** *(optional)*: relevance verdict + noise removal. When
28
+ no LLM is configured the step is silently skipped (raw text returned as-is).
29
+ - **BM25 reranking**: pure-Python, no external API calls.
30
+ - **SPA / bot-protection support**: optional Playwright fallback for
31
+ JS-rendered and challenge pages.
32
+ - **PDF extraction**: optional pymupdf / pymupdf4llm integration.
33
+ - **Host integration seams**: register a browser provider (fetch/search via a
34
+ real browser the user controls) and an auth-source provider (cookies/proxy
35
+ for login-walled domains) — both no-ops by default.
36
+
37
+ ## Quick Start
38
+
39
+ ```bash
40
+ pip install tofu-search
41
+ ```
42
+
43
+ ### Basic search (no LLM required)
44
+
45
+ ```python
46
+ from tofu_search import search
47
+
48
+ results = search("Python asyncio tutorial")
49
+ for r in results:
50
+ print(f"{r['title']}: {r['url']}")
51
+ if r.get('full_content'):
52
+ print(f" {r['full_content'][:200]}...")
53
+ ```
54
+
55
+ ### With OpenAI content filtering
56
+
57
+ ```python
58
+ from tofu_search import search, configure
59
+
60
+ configure(
61
+ llm_api_key="sk-...",
62
+ llm_base_url="https://api.openai.com/v1",
63
+ llm_model="gpt-4o-mini",
64
+ )
65
+
66
+ results = search("Python asyncio tutorial")
67
+ ```
68
+
69
+ ### With a custom LLM callable
70
+
71
+ ```python
72
+ from tofu_search import search, configure
73
+
74
+ def my_llm(messages, **kwargs):
75
+ # Your LLM call — receives OpenAI-format messages.
76
+ # kwargs may include: stop, temperature, timeout
77
+ return "response text"
78
+
79
+ configure(llm_function=my_llm)
80
+ results = search("Python asyncio tutorial")
81
+ ```
82
+
83
+ ### Fetch a single URL
84
+
85
+ ```python
86
+ from tofu_search import fetch_url
87
+
88
+ content = fetch_url("https://example.com")
89
+ if content:
90
+ print(f"Got {len(content)} characters")
91
+ ```
92
+
93
+ ### Vertical (structured-identifier) search
94
+
95
+ ```python
96
+ from tofu_search import detect_vertical_intent, search_vertical
97
+
98
+ domain, identifier, params = detect_vertical_intent("CVE-2021-44228")
99
+ record = search_vertical(domain, identifier, params)
100
+ print(record['content']) # CVSS score, description, references from NVD
101
+
102
+ # Or force a domain-level fan-out (free-text → Hugging Face + Semantic Scholar):
103
+ from tofu_search import search_vertical_domain
104
+ print(search_vertical_domain('academic', 'mamba state space models')['content'])
105
+ ```
106
+
107
+ ## Host integration (provider seams)
108
+
109
+ The standalone library never imports a host application. To unlock the two
110
+ host-only capabilities, register a provider — dependency points inward (host →
111
+ library), exactly like a plugin.
112
+
113
+ ```python
114
+ from tofu_search import (
115
+ BrowserProvider, AuthSourceProvider,
116
+ register_browser_provider, register_auth_source_provider,
117
+ )
118
+
119
+ class MyBrowser(BrowserProvider):
120
+ def is_connected(self): return True
121
+ def fetch_url(self, url, *, max_chars=None, timeout=15): ...
122
+ def search(self, query, *, max_results=8): ...
123
+
124
+ class MyAuth(AuthSourceProvider):
125
+ def match_source(self, url): ... # → {'domain','cookies','proxy',...} | None
126
+ def get_source(self, domain): ...
127
+
128
+ register_browser_provider(MyBrowser()) # last-resort fetch/search fallback
129
+ register_auth_source_provider(MyAuth()) # cookies for login-walled domains
130
+ ```
131
+
132
+ When no provider is registered, the browser fallback and authenticated fetch
133
+ paths are inert no-ops — the anonymous HTTP + Playwright pipeline runs as normal.
134
+
135
+ ## Configuration
136
+
137
+ ```python
138
+ from tofu_search import configure
139
+
140
+ configure(
141
+ # Search / fetch settings
142
+ fetch_top_n=6, # Max results to return
143
+ fetch_timeout=15, # HTTP timeout per request (seconds)
144
+ fetch_max_chars_search=60000, # Max chars per page in search results
145
+ fetch_max_chars_direct=200000, # Max chars for direct fetch_url()
146
+
147
+ # LLM settings (for content filter)
148
+ llm_api_key="sk-...",
149
+ llm_base_url="https://api.openai.com/v1",
150
+ llm_model="gpt-4o-mini",
151
+ # Or a custom callable instead:
152
+ # llm_function=my_callable,
153
+
154
+ # Filter settings
155
+ filter_enabled=True, # Enable/disable LLM filter
156
+ filter_min_chars=3000, # Min chars to trigger LLM filter
157
+ )
158
+ ```
159
+
160
+ Many settings also read from environment variables: `FETCH_TOP_N`,
161
+ `FETCH_TIMEOUT`, `FETCH_MAX_CHARS_SEARCH`, `FETCH_MAX_CHARS_DIRECT`,
162
+ `FETCH_MAX_CHARS_PDF`, `FETCH_MAX_BYTES`. One-hop deepening is enabled with
163
+ `SEARCH_DEEPEN_HOPS=1` (or per call: `perform_web_search(..., deepen=True)`).
164
+ Semantic Scholar raises its rate limit with `SEMANTIC_SCHOLAR_API_KEY`.
165
+
166
+ ## Pipeline
167
+
168
+ `perform_web_search` runs an overlapping streaming pipeline:
169
+
170
+ 1. **Multi-engine search**: engines fire in parallel; each engine's URLs are
171
+ deduped and submitted to the fetch pool the moment they arrive (the first
172
+ page fetch starts before slow engines finish).
173
+ 2. **URL dedup**: scheme/trailing-slash-insensitive keys.
174
+ 3. **Content dedup**: Jaccard similarity on title+snippet shingles.
175
+ 4. **Page fetch**: concurrent HTTP with race-to-N; SSL retry, circuit breaker,
176
+ Playwright fallback for SPA/bot-protection pages.
177
+ - **4b. Deepen** *(opt-in)*: one hop along the best query-relevant links.
178
+ 5. **LLM content filter** *(optional)*: relevance verdict + noise removal.
179
+ 6. **BM25 rerank**: score documents against the query, select top-N.
180
+
181
+ Step 5 is automatically skipped when no LLM is configured.
182
+
183
+ ## Optional Dependencies
184
+
185
+ ```bash
186
+ # SPA / JS-rendered page support
187
+ pip install tofu-search[playwright]
188
+ python -m playwright install chromium
189
+
190
+ # PDF extraction
191
+ pip install tofu-search[pdf]
192
+
193
+ # Everything
194
+ pip install tofu-search[all]
195
+ ```
196
+
197
+ Or just run `./install.sh` (see below).
198
+
199
+ ## Install script
200
+
201
+ ```bash
202
+ ./install.sh # core deps
203
+ ./install.sh --all # core + playwright + pdf, and installs chromium
204
+ ./install.sh --playwright
205
+ ./install.sh --pdf
206
+ ```
207
+
208
+ ## License
209
+
210
+ MIT
@@ -0,0 +1,45 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tofu-search"
7
+ version = "0.2.0"
8
+ description = "Multi-engine web search, vertical lookups, and content fetching with optional LLM filtering — standalone library from the Tofu AI assistant"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "Tofu Team"},
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
23
+ ]
24
+
25
+ dependencies = [
26
+ "requests>=2.28",
27
+ "trafilatura>=1.6",
28
+ "beautifulsoup4>=4.12",
29
+ "python-dateutil>=2.8",
30
+ "urllib3>=1.26",
31
+ "lxml>=4.9",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ playwright = ["playwright>=1.40"]
36
+ pdf = ["pymupdf>=1.23", "pymupdf4llm>=0.0.5"]
37
+ all = ["playwright>=1.40", "pymupdf>=1.23", "pymupdf4llm>=0.0.5"]
38
+ server = ["flask>=3.0"]
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/rangehow/tofu-search"
42
+ Documentation = "https://github.com/rangehow/tofu-search#readme"
43
+
44
+ [tool.setuptools.packages.find]
45
+ include = ["tofu_search*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,147 @@
1
+ """tofu-search — Standalone multi-engine web search with LLM content filtering.
2
+
3
+ Provides a 7-step search pipeline: multi-engine search, URL dedup,
4
+ content dedup, concurrent page fetch, optional LLM content filter,
5
+ BM25 reranking, and result formatting.
6
+
7
+ Usage::
8
+
9
+ from tofu_search import search, fetch_url, configure
10
+
11
+ # Basic search (no LLM filter — still useful):
12
+ results = search("Python asyncio tutorial")
13
+
14
+ # With OpenAI-compatible LLM for content filtering:
15
+ configure(
16
+ llm_api_key="sk-...",
17
+ llm_base_url="https://api.openai.com/v1",
18
+ llm_model="gpt-4o-mini",
19
+ )
20
+ results = search("Python asyncio tutorial")
21
+
22
+ # With custom LLM callable:
23
+ def my_llm(messages, **kwargs):
24
+ return "your response"
25
+ configure(llm_function=my_llm)
26
+ results = search("Python asyncio tutorial")
27
+
28
+ # Fetch a single URL:
29
+ content = fetch_url("https://example.com")
30
+
31
+ # Format results for display:
32
+ text = format_results(results)
33
+ """
34
+
35
+ __version__ = '0.2.0'
36
+
37
+ from tofu_search.config import SearchConfig, configure, get_config
38
+ from tofu_search.fetch.core import (
39
+ extract_urls_from_text,
40
+ fetch_page_content,
41
+ fetch_urls,
42
+ )
43
+ from tofu_search.providers import (
44
+ AuthSourceProvider,
45
+ BrowserProvider,
46
+ register_auth_source_provider,
47
+ register_browser_provider,
48
+ )
49
+ from tofu_search.search.format import format_search_for_tool_response as format_results
50
+ from tofu_search.search.orchestrator import perform_web_search
51
+ from tofu_search.search.vertical import (
52
+ detect_vertical_intent,
53
+ list_domains,
54
+ search_vertical,
55
+ search_vertical_domain,
56
+ )
57
+
58
+ __all__ = [
59
+ 'search',
60
+ 'fetch_url',
61
+ 'configure',
62
+ 'get_config',
63
+ 'SearchConfig',
64
+ 'perform_web_search',
65
+ 'format_results',
66
+ 'fetch_urls',
67
+ 'fetch_page_content',
68
+ 'extract_urls_from_text',
69
+ # Vertical (structured-identifier) search
70
+ 'detect_vertical_intent',
71
+ 'search_vertical',
72
+ 'search_vertical_domain',
73
+ 'list_domains',
74
+ # Provider seams (host integration)
75
+ 'BrowserProvider',
76
+ 'AuthSourceProvider',
77
+ 'register_browser_provider',
78
+ 'register_auth_source_provider',
79
+ ]
80
+
81
+
82
+ def search(query: str, *, max_results: int | None = None,
83
+ user_question: str = '', **kwargs) -> list[dict]:
84
+ """Search the web and return processed results.
85
+
86
+ This is the primary public API. Runs the full 7-step pipeline:
87
+ multi-engine search → URL dedup → content dedup → page fetch →
88
+ LLM content filter (if configured) → BM25 rerank → format.
89
+
90
+ Args:
91
+ query: Search query string.
92
+ max_results: Maximum number of results to return.
93
+ Default: 6 (configurable via configure(fetch_top_n=N)).
94
+ user_question: The user's original question (helps LLM filter judge
95
+ relevance). If not provided, query is used.
96
+ **kwargs: Additional SearchConfig overrides for this call only.
97
+
98
+ Returns:
99
+ List of result dicts, each with keys:
100
+ - title (str): Page title
101
+ - url (str): Page URL
102
+ - snippet (str): Search result snippet
103
+ - source (str): Search engine name
104
+ - full_content (str, optional): Fetched and cleaned page content
105
+
106
+ Example::
107
+
108
+ results = search("Python asyncio tutorial")
109
+ for r in results:
110
+ print(f"{r['title']}: {r['url']}")
111
+ if r.get('full_content'):
112
+ print(f" Content: {r['full_content'][:200]}...")
113
+ """
114
+ config = None
115
+ if kwargs:
116
+ config = get_config().copy(**kwargs)
117
+
118
+ return perform_web_search(
119
+ query,
120
+ max_results=max_results,
121
+ user_question=user_question or query,
122
+ config=config,
123
+ )
124
+
125
+
126
+ def fetch_url(url: str, *, max_chars: int | None = None,
127
+ timeout: int | None = None) -> str | None:
128
+ """Fetch and extract text content from a single URL.
129
+
130
+ Args:
131
+ url: URL to fetch.
132
+ max_chars: Max characters of extracted text. Default: 200,000.
133
+ timeout: Request timeout in seconds. Default: 15.
134
+
135
+ Returns:
136
+ Extracted text content string, or None if fetch failed.
137
+
138
+ Example::
139
+
140
+ content = fetch_url("https://example.com")
141
+ if content:
142
+ print(f"Got {len(content)} chars")
143
+ """
144
+ cfg = get_config()
145
+ if max_chars is None:
146
+ max_chars = cfg.fetch_max_chars_direct
147
+ return fetch_page_content(url, max_chars=max_chars, timeout=timeout)