@staticn0va/wigolo 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +142 -345
- package/dist/agent/pipeline.d.ts.map +1 -1
- package/dist/agent/pipeline.js +35 -5
- package/dist/agent/pipeline.js.map +1 -1
- package/dist/cache/store.d.ts +1 -0
- package/dist/cache/store.d.ts.map +1 -1
- package/dist/cache/store.js +4 -2
- package/dist/cache/store.js.map +1 -1
- package/dist/cli/doctor.d.ts.map +1 -1
- package/dist/cli/doctor.js +43 -17
- package/dist/cli/doctor.js.map +1 -1
- package/dist/cli/shutdown.d.ts +2 -0
- package/dist/cli/shutdown.d.ts.map +1 -0
- package/dist/cli/shutdown.js +26 -0
- package/dist/cli/shutdown.js.map +1 -0
- package/dist/extraction/v1/local-llm.d.ts.map +1 -1
- package/dist/extraction/v1/local-llm.js +13 -37
- package/dist/extraction/v1/local-llm.js.map +1 -1
- package/dist/fetch/error-describe.d.ts +7 -0
- package/dist/fetch/error-describe.d.ts.map +1 -0
- package/dist/fetch/error-describe.js +37 -0
- package/dist/fetch/error-describe.js.map +1 -0
- package/dist/fetch/router.d.ts.map +1 -1
- package/dist/fetch/router.js +4 -2
- package/dist/fetch/router.js.map +1 -1
- package/dist/index.js +17 -12
- package/dist/index.js.map +1 -1
- package/dist/integrations/cloud/llm/model-select.d.ts +5 -0
- package/dist/integrations/cloud/llm/model-select.d.ts.map +1 -0
- package/dist/integrations/cloud/llm/model-select.js +32 -0
- package/dist/integrations/cloud/llm/model-select.js.map +1 -0
- package/dist/integrations/cloud/llm/run.d.ts +27 -0
- package/dist/integrations/cloud/llm/run.d.ts.map +1 -0
- package/dist/integrations/cloud/llm/run.js +99 -0
- package/dist/integrations/cloud/llm/run.js.map +1 -0
- package/dist/integrations/cloud/llm/text-adapters.d.ts +19 -0
- package/dist/integrations/cloud/llm/text-adapters.d.ts.map +1 -0
- package/dist/integrations/cloud/llm/text-adapters.js +103 -0
- package/dist/integrations/cloud/llm/text-adapters.js.map +1 -0
- package/dist/providers/rerank-provider.d.ts +1 -0
- package/dist/providers/rerank-provider.d.ts.map +1 -1
- package/dist/providers/rerank-provider.js +13 -0
- package/dist/providers/rerank-provider.js.map +1 -1
- package/dist/research/brief.d.ts +1 -0
- package/dist/research/brief.d.ts.map +1 -1
- package/dist/research/brief.js +8 -4
- package/dist/research/brief.js.map +1 -1
- package/dist/research/pipeline.js +1 -1
- package/dist/research/pipeline.js.map +1 -1
- package/dist/research/synthesis-local.d.ts +3 -0
- package/dist/research/synthesis-local.d.ts.map +1 -1
- package/dist/research/synthesis-local.js +18 -29
- package/dist/research/synthesis-local.js.map +1 -1
- package/dist/search/filters.d.ts.map +1 -1
- package/dist/search/filters.js +11 -1
- package/dist/search/filters.js.map +1 -1
- package/dist/search/reranker/transformers-rerank-provider.d.ts +1 -0
- package/dist/search/reranker/transformers-rerank-provider.d.ts.map +1 -1
- package/dist/search/reranker/transformers-rerank-provider.js +16 -0
- package/dist/search/reranker/transformers-rerank-provider.js.map +1 -1
- package/dist/tools/cache.d.ts.map +1 -1
- package/dist/tools/cache.js +4 -2
- package/dist/tools/cache.js.map +1 -1
- package/dist/tools/fetch.d.ts.map +1 -1
- package/dist/tools/fetch.js +17 -4
- package/dist/tools/fetch.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,394 +1,191 @@
|
|
|
1
|
-
<div align="center">
|
|
2
|
-
|
|
3
1
|
# wigolo
|
|
4
2
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
Search, fetch, crawl, cache, and extract — ML reranking, semantic embeddings, persistent local cache. Zero API keys, zero cloud, zero cost.
|
|
8
|
-
|
|
9
|
-
[](LICENSE)
|
|
10
|
-
[](https://nodejs.org)
|
|
11
|
-
[](https://www.typescriptlang.org/)
|
|
12
|
-
|
|
13
|
-
[Quick Start](#quick-start) · [Features](#features) · [Why wigolo?](#why-wigolo)
|
|
14
|
-
|
|
15
|
-
</div>
|
|
16
|
-
|
|
17
|
-
```
|
|
18
|
-
$ npx @staticn0va/wigolo init
|
|
19
|
-
```
|
|
3
|
+
Local-first web intelligence MCP server. 8 tools: `search`, `fetch`, `crawl`, `cache`, `extract`, `find_similar`, `research`, `agent`. Runs on Node 20+. No API keys required for the core path.
|
|
20
4
|
|
|
21
|
-
|
|
5
|
+
> **Status:** `v0.1.1` — early release. The v1 retrieval engine has shipped but is still **opt-in** (`WIGOLO_SEARCH=v1`); default uses the legacy SearXNG backend. Full v1.0 will land after the cross-tool benchmark + default flip.
|
|
22
6
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
## What is this?
|
|
26
|
-
|
|
27
|
-
wigolo gives AI coding agents (Claude Code, Cursor, Gemini CLI, Codex, Windsurf, Zed, OpenCode) web search, page fetching, site crawling, content extraction, and a local knowledge cache. It runs entirely on your machine. No API keys, no cloud, no cost — works out of the box with `npx`.
|
|
28
|
-
|
|
29
|
-
## Quick Start
|
|
30
|
-
|
|
31
|
-
### Option A: Interactive setup (recommended)
|
|
7
|
+
## Install
|
|
32
8
|
|
|
33
9
|
```bash
|
|
34
10
|
npx @staticn0va/wigolo init
|
|
35
11
|
```
|
|
36
12
|
|
|
37
|
-
The
|
|
38
|
-
1. **System check** — verifies Node.js, Python, Docker, disk space
|
|
39
|
-
2. **Browser selection** — Lightpanda (fast headless), Chromium, or Firefox
|
|
40
|
-
3. **Install** — search engine, browser, content extractor, ML reranker, embeddings
|
|
41
|
-
4. **Verify** — starts search engine, checks all components
|
|
42
|
-
5. **Agent config** — detects and configures MCP for your AI tools
|
|
43
|
-
6. **Skill install** — writes tool documentation to each agent's instruction system
|
|
44
|
-
|
|
45
|
-
For ongoing use, install globally:
|
|
46
|
-
```bash
|
|
47
|
-
npm i -g @staticn0va/wigolo
|
|
48
|
-
wigolo init # re-run setup
|
|
49
|
-
wigolo doctor # system diagnostics
|
|
50
|
-
wigolo status # quick health check
|
|
51
|
-
wigolo shell # interactive REPL
|
|
52
|
-
```
|
|
53
|
-
|
|
54
|
-
### Option B: Manual setup
|
|
55
|
-
|
|
56
|
-
**1. Warm up:**
|
|
57
|
-
|
|
58
|
-
```bash
|
|
59
|
-
npx @staticn0va/wigolo warmup --all
|
|
60
|
-
```
|
|
13
|
+
The init flow runs a system check, downloads the embedding + reranker models, bootstraps SearXNG, detects installed AI coding agents (Claude Code, Cursor, Gemini CLI, Codex, Windsurf, Zed, OpenCode), and writes MCP config + skill docs for each one.
|
|
61
14
|
|
|
62
|
-
|
|
15
|
+
Or wire it yourself in any MCP client:
|
|
63
16
|
|
|
64
|
-
```bash
|
|
65
|
-
npx @staticn0va/wigolo warmup # browser engine + search engine only
|
|
66
|
-
npx @staticn0va/wigolo warmup --all # + reranker + trafilatura + embeddings + lightpanda + verify
|
|
67
|
-
npx @staticn0va/wigolo warmup --reranker # Install ML reranker
|
|
68
|
-
npx @staticn0va/wigolo warmup --trafilatura # Install content extractor
|
|
69
|
-
npx @staticn0va/wigolo warmup --embeddings # Install semantic embeddings
|
|
70
|
-
npx @staticn0va/wigolo warmup --verify # Start search engine, test all components
|
|
71
|
-
npx @staticn0va/wigolo warmup --force # Wipe search engine state/install/locks and re-bootstrap
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
**2. Connect your agent:**
|
|
75
|
-
|
|
76
|
-
**Claude Code:**
|
|
77
|
-
```bash
|
|
78
|
-
claude mcp add wigolo -- npx @staticn0va/wigolo
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
**Cursor / VS Code / any MCP client:**
|
|
82
17
|
```json
|
|
83
18
|
{
|
|
84
19
|
"mcpServers": {
|
|
85
20
|
"wigolo": {
|
|
86
21
|
"command": "npx",
|
|
87
|
-
"args": ["@staticn0va/wigolo"]
|
|
22
|
+
"args": ["-y", "@staticn0va/wigolo"]
|
|
88
23
|
}
|
|
89
24
|
}
|
|
90
25
|
}
|
|
91
26
|
```
|
|
92
27
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
## Diagnostics
|
|
96
|
-
|
|
97
|
-
```bash
|
|
98
|
-
wigolo doctor # full component health check
|
|
99
|
-
wigolo status # quick overview
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
Or via npx: `npx @staticn0va/wigolo doctor`. Reports the state of every component. Exits 0 when healthy, 1 when degraded. Usable in scripts: `wigolo doctor && my-agent`.
|
|
103
|
-
|
|
104
|
-
## Daemon Mode
|
|
105
|
-
|
|
106
|
-
Run wigolo as a persistent HTTP server for lower latency and shared infrastructure:
|
|
107
|
-
|
|
108
|
-
### Start the daemon
|
|
28
|
+
Global install for repeated CLI use:
|
|
109
29
|
|
|
110
30
|
```bash
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
### What works without Python?
|
|
152
|
-
|
|
153
|
-
Everything except the embedded search engine. Without Python, search falls back to direct scraping of Bing, DuckDuckGo, and Startpage — functional but less reliable. All other tools (fetch, crawl, cache, extract) work fully with just Node.js.
|
|
154
|
-
|
|
155
|
-
## Features
|
|
156
|
-
|
|
157
|
-
### search
|
|
158
|
-
|
|
159
|
-
Search the web and get full markdown content in one call — not snippets.
|
|
160
|
-
|
|
161
|
-
```
|
|
162
|
-
search("React Server Components best practices", { max_results: 5 })
|
|
163
|
-
→ titles, URLs, relevance scores, and full extracted markdown per result
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
- Domain filtering: `include_domains: ["react.dev"]`, `exclude_domains: ["medium.com"]`
|
|
167
|
-
- Date filtering: `from_date: "2024-01-01"`, `to_date: "2025-01-01"`
|
|
168
|
-
- Category search: `general`, `news`, `code`, `docs`, `papers`
|
|
169
|
-
- ML reranking when installed
|
|
170
|
-
- Falls back to direct engine scraping when search engine is unavailable
|
|
171
|
-
|
|
172
|
-
### fetch
|
|
173
|
-
|
|
174
|
-
Fetch any URL and get clean markdown. The page-fetching engine behind `search`.
|
|
175
|
-
|
|
176
|
-
```
|
|
177
|
-
fetch("https://docs.react.dev/reference/react/useState")
|
|
178
|
-
→ clean markdown, links, images, metadata, cached for future use
|
|
179
|
-
```
|
|
180
|
-
|
|
181
|
-
- Smart routing: HTTP first, browser engine fallback for JS-rendered pages (auto-detected)
|
|
182
|
-
- Section targeting: `section: "Parameters"` extracts content under that heading
|
|
183
|
-
- Authenticated browsing: `use_auth: true` with stored session or Chrome profile
|
|
184
|
-
- PDF support: text extraction via pdf-parse
|
|
185
|
-
|
|
186
|
-
### crawl
|
|
187
|
-
|
|
188
|
-
Crawl a site from a seed URL — documentation sites, wikis, anything.
|
|
189
|
-
|
|
190
|
-
```
|
|
191
|
-
crawl("https://docs.example.com", { strategy: "sitemap", max_pages: 50 })
|
|
192
|
-
→ array of pages with titles, markdown, depth
|
|
193
|
-
```
|
|
194
|
-
|
|
195
|
-
- Strategies: `bfs`, `dfs`, `sitemap`, `map` (URL discovery only — no content, faster)
|
|
196
|
-
- URL filtering with include/exclude patterns (regex)
|
|
197
|
-
- robots.txt compliance
|
|
198
|
-
- Cross-page content deduplication (strips repeated nav/header/footer)
|
|
199
|
-
- Total character budget to prevent context overflow
|
|
200
|
-
|
|
201
|
-
### cache
|
|
202
|
-
|
|
203
|
-
Query previously fetched content without hitting the network.
|
|
204
|
-
|
|
205
|
-
```
|
|
206
|
-
cache({ query: "React hooks", url_pattern: "*react.dev*" })
|
|
207
|
-
→ matching cached pages with full markdown
|
|
208
|
-
```
|
|
209
|
-
|
|
210
|
-
- Full-text search over all cached content
|
|
211
|
-
- Combined filters: text query + URL pattern + date range
|
|
212
|
-
- Cache stats and selective clearing
|
|
213
|
-
|
|
214
|
-
### extract
|
|
215
|
-
|
|
216
|
-
Structured data extraction from any URL or HTML.
|
|
217
|
-
|
|
218
|
-
```
|
|
219
|
-
extract("https://example.com/product", { mode: "schema", schema: { price: "string", name: "string" } })
|
|
220
|
-
→ { price: "$29.99", name: "Widget Pro" }
|
|
221
|
-
```
|
|
222
|
-
|
|
223
|
-
Modes:
|
|
224
|
-
- `selector` — CSS selector → text content
|
|
225
|
-
- `tables` — HTML tables → structured row objects
|
|
226
|
-
- `metadata` — title, description, author, date, JSON-LD
|
|
227
|
-
- `schema` — JSON Schema → heuristic field matching from page content
|
|
228
|
-
|
|
229
|
-
## Why wigolo?
|
|
230
|
-
|
|
231
|
-
| | wigolo | Tavily | Firecrawl | Exa |
|
|
232
|
-
|---|---|---|---|---|
|
|
233
|
-
| Cost | Free | $30–500/mo | $16–500/mo | $7/1K queries |
|
|
234
|
-
| API key required | None | Yes | Yes | Yes |
|
|
235
|
-
| Authenticated browsing | Yes | No | No | No |
|
|
236
|
-
| Localhost access | Yes | No | No | No |
|
|
237
|
-
| Local cache + FTS | Yes | No | No | No |
|
|
238
|
-
| Search + extract unified | Yes | Yes | Partial | Partial |
|
|
239
|
-
| ML reranking | Local | Proprietary | No | Neural index |
|
|
240
|
-
| Rate limits | None | Tiered | Tiered | Tiered |
|
|
241
|
-
|
|
242
|
-
## Configuration
|
|
243
|
-
|
|
244
|
-
wigolo works with zero configuration. For advanced use:
|
|
31
|
+
npm i -g @staticn0va/wigolo
|
|
32
|
+
wigolo --help
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## CLI
|
|
36
|
+
|
|
37
|
+
| Command | What it does |
|
|
38
|
+
|---|---|
|
|
39
|
+
| `wigolo` (no args) | Boot MCP server on stdio (used by MCP clients) |
|
|
40
|
+
| `wigolo init` | Interactive onboarding (browser pick, agent detect, MCP config) |
|
|
41
|
+
| `wigolo warmup [--all] [--embeddings] [--reranker]` | Pre-fetch models + bootstrap SearXNG |
|
|
42
|
+
| `wigolo doctor` | Diagnostic: Python, browsers, models, SearXNG, RSS feeds, telemetry |
|
|
43
|
+
| `wigolo health` | Quick OK/degraded exit code |
|
|
44
|
+
| `wigolo serve [--port N]` | Run as HTTP daemon |
|
|
45
|
+
| `wigolo shell` | Interactive REPL against the 8 tools |
|
|
46
|
+
| `wigolo backfill [--dry-run] [--limit N]` | Embed cached pages missing vectors |
|
|
47
|
+
| `wigolo setup mcp` | Wire MCP config into installed agents |
|
|
48
|
+
| `wigolo status` | Show running daemon status |
|
|
49
|
+
| `wigolo plugin <subcommand>` | Manage plugins |
|
|
50
|
+
| `wigolo uninstall` | Remove wigolo install |
|
|
51
|
+
| `wigolo --help` / `wigolo --version` | Help + version |
|
|
52
|
+
|
|
53
|
+
## The 8 MCP tools
|
|
54
|
+
|
|
55
|
+
| Tool | Use when |
|
|
56
|
+
|---|---|
|
|
57
|
+
| `search` | Need info on a topic, no URL yet. Pass query string or array of 3-5 keyword variants for breadth. |
|
|
58
|
+
| `fetch` | Have a specific URL. Returns clean markdown + metadata. JS rendering auto-detected. |
|
|
59
|
+
| `crawl` | Need many pages from one site. Strategies: `bfs`, `dfs`, `sitemap`, `map`. |
|
|
60
|
+
| `cache` | Check the local store before going to the network. FTS5 + optional vec hybrid. |
|
|
61
|
+
| `extract` | Specific data points (tables, metadata, schema-shaped fields). Modes: `selector`, `tables`, `metadata`, `schema`, `structured`. |
|
|
62
|
+
| `find_similar` | "More like this" given a URL or concept. Hybrid FTS + embeddings + web expansion. |
|
|
63
|
+
| `research` | Multi-step investigation: decomposition → parallel search → synthesis with citations. |
|
|
64
|
+
| `agent` | Natural-language data gathering across multiple sources with optional JSON schema. |
|
|
65
|
+
|
|
66
|
+
Each tool surfaces a per-session instruction block (~2 KB) plus a `wigolo://docs/usage` resource with the full routing guide.
|
|
67
|
+
|
|
68
|
+
## Engine selection
|
|
69
|
+
|
|
70
|
+
Two retrieval paths today; toggled by env var:
|
|
245
71
|
|
|
246
72
|
```bash
|
|
247
|
-
#
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
# Authenticated browsing — export browser session state
|
|
251
|
-
WIGOLO_AUTH_STATE_PATH=~/.wigolo/auth.json
|
|
252
|
-
|
|
253
|
-
# Or use your Chrome profile directly (close Chrome first)
|
|
254
|
-
WIGOLO_CHROME_PROFILE_PATH=~/.config/google-chrome/Default
|
|
255
|
-
|
|
256
|
-
# ML reranking (install with: npx @staticn0va/wigolo warmup --reranker)
|
|
257
|
-
WIGOLO_RERANKER=onnx
|
|
258
|
-
|
|
259
|
-
# Tune extraction — auto/always/never
|
|
260
|
-
WIGOLO_TRAFILATURA=auto
|
|
261
|
-
|
|
262
|
-
# Logging
|
|
263
|
-
LOG_LEVEL=info # debug, info, warn, error
|
|
264
|
-
LOG_FORMAT=json # json, text
|
|
73
|
+
WIGOLO_SEARCH=v1 # new path: 11 direct engines, intent-routed verticals, RRF, RSS, recency boost
|
|
74
|
+
WIGOLO_SEARCH=searxng # current default: SearXNG aggregator (legacy)
|
|
265
75
|
```
|
|
266
76
|
|
|
267
|
-
|
|
77
|
+
The v1 engine ships:
|
|
78
|
+
- Direct engines per vertical — general (HN Algolia, lobste.rs, DuckDuckGo, Bing, Startpage), news (HN Algolia, lobste.rs, Bing News), code (GitHub Code, StackOverflow), docs (MDN, DevDocs), papers (arXiv, Semantic Scholar)
|
|
79
|
+
- Intent router + weighted RRF orchestrator
|
|
80
|
+
- Date-range intent classifier + recency boost
|
|
81
|
+
- Opt-in RSS feed engine (`WIGOLO_RSS_FEEDS=url1,url2`)
|
|
82
|
+
- `agent_context.recent_urls` dedup with case-insensitive path matching for IIS / archive.org / Microsoft docs
|
|
268
83
|
|
|
269
|
-
|
|
270
|
-
|---|---|---|
|
|
271
|
-
| `SEARXNG_URL` | *(auto)* | External search engine URL |
|
|
272
|
-
| `SEARXNG_MODE` | `native` | `native` or `docker` |
|
|
273
|
-
| `SEARXNG_PORT` | `8888` | Port for embedded search engine |
|
|
274
|
-
| `WIGOLO_DATA_DIR` | `~/.wigolo` | Data + cache directory |
|
|
275
|
-
| `WIGOLO_AUTH_STATE_PATH` | — | Browser session state JSON |
|
|
276
|
-
| `WIGOLO_CHROME_PROFILE_PATH` | — | Chrome user data directory |
|
|
277
|
-
| `WIGOLO_RERANKER` | `onnx` | ML reranker: `onnx` or `none` (`flashrank` accepted as legacy alias) |
|
|
278
|
-
| `WIGOLO_TRAFILATURA` | `auto` | Content extractor: `auto`, `always`, or `never` |
|
|
279
|
-
| `MAX_BROWSERS` | `3` | Concurrent browser contexts |
|
|
280
|
-
| `FETCH_TIMEOUT_MS` | `10000` | HTTP fetch timeout |
|
|
281
|
-
| `CRAWL_CONCURRENCY` | `2` | Concurrent crawl requests |
|
|
282
|
-
| `RESPECT_ROBOTS_TXT` | `true` | Honor robots.txt |
|
|
283
|
-
| `WIGOLO_BOOTSTRAP_MAX_ATTEMPTS` | `3` | Cap on search engine bootstrap auto-retries |
|
|
284
|
-
| `WIGOLO_BOOTSTRAP_BACKOFF_SECONDS` | `30,3600,86400` | Backoff seconds for retry attempts 1, 2, 3 |
|
|
285
|
-
| `WIGOLO_HEALTH_PROBE_INTERVAL_MS` | `30000` | Interval between search engine health probes |
|
|
286
|
-
| `WIGOLO_DAEMON_PORT` | `3333` | HTTP server port for daemon mode |
|
|
287
|
-
| `WIGOLO_DAEMON_HOST` | `127.0.0.1` | HTTP server bind address for daemon mode |
|
|
288
|
-
|
|
289
|
-
## How it works
|
|
84
|
+
Default stays on SearXNG until the cross-tool benchmark gate clears (Phase 16). Flip with `WIGOLO_SEARCH=v1` to try the new path now.
|
|
290
85
|
|
|
291
|
-
|
|
292
|
-
search query
|
|
293
|
-
→ search engine (70+ engines) or fallback engines (Bing/DDG/Startpage)
|
|
294
|
-
→ deduplicate by URL
|
|
295
|
-
→ domain/date/category filters
|
|
296
|
-
→ ML reranking (optional)
|
|
297
|
-
→ link validation
|
|
298
|
-
→ fetch + extract top N results in parallel
|
|
299
|
-
→ return markdown
|
|
300
|
-
|
|
301
|
-
Each step degrades gracefully:
|
|
302
|
-
Search engine down? → fallback engine scraping
|
|
303
|
-
Page needs JS? → auto-detected, browser rendering used transparently
|
|
304
|
-
Extractor fails? → ensemble pipeline (site-specific → primary → content → fallback → converter)
|
|
305
|
-
Already fetched? → served from local cache
|
|
306
|
-
```
|
|
86
|
+
## Local stack
|
|
307
87
|
|
|
308
|
-
|
|
88
|
+
- **Search aggregator (legacy path):** SearXNG — bootstrapped to `~/.wigolo/searxng/` on first run (native venv preferred, Docker fallback)
|
|
89
|
+
- **Browser:** Playwright Chromium / Firefox / WebKit; Lightpanda available as a fast JS-renderer alternative
|
|
90
|
+
- **Content extraction:** Defuddle with content-type routing for news / recipe / product / paper / event JSON-LD, plus a Mozilla Readability fallback
|
|
91
|
+
- **Embeddings:** `fastembed` running ONNX `BGE-small-en-v1.5` (384-dim) — cached under `~/.wigolo/fastembed/`
|
|
92
|
+
- **Reranker:** `@huggingface/transformers` cross-encoder `Xenova/ms-marco-MiniLM-L-6-v2` — cached under `~/.wigolo/transformers/`
|
|
93
|
+
- **Cache:** SQLite WAL + FTS5; optional vector hybrid via `sqlite-vec` when the extension is loadable on your platform
|
|
94
|
+
- **Process model:** stdio MCP server by default; HTTP daemon (`wigolo serve`) and REPL (`wigolo shell`) also available
|
|
309
95
|
|
|
310
|
-
|
|
311
|
-
1. Site-specific extractors (GitHub, Stack Overflow, MDN, docs frameworks)
|
|
312
|
-
2. Primary extractor — markdown-aware, site-adaptive
|
|
313
|
-
3. Content extraction engine — high-precision article extraction (optional, Python)
|
|
314
|
-
4. Fallback extractor — battle-tested browser-compat algorithm
|
|
315
|
-
5. HTML-to-markdown converter — last resort
|
|
96
|
+
## LLM extraction fallback (optional)
|
|
316
97
|
|
|
317
|
-
|
|
98
|
+
`extract` with `mode: "schema"` falls back to an LLM when heuristics miss. Set one of:
|
|
318
99
|
|
|
319
|
-
## Discovery
|
|
320
|
-
|
|
321
|
-
wigolo is listed on MCP server registries for agent discovery:
|
|
322
|
-
|
|
323
|
-
- **SKILL.md** — machine-readable tool description at repo root, auto-installed to each agent's instruction system by `wigolo init`
|
|
324
|
-
- **npm** — `npm info @staticn0va/wigolo` or search for `mcp-server` keyword
|
|
325
|
-
|
|
326
|
-
The `init` TUI automatically configures MCP and installs SKILL.md for all selected agents. Manual setup:
|
|
327
100
|
```bash
|
|
328
|
-
|
|
101
|
+
export ANTHROPIC_API_KEY=...
|
|
102
|
+
export OPENAI_API_KEY=...
|
|
103
|
+
export GOOGLE_API_KEY=...
|
|
104
|
+
export GROQ_API_KEY=...
|
|
105
|
+
# optional: pin which provider
|
|
106
|
+
export WIGOLO_LLM_PROVIDER=anthropic|openai|gemini|groq
|
|
329
107
|
```
|
|
330
108
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
Start with `npx @staticn0va/wigolo doctor` — it reports the state of every component and is the fastest way to find the cause.
|
|
334
|
-
|
|
335
|
-
**First search is slow or returns odd results**
|
|
336
|
-
Search engine is still bootstrapping in the background. Either wait a minute, or (recommended) run `npx @staticn0va/wigolo warmup --all` before connecting your agent.
|
|
337
|
-
|
|
338
|
-
**ML reranker / content extractor / embeddings "not installed"**
|
|
339
|
-
These are optional Python extras. Install them with `npx @staticn0va/wigolo warmup --all` (or per-component: `--reranker`, `--trafilatura`, `--embeddings`). wigolo uses a private venv under `~/.wigolo/searxng/venv` so your system Python stays untouched.
|
|
340
|
-
|
|
341
|
-
**Search engine won't start**
|
|
342
|
-
Make sure `python3` is on your PATH and version 3.8+. Check with `python3 --version`. If bootstrap got interrupted, `npx @staticn0va/wigolo warmup --force` wipes the state and reinstalls. Alternatively, set `SEARXNG_MODE=docker` if Docker is available.
|
|
109
|
+
If no key is set the fallback is skipped — `extract` still works through the heuristic path. Calls are cached (default 7 days) and rate-limited per request.
|
|
343
110
|
|
|
344
|
-
|
|
345
|
-
That's expected when you haven't made a search yet — the process starts on-demand when the MCP server needs it. Doctor only marks it degraded if the install is broken.
|
|
111
|
+
## Local LLM fallback (research synthesis)
|
|
346
112
|
|
|
347
|
-
|
|
348
|
-
Run `npx @staticn0va/wigolo warmup` to download Chromium. This is done automatically on first use but can fail behind corporate proxies.
|
|
113
|
+
Set `WIGOLO_LLM_PROVIDER=openai-compatible` plus `WIGOLO_LLM_ENDPOINT=http://localhost:11434/v1` to let `research` use a local model (e.g. Ollama) when the host MCP client doesn't support sampling.
|
|
349
114
|
|
|
350
|
-
|
|
351
|
-
If all search engines fail, check your network connection. Behind a proxy? Set `PROXY_URL=http://your-proxy:port`.
|
|
115
|
+
## Config flags worth knowing
|
|
352
116
|
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
117
|
+
| Env var | Default | What it does |
|
|
118
|
+
|---|---|---|
|
|
119
|
+
| `WIGOLO_SEARCH` | `searxng` | `v1` to use the new direct-engine path |
|
|
120
|
+
| `WIGOLO_RSS_FEEDS` | unset | Comma-separated feed URLs; v1 news vertical picks them up |
|
|
121
|
+
| `WIGOLO_DEDUP_CASE_INSENSITIVE_HOSTS` | unset | Comma-separated hostnames where `/A` == `/a` for dedup |
|
|
122
|
+
| `WIGOLO_CRAWL_INDEX` | `0` | `1` to fire-and-forget upsert crawled pages into the vector store |
|
|
123
|
+
| `WIGOLO_EAGER_WARMUP` | `0` | `1` warms embed + rerank at MCP server start (non-blocking) |
|
|
124
|
+
| `WIGOLO_TELEMETRY` | `0` | `1` enables opt-in NDJSON telemetry; off by default |
|
|
125
|
+
| `WIGOLO_DATA_DIR` | `~/.wigolo` | Override data dir for cache, models, SearXNG state |
|
|
126
|
+
| `WIGOLO_LOG_LEVEL` | `info` | `debug`, `info`, `warn`, `error` |
|
|
127
|
+
| `WIGOLO_LOG_FORMAT` | `json` | `json` or `text`; both write to stderr |
|
|
128
|
+
|
|
129
|
+
## What's known to work
|
|
130
|
+
|
|
131
|
+
- 8 MCP tools, full test suite passing (3500+ unit + integration tests on macOS arm64)
|
|
132
|
+
- `init` flow on macOS for Claude Code, Cursor, Gemini CLI, Codex, Windsurf, Zed, OpenCode
|
|
133
|
+
- `WIGOLO_SEARCH=v1` runs end-to-end: intent routing, direct engines, RRF, recency boost, agent_context
|
|
134
|
+
- SQLite + sqlite-vec hybrid on macOS arm64; FTS5-only graceful degradation on alpine/musl (sqlite-vec extension absent)
|
|
135
|
+
- Defuddle extraction with content-type routing (news/recipe/product/paper/event JSON-LD)
|
|
136
|
+
- Conditional GET with true 304 short-circuit through SmartRouter (saves bandwidth on revisits)
|
|
137
|
+
- Doctor + warmup + backfill CLIs
|
|
138
|
+
- Opt-in eager warmup, telemetry, RSS feeds
|
|
139
|
+
|
|
140
|
+
## What's still gated / not done
|
|
141
|
+
|
|
142
|
+
- **Phase 16** — 5-way blind bench (wigolo-v1 vs wigolo-legacy vs Tavily vs Exa vs Firecrawl) not yet captured at the current SHA. Smoke `--subset 2` on this build: `wigolo-v1=10, wigolo-legacy=9.5, exa=10, firecrawl=10, tavily=8`. Full run needed before flipping the default.
|
|
143
|
+
- **Phase 17 default flip** — `WIGOLO_SEARCH` will stay `searxng` until Phase 16 passes. Set `WIGOLO_SEARCH=v1` to opt in now.
|
|
144
|
+
- **Phase 18 v1.0 release** — pinned to bench-gated default flip. This `0.1.1` build is the engine-overhaul snapshot.
|
|
145
|
+
- **Onboarding for engine/RSS/LLM in Ink TUI** — plain `init` asks the new prompts; the Ink TUI phase machine still needs a matching screen.
|
|
146
|
+
- **Bench numbers** — Phase 6/7/8/11/12/13/15 perf + extraction benches are scaffolded but their numbers haven't been captured to `benchmarks/*/output/`.
|
|
147
|
+
|
|
148
|
+
## Architecture in one glance
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
src/
|
|
152
|
+
index.ts CLI router
|
|
153
|
+
server.ts MCP server (8 tools + 1 resource)
|
|
154
|
+
config.ts 52+ env vars
|
|
155
|
+
cli/ warmup, doctor, health, auth, plugin, shell, init, status, backfill, setup-mcp
|
|
156
|
+
tools/ thin MCP handlers (one per tool, delegate to domain)
|
|
157
|
+
fetch/ SmartRouter (HTTP-first → Playwright), browser pool, auth, Lightpanda
|
|
158
|
+
extraction/ Defuddle + content-type routing + named schemas + LLM fallback
|
|
159
|
+
search/ SearXNG client + direct engines + dedup + rerank + RRF + multi-query + answer synth
|
|
160
|
+
search/v1/ v1 engine: intent router + verticals + orchestrator + RSS + recency + context-rank
|
|
161
|
+
crawl/ BFS/DFS/sitemap/map + robots.txt + ETag-incremental
|
|
162
|
+
cache/ SQLite FTS5 + sqlite-vec hybrid + migrations + backfill
|
|
163
|
+
embedding/ fastembed (BGE-small-en-v1.5)
|
|
164
|
+
research/ decomposition → parallel search → synthesis + citation graph
|
|
165
|
+
agent/ plan → execute → synthesize
|
|
166
|
+
searxng/ process + Docker management + bootstrap retry
|
|
167
|
+
providers/ embed, rerank, extract, vector-store, search interfaces
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Common pitfalls
|
|
171
|
+
|
|
172
|
+
- **First run is slow** — model downloads (~250 MB combined) + SearXNG bootstrap (~30 s). `wigolo warmup --all` upfront avoids it during first MCP request.
|
|
173
|
+
- **`wigolo doctor` shows `ML reranker: not installed`** — run `wigolo warmup --reranker` to fetch the cross-encoder model (~22 MB).
|
|
174
|
+
- **`category: 'images'` rejected on `WIGOLO_SEARCH=v1`** — v1 has no images vertical (yet). Use the legacy path (`WIGOLO_SEARCH=searxng`) or omit `category`.
|
|
175
|
+
- **`sqlite-vec extension failed to load`** — your platform (alpine/musl) doesn't have prebuilt binaries. The cache still works via FTS5; vector search is disabled.
|
|
176
|
+
|
|
177
|
+
## Development
|
|
365
178
|
|
|
366
179
|
```bash
|
|
367
180
|
git clone https://github.com/KnockOutEZ/wigolo
|
|
368
181
|
cd wigolo
|
|
369
182
|
npm install
|
|
370
|
-
npm
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
Releases are triggered by pushing a version tag. CI handles the rest.
|
|
376
|
-
|
|
377
|
-
```bash
|
|
378
|
-
# on main, all changes committed and pushed
|
|
379
|
-
make release-patch # or: release-minor / release-major
|
|
183
|
+
npm run build # tsup → dist/, then tsc → dist/*.d.ts
|
|
184
|
+
npm test # full vitest suite
|
|
185
|
+
npm run lint # tsc --noEmit
|
|
186
|
+
npm run dev # tsx src/index.ts
|
|
380
187
|
```
|
|
381
188
|
|
|
382
|
-
Run `make help` for all targets, or `make release-dry-run` to preview the npm tarball.
|
|
383
|
-
|
|
384
|
-
The `release` workflow will:
|
|
385
|
-
1. Build a clean `dist/`
|
|
386
|
-
2. Verify the tag matches `package.json` version
|
|
387
|
-
3. Publish to npm with provenance
|
|
388
|
-
4. Create a GitHub Release with auto-generated notes
|
|
389
|
-
|
|
390
|
-
Requires the `NPM_TOKEN` repository secret (npm automation token with publish scope).
|
|
391
|
-
|
|
392
189
|
## License
|
|
393
190
|
|
|
394
|
-
|
|
191
|
+
BUSL-1.1 — see `LICENSE`.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/agent/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,EACL,KAAK,qBAAqB,EAG3B,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/agent/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,EACL,KAAK,qBAAqB,EAG3B,MAAM,uBAAuB,CAAC;AAE/B,OAAO,KAAK,EACV,UAAU,EACV,WAAW,EAGX,YAAY,EACb,MAAM,aAAa,CAAC;AACrB,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAQtD,wBAAsB,gBAAgB,CACpC,KAAK,EAAE,UAAU,EACjB,OAAO,EAAE,YAAY,EAAE,EACvB,MAAM,EAAE,WAAW,EACnB,MAAM,CAAC,EAAE,qBAAqB,GAC7B,OAAO,CAAC,WAAW,CAAC,CA+FtB"}
|
package/dist/agent/pipeline.js
CHANGED
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
requestSampling,
|
|
7
7
|
checkSamplingSupport
|
|
8
8
|
} from "../search/sampling.js";
|
|
9
|
+
import { isLlmConfigured, runLlmText } from "../integrations/cloud/llm/run.js";
|
|
9
10
|
const log = createLogger("agent");
|
|
10
11
|
const DEFAULT_MAX_PAGES = 10;
|
|
11
12
|
const DEFAULT_MAX_TIME_MS = 6e4;
|
|
@@ -56,10 +57,12 @@ async function runAgentPipeline(input, engines, router, server) {
|
|
|
56
57
|
}
|
|
57
58
|
}
|
|
58
59
|
const synthStart = Date.now();
|
|
59
|
-
const result = await synthesizeResult(input.prompt, sources, server);
|
|
60
|
+
const { result, samplingUsed, llmUsed } = await synthesizeResult(input.prompt, sources, server);
|
|
61
|
+
const resultLen = typeof result === "string" ? result.length : JSON.stringify(result).length;
|
|
62
|
+
const synthPath = samplingUsed ? " (via sampling)" : llmUsed ? " (via configured LLM)" : " (evidence fallback)";
|
|
60
63
|
steps.push({
|
|
61
64
|
action: "synthesize",
|
|
62
|
-
detail: `Produced ${
|
|
65
|
+
detail: `Produced ${resultLen} char result${synthPath}`,
|
|
63
66
|
time_ms: Date.now() - synthStart
|
|
64
67
|
});
|
|
65
68
|
return {
|
|
@@ -120,19 +123,46 @@ function applySchemaExtraction(sources, schema) {
|
|
|
120
123
|
async function synthesizeResult(prompt, sources, server) {
|
|
121
124
|
const fetchedSources = sources.filter((s) => s.fetched && s.markdown_content.length > 0);
|
|
122
125
|
if (fetchedSources.length === 0) {
|
|
123
|
-
return "No data could be gathered for this request.";
|
|
126
|
+
return { result: "No data could be gathered for this request.", samplingUsed: false };
|
|
124
127
|
}
|
|
125
128
|
if (server) {
|
|
126
129
|
try {
|
|
127
130
|
const result = await synthesizeWithSampling(prompt, fetchedSources, server);
|
|
128
|
-
if (result) return result;
|
|
131
|
+
if (result) return { result, samplingUsed: true };
|
|
129
132
|
} catch (err) {
|
|
130
133
|
log.warn("sampling synthesis failed, using fallback", {
|
|
131
134
|
error: err instanceof Error ? err.message : String(err)
|
|
132
135
|
});
|
|
133
136
|
}
|
|
134
137
|
}
|
|
135
|
-
|
|
138
|
+
if (isLlmConfigured()) {
|
|
139
|
+
try {
|
|
140
|
+
const result = await synthesizeViaLlmRunner(prompt, fetchedSources);
|
|
141
|
+
if (result) return { result, samplingUsed: false, llmUsed: true };
|
|
142
|
+
} catch (err) {
|
|
143
|
+
log.warn("llm runner synthesis failed, using evidence fallback", {
|
|
144
|
+
error: err instanceof Error ? err.message : String(err)
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
return { result: buildFallbackSynthesis(prompt, fetchedSources), samplingUsed: false };
|
|
149
|
+
}
|
|
150
|
+
async function synthesizeViaLlmRunner(prompt, sources) {
|
|
151
|
+
const maxCharsPerSource = 3e3;
|
|
152
|
+
const sourceBlocks = sources.map((s, i) => {
|
|
153
|
+
const content = s.markdown_content.slice(0, maxCharsPerSource);
|
|
154
|
+
return `[${i + 1}] ${s.title} (${s.url})
|
|
155
|
+
${content}`;
|
|
156
|
+
});
|
|
157
|
+
const truncated = sourceBlocks.join("\n\n").slice(0, 4e4);
|
|
158
|
+
const fullPrompt = `You are a data gathering assistant. Based on the user request and the gathered sources, synthesize a clear, well-organized response. Cite sources as [1], [2], etc.
|
|
159
|
+
|
|
160
|
+
User request: ${prompt}
|
|
161
|
+
|
|
162
|
+
Sources:
|
|
163
|
+
${truncated}`;
|
|
164
|
+
const r = await runLlmText({ prompt: fullPrompt, maxTokens: 2e3 });
|
|
165
|
+
return r.text && r.text.trim().length > 0 ? r.text.trim() : null;
|
|
136
166
|
}
|
|
137
167
|
async function synthesizeWithSampling(prompt, sources, server) {
|
|
138
168
|
try {
|