@staticn0va/wigolo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +74 -0
- package/README.md +272 -0
- package/dist/cache/db.d.ts +5 -0
- package/dist/cache/db.d.ts.map +1 -0
- package/dist/cache/db.js +97 -0
- package/dist/cache/db.js.map +1 -0
- package/dist/cache/store.d.ts +26 -0
- package/dist/cache/store.d.ts.map +1 -0
- package/dist/cache/store.js +214 -0
- package/dist/cache/store.js.map +1 -0
- package/dist/cli/daemon.d.ts +2 -0
- package/dist/cli/daemon.d.ts.map +1 -0
- package/dist/cli/daemon.js +5 -0
- package/dist/cli/daemon.js.map +1 -0
- package/dist/cli/health.d.ts +2 -0
- package/dist/cli/health.d.ts.map +1 -0
- package/dist/cli/health.js +5 -0
- package/dist/cli/health.js.map +1 -0
- package/dist/cli/index.d.ts +7 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +9 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/warmup.d.ts +11 -0
- package/dist/cli/warmup.d.ts.map +1 -0
- package/dist/cli/warmup.js +107 -0
- package/dist/cli/warmup.js.map +1 -0
- package/dist/config.d.ts +41 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +66 -0
- package/dist/config.js.map +1 -0
- package/dist/crawl/crawler.d.ts +18 -0
- package/dist/crawl/crawler.d.ts.map +1 -0
- package/dist/crawl/crawler.js +228 -0
- package/dist/crawl/crawler.js.map +1 -0
- package/dist/crawl/dedup.d.ts +15 -0
- package/dist/crawl/dedup.d.ts.map +1 -0
- package/dist/crawl/dedup.js +93 -0
- package/dist/crawl/dedup.js.map +1 -0
- package/dist/crawl/mapper.d.ts +17 -0
- package/dist/crawl/mapper.d.ts.map +1 -0
- package/dist/crawl/mapper.js +178 -0
- package/dist/crawl/mapper.js.map +1 -0
- package/dist/crawl/rate-limiter.d.ts +10 -0
- package/dist/crawl/rate-limiter.d.ts.map +1 -0
- package/dist/crawl/rate-limiter.js +72 -0
- package/dist/crawl/rate-limiter.js.map +1 -0
- package/dist/crawl/robots.d.ts +9 -0
- package/dist/crawl/robots.d.ts.map +1 -0
- package/dist/crawl/robots.js +63 -0
- package/dist/crawl/robots.js.map +1 -0
- package/dist/crawl/sitemap.d.ts +4 -0
- package/dist/crawl/sitemap.d.ts.map +1 -0
- package/dist/crawl/sitemap.js +38 -0
- package/dist/crawl/sitemap.js.map +1 -0
- package/dist/crawl/url-utils.d.ts +3 -0
- package/dist/crawl/url-utils.d.ts.map +1 -0
- package/dist/crawl/url-utils.js +41 -0
- package/dist/crawl/url-utils.js.map +1 -0
- package/dist/extraction/defuddle.d.ts +3 -0
- package/dist/extraction/defuddle.d.ts.map +1 -0
- package/dist/extraction/defuddle.js +26 -0
- package/dist/extraction/defuddle.js.map +1 -0
- package/dist/extraction/extract.d.ts +5 -0
- package/dist/extraction/extract.d.ts.map +1 -0
- package/dist/extraction/extract.js +83 -0
- package/dist/extraction/extract.js.map +1 -0
- package/dist/extraction/jsonld.d.ts +4 -0
- package/dist/extraction/jsonld.d.ts.map +1 -0
- package/dist/extraction/jsonld.js +64 -0
- package/dist/extraction/jsonld.js.map +1 -0
- package/dist/extraction/markdown.d.ts +10 -0
- package/dist/extraction/markdown.d.ts.map +1 -0
- package/dist/extraction/markdown.js +107 -0
- package/dist/extraction/markdown.js.map +1 -0
- package/dist/extraction/pipeline.d.ts +11 -0
- package/dist/extraction/pipeline.d.ts.map +1 -0
- package/dist/extraction/pipeline.js +95 -0
- package/dist/extraction/pipeline.js.map +1 -0
- package/dist/extraction/readability.d.ts +3 -0
- package/dist/extraction/readability.d.ts.map +1 -0
- package/dist/extraction/readability.js +32 -0
- package/dist/extraction/readability.js.map +1 -0
- package/dist/extraction/schema.d.ts +7 -0
- package/dist/extraction/schema.d.ts.map +1 -0
- package/dist/extraction/schema.js +86 -0
- package/dist/extraction/schema.js.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.js +104 -0
- package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
- package/dist/extraction/site-extractors/github.d.ts +3 -0
- package/dist/extraction/site-extractors/github.d.ts.map +1 -0
- package/dist/extraction/site-extractors/github.js +107 -0
- package/dist/extraction/site-extractors/github.js.map +1 -0
- package/dist/extraction/site-extractors/mdn.d.ts +3 -0
- package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
- package/dist/extraction/site-extractors/mdn.js +58 -0
- package/dist/extraction/site-extractors/mdn.js.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.js +88 -0
- package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
- package/dist/extraction/trafilatura.d.ts +6 -0
- package/dist/extraction/trafilatura.d.ts.map +1 -0
- package/dist/extraction/trafilatura.js +105 -0
- package/dist/extraction/trafilatura.js.map +1 -0
- package/dist/fetch/auth.d.ts +8 -0
- package/dist/fetch/auth.d.ts.map +1 -0
- package/dist/fetch/auth.js +32 -0
- package/dist/fetch/auth.js.map +1 -0
- package/dist/fetch/browser-pool.d.ts +28 -0
- package/dist/fetch/browser-pool.d.ts.map +1 -0
- package/dist/fetch/browser-pool.js +138 -0
- package/dist/fetch/browser-pool.js.map +1 -0
- package/dist/fetch/content-check.d.ts +2 -0
- package/dist/fetch/content-check.d.ts.map +1 -0
- package/dist/fetch/content-check.js +62 -0
- package/dist/fetch/content-check.js.map +1 -0
- package/dist/fetch/http-client.d.ts +15 -0
- package/dist/fetch/http-client.d.ts.map +1 -0
- package/dist/fetch/http-client.js +146 -0
- package/dist/fetch/http-client.js.map +1 -0
- package/dist/fetch/router.d.ts +45 -0
- package/dist/fetch/router.d.ts.map +1 -0
- package/dist/fetch/router.js +89 -0
- package/dist/fetch/router.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +10 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +39 -0
- package/dist/logger.js.map +1 -0
- package/dist/search/dedup.d.ts +10 -0
- package/dist/search/dedup.d.ts.map +1 -0
- package/dist/search/dedup.js +35 -0
- package/dist/search/dedup.js.map +1 -0
- package/dist/search/engines/bing.d.ts +7 -0
- package/dist/search/engines/bing.d.ts.map +1 -0
- package/dist/search/engines/bing.js +48 -0
- package/dist/search/engines/bing.js.map +1 -0
- package/dist/search/engines/duckduckgo.d.ts +7 -0
- package/dist/search/engines/duckduckgo.d.ts.map +1 -0
- package/dist/search/engines/duckduckgo.js +50 -0
- package/dist/search/engines/duckduckgo.js.map +1 -0
- package/dist/search/engines/startpage.d.ts +7 -0
- package/dist/search/engines/startpage.d.ts.map +1 -0
- package/dist/search/engines/startpage.js +50 -0
- package/dist/search/engines/startpage.js.map +1 -0
- package/dist/search/filters.d.ts +16 -0
- package/dist/search/filters.d.ts.map +1 -0
- package/dist/search/filters.js +63 -0
- package/dist/search/filters.js.map +1 -0
- package/dist/search/flashrank.d.ts +12 -0
- package/dist/search/flashrank.d.ts.map +1 -0
- package/dist/search/flashrank.js +63 -0
- package/dist/search/flashrank.js.map +1 -0
- package/dist/search/query.d.ts +2 -0
- package/dist/search/query.d.ts.map +1 -0
- package/dist/search/query.js +41 -0
- package/dist/search/query.js.map +1 -0
- package/dist/search/rerank.d.ts +3 -0
- package/dist/search/rerank.d.ts.map +1 -0
- package/dist/search/rerank.js +40 -0
- package/dist/search/rerank.js.map +1 -0
- package/dist/search/searxng.d.ts +8 -0
- package/dist/search/searxng.d.ts.map +1 -0
- package/dist/search/searxng.js +87 -0
- package/dist/search/searxng.js.map +1 -0
- package/dist/search/validator.d.ts +6 -0
- package/dist/search/validator.d.ts.map +1 -0
- package/dist/search/validator.js +35 -0
- package/dist/search/validator.js.map +1 -0
- package/dist/searxng/bootstrap.d.ts +18 -0
- package/dist/searxng/bootstrap.d.ts.map +1 -0
- package/dist/searxng/bootstrap.js +136 -0
- package/dist/searxng/bootstrap.js.map +1 -0
- package/dist/searxng/docker.d.ts +9 -0
- package/dist/searxng/docker.d.ts.map +1 -0
- package/dist/searxng/docker.js +67 -0
- package/dist/searxng/docker.js.map +1 -0
- package/dist/searxng/process.d.ts +23 -0
- package/dist/searxng/process.d.ts.map +1 -0
- package/dist/searxng/process.js +188 -0
- package/dist/searxng/process.js.map +1 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +311 -0
- package/dist/server.js.map +1 -0
- package/dist/tools/cache.d.ts +3 -0
- package/dist/tools/cache.d.ts.map +1 -0
- package/dist/tools/cache.js +50 -0
- package/dist/tools/cache.js.map +1 -0
- package/dist/tools/crawl.d.ts +6 -0
- package/dist/tools/crawl.d.ts.map +1 -0
- package/dist/tools/crawl.js +97 -0
- package/dist/tools/crawl.js.map +1 -0
- package/dist/tools/extract.d.ts +4 -0
- package/dist/tools/extract.d.ts.map +1 -0
- package/dist/tools/extract.js +69 -0
- package/dist/tools/extract.js.map +1 -0
- package/dist/tools/fetch.d.ts +4 -0
- package/dist/tools/fetch.d.ts.map +1 -0
- package/dist/tools/fetch.js +76 -0
- package/dist/tools/fetch.js.map +1 -0
- package/dist/tools/search.d.ts +4 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +160 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/types.d.ts +222 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +61 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Business Source License 1.1
|
|
2
|
+
|
|
3
|
+
License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
|
|
4
|
+
"Business Source License" is a trademark of MariaDB Corporation Ab.
|
|
5
|
+
|
|
6
|
+
Parameters
|
|
7
|
+
|
|
8
|
+
Licensor: Towhid Khan
|
|
9
|
+
Licensed Work: wigolo
|
|
10
|
+
The Licensed Work is (c) 2026 Towhid Khan
|
|
11
|
+
Additional Use Grant: You may make production use of the Licensed Work,
|
|
12
|
+
provided that such use does not include offering
|
|
13
|
+
the Licensed Work to third parties on a hosted or
|
|
14
|
+
embedded basis which competes with the Licensor's
|
|
15
|
+
paid offerings, and provided that your organization's
|
|
16
|
+
total annual revenue does not exceed US $1,000,000,
|
|
17
|
+
or you are using the Licensed Work for personal,
|
|
18
|
+
educational, or non-commercial open source purposes.
|
|
19
|
+
|
|
20
|
+
Change Date: 2029-04-12
|
|
21
|
+
|
|
22
|
+
Change License: MIT License
|
|
23
|
+
|
|
24
|
+
For information about alternative licensing arrangements, contact:
|
|
25
|
+
ktowhid20@gmail.com
|
|
26
|
+
|
|
27
|
+
Notice
|
|
28
|
+
|
|
29
|
+
The Business Source License (this document, or the "License") is not an Open
|
|
30
|
+
Source license. However, the Licensed Work will eventually be made available
|
|
31
|
+
under an Open Source License, as stated in this License.
|
|
32
|
+
|
|
33
|
+
License terms copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
|
|
34
|
+
|
|
35
|
+
Terms
|
|
36
|
+
|
|
37
|
+
The Licensor hereby grants you the right to copy, modify, create derivative
|
|
38
|
+
works, redistribute, and make non-production use of the Licensed Work. The
|
|
39
|
+
Licensor may make an Additional Use Grant, above, permitting limited production
|
|
40
|
+
use.
|
|
41
|
+
|
|
42
|
+
Effective on the Change Date, or the fourth anniversary of the first publicly
|
|
43
|
+
available distribution of a specific version of the Licensed Work under this
|
|
44
|
+
License, whichever comes first, the Licensor hereby grants you rights under the
|
|
45
|
+
terms of the Change License, and the rights granted in the paragraph above
|
|
46
|
+
terminate.
|
|
47
|
+
|
|
48
|
+
If your use of the Licensed Work does not comply with the requirements
|
|
49
|
+
currently in effect as described in this License, you must purchase a
|
|
50
|
+
commercial license from the Licensor, its affiliated entities, or authorized
|
|
51
|
+
resellers, or you must refrain from using the Licensed Work.
|
|
52
|
+
|
|
53
|
+
All copies of the original and modified Licensed Work, and derivative works
|
|
54
|
+
of the Licensed Work, are subject to this License. This License applies
|
|
55
|
+
separately for each version of the Licensed Work and the Change Date may vary
|
|
56
|
+
for each version of the Licensed Work released by Licensor.
|
|
57
|
+
|
|
58
|
+
You must conspicuously display this License on each original or modified copy
|
|
59
|
+
of the Licensed Work. If you receive the Licensed Work in original or modified
|
|
60
|
+
form from a third party, the terms and conditions set forth in this License
|
|
61
|
+
apply to your use of that work.
|
|
62
|
+
|
|
63
|
+
Any use of the Licensed Work in violation of this License will automatically
|
|
64
|
+
terminate your rights under this License for the current and all other versions
|
|
65
|
+
of the Licensed Work.
|
|
66
|
+
|
|
67
|
+
This License does not grant you any right in any trademark or logo of Licensor
|
|
68
|
+
or its affiliates (provided that you may use a trademark or logo of Licensor as
|
|
69
|
+
expressly required by this License).
|
|
70
|
+
|
|
71
|
+
TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON AN
|
|
72
|
+
"AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS
|
|
73
|
+
OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF MERCHANTABILITY,
|
|
74
|
+
FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND TITLE.
|
package/README.md
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# wigolo
|
|
4
|
+
|
|
5
|
+
**Local-first web search MCP server for AI coding agents.**
|
|
6
|
+
|
|
7
|
+
Search, fetch, crawl, cache, and extract — zero API keys, zero cloud, zero cost.
|
|
8
|
+
|
|
9
|
+
[](LICENSE)
|
|
10
|
+
[](https://nodejs.org)
|
|
11
|
+
[](https://www.typescriptlang.org/)
|
|
12
|
+
|
|
13
|
+
[Quick Start](#quick-start) · [Features](#features) · [Why wigolo?](#why-wigolo) · [Roadmap](#roadmap)
|
|
14
|
+
|
|
15
|
+
</div>
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
$ claude mcp add wigolo -- npx wigolo
|
|
19
|
+
Added MCP server wigolo
|
|
20
|
+
|
|
21
|
+
$ # That's it. Your agent now has web search.
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## What is this?
|
|
25
|
+
|
|
26
|
+
wigolo gives AI coding agents (Claude Code, Cursor, Gemini CLI, Codex, Windsurf) web search, page fetching, site crawling, content extraction, and a local knowledge cache. It runs entirely on your machine. No API keys, no cloud, no cost — works out of the box with `npx`.
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
**Claude Code:**
|
|
31
|
+
```bash
|
|
32
|
+
claude mcp add wigolo -- npx wigolo
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**Cursor / VS Code / any MCP client:**
|
|
36
|
+
```json
|
|
37
|
+
{
|
|
38
|
+
"mcpServers": {
|
|
39
|
+
"wigolo": {
|
|
40
|
+
"command": "npx",
|
|
41
|
+
"args": ["wigolo"]
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
**Optional warmup (improves quality on first use):**
|
|
48
|
+
```bash
|
|
49
|
+
npx wigolo warmup # Downloads Playwright + SearXNG
|
|
50
|
+
npx wigolo warmup --all # + ML reranking + Trafilatura extraction
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Prerequisites
|
|
54
|
+
|
|
55
|
+
- **Node.js 20+** — [Download](https://nodejs.org/) or `brew install node` (macOS) / `winget install OpenJS.NodeJS` (Windows) / `sudo apt install nodejs` (Ubuntu/Debian)
|
|
56
|
+
- **Python 3.8+** *(recommended)* — [Download](https://python.org/) or `brew install python3` (macOS) / `winget install Python.Python.3` (Windows) / `sudo apt install python3` (Ubuntu/Debian)
|
|
57
|
+
- **Docker** *(optional)* — Alternative to Python for running SearXNG.
|
|
58
|
+
|
|
59
|
+
Everything else (Playwright, SearXNG) is downloaded automatically on first use or via `npx wigolo warmup`.
|
|
60
|
+
|
|
61
|
+
### What works without Python?
|
|
62
|
+
|
|
63
|
+
Everything except embedded SearXNG. Without Python, search falls back to direct scraping of Bing, DuckDuckGo, and Startpage — functional but less reliable. All other tools (fetch, crawl, cache, extract) work fully with just Node.js.
|
|
64
|
+
|
|
65
|
+
## Features
|
|
66
|
+
|
|
67
|
+
### search
|
|
68
|
+
|
|
69
|
+
Search the web and get full markdown content in one call — not snippets.
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
search("React Server Components best practices", { max_results: 5 })
|
|
73
|
+
→ titles, URLs, relevance scores, and full extracted markdown per result
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
- Domain filtering: `include_domains: ["react.dev"]`, `exclude_domains: ["medium.com"]`
|
|
77
|
+
- Date filtering: `from_date: "2024-01-01"`, `to_date: "2025-01-01"`
|
|
78
|
+
- Category search: `general`, `news`, `code`, `docs`, `papers`
|
|
79
|
+
- ML reranking with FlashRank when installed
|
|
80
|
+
- Falls back to direct engine scraping when SearXNG is unavailable
|
|
81
|
+
|
|
82
|
+
### fetch
|
|
83
|
+
|
|
84
|
+
Fetch any URL and get clean markdown. The page-fetching engine behind `search`.
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
fetch("https://docs.react.dev/reference/react/useState")
|
|
88
|
+
→ clean markdown, links, images, metadata, cached for future use
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
- Smart routing: HTTP first, Playwright fallback for JS-rendered pages (auto-detected)
|
|
92
|
+
- Section targeting: `section: "Parameters"` extracts content under that heading
|
|
93
|
+
- Authenticated browsing: `use_auth: true` with stored session or Chrome profile
|
|
94
|
+
- PDF support: text extraction via pdf-parse
|
|
95
|
+
|
|
96
|
+
### crawl
|
|
97
|
+
|
|
98
|
+
Crawl a site from a seed URL — documentation sites, wikis, anything.
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
crawl("https://docs.example.com", { strategy: "sitemap", max_pages: 50 })
|
|
102
|
+
→ array of pages with titles, markdown, depth
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
- Strategies: `bfs`, `dfs`, `sitemap`, `map` (URL discovery only — no content, faster)
|
|
106
|
+
- URL filtering with include/exclude patterns (regex)
|
|
107
|
+
- robots.txt compliance
|
|
108
|
+
- Cross-page content deduplication (strips repeated nav/header/footer)
|
|
109
|
+
- Total character budget to prevent context overflow
|
|
110
|
+
|
|
111
|
+
### cache
|
|
112
|
+
|
|
113
|
+
Query previously fetched content without hitting the network.
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
cache({ query: "React hooks", url_pattern: "*react.dev*" })
|
|
117
|
+
→ matching cached pages with full markdown
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
- SQLite FTS5 full-text search over all cached content
|
|
121
|
+
- Combined filters: text query + URL pattern + date range
|
|
122
|
+
- Cache stats and selective clearing
|
|
123
|
+
|
|
124
|
+
### extract
|
|
125
|
+
|
|
126
|
+
Structured data extraction from any URL or HTML.
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
extract("https://example.com/product", { mode: "schema", schema: { price: "string", name: "string" } })
|
|
130
|
+
→ { price: "$29.99", name: "Widget Pro" }
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Modes:
|
|
134
|
+
- `selector` — CSS selector → text content
|
|
135
|
+
- `tables` — HTML tables → structured row objects
|
|
136
|
+
- `metadata` — title, description, author, date, JSON-LD
|
|
137
|
+
- `schema` — JSON Schema → heuristic field matching from page content
|
|
138
|
+
|
|
139
|
+
## Why wigolo?
|
|
140
|
+
|
|
141
|
+
| | wigolo | Tavily | Firecrawl | Exa |
|
|
142
|
+
|---|---|---|---|---|
|
|
143
|
+
| Cost | Free | $30–500/mo | $16–500/mo | $7/1K queries |
|
|
144
|
+
| API key required | None | Yes | Yes | Yes |
|
|
145
|
+
| Authenticated browsing | Yes | No | No | No |
|
|
146
|
+
| Localhost access | Yes | No | No | No |
|
|
147
|
+
| Local cache + FTS | Yes | No | No | No |
|
|
148
|
+
| Search + extract unified | Yes | Yes | Partial | Partial |
|
|
149
|
+
| ML reranking | Local | Proprietary | No | Neural index |
|
|
150
|
+
| Rate limits | None | Tiered | Tiered | Tiered |
|
|
151
|
+
|
|
152
|
+
## Configuration
|
|
153
|
+
|
|
154
|
+
wigolo works with zero configuration. For advanced use:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Use an existing SearXNG instance instead of the embedded one
|
|
158
|
+
SEARXNG_URL=http://localhost:8888
|
|
159
|
+
|
|
160
|
+
# Authenticated browsing — export session state via Playwright
|
|
161
|
+
WIGOLO_AUTH_STATE_PATH=~/.wigolo/auth.json
|
|
162
|
+
|
|
163
|
+
# Or use your Chrome profile directly (close Chrome first)
|
|
164
|
+
WIGOLO_CHROME_PROFILE_PATH=~/.config/google-chrome/Default
|
|
165
|
+
|
|
166
|
+
# ML reranking (install with: npx wigolo warmup --reranker)
|
|
167
|
+
WIGOLO_RERANKER=flashrank
|
|
168
|
+
|
|
169
|
+
# Tune extraction — auto/always/never
|
|
170
|
+
WIGOLO_TRAFILATURA=auto
|
|
171
|
+
|
|
172
|
+
# Logging
|
|
173
|
+
LOG_LEVEL=info # debug, info, warn, error
|
|
174
|
+
LOG_FORMAT=json # json, text
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Full list of env vars:
|
|
178
|
+
|
|
179
|
+
| Variable | Default | Description |
|
|
180
|
+
|---|---|---|
|
|
181
|
+
| `SEARXNG_URL` | *(auto)* | External SearXNG URL |
|
|
182
|
+
| `SEARXNG_MODE` | `native` | `native` or `docker` |
|
|
183
|
+
| `SEARXNG_PORT` | `8888` | Port for embedded SearXNG |
|
|
184
|
+
| `WIGOLO_DATA_DIR` | `~/.wigolo` | Data + cache directory |
|
|
185
|
+
| `WIGOLO_AUTH_STATE_PATH` | — | Playwright storage state JSON |
|
|
186
|
+
| `WIGOLO_CHROME_PROFILE_PATH` | — | Chrome user data directory |
|
|
187
|
+
| `WIGOLO_RERANKER` | `none` | `flashrank` or `none` |
|
|
188
|
+
| `WIGOLO_TRAFILATURA` | `auto` | `auto`, `always`, or `never` |
|
|
189
|
+
| `MAX_BROWSERS` | `3` | Concurrent Playwright contexts |
|
|
190
|
+
| `FETCH_TIMEOUT_MS` | `10000` | HTTP fetch timeout |
|
|
191
|
+
| `CRAWL_CONCURRENCY` | `2` | Concurrent crawl requests |
|
|
192
|
+
| `RESPECT_ROBOTS_TXT` | `true` | Honor robots.txt |
|
|
193
|
+
|
|
194
|
+
## How it works
|
|
195
|
+
|
|
196
|
+
```
|
|
197
|
+
search query
|
|
198
|
+
→ SearXNG (70+ engines) or direct scraping (Bing/DDG/Startpage)
|
|
199
|
+
→ deduplicate by URL
|
|
200
|
+
→ domain/date/category filters
|
|
201
|
+
→ ML reranking (FlashRank, optional)
|
|
202
|
+
→ link validation
|
|
203
|
+
→ fetch + extract top N results in parallel
|
|
204
|
+
→ return markdown
|
|
205
|
+
|
|
206
|
+
Each step degrades gracefully:
|
|
207
|
+
SearXNG down? → direct scraping fallback
|
|
208
|
+
Page needs JS? → auto-detected, Playwright used transparently
|
|
209
|
+
Extractor fails? → ensemble: site-specific → Defuddle → Trafilatura → Readability → Turndown
|
|
210
|
+
Already fetched? → served from SQLite cache with FTS5
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
**Extraction ensemble** — every page runs through multiple extractors in order, falling back if content is below threshold:
|
|
214
|
+
1. Site-specific extractors (GitHub, Stack Overflow, MDN, docs frameworks)
|
|
215
|
+
2. Defuddle — markdown-aware, site-adaptive
|
|
216
|
+
3. Trafilatura — high-precision article extraction (Python, optional)
|
|
217
|
+
4. Readability.js — battle-tested Mozilla algorithm
|
|
218
|
+
5. Raw Turndown — last resort HTML-to-markdown
|
|
219
|
+
|
|
220
|
+
## Roadmap
|
|
221
|
+
|
|
222
|
+
### v2.1 — Next
|
|
223
|
+
- [ ] Daemon mode — persistent HTTP server, zero startup latency
|
|
224
|
+
- [ ] Browser interaction — click, type, scroll before extraction
|
|
225
|
+
- [ ] Content change detection — diff monitoring for cached pages
|
|
226
|
+
- [ ] CDP session discovery — attach to running Chrome for seamless auth
|
|
227
|
+
- [ ] Plugin system — community extractors and search engines
|
|
228
|
+
|
|
229
|
+
### v2.2
|
|
230
|
+
- [ ] Multi-browser pool — Chromium + Firefox for fingerprint diversity
|
|
231
|
+
- [ ] Interactive REPL (`wigolo shell`)
|
|
232
|
+
- [ ] Agent skill distribution — MCP registry listings, `SKILL.md`
|
|
233
|
+
|
|
234
|
+
### v3 — The Knowledge Engine
|
|
235
|
+
- [ ] Answer synthesis — search + LLM = direct answers with citations (bring your own key)
|
|
236
|
+
- [ ] Semantic search — local vector embeddings over cached content (`findSimilar`)
|
|
237
|
+
- [ ] Agent endpoint — describe what you need, no URLs required
|
|
238
|
+
- [ ] Streaming answers — real-time generation as results come in
|
|
239
|
+
- [ ] Knowledge graph — entity and relationship extraction from crawled content
|
|
240
|
+
- [ ] Auto re-crawl scheduler — keep documentation fresh automatically
|
|
241
|
+
- [ ] Lightpanda browser — optional ultra-lightweight headless browser (11x less RAM than Chrome)
|
|
242
|
+
- [ ] Cloud sync — share cache across machines via rclone (S3, Drive, Dropbox)
|
|
243
|
+
- [ ] Team knowledge base — shared indexed content across team members
|
|
244
|
+
|
|
245
|
+
## Troubleshooting
|
|
246
|
+
|
|
247
|
+
**SearXNG won't start**
|
|
248
|
+
Make sure `python3` is on your PATH and version 3.8+. Check with `python3 --version`. Alternatively, set `SEARXNG_MODE=docker` if Docker is available.
|
|
249
|
+
|
|
250
|
+
**Playwright browser not found**
|
|
251
|
+
Run `npx wigolo warmup` to download Chromium. This is done automatically on first use but can fail behind corporate proxies.
|
|
252
|
+
|
|
253
|
+
**Search returns no results**
|
|
254
|
+
If SearXNG and all fallback engines fail, check your network connection. Behind a proxy? Set `PROXY_URL=http://your-proxy:port`.
|
|
255
|
+
|
|
256
|
+
**Permission errors on `~/.wigolo/`**
|
|
257
|
+
wigolo stores its cache and SearXNG installation in `~/.wigolo/`. Ensure your user has write access. Override with `WIGOLO_DATA_DIR=/your/path`.
|
|
258
|
+
|
|
259
|
+
## Contributing
|
|
260
|
+
|
|
261
|
+
PRs welcome. Open an issue first to discuss what you'd like to change.
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
git clone https://github.com/KnockOutEZ/wigolo
|
|
265
|
+
cd wigolo
|
|
266
|
+
npm install
|
|
267
|
+
npm test
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
[BSL 1.1](LICENSE) — free for individuals, small teams (under $1M revenue), education, and open source. Converts to MIT on 2029-04-12.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"db.d.ts","sourceRoot":"","sources":["../../src/cache/db.ts"],"names":[],"mappings":"AAAA,OAAO,QAAQ,MAAM,gBAAgB,CAAC;AAItC,wBAAgB,YAAY,CAAC,MAAM,EAAE,MAAM,GAAG,QAAQ,CAAC,QAAQ,CAqF9D;AAED,wBAAgB,WAAW,IAAI,QAAQ,CAAC,QAAQ,CAK/C;AAED,wBAAgB,aAAa,IAAI,IAAI,CAKpC"}
|
package/dist/cache/db.js
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import Database from 'better-sqlite3';
|
|
2
|
+
let instance = null;
|
|
3
|
+
export function initDatabase(dbPath) {
|
|
4
|
+
if (instance) {
|
|
5
|
+
instance.close();
|
|
6
|
+
instance = null;
|
|
7
|
+
}
|
|
8
|
+
const db = new Database(dbPath);
|
|
9
|
+
db.pragma('journal_mode = WAL');
|
|
10
|
+
db.pragma('synchronous = NORMAL');
|
|
11
|
+
db.pragma('foreign_keys = ON');
|
|
12
|
+
db.exec(`
|
|
13
|
+
CREATE TABLE IF NOT EXISTS url_cache (
|
|
14
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
15
|
+
url TEXT UNIQUE NOT NULL,
|
|
16
|
+
normalized_url TEXT NOT NULL,
|
|
17
|
+
title TEXT,
|
|
18
|
+
markdown TEXT,
|
|
19
|
+
raw_html TEXT,
|
|
20
|
+
metadata TEXT,
|
|
21
|
+
links TEXT,
|
|
22
|
+
images TEXT,
|
|
23
|
+
fetch_method TEXT,
|
|
24
|
+
extractor_used TEXT,
|
|
25
|
+
content_hash TEXT,
|
|
26
|
+
fetched_at TEXT NOT NULL,
|
|
27
|
+
expires_at TEXT,
|
|
28
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
29
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_url_cache_normalized ON url_cache(normalized_url);
|
|
33
|
+
|
|
34
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS url_cache_fts USING fts5(
|
|
35
|
+
title,
|
|
36
|
+
markdown,
|
|
37
|
+
url,
|
|
38
|
+
content='url_cache',
|
|
39
|
+
content_rowid='id'
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
CREATE TRIGGER IF NOT EXISTS url_cache_ai AFTER INSERT ON url_cache BEGIN
|
|
43
|
+
INSERT INTO url_cache_fts(rowid, title, markdown, url)
|
|
44
|
+
VALUES (new.id, new.title, new.markdown, new.url);
|
|
45
|
+
END;
|
|
46
|
+
|
|
47
|
+
CREATE TRIGGER IF NOT EXISTS url_cache_ad BEFORE DELETE ON url_cache BEGIN
|
|
48
|
+
INSERT INTO url_cache_fts(url_cache_fts, rowid, title, markdown, url)
|
|
49
|
+
VALUES ('delete', old.id, old.title, old.markdown, old.url);
|
|
50
|
+
END;
|
|
51
|
+
|
|
52
|
+
CREATE TRIGGER IF NOT EXISTS url_cache_au BEFORE UPDATE ON url_cache BEGIN
|
|
53
|
+
INSERT INTO url_cache_fts(url_cache_fts, rowid, title, markdown, url)
|
|
54
|
+
VALUES ('delete', old.id, old.title, old.markdown, old.url);
|
|
55
|
+
END;
|
|
56
|
+
|
|
57
|
+
CREATE TABLE IF NOT EXISTS search_cache (
|
|
58
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
59
|
+
query TEXT NOT NULL,
|
|
60
|
+
query_hash TEXT UNIQUE NOT NULL,
|
|
61
|
+
results TEXT NOT NULL,
|
|
62
|
+
engines_used TEXT,
|
|
63
|
+
searched_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
64
|
+
expires_at TEXT
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
CREATE TABLE IF NOT EXISTS domain_routing (
|
|
68
|
+
domain TEXT PRIMARY KEY,
|
|
69
|
+
prefer_playwright INTEGER DEFAULT 0,
|
|
70
|
+
http_failures INTEGER DEFAULT 0,
|
|
71
|
+
last_updated TEXT
|
|
72
|
+
);
|
|
73
|
+
|
|
74
|
+
CREATE TABLE IF NOT EXISTS domain_boilerplate (
|
|
75
|
+
domain TEXT NOT NULL,
|
|
76
|
+
block_hash TEXT NOT NULL,
|
|
77
|
+
sample_text TEXT,
|
|
78
|
+
discovered_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
79
|
+
PRIMARY KEY (domain, block_hash)
|
|
80
|
+
);
|
|
81
|
+
`);
|
|
82
|
+
instance = db;
|
|
83
|
+
return db;
|
|
84
|
+
}
|
|
85
|
+
export function getDatabase() {
|
|
86
|
+
if (!instance) {
|
|
87
|
+
throw new Error('Database not initialized. Call initDatabase() first.');
|
|
88
|
+
}
|
|
89
|
+
return instance;
|
|
90
|
+
}
|
|
91
|
+
export function closeDatabase() {
|
|
92
|
+
if (instance) {
|
|
93
|
+
instance.close();
|
|
94
|
+
instance = null;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
//# sourceMappingURL=db.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"db.js","sourceRoot":"","sources":["../../src/cache/db.ts"],"names":[],"mappings":"AAAA,OAAO,QAAQ,MAAM,gBAAgB,CAAC;AAEtC,IAAI,QAAQ,GAA6B,IAAI,CAAC;AAE9C,MAAM,UAAU,YAAY,CAAC,MAAc;IACzC,IAAI,QAAQ,EAAE,CAAC;QACb,QAAQ,CAAC,KAAK,EAAE,CAAC;QACjB,QAAQ,GAAG,IAAI,CAAC;IAClB,CAAC;IAED,MAAM,EAAE,GAAG,IAAI,QAAQ,CAAC,MAAM,CAAC,CAAC;IAEhC,EAAE,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAChC,EAAE,CAAC,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAClC,EAAE,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC;IAE/B,EAAE,CAAC,IAAI,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqEP,CAAC,CAAC;IAEH,QAAQ,GAAG,EAAE,CAAC;IACd,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,WAAW;IACzB,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;IAC1E,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,MAAM,UAAU,aAAa;IAC3B,IAAI,QAAQ,EAAE,CAAC;QACb,QAAQ,CAAC,KAAK,EAAE,CAAC;QACjB,QAAQ,GAAG,IAAI,CAAC;IAClB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type { RawFetchResult, ExtractionResult, CachedContent, SearchResultItem, CacheStats } from '../types.js';
|
|
2
|
+
export declare function normalizeUrl(url: string): string;
|
|
3
|
+
export declare function cacheContent(result: RawFetchResult, extraction: ExtractionResult): void;
|
|
4
|
+
export declare function getCachedContent(url: string): CachedContent | null;
|
|
5
|
+
export declare function isExpired(cached: CachedContent): boolean;
|
|
6
|
+
export declare function searchCache(query: string): CachedContent[];
|
|
7
|
+
export interface CachedSearchResult {
|
|
8
|
+
query: string;
|
|
9
|
+
results: SearchResultItem[];
|
|
10
|
+
engines_used: string[];
|
|
11
|
+
searched_at: string;
|
|
12
|
+
}
|
|
13
|
+
export declare function cacheSearchResults(query: string, results: SearchResultItem[], enginesUsed: string[]): void;
|
|
14
|
+
export declare function getCachedSearchResults(query: string): CachedSearchResult | null;
|
|
15
|
+
export declare function searchCacheFiltered(options: {
|
|
16
|
+
query?: string;
|
|
17
|
+
urlPattern?: string;
|
|
18
|
+
since?: string;
|
|
19
|
+
}): CachedContent[];
|
|
20
|
+
export declare function clearCacheEntries(options: {
|
|
21
|
+
query?: string;
|
|
22
|
+
urlPattern?: string;
|
|
23
|
+
since?: string;
|
|
24
|
+
}): number;
|
|
25
|
+
export declare function getCacheStats(): CacheStats;
|
|
26
|
+
//# sourceMappingURL=store.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"store.d.ts","sourceRoot":"","sources":["../../src/cache/store.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,cAAc,EAAE,gBAAgB,EAAE,aAAa,EAAE,gBAAgB,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAgBjH,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CA0BhD;AAMD,wBAAgB,YAAY,CAAC,MAAM,EAAE,cAAc,EAAE,UAAU,EAAE,gBAAgB,GAAG,IAAI,CAsCvF;AAsCD,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,MAAM,GAAG,aAAa,GAAG,IAAI,CASlE;AAED,wBAAgB,SAAS,CAAC,MAAM,EAAE,aAAa,GAAG,OAAO,CAGxD;AAED,wBAAgB,WAAW,CAAC,KAAK,EAAE,MAAM,GAAG,aAAa,EAAE,CAY1D;AAED,MAAM,WAAW,kBAAkB;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,gBAAgB,EAAE,CAAC;IAC5B,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,MAAM,EACb,OAAO,EAAE,gBAAgB,EAAE,EAC3B,WAAW,EAAE,MAAM,EAAE,GACpB,IAAI,CAqBN;AAED,wBAAgB,sBAAsB,CAAC,KAAK,EAAE,MAAM,GAAG,kBAAkB,GAAG,IAAI,CAgB/E;AAED,wBAAgB,mBAAmB,CAAC,OAAO,EAAE;IAC3C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB,GAAG,aAAa,EAAE,CA4BlB;AAED,wBAAgB,iBAAiB,CAAC,OAAO,EAAE;IACzC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB,GAAG,MAAM,CA0BT;AAED,wBAAgB,aAAa,IAAI,UAAU,CAiB1C"}
|