@staticn0va/wigolo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/LICENSE +74 -0
  2. package/README.md +272 -0
  3. package/dist/cache/db.d.ts +5 -0
  4. package/dist/cache/db.d.ts.map +1 -0
  5. package/dist/cache/db.js +97 -0
  6. package/dist/cache/db.js.map +1 -0
  7. package/dist/cache/store.d.ts +26 -0
  8. package/dist/cache/store.d.ts.map +1 -0
  9. package/dist/cache/store.js +214 -0
  10. package/dist/cache/store.js.map +1 -0
  11. package/dist/cli/daemon.d.ts +2 -0
  12. package/dist/cli/daemon.d.ts.map +1 -0
  13. package/dist/cli/daemon.js +5 -0
  14. package/dist/cli/daemon.js.map +1 -0
  15. package/dist/cli/health.d.ts +2 -0
  16. package/dist/cli/health.d.ts.map +1 -0
  17. package/dist/cli/health.js +5 -0
  18. package/dist/cli/health.js.map +1 -0
  19. package/dist/cli/index.d.ts +7 -0
  20. package/dist/cli/index.d.ts.map +1 -0
  21. package/dist/cli/index.js +9 -0
  22. package/dist/cli/index.js.map +1 -0
  23. package/dist/cli/warmup.d.ts +11 -0
  24. package/dist/cli/warmup.d.ts.map +1 -0
  25. package/dist/cli/warmup.js +107 -0
  26. package/dist/cli/warmup.js.map +1 -0
  27. package/dist/config.d.ts +41 -0
  28. package/dist/config.d.ts.map +1 -0
  29. package/dist/config.js +66 -0
  30. package/dist/config.js.map +1 -0
  31. package/dist/crawl/crawler.d.ts +18 -0
  32. package/dist/crawl/crawler.d.ts.map +1 -0
  33. package/dist/crawl/crawler.js +228 -0
  34. package/dist/crawl/crawler.js.map +1 -0
  35. package/dist/crawl/dedup.d.ts +15 -0
  36. package/dist/crawl/dedup.d.ts.map +1 -0
  37. package/dist/crawl/dedup.js +93 -0
  38. package/dist/crawl/dedup.js.map +1 -0
  39. package/dist/crawl/mapper.d.ts +17 -0
  40. package/dist/crawl/mapper.d.ts.map +1 -0
  41. package/dist/crawl/mapper.js +178 -0
  42. package/dist/crawl/mapper.js.map +1 -0
  43. package/dist/crawl/rate-limiter.d.ts +10 -0
  44. package/dist/crawl/rate-limiter.d.ts.map +1 -0
  45. package/dist/crawl/rate-limiter.js +72 -0
  46. package/dist/crawl/rate-limiter.js.map +1 -0
  47. package/dist/crawl/robots.d.ts +9 -0
  48. package/dist/crawl/robots.d.ts.map +1 -0
  49. package/dist/crawl/robots.js +63 -0
  50. package/dist/crawl/robots.js.map +1 -0
  51. package/dist/crawl/sitemap.d.ts +4 -0
  52. package/dist/crawl/sitemap.d.ts.map +1 -0
  53. package/dist/crawl/sitemap.js +38 -0
  54. package/dist/crawl/sitemap.js.map +1 -0
  55. package/dist/crawl/url-utils.d.ts +3 -0
  56. package/dist/crawl/url-utils.d.ts.map +1 -0
  57. package/dist/crawl/url-utils.js +41 -0
  58. package/dist/crawl/url-utils.js.map +1 -0
  59. package/dist/extraction/defuddle.d.ts +3 -0
  60. package/dist/extraction/defuddle.d.ts.map +1 -0
  61. package/dist/extraction/defuddle.js +26 -0
  62. package/dist/extraction/defuddle.js.map +1 -0
  63. package/dist/extraction/extract.d.ts +5 -0
  64. package/dist/extraction/extract.d.ts.map +1 -0
  65. package/dist/extraction/extract.js +83 -0
  66. package/dist/extraction/extract.js.map +1 -0
  67. package/dist/extraction/jsonld.d.ts +4 -0
  68. package/dist/extraction/jsonld.d.ts.map +1 -0
  69. package/dist/extraction/jsonld.js +64 -0
  70. package/dist/extraction/jsonld.js.map +1 -0
  71. package/dist/extraction/markdown.d.ts +10 -0
  72. package/dist/extraction/markdown.d.ts.map +1 -0
  73. package/dist/extraction/markdown.js +107 -0
  74. package/dist/extraction/markdown.js.map +1 -0
  75. package/dist/extraction/pipeline.d.ts +11 -0
  76. package/dist/extraction/pipeline.d.ts.map +1 -0
  77. package/dist/extraction/pipeline.js +95 -0
  78. package/dist/extraction/pipeline.js.map +1 -0
  79. package/dist/extraction/readability.d.ts +3 -0
  80. package/dist/extraction/readability.d.ts.map +1 -0
  81. package/dist/extraction/readability.js +32 -0
  82. package/dist/extraction/readability.js.map +1 -0
  83. package/dist/extraction/schema.d.ts +7 -0
  84. package/dist/extraction/schema.d.ts.map +1 -0
  85. package/dist/extraction/schema.js +86 -0
  86. package/dist/extraction/schema.js.map +1 -0
  87. package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
  88. package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
  89. package/dist/extraction/site-extractors/docs-generic.js +104 -0
  90. package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
  91. package/dist/extraction/site-extractors/github.d.ts +3 -0
  92. package/dist/extraction/site-extractors/github.d.ts.map +1 -0
  93. package/dist/extraction/site-extractors/github.js +107 -0
  94. package/dist/extraction/site-extractors/github.js.map +1 -0
  95. package/dist/extraction/site-extractors/mdn.d.ts +3 -0
  96. package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
  97. package/dist/extraction/site-extractors/mdn.js +58 -0
  98. package/dist/extraction/site-extractors/mdn.js.map +1 -0
  99. package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
  100. package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
  101. package/dist/extraction/site-extractors/stackoverflow.js +88 -0
  102. package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
  103. package/dist/extraction/trafilatura.d.ts +6 -0
  104. package/dist/extraction/trafilatura.d.ts.map +1 -0
  105. package/dist/extraction/trafilatura.js +105 -0
  106. package/dist/extraction/trafilatura.js.map +1 -0
  107. package/dist/fetch/auth.d.ts +8 -0
  108. package/dist/fetch/auth.d.ts.map +1 -0
  109. package/dist/fetch/auth.js +32 -0
  110. package/dist/fetch/auth.js.map +1 -0
  111. package/dist/fetch/browser-pool.d.ts +28 -0
  112. package/dist/fetch/browser-pool.d.ts.map +1 -0
  113. package/dist/fetch/browser-pool.js +138 -0
  114. package/dist/fetch/browser-pool.js.map +1 -0
  115. package/dist/fetch/content-check.d.ts +2 -0
  116. package/dist/fetch/content-check.d.ts.map +1 -0
  117. package/dist/fetch/content-check.js +62 -0
  118. package/dist/fetch/content-check.js.map +1 -0
  119. package/dist/fetch/http-client.d.ts +15 -0
  120. package/dist/fetch/http-client.d.ts.map +1 -0
  121. package/dist/fetch/http-client.js +146 -0
  122. package/dist/fetch/http-client.js.map +1 -0
  123. package/dist/fetch/router.d.ts +45 -0
  124. package/dist/fetch/router.d.ts.map +1 -0
  125. package/dist/fetch/router.js +89 -0
  126. package/dist/fetch/router.js.map +1 -0
  127. package/dist/index.d.ts +3 -0
  128. package/dist/index.d.ts.map +1 -0
  129. package/dist/index.js +22 -0
  130. package/dist/index.js.map +1 -0
  131. package/dist/logger.d.ts +10 -0
  132. package/dist/logger.d.ts.map +1 -0
  133. package/dist/logger.js +39 -0
  134. package/dist/logger.js.map +1 -0
  135. package/dist/search/dedup.d.ts +10 -0
  136. package/dist/search/dedup.d.ts.map +1 -0
  137. package/dist/search/dedup.js +35 -0
  138. package/dist/search/dedup.js.map +1 -0
  139. package/dist/search/engines/bing.d.ts +7 -0
  140. package/dist/search/engines/bing.d.ts.map +1 -0
  141. package/dist/search/engines/bing.js +48 -0
  142. package/dist/search/engines/bing.js.map +1 -0
  143. package/dist/search/engines/duckduckgo.d.ts +7 -0
  144. package/dist/search/engines/duckduckgo.d.ts.map +1 -0
  145. package/dist/search/engines/duckduckgo.js +50 -0
  146. package/dist/search/engines/duckduckgo.js.map +1 -0
  147. package/dist/search/engines/startpage.d.ts +7 -0
  148. package/dist/search/engines/startpage.d.ts.map +1 -0
  149. package/dist/search/engines/startpage.js +50 -0
  150. package/dist/search/engines/startpage.js.map +1 -0
  151. package/dist/search/filters.d.ts +16 -0
  152. package/dist/search/filters.d.ts.map +1 -0
  153. package/dist/search/filters.js +63 -0
  154. package/dist/search/filters.js.map +1 -0
  155. package/dist/search/flashrank.d.ts +12 -0
  156. package/dist/search/flashrank.d.ts.map +1 -0
  157. package/dist/search/flashrank.js +63 -0
  158. package/dist/search/flashrank.js.map +1 -0
  159. package/dist/search/query.d.ts +2 -0
  160. package/dist/search/query.d.ts.map +1 -0
  161. package/dist/search/query.js +41 -0
  162. package/dist/search/query.js.map +1 -0
  163. package/dist/search/rerank.d.ts +3 -0
  164. package/dist/search/rerank.d.ts.map +1 -0
  165. package/dist/search/rerank.js +40 -0
  166. package/dist/search/rerank.js.map +1 -0
  167. package/dist/search/searxng.d.ts +8 -0
  168. package/dist/search/searxng.d.ts.map +1 -0
  169. package/dist/search/searxng.js +87 -0
  170. package/dist/search/searxng.js.map +1 -0
  171. package/dist/search/validator.d.ts +6 -0
  172. package/dist/search/validator.d.ts.map +1 -0
  173. package/dist/search/validator.js +35 -0
  174. package/dist/search/validator.js.map +1 -0
  175. package/dist/searxng/bootstrap.d.ts +18 -0
  176. package/dist/searxng/bootstrap.d.ts.map +1 -0
  177. package/dist/searxng/bootstrap.js +136 -0
  178. package/dist/searxng/bootstrap.js.map +1 -0
  179. package/dist/searxng/docker.d.ts +9 -0
  180. package/dist/searxng/docker.d.ts.map +1 -0
  181. package/dist/searxng/docker.js +67 -0
  182. package/dist/searxng/docker.js.map +1 -0
  183. package/dist/searxng/process.d.ts +23 -0
  184. package/dist/searxng/process.d.ts.map +1 -0
  185. package/dist/searxng/process.js +188 -0
  186. package/dist/searxng/process.js.map +1 -0
  187. package/dist/server.d.ts +2 -0
  188. package/dist/server.d.ts.map +1 -0
  189. package/dist/server.js +311 -0
  190. package/dist/server.js.map +1 -0
  191. package/dist/tools/cache.d.ts +3 -0
  192. package/dist/tools/cache.d.ts.map +1 -0
  193. package/dist/tools/cache.js +50 -0
  194. package/dist/tools/cache.js.map +1 -0
  195. package/dist/tools/crawl.d.ts +6 -0
  196. package/dist/tools/crawl.d.ts.map +1 -0
  197. package/dist/tools/crawl.js +97 -0
  198. package/dist/tools/crawl.js.map +1 -0
  199. package/dist/tools/extract.d.ts +4 -0
  200. package/dist/tools/extract.d.ts.map +1 -0
  201. package/dist/tools/extract.js +69 -0
  202. package/dist/tools/extract.js.map +1 -0
  203. package/dist/tools/fetch.d.ts +4 -0
  204. package/dist/tools/fetch.d.ts.map +1 -0
  205. package/dist/tools/fetch.js +76 -0
  206. package/dist/tools/fetch.js.map +1 -0
  207. package/dist/tools/search.d.ts +4 -0
  208. package/dist/tools/search.d.ts.map +1 -0
  209. package/dist/tools/search.js +160 -0
  210. package/dist/tools/search.js.map +1 -0
  211. package/dist/types.d.ts +222 -0
  212. package/dist/types.d.ts.map +1 -0
  213. package/dist/types.js +2 -0
  214. package/dist/types.js.map +1 -0
  215. package/package.json +61 -0
package/LICENSE ADDED
@@ -0,0 +1,74 @@
1
+ Business Source License 1.1
2
+
3
+ License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
4
+ "Business Source License" is a trademark of MariaDB Corporation Ab.
5
+
6
+ Parameters
7
+
8
+ Licensor: Towhid Khan
9
+ Licensed Work: wigolo
10
+ The Licensed Work is (c) 2026 Towhid Khan
11
+ Additional Use Grant: You may make production use of the Licensed Work,
12
+ provided that such use does not include offering
13
+ the Licensed Work to third parties on a hosted or
14
+ embedded basis which competes with the Licensor's
15
+ paid offerings, and provided that your organization's
16
+ total annual revenue does not exceed US $1,000,000,
17
+ or you are using the Licensed Work for personal,
18
+ educational, or non-commercial open source purposes.
19
+
20
+ Change Date: 2029-04-12
21
+
22
+ Change License: MIT License
23
+
24
+ For information about alternative licensing arrangements, contact:
25
+ ktowhid20@gmail.com
26
+
27
+ Notice
28
+
29
+ The Business Source License (this document, or the "License") is not an Open
30
+ Source license. However, the Licensed Work will eventually be made available
31
+ under an Open Source License, as stated in this License.
32
+
33
+ License terms copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
34
+
35
+ Terms
36
+
37
+ The Licensor hereby grants you the right to copy, modify, create derivative
38
+ works, redistribute, and make non-production use of the Licensed Work. The
39
+ Licensor may make an Additional Use Grant, above, permitting limited production
40
+ use.
41
+
42
+ Effective on the Change Date, or the fourth anniversary of the first publicly
43
+ available distribution of a specific version of the Licensed Work under this
44
+ License, whichever comes first, the Licensor hereby grants you rights under the
45
+ terms of the Change License, and the rights granted in the paragraph above
46
+ terminate.
47
+
48
+ If your use of the Licensed Work does not comply with the requirements
49
+ currently in effect as described in this License, you must purchase a
50
+ commercial license from the Licensor, its affiliated entities, or authorized
51
+ resellers, or you must refrain from using the Licensed Work.
52
+
53
+ All copies of the original and modified Licensed Work, and derivative works
54
+ of the Licensed Work, are subject to this License. This License applies
55
+ separately for each version of the Licensed Work and the Change Date may vary
56
+ for each version of the Licensed Work released by Licensor.
57
+
58
+ You must conspicuously display this License on each original or modified copy
59
+ of the Licensed Work. If you receive the Licensed Work in original or modified
60
+ form from a third party, the terms and conditions set forth in this License
61
+ apply to your use of that work.
62
+
63
+ Any use of the Licensed Work in violation of this License will automatically
64
+ terminate your rights under this License for the current and all other versions
65
+ of the Licensed Work.
66
+
67
+ This License does not grant you any right in any trademark or logo of Licensor
68
+ or its affiliates (provided that you may use a trademark or logo of Licensor as
69
+ expressly required by this License).
70
+
71
+ TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON AN
72
+ "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS
73
+ OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF MERCHANTABILITY,
74
+ FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND TITLE.
package/README.md ADDED
@@ -0,0 +1,272 @@
1
+ <div align="center">
2
+
3
+ # wigolo
4
+
5
+ **Local-first web search MCP server for AI coding agents.**
6
+
7
+ Search, fetch, crawl, cache, and extract — zero API keys, zero cloud, zero cost.
8
+
9
+ [![License: BSL 1.1](https://img.shields.io/badge/License-BSL_1.1-blue.svg)](LICENSE)
10
+ [![Node.js](https://img.shields.io/badge/node-%3E%3D20-brightgreen)](https://nodejs.org)
11
+ [![TypeScript](https://img.shields.io/badge/TypeScript-5.x-blue)](https://www.typescriptlang.org/)
12
+
13
+ [Quick Start](#quick-start) · [Features](#features) · [Why wigolo?](#why-wigolo) · [Roadmap](#roadmap)
14
+
15
+ </div>
16
+
17
+ ```
18
+ $ claude mcp add wigolo -- npx wigolo
19
+ Added MCP server wigolo
20
+
21
+ $ # That's it. Your agent now has web search.
22
+ ```
23
+
24
+ ## What is this?
25
+
26
+ wigolo gives AI coding agents (Claude Code, Cursor, Gemini CLI, Codex, Windsurf) web search, page fetching, site crawling, content extraction, and a local knowledge cache. It runs entirely on your machine. No API keys, no cloud, no cost — works out of the box with `npx`.
27
+
28
+ ## Quick Start
29
+
30
+ **Claude Code:**
31
+ ```bash
32
+ claude mcp add wigolo -- npx wigolo
33
+ ```
34
+
35
+ **Cursor / VS Code / any MCP client:**
36
+ ```json
37
+ {
38
+ "mcpServers": {
39
+ "wigolo": {
40
+ "command": "npx",
41
+ "args": ["wigolo"]
42
+ }
43
+ }
44
+ }
45
+ ```
46
+
47
+ **Optional warmup (improves quality on first use):**
48
+ ```bash
49
+ npx wigolo warmup # Downloads Playwright + SearXNG
50
+ npx wigolo warmup --all # + ML reranking + Trafilatura extraction
51
+ ```
52
+
53
+ ## Prerequisites
54
+
55
+ - **Node.js 20+** — [Download](https://nodejs.org/) or `brew install node` (macOS) / `winget install OpenJS.NodeJS` (Windows) / `sudo apt install nodejs` (Ubuntu/Debian)
56
+ - **Python 3.8+** *(recommended)* — [Download](https://python.org/) or `brew install python3` (macOS) / `winget install Python.Python.3` (Windows) / `sudo apt install python3` (Ubuntu/Debian)
57
+ - **Docker** *(optional)* — Alternative to Python for running SearXNG.
58
+
59
+ Everything else (Playwright, SearXNG) is downloaded automatically on first use or via `npx wigolo warmup`.
60
+
61
+ ### What works without Python?
62
+
63
+ Everything except embedded SearXNG. Without Python, search falls back to direct scraping of Bing, DuckDuckGo, and Startpage — functional but less reliable. All other tools (fetch, crawl, cache, extract) work fully with just Node.js.
64
+
65
+ ## Features
66
+
67
+ ### search
68
+
69
+ Search the web and get full markdown content in one call — not snippets.
70
+
71
+ ```
72
+ search("React Server Components best practices", { max_results: 5 })
73
+ → titles, URLs, relevance scores, and full extracted markdown per result
74
+ ```
75
+
76
+ - Domain filtering: `include_domains: ["react.dev"]`, `exclude_domains: ["medium.com"]`
77
+ - Date filtering: `from_date: "2024-01-01"`, `to_date: "2025-01-01"`
78
+ - Category search: `general`, `news`, `code`, `docs`, `papers`
79
+ - ML reranking with FlashRank when installed
80
+ - Falls back to direct engine scraping when SearXNG is unavailable
81
+
82
+ ### fetch
83
+
84
+ Fetch any URL and get clean markdown. The page-fetching engine behind `search`.
85
+
86
+ ```
87
+ fetch("https://docs.react.dev/reference/react/useState")
88
+ → clean markdown, links, images, metadata, cached for future use
89
+ ```
90
+
91
+ - Smart routing: HTTP first, Playwright fallback for JS-rendered pages (auto-detected)
92
+ - Section targeting: `section: "Parameters"` extracts content under that heading
93
+ - Authenticated browsing: `use_auth: true` with stored session or Chrome profile
94
+ - PDF support: text extraction via pdf-parse
95
+
96
+ ### crawl
97
+
98
+ Crawl a site from a seed URL — documentation sites, wikis, anything.
99
+
100
+ ```
101
+ crawl("https://docs.example.com", { strategy: "sitemap", max_pages: 50 })
102
+ → array of pages with titles, markdown, depth
103
+ ```
104
+
105
+ - Strategies: `bfs`, `dfs`, `sitemap`, `map` (URL discovery only — no content, faster)
106
+ - URL filtering with include/exclude patterns (regex)
107
+ - robots.txt compliance
108
+ - Cross-page content deduplication (strips repeated nav/header/footer)
109
+ - Total character budget to prevent context overflow
110
+
111
+ ### cache
112
+
113
+ Query previously fetched content without hitting the network.
114
+
115
+ ```
116
+ cache({ query: "React hooks", url_pattern: "*react.dev*" })
117
+ → matching cached pages with full markdown
118
+ ```
119
+
120
+ - SQLite FTS5 full-text search over all cached content
121
+ - Combined filters: text query + URL pattern + date range
122
+ - Cache stats and selective clearing
123
+
124
+ ### extract
125
+
126
+ Structured data extraction from any URL or HTML.
127
+
128
+ ```
129
+ extract("https://example.com/product", { mode: "schema", schema: { price: "string", name: "string" } })
130
+ → { price: "$29.99", name: "Widget Pro" }
131
+ ```
132
+
133
+ Modes:
134
+ - `selector` — CSS selector → text content
135
+ - `tables` — HTML tables → structured row objects
136
+ - `metadata` — title, description, author, date, JSON-LD
137
+ - `schema` — JSON Schema → heuristic field matching from page content
138
+
139
+ ## Why wigolo?
140
+
141
+ | | wigolo | Tavily | Firecrawl | Exa |
142
+ |---|---|---|---|---|
143
+ | Cost | Free | $30–500/mo | $16–500/mo | $7/1K queries |
144
+ | API key required | None | Yes | Yes | Yes |
145
+ | Authenticated browsing | Yes | No | No | No |
146
+ | Localhost access | Yes | No | No | No |
147
+ | Local cache + FTS | Yes | No | No | No |
148
+ | Search + extract unified | Yes | Yes | Partial | Partial |
149
+ | ML reranking | Local | Proprietary | No | Neural index |
150
+ | Rate limits | None | Tiered | Tiered | Tiered |
151
+
152
+ ## Configuration
153
+
154
+ wigolo works with zero configuration. For advanced use:
155
+
156
+ ```bash
157
+ # Use an existing SearXNG instance instead of the embedded one
158
+ SEARXNG_URL=http://localhost:8888
159
+
160
+ # Authenticated browsing — export session state via Playwright
161
+ WIGOLO_AUTH_STATE_PATH=~/.wigolo/auth.json
162
+
163
+ # Or use your Chrome profile directly (close Chrome first)
164
+ WIGOLO_CHROME_PROFILE_PATH=~/.config/google-chrome/Default
165
+
166
+ # ML reranking (install with: npx wigolo warmup --reranker)
167
+ WIGOLO_RERANKER=flashrank
168
+
169
+ # Tune extraction — auto/always/never
170
+ WIGOLO_TRAFILATURA=auto
171
+
172
+ # Logging
173
+ LOG_LEVEL=info # debug, info, warn, error
174
+ LOG_FORMAT=json # json, text
175
+ ```
176
+
177
+ Full list of env vars:
178
+
179
+ | Variable | Default | Description |
180
+ |---|---|---|
181
+ | `SEARXNG_URL` | *(auto)* | External SearXNG URL |
182
+ | `SEARXNG_MODE` | `native` | `native` or `docker` |
183
+ | `SEARXNG_PORT` | `8888` | Port for embedded SearXNG |
184
+ | `WIGOLO_DATA_DIR` | `~/.wigolo` | Data + cache directory |
185
+ | `WIGOLO_AUTH_STATE_PATH` | — | Playwright storage state JSON |
186
+ | `WIGOLO_CHROME_PROFILE_PATH` | — | Chrome user data directory |
187
+ | `WIGOLO_RERANKER` | `none` | `flashrank` or `none` |
188
+ | `WIGOLO_TRAFILATURA` | `auto` | `auto`, `always`, or `never` |
189
+ | `MAX_BROWSERS` | `3` | Concurrent Playwright contexts |
190
+ | `FETCH_TIMEOUT_MS` | `10000` | HTTP fetch timeout |
191
+ | `CRAWL_CONCURRENCY` | `2` | Concurrent crawl requests |
192
+ | `RESPECT_ROBOTS_TXT` | `true` | Honor robots.txt |
193
+
194
+ ## How it works
195
+
196
+ ```
197
+ search query
198
+ → SearXNG (70+ engines) or direct scraping (Bing/DDG/Startpage)
199
+ → deduplicate by URL
200
+ → domain/date/category filters
201
+ → ML reranking (FlashRank, optional)
202
+ → link validation
203
+ → fetch + extract top N results in parallel
204
+ → return markdown
205
+
206
+ Each step degrades gracefully:
207
+ SearXNG down? → direct scraping fallback
208
+ Page needs JS? → auto-detected, Playwright used transparently
209
+ Extractor fails? → ensemble: site-specific → Defuddle → Trafilatura → Readability → Turndown
210
+ Already fetched? → served from SQLite cache with FTS5
211
+ ```
212
+
213
+ **Extraction ensemble** — every page runs through multiple extractors in order, falling back if content is below threshold:
214
+ 1. Site-specific extractors (GitHub, Stack Overflow, MDN, docs frameworks)
215
+ 2. Defuddle — markdown-aware, site-adaptive
216
+ 3. Trafilatura — high-precision article extraction (Python, optional)
217
+ 4. Readability.js — battle-tested Mozilla algorithm
218
+ 5. Raw Turndown — last resort HTML-to-markdown
219
+
220
+ ## Roadmap
221
+
222
+ ### v2.1 — Next
223
+ - [ ] Daemon mode — persistent HTTP server, zero startup latency
224
+ - [ ] Browser interaction — click, type, scroll before extraction
225
+ - [ ] Content change detection — diff monitoring for cached pages
226
+ - [ ] CDP session discovery — attach to running Chrome for seamless auth
227
+ - [ ] Plugin system — community extractors and search engines
228
+
229
+ ### v2.2
230
+ - [ ] Multi-browser pool — Chromium + Firefox for fingerprint diversity
231
+ - [ ] Interactive REPL (`wigolo shell`)
232
+ - [ ] Agent skill distribution — MCP registry listings, `SKILL.md`
233
+
234
+ ### v3 — The Knowledge Engine
235
+ - [ ] Answer synthesis — search + LLM = direct answers with citations (bring your own key)
236
+ - [ ] Semantic search — local vector embeddings over cached content (`findSimilar`)
237
+ - [ ] Agent endpoint — describe what you need, no URLs required
238
+ - [ ] Streaming answers — real-time generation as results come in
239
+ - [ ] Knowledge graph — entity and relationship extraction from crawled content
240
+ - [ ] Auto re-crawl scheduler — keep documentation fresh automatically
241
+ - [ ] Lightpanda browser — optional ultra-lightweight headless browser (11x less RAM than Chrome)
242
+ - [ ] Cloud sync — share cache across machines via rclone (S3, Drive, Dropbox)
243
+ - [ ] Team knowledge base — shared indexed content across team members
244
+
245
+ ## Troubleshooting
246
+
247
+ **SearXNG won't start**
248
+ Make sure `python3` is on your PATH and version 3.8+. Check with `python3 --version`. Alternatively, set `SEARXNG_MODE=docker` if Docker is available.
249
+
250
+ **Playwright browser not found**
251
+ Run `npx wigolo warmup` to download Chromium. This is done automatically on first use but can fail behind corporate proxies.
252
+
253
+ **Search returns no results**
254
+ If SearXNG and all fallback engines fail, check your network connection. Behind a proxy? Set `PROXY_URL=http://your-proxy:port`.
255
+
256
+ **Permission errors on `~/.wigolo/`**
257
+ wigolo stores its cache and SearXNG installation in `~/.wigolo/`. Ensure your user has write access. Override with `WIGOLO_DATA_DIR=/your/path`.
258
+
259
+ ## Contributing
260
+
261
+ PRs welcome. Open an issue first to discuss what you'd like to change.
262
+
263
+ ```bash
264
+ git clone https://github.com/KnockOutEZ/wigolo
265
+ cd wigolo
266
+ npm install
267
+ npm test
268
+ ```
269
+
270
+ ## License
271
+
272
+ [BSL 1.1](LICENSE) — free for individuals, small teams (under $1M revenue), education, and open source. Converts to MIT on 2029-04-12.
@@ -0,0 +1,5 @@
1
+ import Database from 'better-sqlite3';
2
+ export declare function initDatabase(dbPath: string): Database.Database;
3
+ export declare function getDatabase(): Database.Database;
4
+ export declare function closeDatabase(): void;
5
+ //# sourceMappingURL=db.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"db.d.ts","sourceRoot":"","sources":["../../src/cache/db.ts"],"names":[],"mappings":"AAAA,OAAO,QAAQ,MAAM,gBAAgB,CAAC;AAItC,wBAAgB,YAAY,CAAC,MAAM,EAAE,MAAM,GAAG,QAAQ,CAAC,QAAQ,CAqF9D;AAED,wBAAgB,WAAW,IAAI,QAAQ,CAAC,QAAQ,CAK/C;AAED,wBAAgB,aAAa,IAAI,IAAI,CAKpC"}
@@ -0,0 +1,97 @@
1
+ import Database from 'better-sqlite3';
2
+ let instance = null;
3
+ export function initDatabase(dbPath) {
4
+ if (instance) {
5
+ instance.close();
6
+ instance = null;
7
+ }
8
+ const db = new Database(dbPath);
9
+ db.pragma('journal_mode = WAL');
10
+ db.pragma('synchronous = NORMAL');
11
+ db.pragma('foreign_keys = ON');
12
+ db.exec(`
13
+ CREATE TABLE IF NOT EXISTS url_cache (
14
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
15
+ url TEXT UNIQUE NOT NULL,
16
+ normalized_url TEXT NOT NULL,
17
+ title TEXT,
18
+ markdown TEXT,
19
+ raw_html TEXT,
20
+ metadata TEXT,
21
+ links TEXT,
22
+ images TEXT,
23
+ fetch_method TEXT,
24
+ extractor_used TEXT,
25
+ content_hash TEXT,
26
+ fetched_at TEXT NOT NULL,
27
+ expires_at TEXT,
28
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
29
+ updated_at TEXT NOT NULL DEFAULT (datetime('now'))
30
+ );
31
+
32
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_url_cache_normalized ON url_cache(normalized_url);
33
+
34
+ CREATE VIRTUAL TABLE IF NOT EXISTS url_cache_fts USING fts5(
35
+ title,
36
+ markdown,
37
+ url,
38
+ content='url_cache',
39
+ content_rowid='id'
40
+ );
41
+
42
+ CREATE TRIGGER IF NOT EXISTS url_cache_ai AFTER INSERT ON url_cache BEGIN
43
+ INSERT INTO url_cache_fts(rowid, title, markdown, url)
44
+ VALUES (new.id, new.title, new.markdown, new.url);
45
+ END;
46
+
47
+ CREATE TRIGGER IF NOT EXISTS url_cache_ad BEFORE DELETE ON url_cache BEGIN
48
+ INSERT INTO url_cache_fts(url_cache_fts, rowid, title, markdown, url)
49
+ VALUES ('delete', old.id, old.title, old.markdown, old.url);
50
+ END;
51
+
52
+ CREATE TRIGGER IF NOT EXISTS url_cache_au BEFORE UPDATE ON url_cache BEGIN
53
+ INSERT INTO url_cache_fts(url_cache_fts, rowid, title, markdown, url)
54
+ VALUES ('delete', old.id, old.title, old.markdown, old.url);
55
+ END;
56
+
57
+ CREATE TABLE IF NOT EXISTS search_cache (
58
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
59
+ query TEXT NOT NULL,
60
+ query_hash TEXT UNIQUE NOT NULL,
61
+ results TEXT NOT NULL,
62
+ engines_used TEXT,
63
+ searched_at TEXT NOT NULL DEFAULT (datetime('now')),
64
+ expires_at TEXT
65
+ );
66
+
67
+ CREATE TABLE IF NOT EXISTS domain_routing (
68
+ domain TEXT PRIMARY KEY,
69
+ prefer_playwright INTEGER DEFAULT 0,
70
+ http_failures INTEGER DEFAULT 0,
71
+ last_updated TEXT
72
+ );
73
+
74
+ CREATE TABLE IF NOT EXISTS domain_boilerplate (
75
+ domain TEXT NOT NULL,
76
+ block_hash TEXT NOT NULL,
77
+ sample_text TEXT,
78
+ discovered_at TEXT NOT NULL DEFAULT (datetime('now')),
79
+ PRIMARY KEY (domain, block_hash)
80
+ );
81
+ `);
82
+ instance = db;
83
+ return db;
84
+ }
85
+ export function getDatabase() {
86
+ if (!instance) {
87
+ throw new Error('Database not initialized. Call initDatabase() first.');
88
+ }
89
+ return instance;
90
+ }
91
+ export function closeDatabase() {
92
+ if (instance) {
93
+ instance.close();
94
+ instance = null;
95
+ }
96
+ }
97
+ //# sourceMappingURL=db.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"db.js","sourceRoot":"","sources":["../../src/cache/db.ts"],"names":[],"mappings":"AAAA,OAAO,QAAQ,MAAM,gBAAgB,CAAC;AAEtC,IAAI,QAAQ,GAA6B,IAAI,CAAC;AAE9C,MAAM,UAAU,YAAY,CAAC,MAAc;IACzC,IAAI,QAAQ,EAAE,CAAC;QACb,QAAQ,CAAC,KAAK,EAAE,CAAC;QACjB,QAAQ,GAAG,IAAI,CAAC;IAClB,CAAC;IAED,MAAM,EAAE,GAAG,IAAI,QAAQ,CAAC,MAAM,CAAC,CAAC;IAEhC,EAAE,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAChC,EAAE,CAAC,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAClC,EAAE,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC;IAE/B,EAAE,CAAC,IAAI,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqEP,CAAC,CAAC;IAEH,QAAQ,GAAG,EAAE,CAAC;IACd,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,WAAW;IACzB,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,sDAAsD,CAAC,CAAC;IAC1E,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,MAAM,UAAU,aAAa;IAC3B,IAAI,QAAQ,EAAE,CAAC;QACb,QAAQ,CAAC,KAAK,EAAE,CAAC;QACjB,QAAQ,GAAG,IAAI,CAAC;IAClB,CAAC;AACH,CAAC"}
@@ -0,0 +1,26 @@
1
+ import type { RawFetchResult, ExtractionResult, CachedContent, SearchResultItem, CacheStats } from '../types.js';
2
+ export declare function normalizeUrl(url: string): string;
3
+ export declare function cacheContent(result: RawFetchResult, extraction: ExtractionResult): void;
4
+ export declare function getCachedContent(url: string): CachedContent | null;
5
+ export declare function isExpired(cached: CachedContent): boolean;
6
+ export declare function searchCache(query: string): CachedContent[];
7
+ export interface CachedSearchResult {
8
+ query: string;
9
+ results: SearchResultItem[];
10
+ engines_used: string[];
11
+ searched_at: string;
12
+ }
13
+ export declare function cacheSearchResults(query: string, results: SearchResultItem[], enginesUsed: string[]): void;
14
+ export declare function getCachedSearchResults(query: string): CachedSearchResult | null;
15
+ export declare function searchCacheFiltered(options: {
16
+ query?: string;
17
+ urlPattern?: string;
18
+ since?: string;
19
+ }): CachedContent[];
20
+ export declare function clearCacheEntries(options: {
21
+ query?: string;
22
+ urlPattern?: string;
23
+ since?: string;
24
+ }): number;
25
+ export declare function getCacheStats(): CacheStats;
26
+ //# sourceMappingURL=store.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"store.d.ts","sourceRoot":"","sources":["../../src/cache/store.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,cAAc,EAAE,gBAAgB,EAAE,aAAa,EAAE,gBAAgB,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAgBjH,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CA0BhD;AAMD,wBAAgB,YAAY,CAAC,MAAM,EAAE,cAAc,EAAE,UAAU,EAAE,gBAAgB,GAAG,IAAI,CAsCvF;AAsCD,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,MAAM,GAAG,aAAa,GAAG,IAAI,CASlE;AAED,wBAAgB,SAAS,CAAC,MAAM,EAAE,aAAa,GAAG,OAAO,CAGxD;AAED,wBAAgB,WAAW,CAAC,KAAK,EAAE,MAAM,GAAG,aAAa,EAAE,CAY1D;AAED,MAAM,WAAW,kBAAkB;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,gBAAgB,EAAE,CAAC;IAC5B,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,MAAM,EACb,OAAO,EAAE,gBAAgB,EAAE,EAC3B,WAAW,EAAE,MAAM,EAAE,GACpB,IAAI,CAqBN;AAED,wBAAgB,sBAAsB,CAAC,KAAK,EAAE,MAAM,GAAG,kBAAkB,GAAG,IAAI,CAgB/E;AAED,wBAAgB,mBAAmB,CAAC,OAAO,EAAE;IAC3C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB,GAAG,aAAa,EAAE,CA4BlB;AAED,wBAAgB,iBAAiB,CAAC,OAAO,EAAE;IACzC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB,GAAG,MAAM,CA0BT;AAED,wBAAgB,aAAa,IAAI,UAAU,CAiB1C"}