@staticn0va/wigolo 0.6.6 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache/store.d.ts +9 -1
- package/dist/cache/store.d.ts.map +1 -1
- package/dist/cache/store.js +30 -4
- package/dist/cache/store.js.map +1 -1
- package/dist/cli/doctor.d.ts.map +1 -1
- package/dist/cli/doctor.js +56 -2
- package/dist/cli/doctor.js.map +1 -1
- package/dist/cli/status.js +1 -1
- package/dist/cli/status.js.map +1 -1
- package/dist/cli/tui/hooks/useInstall.js +1 -1
- package/dist/cli/tui/hooks/useInstall.js.map +1 -1
- package/dist/cli/tui/hooks/useVerify.js +1 -1
- package/dist/cli/tui/hooks/useVerify.js.map +1 -1
- package/dist/cli/tui/status-format.d.ts +1 -1
- package/dist/cli/tui/status-format.d.ts.map +1 -1
- package/dist/cli/tui/status-format.js +1 -1
- package/dist/cli/tui/status-format.js.map +1 -1
- package/dist/cli/tui/status-python.d.ts +1 -1
- package/dist/cli/tui/status-python.d.ts.map +1 -1
- package/dist/cli/tui/status-python.js +17 -1
- package/dist/cli/tui/status-python.js.map +1 -1
- package/dist/cli/tui/verify-suggestions.d.ts +1 -1
- package/dist/cli/tui/verify-suggestions.d.ts.map +1 -1
- package/dist/cli/tui/verify-suggestions.js +3 -3
- package/dist/cli/tui/verify-suggestions.js.map +1 -1
- package/dist/cli/tui/verify.d.ts +2 -2
- package/dist/cli/tui/verify.d.ts.map +1 -1
- package/dist/cli/tui/verify.js +32 -6
- package/dist/cli/tui/verify.js.map +1 -1
- package/dist/cli/warmup.d.ts.map +1 -1
- package/dist/cli/warmup.js +16 -12
- package/dist/cli/warmup.js.map +1 -1
- package/dist/config.d.ts +6 -1
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +15 -2
- package/dist/config.js.map +1 -1
- package/dist/crawl/dedup.d.ts +1 -0
- package/dist/crawl/dedup.d.ts.map +1 -1
- package/dist/crawl/dedup.js +47 -1
- package/dist/crawl/dedup.js.map +1 -1
- package/dist/extraction/boilerplate.d.ts +15 -0
- package/dist/extraction/boilerplate.d.ts.map +1 -0
- package/dist/extraction/boilerplate.js +49 -0
- package/dist/extraction/boilerplate.js.map +1 -0
- package/dist/extraction/defuddle.d.ts.map +1 -1
- package/dist/extraction/defuddle.js +7 -3
- package/dist/extraction/defuddle.js.map +1 -1
- package/dist/extraction/jsonld.js +1 -1
- package/dist/extraction/jsonld.js.map +1 -1
- package/dist/extraction/lang-hints.d.ts +2 -0
- package/dist/extraction/lang-hints.d.ts.map +1 -0
- package/dist/extraction/lang-hints.js +28 -0
- package/dist/extraction/lang-hints.js.map +1 -0
- package/dist/extraction/llm/anthropic.d.ts +3 -0
- package/dist/extraction/llm/anthropic.d.ts.map +1 -0
- package/dist/extraction/llm/anthropic.js +33 -0
- package/dist/extraction/llm/anthropic.js.map +1 -0
- package/dist/extraction/llm/cache.d.ts +5 -0
- package/dist/extraction/llm/cache.d.ts.map +1 -0
- package/dist/extraction/llm/cache.js +35 -0
- package/dist/extraction/llm/cache.js.map +1 -0
- package/dist/extraction/llm/gemini.d.ts +3 -0
- package/dist/extraction/llm/gemini.d.ts.map +1 -0
- package/dist/extraction/llm/gemini.js +35 -0
- package/dist/extraction/llm/gemini.js.map +1 -0
- package/dist/extraction/llm/groq.d.ts +3 -0
- package/dist/extraction/llm/groq.d.ts.map +1 -0
- package/dist/extraction/llm/groq.js +63 -0
- package/dist/extraction/llm/groq.js.map +1 -0
- package/dist/extraction/llm/hash.d.ts +3 -0
- package/dist/extraction/llm/hash.d.ts.map +1 -0
- package/dist/extraction/llm/hash.js +22 -0
- package/dist/extraction/llm/hash.js.map +1 -0
- package/dist/extraction/llm/openai.d.ts +3 -0
- package/dist/extraction/llm/openai.d.ts.map +1 -0
- package/dist/extraction/llm/openai.js +38 -0
- package/dist/extraction/llm/openai.js.map +1 -0
- package/dist/extraction/llm/select.d.ts +5 -0
- package/dist/extraction/llm/select.d.ts.map +1 -0
- package/dist/extraction/llm/select.js +27 -0
- package/dist/extraction/llm/select.js.map +1 -0
- package/dist/extraction/llm/types.d.ts +24 -0
- package/dist/extraction/llm/types.d.ts.map +1 -0
- package/dist/extraction/llm/types.js +2 -0
- package/dist/extraction/llm/types.js.map +1 -0
- package/dist/extraction/llm/validate.d.ts +6 -0
- package/dist/extraction/llm/validate.d.ts.map +1 -0
- package/dist/extraction/llm/validate.js +63 -0
- package/dist/extraction/llm/validate.js.map +1 -0
- package/dist/extraction/llm-fallback.d.ts +17 -0
- package/dist/extraction/llm-fallback.d.ts.map +1 -0
- package/dist/extraction/llm-fallback.js +129 -0
- package/dist/extraction/llm-fallback.js.map +1 -0
- package/dist/extraction/markdown.d.ts +9 -0
- package/dist/extraction/markdown.d.ts.map +1 -1
- package/dist/extraction/markdown.js +52 -3
- package/dist/extraction/markdown.js.map +1 -1
- package/dist/extraction/pipeline.d.ts.map +1 -1
- package/dist/extraction/pipeline.js +17 -5
- package/dist/extraction/pipeline.js.map +1 -1
- package/dist/extraction/readability.d.ts.map +1 -1
- package/dist/extraction/readability.js +2 -3
- package/dist/extraction/readability.js.map +1 -1
- package/dist/extraction/schema.d.ts +12 -0
- package/dist/extraction/schema.d.ts.map +1 -1
- package/dist/extraction/schema.js +81 -11
- package/dist/extraction/schema.js.map +1 -1
- package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -1
- package/dist/extraction/site-extractors/docs-generic.js +2 -3
- package/dist/extraction/site-extractors/docs-generic.js.map +1 -1
- package/dist/extraction/site-extractors/github.d.ts.map +1 -1
- package/dist/extraction/site-extractors/github.js +4 -5
- package/dist/extraction/site-extractors/github.js.map +1 -1
- package/dist/extraction/site-extractors/mdn.d.ts.map +1 -1
- package/dist/extraction/site-extractors/mdn.js +2 -3
- package/dist/extraction/site-extractors/mdn.js.map +1 -1
- package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -1
- package/dist/extraction/site-extractors/stackoverflow.js +3 -4
- package/dist/extraction/site-extractors/stackoverflow.js.map +1 -1
- package/dist/extraction/structured-data.d.ts +4 -0
- package/dist/extraction/structured-data.d.ts.map +1 -0
- package/dist/extraction/structured-data.js +203 -0
- package/dist/extraction/structured-data.js.map +1 -0
- package/dist/fetch/router.d.ts +2 -1
- package/dist/fetch/router.d.ts.map +1 -1
- package/dist/fetch/router.js +19 -1
- package/dist/fetch/router.js.map +1 -1
- package/dist/instructions.d.ts +7 -7
- package/dist/instructions.d.ts.map +1 -1
- package/dist/instructions.js +43 -36
- package/dist/instructions.js.map +1 -1
- package/dist/logger.d.ts +1 -1
- package/dist/logger.d.ts.map +1 -1
- package/dist/research/brief.js +1 -1
- package/dist/research/brief.js.map +1 -1
- package/dist/search/evidence.d.ts +25 -0
- package/dist/search/evidence.d.ts.map +1 -0
- package/dist/search/evidence.js +260 -0
- package/dist/search/evidence.js.map +1 -0
- package/dist/search/highlights.d.ts +11 -2
- package/dist/search/highlights.d.ts.map +1 -1
- package/dist/search/highlights.js +131 -48
- package/dist/search/highlights.js.map +1 -1
- package/dist/search/multi-query.d.ts +1 -0
- package/dist/search/multi-query.d.ts.map +1 -1
- package/dist/search/multi-query.js +13 -0
- package/dist/search/multi-query.js.map +1 -1
- package/dist/search/rerank.d.ts +3 -2
- package/dist/search/rerank.d.ts.map +1 -1
- package/dist/search/rerank.js +16 -44
- package/dist/search/rerank.js.map +1 -1
- package/dist/search/reranker/download.d.ts +9 -0
- package/dist/search/reranker/download.d.ts.map +1 -0
- package/dist/search/reranker/download.js +77 -0
- package/dist/search/reranker/download.js.map +1 -0
- package/dist/search/reranker/models.d.ts +14 -0
- package/dist/search/reranker/models.d.ts.map +1 -0
- package/dist/search/reranker/models.js +37 -0
- package/dist/search/reranker/models.js.map +1 -0
- package/dist/search/reranker/onnx.d.ts +13 -0
- package/dist/search/reranker/onnx.d.ts.map +1 -0
- package/dist/search/reranker/onnx.js +70 -0
- package/dist/search/reranker/onnx.js.map +1 -0
- package/dist/search/reranker/recency-boost.d.ts +3 -0
- package/dist/search/reranker/recency-boost.d.ts.map +1 -0
- package/dist/search/reranker/recency-boost.js +12 -0
- package/dist/search/reranker/recency-boost.js.map +1 -0
- package/dist/search/reranker/recency.d.ts +3 -0
- package/dist/search/reranker/recency.d.ts.map +1 -0
- package/dist/search/reranker/recency.js +26 -0
- package/dist/search/reranker/recency.js.map +1 -0
- package/dist/search/reranker/tokenizer.d.ts +30 -0
- package/dist/search/reranker/tokenizer.d.ts.map +1 -0
- package/dist/search/reranker/tokenizer.js +49 -0
- package/dist/search/reranker/tokenizer.js.map +1 -0
- package/dist/search/tokens.d.ts +3 -0
- package/dist/search/tokens.d.ts.map +1 -0
- package/dist/search/tokens.js +38 -0
- package/dist/search/tokens.js.map +1 -0
- package/dist/search/truncate.d.ts +4 -0
- package/dist/search/truncate.d.ts.map +1 -1
- package/dist/search/truncate.js +13 -0
- package/dist/search/truncate.js.map +1 -1
- package/dist/server/tool-schemas.d.ts +503 -0
- package/dist/server/tool-schemas.d.ts.map +1 -0
- package/dist/server/tool-schemas.js +425 -0
- package/dist/server/tool-schemas.js.map +1 -0
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +1 -326
- package/dist/server.js.map +1 -1
- package/dist/tools/agent.d.ts.map +1 -1
- package/dist/tools/agent.js +36 -0
- package/dist/tools/agent.js.map +1 -1
- package/dist/tools/crawl.d.ts.map +1 -1
- package/dist/tools/crawl.js +37 -2
- package/dist/tools/crawl.js.map +1 -1
- package/dist/tools/extract.d.ts.map +1 -1
- package/dist/tools/extract.js +19 -3
- package/dist/tools/extract.js.map +1 -1
- package/dist/tools/fetch.d.ts.map +1 -1
- package/dist/tools/fetch.js +44 -7
- package/dist/tools/fetch.js.map +1 -1
- package/dist/tools/find-similar.d.ts.map +1 -1
- package/dist/tools/find-similar.js +32 -1
- package/dist/tools/find-similar.js.map +1 -1
- package/dist/tools/research.d.ts.map +1 -1
- package/dist/tools/research.js +34 -1
- package/dist/tools/research.js.map +1 -1
- package/dist/tools/search.d.ts.map +1 -1
- package/dist/tools/search.js +101 -55
- package/dist/tools/search.js.map +1 -1
- package/dist/types.d.ts +65 -1
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +1 -1
- package/dist/types.js.map +1 -1
- package/dist/util/mode.d.ts +4 -0
- package/dist/util/mode.d.ts.map +1 -0
- package/dist/util/mode.js +13 -0
- package/dist/util/mode.js.map +1 -0
- package/package.json +9 -1
- package/dist/search/flashrank.d.ts +0 -12
- package/dist/search/flashrank.d.ts.map +0 -1
- package/dist/search/flashrank.js +0 -64
- package/dist/search/flashrank.js.map +0 -1
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import { createLogger } from '../logger.js';
|
|
3
|
+
const log = createLogger('structured-data');
|
|
4
|
+
const KNOWN_TYPES = new Set([
|
|
5
|
+
'Article',
|
|
6
|
+
'Product',
|
|
7
|
+
'Recipe',
|
|
8
|
+
'BreadcrumbList',
|
|
9
|
+
'Organization',
|
|
10
|
+
'Person',
|
|
11
|
+
]);
|
|
12
|
+
export function extractStructuredData(html) {
|
|
13
|
+
if (!html)
|
|
14
|
+
return [];
|
|
15
|
+
const { document: doc } = parseHTML(html);
|
|
16
|
+
const out = [];
|
|
17
|
+
out.push(...extractJsonLdBlocks(doc));
|
|
18
|
+
out.push(...extractMicrodataBlocks(doc));
|
|
19
|
+
out.push(...extractRdfaBlocks(doc));
|
|
20
|
+
return out;
|
|
21
|
+
}
|
|
22
|
+
function extractJsonLdBlocks(doc) {
|
|
23
|
+
const out = [];
|
|
24
|
+
const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
|
|
25
|
+
for (const script of scripts) {
|
|
26
|
+
const text = script.textContent?.trim();
|
|
27
|
+
if (!text)
|
|
28
|
+
continue;
|
|
29
|
+
let parsed;
|
|
30
|
+
try {
|
|
31
|
+
parsed = JSON.parse(text);
|
|
32
|
+
}
|
|
33
|
+
catch (err) {
|
|
34
|
+
log.warn('Failed to parse JSON-LD block', { error: String(err) });
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
for (const node of flattenJsonLd(parsed)) {
|
|
38
|
+
const type = normalizeType(node['@type']);
|
|
39
|
+
if (!type)
|
|
40
|
+
continue;
|
|
41
|
+
const fields = {};
|
|
42
|
+
for (const [k, v] of Object.entries(node)) {
|
|
43
|
+
if (k.startsWith('@'))
|
|
44
|
+
continue;
|
|
45
|
+
fields[k] = v;
|
|
46
|
+
}
|
|
47
|
+
out.push({ provenance: 'json-ld', type, fields });
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return out;
|
|
51
|
+
}
|
|
52
|
+
function flattenJsonLd(value) {
|
|
53
|
+
if (!value || typeof value !== 'object')
|
|
54
|
+
return [];
|
|
55
|
+
if (Array.isArray(value))
|
|
56
|
+
return value.flatMap(flattenJsonLd);
|
|
57
|
+
const obj = value;
|
|
58
|
+
if (Array.isArray(obj['@graph']))
|
|
59
|
+
return obj['@graph'].flatMap(flattenJsonLd);
|
|
60
|
+
return [obj];
|
|
61
|
+
}
|
|
62
|
+
function normalizeType(raw) {
|
|
63
|
+
if (typeof raw === 'string') {
|
|
64
|
+
const tail = raw.split('/').pop();
|
|
65
|
+
return tail || null;
|
|
66
|
+
}
|
|
67
|
+
if (Array.isArray(raw)) {
|
|
68
|
+
for (const t of raw) {
|
|
69
|
+
const norm = normalizeType(t);
|
|
70
|
+
if (norm)
|
|
71
|
+
return norm;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
function extractMicrodataBlocks(doc) {
|
|
77
|
+
const out = [];
|
|
78
|
+
// Top-level itemscopes only — nested itemscopes are walked into as fields.
|
|
79
|
+
const all = Array.from(doc.querySelectorAll('[itemscope]'));
|
|
80
|
+
const tops = all.filter((el) => !hasItemscopeAncestor(el));
|
|
81
|
+
for (const el of tops) {
|
|
82
|
+
const node = readMicrodataNode(el);
|
|
83
|
+
if (!node)
|
|
84
|
+
continue;
|
|
85
|
+
out.push(node);
|
|
86
|
+
}
|
|
87
|
+
return out;
|
|
88
|
+
}
|
|
89
|
+
function hasItemscopeAncestor(el) {
|
|
90
|
+
let cur = el.parentElement;
|
|
91
|
+
while (cur) {
|
|
92
|
+
if (cur.hasAttribute('itemscope'))
|
|
93
|
+
return true;
|
|
94
|
+
cur = cur.parentElement;
|
|
95
|
+
}
|
|
96
|
+
return false;
|
|
97
|
+
}
|
|
98
|
+
function readMicrodataNode(el) {
|
|
99
|
+
const itemtype = el.getAttribute('itemtype') ?? '';
|
|
100
|
+
const type = itemtype ? itemtype.split('/').pop() : '';
|
|
101
|
+
if (!type)
|
|
102
|
+
return null;
|
|
103
|
+
const fields = {};
|
|
104
|
+
// Walk descendants but stop crossing into nested itemscopes (handle them as nested objects)
|
|
105
|
+
collectItemprops(el, fields);
|
|
106
|
+
return { provenance: 'microdata', type, fields };
|
|
107
|
+
}
|
|
108
|
+
function collectItemprops(root, target) {
|
|
109
|
+
const stack = Array.from(root.children);
|
|
110
|
+
while (stack.length) {
|
|
111
|
+
const el = stack.shift();
|
|
112
|
+
const prop = el.getAttribute('itemprop');
|
|
113
|
+
if (prop) {
|
|
114
|
+
let value;
|
|
115
|
+
if (el.hasAttribute('itemscope')) {
|
|
116
|
+
const nested = {};
|
|
117
|
+
collectItemprops(el, nested);
|
|
118
|
+
value = nested;
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
value =
|
|
122
|
+
el.getAttribute('content') ??
|
|
123
|
+
el.getAttribute('href') ??
|
|
124
|
+
el.getAttribute('src') ??
|
|
125
|
+
(el.textContent ?? '').trim();
|
|
126
|
+
}
|
|
127
|
+
mergeProp(target, prop, value);
|
|
128
|
+
}
|
|
129
|
+
// Always stop at any itemscope: it is an independent item, regardless of
|
|
130
|
+
// whether it carries an itemprop. Otherwise its descendants' itemprops
|
|
131
|
+
// would leak into the parent record.
|
|
132
|
+
if (el.hasAttribute('itemscope'))
|
|
133
|
+
continue;
|
|
134
|
+
for (const c of el.children)
|
|
135
|
+
stack.push(c);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
function mergeProp(target, prop, value) {
|
|
139
|
+
if (target[prop] === undefined) {
|
|
140
|
+
target[prop] = value;
|
|
141
|
+
return;
|
|
142
|
+
}
|
|
143
|
+
if (Array.isArray(target[prop])) {
|
|
144
|
+
target[prop].push(value);
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
target[prop] = [target[prop], value];
|
|
148
|
+
}
|
|
149
|
+
function extractRdfaBlocks(doc) {
|
|
150
|
+
const out = [];
|
|
151
|
+
const all = Array.from(doc.querySelectorAll('[typeof]'));
|
|
152
|
+
const tops = all.filter((el) => !hasTypeofAncestor(el));
|
|
153
|
+
for (const el of tops) {
|
|
154
|
+
const typeAttr = el.getAttribute('typeof') ?? '';
|
|
155
|
+
const type = typeAttr.split(/\s+/)[0]?.split(/[:/]/).pop() ?? '';
|
|
156
|
+
if (!type)
|
|
157
|
+
continue;
|
|
158
|
+
const fields = {};
|
|
159
|
+
collectRdfaProps(el, fields);
|
|
160
|
+
out.push({ provenance: 'rdfa', type, fields });
|
|
161
|
+
}
|
|
162
|
+
return out;
|
|
163
|
+
}
|
|
164
|
+
function hasTypeofAncestor(el) {
|
|
165
|
+
let cur = el.parentElement;
|
|
166
|
+
while (cur) {
|
|
167
|
+
if (cur.hasAttribute('typeof'))
|
|
168
|
+
return true;
|
|
169
|
+
cur = cur.parentElement;
|
|
170
|
+
}
|
|
171
|
+
return false;
|
|
172
|
+
}
|
|
173
|
+
function collectRdfaProps(root, target) {
|
|
174
|
+
const stack = Array.from(root.children);
|
|
175
|
+
while (stack.length) {
|
|
176
|
+
const el = stack.shift();
|
|
177
|
+
const prop = el.getAttribute('property');
|
|
178
|
+
if (prop) {
|
|
179
|
+
const propName = prop.split(/[:/]/).pop() ?? prop;
|
|
180
|
+
let value;
|
|
181
|
+
if (el.hasAttribute('typeof')) {
|
|
182
|
+
const nested = {};
|
|
183
|
+
collectRdfaProps(el, nested);
|
|
184
|
+
value = nested;
|
|
185
|
+
}
|
|
186
|
+
else {
|
|
187
|
+
value =
|
|
188
|
+
el.getAttribute('content') ??
|
|
189
|
+
el.getAttribute('href') ??
|
|
190
|
+
el.getAttribute('resource') ??
|
|
191
|
+
(el.textContent ?? '').trim();
|
|
192
|
+
}
|
|
193
|
+
mergeProp(target, propName, value);
|
|
194
|
+
}
|
|
195
|
+
// Always stop at any nested typeof, regardless of property — independent item.
|
|
196
|
+
if (el.hasAttribute('typeof'))
|
|
197
|
+
continue;
|
|
198
|
+
for (const c of el.children)
|
|
199
|
+
stack.push(c);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
export const KNOWN_SCHEMA_TYPES = KNOWN_TYPES;
|
|
203
|
+
//# sourceMappingURL=structured-data.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured-data.js","sourceRoot":"","sources":["../../src/extraction/structured-data.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAG5C,MAAM,GAAG,GAAG,YAAY,CAAC,iBAAiB,CAAC,CAAC;AAE5C,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC;IAC1B,SAAS;IACT,SAAS;IACT,QAAQ;IACR,gBAAgB;IAChB,cAAc;IACd,QAAQ;CACT,CAAC,CAAC;AAEH,MAAM,UAAU,qBAAqB,CAAC,IAAY;IAChD,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IACrB,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,GAAG,CAAC,IAAI,CAAC,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC;IACtC,GAAG,CAAC,IAAI,CAAC,GAAG,sBAAsB,CAAC,GAAG,CAAC,CAAC,CAAC;IACzC,GAAG,CAAC,IAAI,CAAC,GAAG,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC;IACpC,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,mBAAmB,CAAC,GAAa;IACxC,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,MAAM,OAAO,GAAG,GAAG,CAAC,gBAAgB,CAAC,oCAAoC,CAAC,CAAC;IAC3E,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAG,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;QACxC,IAAI,CAAC,IAAI;YAAE,SAAS;QACpB,IAAI,MAAe,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC5B,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,GAAG,CAAC,IAAI,CAAC,+BAA+B,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAClE,SAAS;QACX,CAAC;QACD,KAAK,MAAM,IAAI,IAAI,aAAa,CAAC,MAAM,CAAC,EAAE,CAAC;YACzC,MAAM,IAAI,GAAG,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC;YAC1C,IAAI,CAAC,IAAI;gBAAE,SAAS;YACpB,MAAM,MAAM,GAA4B,EAAE,CAAC;YAC3C,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1C,IAAI,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC;oBAAE,SAAS;gBAChC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;YAChB,CAAC;YACD,GAAG,CAAC,IAAI,CAAC,EAAE,UAAU,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,aAAa,CAAC,KAAc;IACnC,IAAI,CAAC,KAAK,IAAI,OAAO,KAAK,KAAK,QAAQ;QAAE,OAAO,EAAE,CAAC;IACnD,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAC9D,MAAM,GAAG,GAAG,KAAgC,CAAC;IAC7C,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAAE,OAAO,GAAG,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAC9E,OAAO,CAAC,GAAG,CAAC,CAAC;AACf,CAAC;AAED,SAAS,aAAa,CAAC,GAAY;IACjC,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAG,CAAC;QACnC,OAAO,IAAI,IAAI,IAAI,CAAC;IACtB,CAAC;IACD,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QACvB,KAAK,MAAM,CAAC,IAAI,GAAG,EAAE,CAAC;YACpB,MAAM,IAAI,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;YAC9B,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,sBAAsB,CAAC,GAAa;IAC3C,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,2EAA2E;IAC3E,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC,CAAC;IAC5D,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,oBAAoB,CAAC,EAAE,CAAC,CAAC,CAAC;IAC3D,KAAK,MAAM,EAAE,IAAI,IAAI,EAAE,CAAC;QACtB,MAAM,IAAI,GAAG,iBAAiB,CAAC,EAAE,CAAC,CAAC;QACnC,IAAI,CAAC,IAAI;YAAE,SAAS;QACpB,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACjB,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,oBAAoB,CAAC,EAAW;IACvC,IAAI,GAAG,GAAG,EAAE,CAAC,aAAa,CAAC;IAC3B,OAAO,GAAG,EAAE,CAAC;QACX,IAAI,GAAG,CAAC,YAAY,CAAC,WAAW,CAAC;YAAE,OAAO,IAAI,CAAC;QAC/C,GAAG,GAAG,GAAG,CAAC,aAAa,CAAC;IAC1B,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,iBAAiB,CAAC,EAAW;IACpC,MAAM,QAAQ,GAAG,EAAE,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;IACnD,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAG,CAAC,CAAC,CAAC,EAAE,CAAC;IACxD,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAC;IACvB,MAAM,MAAM,GAA4B,EAAE,CAAC;IAC3C,4FAA4F;IAC5F,gBAAgB,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC;IAC7B,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC;AACnD,CAAC;AAED,SAAS,gBAAgB,CAAC,IAAa,EAAE,MAA+B;IACtE,MAAM,KAAK,GAAc,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACnD,OAAO,KAAK,CAAC,MAAM,EAAE,CAAC;QACpB,MAAM,EAAE,GAAG,KAAK,CAAC,KAAK,EAAG,CAAC;QAC1B,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;QACzC,IAAI,IAAI,EAAE,CAAC;YACT,IAAI,KAAc,CAAC;YACnB,IAAI,EAAE,CAAC,YAAY,CAAC,WAAW,CAAC,EAAE,CAAC;gBACjC,MAAM,MAAM,GAA4B,EAAE,CAAC;gBAC3C,gBAAgB,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC;gBAC7B,KAAK,GAAG,MAAM,CAAC;YACjB,CAAC;iBAAM,CAAC;gBACN,KAAK;oBACH,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC;wBAC1B,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC;wBACvB,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC;wBACtB,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;YAClC,CAAC;YACD,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC;QACjC,CAAC;QACD,yEAAyE;QACzE,uEAAuE;QACvE,qCAAqC;QACrC,IAAI,EAAE,CAAC,YAAY,CAAC,WAAW,CAAC;YAAE,SAAS;QAC3C,KAAK,MAAM,CAAC,IAAI,EAAE,CAAC,QAAQ;YAAE,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC7C,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAAC,MAA+B,EAAE,IAAY,EAAE,KAAc;IAC9E,IAAI,MAAM,CAAC,IAAI,CAAC,KAAK,SAAS,EAAE,CAAC;QAC/B,MAAM,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC;QACrB,OAAO;IACT,CAAC;IACD,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;QAC/B,MAAM,CAAC,IAAI,CAAe,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACxC,OAAO;IACT,CAAC;IACD,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,KAAK,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,iBAAiB,CAAC,GAAa;IACtC,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,CAAC;IACzD,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,iBAAiB,CAAC,EAAE,CAAC,CAAC,CAAC;IACxD,KAAK,MAAM,EAAE,IAAI,IAAI,EAAE,CAAC;QACtB,MAAM,QAAQ,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QACjD,MAAM,IAAI,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC;QACjE,IAAI,CAAC,IAAI;YAAE,SAAS;QACpB,MAAM,MAAM,GAA4B,EAAE,CAAC;QAC3C,gBAAgB,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC;QAC7B,GAAG,CAAC,IAAI,CAAC,EAAE,UAAU,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;IACjD,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,iBAAiB,CAAC,EAAW;IACpC,IAAI,GAAG,GAAG,EAAE,CAAC,aAAa,CAAC;IAC3B,OAAO,GAAG,EAAE,CAAC;QACX,IAAI,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC;YAAE,OAAO,IAAI,CAAC;QAC5C,GAAG,GAAG,GAAG,CAAC,aAAa,CAAC;IAC1B,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,gBAAgB,CAAC,IAAa,EAAE,MAA+B;IACtE,MAAM,KAAK,GAAc,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACnD,OAAO,KAAK,CAAC,MAAM,EAAE,CAAC;QACpB,MAAM,EAAE,GAAG,KAAK,CAAC,KAAK,EAAG,CAAC;QAC1B,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;QACzC,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,IAAI,CAAC;YAClD,IAAI,KAAc,CAAC;YACnB,IAAI,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC9B,MAAM,MAAM,GAA4B,EAAE,CAAC;gBAC3C,gBAAgB,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC;gBAC7B,KAAK,GAAG,MAAM,CAAC;YACjB,CAAC;iBAAM,CAAC;gBACN,KAAK;oBACH,EAAE,CAAC,YAAY,CAAC,SAAS,CAAC;wBAC1B,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC;wBACvB,EAAE,CAAC,YAAY,CAAC,UAAU,CAAC;wBAC3B,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;YAClC,CAAC;YACD,SAAS,CAAC,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC;QACrC,CAAC;QACD,+EAA+E;QAC/E,IAAI,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC;YAAE,SAAS;QACxC,KAAK,MAAM,CAAC,IAAI,EAAE,CAAC,QAAQ;YAAE,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC7C,CAAC;AACH,CAAC;AAED,MAAM,CAAC,MAAM,kBAAkB,GAAwB,WAAW,CAAC"}
|
package/dist/fetch/router.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { RawFetchResult, BrowserAction } from '../types.js';
|
|
1
|
+
import type { RawFetchResult, BrowserAction, Mode } from '../types.js';
|
|
2
2
|
export interface RouterFetchOptions {
|
|
3
3
|
renderJs?: 'auto' | 'always' | 'never';
|
|
4
4
|
useAuth?: boolean;
|
|
@@ -6,6 +6,7 @@ export interface RouterFetchOptions {
|
|
|
6
6
|
screenshot?: boolean;
|
|
7
7
|
actions?: BrowserAction[];
|
|
8
8
|
force_refresh?: boolean;
|
|
9
|
+
mode?: Mode;
|
|
9
10
|
}
|
|
10
11
|
export interface HttpClient {
|
|
11
12
|
fetch(url: string, options?: {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"router.d.ts","sourceRoot":"","sources":["../../src/fetch/router.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"router.d.ts","sourceRoot":"","sources":["../../src/fetch/router.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AAEvE,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,OAAO,CAAC;IACvC,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,OAAO,CAAC,EAAE,aAAa,EAAE,CAAC;IAC1B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,IAAI,CAAC,EAAE,IAAI,CAAC;CACb;AAED,MAAM,WAAW,UAAU;IACzB,KAAK,CACH,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE;QAAE,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAAC,SAAS,CAAC,EAAE,MAAM,CAAA;KAAE,GACjE,OAAO,CAAC;QACT,GAAG,EAAE,MAAM,CAAC;QACZ,QAAQ,EAAE,MAAM,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,WAAW,EAAE,MAAM,CAAC;QACpB,UAAU,EAAE,MAAM,CAAC;QACnB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAChC,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,oBAAoB;IACnC,gBAAgB,CACd,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE;QAAE,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAAC,WAAW,CAAC,EAAE,MAAM,CAAC;QAAC,UAAU,CAAC,EAAE,OAAO,CAAC;QAAC,OAAO,CAAC,EAAE,aAAa,EAAE,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE,GAChK,OAAO,CAAC,cAAc,CAAC,CAAC;CAC5B;AAED,UAAU,WAAW;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,gBAAgB,EAAE,OAAO,CAAC;CAC3B;AAED,qBAAa,WAAW;IAIpB,OAAO,CAAC,QAAQ,CAAC,UAAU;IAC3B,OAAO,CAAC,QAAQ,CAAC,WAAW;IAJ9B,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAkC;gBAGzC,UAAU,EAAE,UAAU,EACtB,WAAW,EAAE,oBAAoB;IAG9C,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,kBAAuB,GAAG,OAAO,CAAC,cAAc,CAAC;IAuFnF,cAAc,CAAC,MAAM,EAAE,MAAM,GAAG,WAAW,GAAG,SAAS;IAIvD,OAAO,CAAC,WAAW;IASnB,OAAO,CAAC,gBAAgB;CAczB"}
|
package/dist/fetch/router.js
CHANGED
|
@@ -11,11 +11,29 @@ export class SmartRouter {
|
|
|
11
11
|
this.browserPool = browserPool;
|
|
12
12
|
}
|
|
13
13
|
async fetch(url, options = {}) {
|
|
14
|
-
const { renderJs = 'auto', useAuth = false, headers, screenshot, actions } = options;
|
|
14
|
+
const { renderJs = 'auto', useAuth = false, headers, screenshot, actions, mode } = options;
|
|
15
15
|
const config = getConfig();
|
|
16
16
|
const logger = createLogger('fetch');
|
|
17
17
|
const threshold = config.browserFallbackThreshold;
|
|
18
18
|
const domain = new URL(url).hostname;
|
|
19
|
+
// Fast mode: HTTP-only with tight timeout, never escalates to a browser.
|
|
20
|
+
if (mode === 'fast') {
|
|
21
|
+
if (actions && actions.length > 0) {
|
|
22
|
+
logger.warn('mode=fast ignores browser actions; switch to balanced/deep to execute them', {
|
|
23
|
+
url,
|
|
24
|
+
actionCount: actions.length,
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
logger.debug('routing to http (fast)', { url });
|
|
28
|
+
const result = await this.httpClient.fetch(url, {
|
|
29
|
+
headers,
|
|
30
|
+
timeoutMs: config.fastTimeoutMs,
|
|
31
|
+
});
|
|
32
|
+
this.ensureStats(domain);
|
|
33
|
+
const raw = this.toRawFetchResult(result);
|
|
34
|
+
raw.jsRequired = contentAppearsEmpty(result.html);
|
|
35
|
+
return raw;
|
|
36
|
+
}
|
|
19
37
|
// Actions always force Playwright --- actions need a live browser page
|
|
20
38
|
if (actions && actions.length > 0) {
|
|
21
39
|
const authOptions = useAuth ? (await getAuthOptions() ?? {}) : {};
|
package/dist/fetch/router.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"router.js","sourceRoot":"","sources":["../../src/fetch/router.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;
|
|
1
|
+
{"version":3,"file":"router.js","sourceRoot":"","sources":["../../src/fetch/router.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;AAwC3C,MAAM,OAAO,WAAW;IAIH;IACA;IAJF,SAAS,GAAG,IAAI,GAAG,EAAuB,CAAC;IAE5D,YACmB,UAAsB,EACtB,WAAiC;QADjC,eAAU,GAAV,UAAU,CAAY;QACtB,gBAAW,GAAX,WAAW,CAAsB;IACjD,CAAC;IAEJ,KAAK,CAAC,KAAK,CAAC,GAAW,EAAE,UAA8B,EAAE;QACvD,MAAM,EAAE,QAAQ,GAAG,MAAM,EAAE,OAAO,GAAG,KAAK,EAAE,OAAO,EAAE,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,GAAG,OAAO,CAAC;QAC3F,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;QACrC,MAAM,SAAS,GAAG,MAAM,CAAC,wBAAwB,CAAC;QAClD,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;QAErC,yEAAyE;QACzE,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;YACpB,IAAI,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClC,MAAM,CAAC,IAAI,CAAC,4EAA4E,EAAE;oBACxF,GAAG;oBACH,WAAW,EAAE,OAAO,CAAC,MAAM;iBAC5B,CAAC,CAAC;YACL,CAAC;YACD,MAAM,CAAC,KAAK,CAAC,wBAAwB,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YAChD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,EAAE;gBAC9C,OAAO;gBACP,SAAS,EAAE,MAAM,CAAC,aAAa;aAChC,CAAC,CAAC;YACH,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;YACzB,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;YAC1C,GAAG,CAAC,UAAU,GAAG,mBAAmB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YAClD,OAAO,GAAG,CAAC;QACb,CAAC;QAED,uEAAuE;QACvE,IAAI,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAClC,MAAM,WAAW,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,MAAM,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAClE,MAAM,CAAC,KAAK,CAAC,uBAAuB,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,iBAAiB,EAAE,CAAC,CAAC;YAC1E,OAAO,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,WAAW,EAAE,CAAC,CAAC;QAClG,CAAC;QAED,kDAAkD;QAClD,IAAI,QAAQ,KAAK,QAAQ,IAAI,OAAO,EAAE,CAAC;YACrC,MAAM,WAAW,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,MAAM,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAClE,MAAM,CAAC,KAAK,CAAC,uBAAuB,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,kBAAkB,EAAE,CAAC,CAAC;YAC9F,OAAO,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,GAAG,WAAW,EAAE,CAAC,CAAC;QACzF,CAAC;QAED,yBAAyB;QACzB,IAAI,QAAQ,KAAK,OAAO,EAAE,CAAC;YACzB,MAAM,CAAC,KAAK,CAAC,yBAAyB,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YACjD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;YAC7D,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;YACzB,OAAO,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACvC,CAAC;QAED,yDAAyD;QACzD,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QAEvC,IAAI,KAAK,CAAC,gBAAgB,EAAE,CAAC;YAC3B,MAAM,CAAC,KAAK,CAAC,uCAAuC,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,CAAC;YACvE,OAAO,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC,CAAC;QACzE,CAAC;QAED,iBAAiB;QACjB,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;YAE7D,sCAAsC;YACtC,IAAI,mBAAmB,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC;gBACrC,MAAM,CAAC,IAAI,CAAC,mDAAmD,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,CAAC;gBAClF,KAAK,CAAC,gBAAgB,GAAG,IAAI,CAAC;gBAC9B,OAAO,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC,CAAC;YACzE,CAAC;YAED,OAAO,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACvC,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,KAAK,CAAC,YAAY,EAAE,CAAC;YACrB,MAAM,CAAC,IAAI,CAAC,mBAAmB,EAAE;gBAC/B,GAAG;gBACH,MAAM;gBACN,YAAY,EAAE,KAAK,CAAC,YAAY;gBAChC,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;aACxD,CAAC,CAAC;YAEH,IAAI,KAAK,CAAC,YAAY,IAAI,SAAS,EAAE,CAAC;gBACpC,MAAM,CAAC,IAAI,CAAC,0DAA0D,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC,CAAC;gBACpG,KAAK,CAAC,gBAAgB,GAAG,IAAI,CAAC;gBAC9B,OAAO,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC,CAAC;YACzE,CAAC;YAED,MAAM,GAAG,CAAC;QACZ,CAAC;IACH,CAAC;IAED,cAAc,CAAC,MAAc;QAC3B,OAAO,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IACpC,CAAC;IAEO,WAAW,CAAC,MAAc;QAChC,IAAI,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACvC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,KAAK,GAAG,EAAE,YAAY,EAAE,CAAC,EAAE,gBAAgB,EAAE,KAAK,EAAE,CAAC;YACrD,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;QACpC,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,gBAAgB,CACtB,MAAgD;QAEhD,OAAO;YACL,GAAG,EAAE,MAAM,CAAC,GAAG;YACf,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,SAAS,EAAE,MAAM,CAAC,SAAS;SAC5B,CAAC;IACJ,CAAC;CACF"}
|
package/dist/instructions.d.ts
CHANGED
|
@@ -14,16 +14,16 @@
|
|
|
14
14
|
* Parameter schemas (types, enums, required/optional) belong on the JSON
|
|
15
15
|
* Schema, not here. Installation/configuration is for humans, not LLMs.
|
|
16
16
|
*/
|
|
17
|
-
export declare const WIGOLO_INSTRUCTIONS = "Wigolo is a local-first web access layer: search the open web, fetch pages, crawl sites, extract structured data, find related content, run multi-step research, and execute agent-driven data gathering. All results land in a local knowledge cache that persists across sessions.\n\n## Host-LLM synthesis pattern (read this first)\n\nWigolo has no internal LLM. It returns *structured evidence* so YOU (the host LLM) write the final answer. Fold structure into your reply:\n\n- `search` `format:
|
|
17
|
+
export declare const WIGOLO_INSTRUCTIONS = "Wigolo is a local-first web access layer: search the open web, fetch pages, crawl sites, extract structured data, find related content, run multi-step research, and execute agent-driven data gathering. All results land in a local knowledge cache that persists across sessions.\n\n## Host-LLM synthesis pattern (read this first)\n\nWigolo has no internal LLM. It returns *structured evidence* so YOU (the host LLM) write the final answer. Fold structure into your reply:\n\n- `search` \u2192 evidence (title/url/section_heading/excerpt/score/citation_id/source_span) + citations. Quote [N] or {citation_id}.\n- `format: 'answer'|'stream_answer'` \u2192 LLM synthesis when sampling supported; else evidence fallback.\n- `max_tokens_out` caps total output (cl100k-base, ~5-15% drift on non-OpenAI). `include_full_markdown: true` restores full body. `citation_format`: `'numbered'`|`'json'`|`'anthropic_tags'`.\n- `research` \u2192 `brief` with `topics`, `highlights`, `key_findings`, `sections` when sampling unavailable. Use `sections.overview.cross_references` for corroborated findings, `sections.gaps` for coverage limits, `sections.comparison` for entity-vs-entity analysis. `query_type` indicates decomposition strategy used.\n- `find_similar` \u2192 `cold_start` string when local signals are weak. Pass to user verbatim.\n- `extract` `mode: \"structured\"` \u2192 tables + definitions + jsonld + chart_hints + key_value_pairs in one call.\n- `fetch` metadata \u2192 `og_type`, `canonical_url`, `og_image` when present.\n\n## When to use which tool\n\n- `search` -- you need information on a topic but do not have a URL yet. Pass a query string or an array of 3-5 semantically varied keyword forms for broader coverage.\n- `fetch` -- you already have a specific URL to read.\n- `crawl` -- you need multiple pages from the same site (docs, wikis, references).\n- `cache` -- you want to know if the content is already on disk from an earlier read.\n- `extract` -- you need specific data points (tables, metadata, schema-shaped fields) rather than a whole page as markdown.\n- `find_similar` -- you have a URL or concept and want related content from the cache or web. Useful for \"more like this\" discovery.\n- `research` -- you have a complex question that needs multi-step investigation: question decomposition, parallel search, source synthesis into a report. Set `depth` to control thoroughness.\n- `agent` -- you need to gather structured or unstructured data from multiple sources based on a natural-language prompt. Provides full step transparency.\n\n## Routing by intent\n\n| Intent | Tool | Key parameters |\n|--------|------|----------------|\n| Documentation lookup | `search` | `include_domains: [\"react.dev\", \"nextjs.org\"]` -- scope to the project's official site, do not rely on `category: \"docs\"` alone |\n| Error debugging | `search` | exact error string as query, `category: \"code\"` (no domain scoping -- errors appear everywhere) |\n| Library research | `crawl` | seed URL of docs site, `strategy: \"sitemap\"`, then `cache` for later queries |\n| Related content | `find_similar` | `url` of a known good page, or `concept` as free text |\n| Evidence excerpt | `search` | default output; cite [N] or {citation_id} from each evidence item |\n| Direct answer | `search` | `format: \"answer\"` if client supports sampling, else falls back to evidence |\n| Comprehensive research | `research` | `depth: \"comprehensive\"`, optional `include_domains` to scope |\n| Data gathering | `agent` | natural-language `prompt`, optional `schema` for structured output |\n| Structured extraction | `extract` | `mode: \"structured\"` (tables + dl + JSON-LD + chart hints + kv pairs), or `mode: \"schema\"` with a JSON Schema |\n| Site inventory | `crawl` | `strategy: \"map\"` for URL-only discovery, no content fetched |\n\n## Rapidly changing content\n\nFor news, prices, status pages, or release notes, bypass the cache with `force_refresh: true`:\n\n search({ query: \"...\", force_refresh: true })\n fetch({ url: \"...\", force_refresh: true })\n\nFor docs, tutorials, and reference pages, let the cache work -- much faster.\n\n## Check the cache before going to the network\n\nBefore every `search` or `fetch`, consider a `cache` call. Pages read this session or earlier return instantly with full markdown -- no network. `research` and `agent` check the cache internally.\n\n## Multi-query search strategy\n\nFor broad queries, pass an array of 3-5 semantically varied keyword forms rather than one natural-language question. Example: instead of \"how does React handle state management\", pass `[\"react state management\", \"useState useReducer\", \"react hooks state\", \"react context vs redux\"]`. Sub-queries are deduplicated automatically.\n\n## Pick the right strategy\n\n- For docs sites, prefer `crawl` with `strategy: \"sitemap\"` -- faster and more complete than BFS.\n- For URL discovery only, use `crawl` with `strategy: \"map\"` -- URLs only, no content. Follow with targeted `fetch` calls.\n- For structured data (prices, specs, table rows), use `extract` with `mode: \"schema\"` or `mode: \"tables\"`. Use `fetch` only when you want the whole page as markdown.\n- For multi-source synthesis, use `research` instead of chaining `search` + `fetch` manually.\n- For natural-language data gathering, use `agent` with optional `schema`.\n- `crawl` accepts regex `include_patterns` and `exclude_patterns` to stay inside a section of a large site.\n\n## Scope searches by domain\n\nFor library/framework/SDK queries, **always pass `include_domains`** with official sites. Unscoped queries return generic noise. `category: \"docs\"` alone returns generic portals -- pair with `include_domains` or omit. Skip domain scoping for error strings, broad exploration, and news.\n\n## Performance\n\n- `max_results: 3` for focused lookups; `5` default; `10+` only for broad research.\n- `max_tokens_out` caps total response size (cl100k-base BPE); prefer this over `max_chars` for budget-aware agents. When both are set, `max_tokens_out` wins.\n- `max_content_chars: 3000` remains a legitimate per-page budget \u2014 smart-truncates each result's markdown at a paragraph/heading boundary with a `[... content truncated]` marker.\n- `fetch` with `section: \"Heading Name\"` returns content under that heading -- cheaper than the whole page.\n- Repeated fetches of the same URL are free (local cache).\n- `research` with `depth: \"quick\"` (~15s) suits most factual questions; reserve `\"comprehensive\"` for deep investigation.\n- `agent` respects `max_pages` (default 10) and `max_time_ms` (default 60s).\n\n## Extras\n\n- Localhost URLs (`localhost:3000`, `127.0.0.1:8080`) work for local dev servers.\n- `use_auth: true` on `fetch`/`crawl` reuses browser session for logged-in pages.\n- `cache` supports full-text search syntax (`AND`, `OR`, `NOT`, `\"phrase\"`).\n- `research`/`agent` use MCP sampling when supported; fall back to structured data for host-LLM synthesis.";
|
|
18
18
|
export declare const TOOL_DESCRIPTIONS: {
|
|
19
|
-
readonly fetch: "Fetch a single URL and return clean markdown. Use when you have a specific URL to read. Automatically detects if JavaScript rendering is needed.\n\nKey parameters:\n- section: extract content under a specific heading (e.g., section: \"API Reference\") -- faster than reading the whole page\n- max_content_chars: smart-truncate markdown at a paragraph/heading boundary with a `[... content truncated]` marker (e.g., 3000 for compact context). Preferred over max_chars for AI agents.\n- use_auth: true to use stored browser session for authenticated/private pages\n- render_js: \"auto\" (default, detects JS need), \"always\" (force browser), \"never\" (HTTP only, fastest)\n- headers: custom HTTP headers if needed\n- force_refresh: true to bypass cache and fetch fresh content from the network\n\nReturns title, markdown, links, images, metadata (og_image, og_type, canonical_url, keywords).
|
|
20
|
-
readonly search: "Search the web and return
|
|
21
|
-
readonly crawl: "Crawl a website starting from a URL and return content from multiple pages. Use for indexing documentation sites, wikis, or any multi-page resource.\n\nKey parameters:\n- strategy: \"bfs\" (breadth-first, default), \"dfs\" (depth-first), \"sitemap\" (use sitemap.xml -- fastest for doc sites), \"map\" (URL discovery only, no content -- fastest for scoping a site)\n- max_depth: how many links deep to follow (default 2)\n- max_pages: maximum pages to fetch (default 20)\n- include_patterns/exclude_patterns: regex filters on URLs\n\nReturns an array of pages with title,
|
|
19
|
+
readonly fetch: "Fetch a single URL and return clean markdown. Use when you have a specific URL to read. Automatically detects if JavaScript rendering is needed.\n\nKey parameters:\n- section: extract content under a specific heading (e.g., section: \"API Reference\") -- faster than reading the whole page\n- max_content_chars: smart-truncate markdown at a paragraph/heading boundary with a `[... content truncated]` marker (e.g., 3000 for compact context). Preferred over max_chars for AI agents.\n- max_tokens_out: token-budget cap on total output (cl100k-base BPE). Takes precedence over max_chars when both are set.\n- include_full_markdown: default false. Set true to include the full markdown body in addition to evidence excerpts.\n- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.\n- use_auth: true to use stored browser session for authenticated/private pages\n- render_js: \"auto\" (default, detects JS need), \"always\" (force browser), \"never\" (HTTP only, fastest)\n- headers: custom HTTP headers if needed\n- force_refresh: true to bypass cache and fetch fresh content from the network\n- mode: 'fast' | 'balanced' (default) | 'deep'. fast=HTTP-only, accepts cache up to 24h stale. deep=full render + freshness.\n\nReturns title, markdown, links, images, metadata (og_image, og_type, canonical_url, keywords). Cached locally; repeat fetches are instant. Localhost URLs work.";
|
|
20
|
+
readonly search: "Search the web and return scored evidence excerpts (title/url/section_heading/excerpt/score/citation_id/source_span) plus citations. Default shape is evidence-only — no full markdown body.\n\nKey parameters:\n- query: string or string[] array (3-5 keyword variants; deduplicated automatically)\n- include_domains/exclude_domains: scope to specific sites. ALWAYS scope library/framework queries.\n- category: \"general\" | \"news\" | \"code\" | \"docs\" | \"papers\" — coarse filter, pair with include_domains.\n- from_date/to_date: ISO YYYY-MM-DD for time-bounded queries\n- max_results: default 5; use 3 for focused, 10+ for research\n- format: omit for default evidence shape. 'answer'/'stream_answer' = sampling synthesis (falls back to evidence). Retired values 'full'/'context'/'highlights' reject with a migration error.\n- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).\n- include_full_markdown: true to restore full markdown body alongside evidence (default false).\n- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.\n- max_content_chars: smart-truncate per-page markdown at paragraph boundary (e.g., 3000)\n- force_refresh: true to bypass all caches\n- mode: 'fast' | 'balanced' (default) | 'deep'. fast=single-engine, no rerank, 24h-stale cache. deep=multi-query expansion + full-body top-K.\n\nQuote [N] or {citation_id} from the evidence list.";
|
|
21
|
+
readonly crawl: "Crawl a website starting from a URL and return content from multiple pages. Use for indexing documentation sites, wikis, or any multi-page resource.\n\nKey parameters:\n- strategy: \"bfs\" (breadth-first, default), \"dfs\" (depth-first), \"sitemap\" (use sitemap.xml -- fastest for doc sites), \"map\" (URL discovery only, no content -- fastest for scoping a site)\n- max_depth: how many links deep to follow (default 2)\n- max_pages: maximum pages to fetch (default 20)\n- include_patterns/exclude_patterns: regex filters on URLs\n- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).\n- include_full_markdown: default false — pages return evidence excerpts; set true for full bodies.\n- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.\n\nReturns an array of pages with title, evidence, and depth. Content is deduplicated across pages. All pages are cached for later cache queries.";
|
|
22
22
|
readonly cache: "Search previously fetched content without hitting the network. Use before searching the web -- if relevant content was already fetched or crawled, this returns it instantly.\n\nKey parameters:\n- query: full-text search over cached markdown and titles (supports AND, OR, NOT, \"phrase match\")\n- url_pattern: glob filter on URLs (e.g., \"*example.com*\")\n- since: ISO date -- only results cached after this date\n- stats: true to get cache size, entry count, oldest/newest dates\n- clear: true to delete matching entries\n\nReturns matching cached pages with full markdown content. Cache persists across sessions locally.";
|
|
23
23
|
readonly extract: "Extract structured data from a URL or raw HTML. Use when you need specific data points, tables, or metadata rather than full page markdown.\n\nKey parameters:\n- mode: \"selector\" (CSS selector -> text), \"tables\" (HTML tables only), \"metadata\" (title/author/date/description/og_* + JSON-LD), \"schema\" (JSON Schema -> heuristic field extraction), \"structured\" (ONE-SHOT: tables + <dl> definitions + JSON-LD + chart hints from SVG/figure + microdata/data-attr/grid key-value pairs)\n- css_selector: required for mode=\"selector\" -- any valid CSS selector\n- schema: for mode=\"schema\", a JSON Schema object describing the fields to extract\n- multiple: true to return array of all matches (mode=\"selector\" only)\n\nPrefer mode=\"structured\" over chaining multiple extract calls — it returns every structured pattern on the page in one response:\n { tables, definitions, jsonld, chart_hints, key_value_pairs }\n\nchart_hints surfaces SVG titles, aria-labels, and figcaptions — host LLMs use these to describe data visualizations even when the underlying data is rendered by JavaScript.\n\nFor mode=\"tables\", returns array of table objects with headers and row data. For mode=\"schema\", pass { price: \"string\", name: \"string\" } and get structured fields extracted from the page.";
|
|
24
|
-
readonly find_similar: "Find content related to a URL or concept. Use when you have a known-good page or topic and want to discover similar resources from the cache or web.\n\nKey parameters:\n- url: a URL to find content similar to. The page's content and embeddings are used for similarity matching.\n- concept: free-text description of what you want similar content for. Use when you do not have a specific URL.\n- max_results: number of similar items to return (default 5)\n- include_cached: true (default) to search the local cache first, false to skip cache and search the web only\n- threshold: minimum similarity score (0-1, default 0.5)
|
|
25
|
-
readonly research: "Run multi-step research on a complex question. Decomposes the question into sub-queries, searches in parallel, fetches top sources, and synthesizes a report with citations.\n\nKey parameters:\n- question: the research question to investigate\n- depth: \"quick\" (~15s, 2 sub-queries, 5-8 sources), \"standard\" (~40s, 4 sub-queries, 10-15 sources, default), \"comprehensive\" (~80s, 7 sub-queries, 20-25 sources)\n- max_sources: override the default source count for the chosen depth\n- include_domains/exclude_domains: scope research to specific sites\n- schema: optional JSON Schema --
|
|
26
|
-
readonly agent: "Execute a natural-language data gathering task. Plans search queries and URLs from a prompt, executes them in parallel, and synthesizes results. Full step transparency.\n\nKey parameters:\n- prompt: natural-language description of what data to gather (e.g., \"find pricing for the top 5 CRM tools\")\n- urls: optional array of specific URLs to include in the gathering\n- schema: optional JSON Schema -- if provided, extracts structured data matching the schema from each page and merges results\n- max_pages: maximum pages to fetch (default 10)\n- max_time_ms: maximum execution time in milliseconds (default 60000)\n- stream: true to receive progress notifications as each step completes\n
|
|
24
|
+
readonly find_similar: "Find content related to a URL or concept. Use when you have a known-good page or topic and want to discover similar resources from the cache or web.\n\nKey parameters:\n- url: a URL to find content similar to. The page's content and embeddings are used for similarity matching.\n- concept: free-text description of what you want similar content for. Use when you do not have a specific URL.\n- max_results: number of similar items to return (default 5)\n- include_cached: true (default) to search the local cache first, false to skip cache and search the web only\n- threshold: minimum similarity score (0-1, default 0.5)\n- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).\n- include_full_markdown: default false — results return evidence excerpts; set true for full bodies.\n- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.\n\nProvide either url or concept. Results fuse three signals via 3-way RRF: keyword match, semantic embeddings, and (if local hits sparse) live web search. Each result carries `match_signals` with `embedding_rank`, `fts5_rank`, and `fused_score`.\n\nThe response may include a `cold_start` string when local signals are weak. Pass this verbatim to the user.\n\nReturns results array, method used (\"hybrid\" | \"embedding\" | \"fts5\" | \"search\"), cache_hits, search_hits, embedding_available, and total_time_ms.";
|
|
25
|
+
readonly research: "Run multi-step research on a complex question. Decomposes the question into sub-queries, searches in parallel, fetches top sources, and synthesizes a report with citations.\n\nKey parameters:\n- question: the research question to investigate\n- depth: \"quick\" (~15s, 2 sub-queries, 5-8 sources), \"standard\" (~40s, 4 sub-queries, 10-15 sources, default), \"comprehensive\" (~80s, 7 sub-queries, 20-25 sources)\n- max_sources: override the default source count for the chosen depth\n- include_domains/exclude_domains: scope research to specific sites\n- schema: optional JSON Schema -- structures the report to extract matching fields\n- stream: true to receive progress notifications as each phase completes\n- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).\n- include_full_markdown: default false — sources return evidence excerpts; set true for full bodies.\n- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.\n\nReturns report (markdown with [N] citations), citations array, sources, sub_queries, depth, total_time_ms, sampling_supported, and brief (topics, highlights, key_findings, sections.overview/comparison/gaps).";
|
|
26
|
+
readonly agent: "Execute a natural-language data gathering task. Plans search queries and URLs from a prompt, executes them in parallel, and synthesizes results. Full step transparency.\n\nKey parameters:\n- prompt: natural-language description of what data to gather (e.g., \"find pricing for the top 5 CRM tools\")\n- urls: optional array of specific URLs to include in the gathering\n- schema: optional JSON Schema -- if provided, extracts structured data matching the schema from each page and merges results\n- max_pages: maximum pages to fetch (default 10)\n- max_time_ms: maximum execution time in milliseconds (default 60000)\n- stream: true to receive progress notifications as each step completes\n- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).\n- include_full_markdown: default false — pages return evidence excerpts; set true for full bodies.\n- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.\n\nPipeline: (1) plan, (2) execute search+fetch in parallel within budget, (3) optional schema extraction, (4) synthesize. The steps array exposes every action with timing.\n\nUses MCP requestSampling for planning and synthesis. Without sampling support, uses keyword extraction.\n\nReturns result, sources array, pages_fetched count, steps array, total_time_ms, sampling_supported.";
|
|
27
27
|
};
|
|
28
28
|
export type ToolName = keyof typeof TOOL_DESCRIPTIONS;
|
|
29
29
|
//# sourceMappingURL=instructions.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"instructions.d.ts","sourceRoot":"","sources":["../src/instructions.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,eAAO,MAAM,mBAAmB,
|
|
1
|
+
{"version":3,"file":"instructions.d.ts","sourceRoot":"","sources":["../src/instructions.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,eAAO,MAAM,mBAAmB,01NAqF+E,CAAC;AAEhH,eAAO,MAAM,iBAAiB;;;;;;;;;CA6HpB,CAAC;AAEX,MAAM,MAAM,QAAQ,GAAG,MAAM,OAAO,iBAAiB,CAAC"}
|
package/dist/instructions.js
CHANGED
|
@@ -20,7 +20,9 @@ export const WIGOLO_INSTRUCTIONS = `Wigolo is a local-first web access layer: se
|
|
|
20
20
|
|
|
21
21
|
Wigolo has no internal LLM. It returns *structured evidence* so YOU (the host LLM) write the final answer. Fold structure into your reply:
|
|
22
22
|
|
|
23
|
-
- \`search\`
|
|
23
|
+
- \`search\` → evidence (title/url/section_heading/excerpt/score/citation_id/source_span) + citations. Quote [N] or {citation_id}.
|
|
24
|
+
- \`format: 'answer'|'stream_answer'\` → LLM synthesis when sampling supported; else evidence fallback.
|
|
25
|
+
- \`max_tokens_out\` caps total output (cl100k-base, ~5-15% drift on non-OpenAI). \`include_full_markdown: true\` restores full body. \`citation_format\`: \`'numbered'\`|\`'json'\`|\`'anthropic_tags'\`.
|
|
24
26
|
- \`research\` → \`brief\` with \`topics\`, \`highlights\`, \`key_findings\`, \`sections\` when sampling unavailable. Use \`sections.overview.cross_references\` for corroborated findings, \`sections.gaps\` for coverage limits, \`sections.comparison\` for entity-vs-entity analysis. \`query_type\` indicates decomposition strategy used.
|
|
25
27
|
- \`find_similar\` → \`cold_start\` string when local signals are weak. Pass to user verbatim.
|
|
26
28
|
- \`extract\` \`mode: "structured"\` → tables + definitions + jsonld + chart_hints + key_value_pairs in one call.
|
|
@@ -45,8 +47,8 @@ Wigolo has no internal LLM. It returns *structured evidence* so YOU (the host LL
|
|
|
45
47
|
| Error debugging | \`search\` | exact error string as query, \`category: "code"\` (no domain scoping -- errors appear everywhere) |
|
|
46
48
|
| Library research | \`crawl\` | seed URL of docs site, \`strategy: "sitemap"\`, then \`cache\` for later queries |
|
|
47
49
|
| Related content | \`find_similar\` | \`url\` of a known good page, or \`concept\` as free text |
|
|
48
|
-
|
|
|
49
|
-
| Direct answer | \`search\` | \`format: "answer"\` if client supports sampling, else falls back to
|
|
50
|
+
| Evidence excerpt | \`search\` | default output; cite [N] or {citation_id} from each evidence item |
|
|
51
|
+
| Direct answer | \`search\` | \`format: "answer"\` if client supports sampling, else falls back to evidence |
|
|
50
52
|
| Comprehensive research | \`research\` | \`depth: "comprehensive"\`, optional \`include_domains\` to scope |
|
|
51
53
|
| Data gathering | \`agent\` | natural-language \`prompt\`, optional \`schema\` for structured output |
|
|
52
54
|
| Structured extraction | \`extract\` | \`mode: "structured"\` (tables + dl + JSON-LD + chart hints + kv pairs), or \`mode: "schema"\` with a JSON Schema |
|
|
@@ -85,7 +87,8 @@ For library/framework/SDK queries, **always pass \`include_domains\`** with offi
|
|
|
85
87
|
## Performance
|
|
86
88
|
|
|
87
89
|
- \`max_results: 3\` for focused lookups; \`5\` default; \`10+\` only for broad research.
|
|
88
|
-
- \`
|
|
90
|
+
- \`max_tokens_out\` caps total response size (cl100k-base BPE); prefer this over \`max_chars\` for budget-aware agents. When both are set, \`max_tokens_out\` wins.
|
|
91
|
+
- \`max_content_chars: 3000\` remains a legitimate per-page budget — smart-truncates each result's markdown at a paragraph/heading boundary with a \`[... content truncated]\` marker.
|
|
89
92
|
- \`fetch\` with \`section: "Heading Name"\` returns content under that heading -- cheaper than the whole page.
|
|
90
93
|
- Repeated fetches of the same URL are free (local cache).
|
|
91
94
|
- \`research\` with \`depth: "quick"\` (~15s) suits most factual questions; reserve \`"comprehensive"\` for deep investigation.
|
|
@@ -103,28 +106,33 @@ export const TOOL_DESCRIPTIONS = {
|
|
|
103
106
|
Key parameters:
|
|
104
107
|
- section: extract content under a specific heading (e.g., section: "API Reference") -- faster than reading the whole page
|
|
105
108
|
- max_content_chars: smart-truncate markdown at a paragraph/heading boundary with a \`[... content truncated]\` marker (e.g., 3000 for compact context). Preferred over max_chars for AI agents.
|
|
109
|
+
- max_tokens_out: token-budget cap on total output (cl100k-base BPE). Takes precedence over max_chars when both are set.
|
|
110
|
+
- include_full_markdown: default false. Set true to include the full markdown body in addition to evidence excerpts.
|
|
111
|
+
- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.
|
|
106
112
|
- use_auth: true to use stored browser session for authenticated/private pages
|
|
107
113
|
- render_js: "auto" (default, detects JS need), "always" (force browser), "never" (HTTP only, fastest)
|
|
108
114
|
- headers: custom HTTP headers if needed
|
|
109
115
|
- force_refresh: true to bypass cache and fetch fresh content from the network
|
|
116
|
+
- mode: 'fast' | 'balanced' (default) | 'deep'. fast=HTTP-only, accepts cache up to 24h stale. deep=full render + freshness.
|
|
110
117
|
|
|
111
|
-
Returns title, markdown, links, images, metadata (og_image, og_type, canonical_url, keywords).
|
|
112
|
-
|
|
113
|
-
Use force_refresh: true for frequently changing content. Default serves from cache.`,
|
|
114
|
-
search: `Search the web and return full markdown content from top results. Returns extracted page content, not just snippets.
|
|
118
|
+
Returns title, markdown, links, images, metadata (og_image, og_type, canonical_url, keywords). Cached locally; repeat fetches are instant. Localhost URLs work.`,
|
|
119
|
+
search: `Search the web and return scored evidence excerpts (title/url/section_heading/excerpt/score/citation_id/source_span) plus citations. Default shape is evidence-only — no full markdown body.
|
|
115
120
|
|
|
116
121
|
Key parameters:
|
|
117
|
-
- query: string or string[] array (3-5 keyword variants
|
|
122
|
+
- query: string or string[] array (3-5 keyword variants; deduplicated automatically)
|
|
118
123
|
- include_domains/exclude_domains: scope to specific sites. ALWAYS scope library/framework queries.
|
|
119
124
|
- category: "general" | "news" | "code" | "docs" | "papers" — coarse filter, pair with include_domains.
|
|
120
125
|
- from_date/to_date: ISO YYYY-MM-DD for time-bounded queries
|
|
121
126
|
- max_results: default 5; use 3 for focused, 10+ for research
|
|
122
|
-
- format:
|
|
123
|
-
-
|
|
124
|
-
-
|
|
127
|
+
- format: omit for default evidence shape. 'answer'/'stream_answer' = sampling synthesis (falls back to evidence). Retired values 'full'/'context'/'highlights' reject with a migration error.
|
|
128
|
+
- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).
|
|
129
|
+
- include_full_markdown: true to restore full markdown body alongside evidence (default false).
|
|
130
|
+
- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.
|
|
131
|
+
- max_content_chars: smart-truncate per-page markdown at paragraph boundary (e.g., 3000)
|
|
125
132
|
- force_refresh: true to bypass all caches
|
|
133
|
+
- mode: 'fast' | 'balanced' (default) | 'deep'. fast=single-engine, no rerank, 24h-stale cache. deep=multi-query expansion + full-body top-K.
|
|
126
134
|
|
|
127
|
-
|
|
135
|
+
Quote [N] or {citation_id} from the evidence list.`,
|
|
128
136
|
crawl: `Crawl a website starting from a URL and return content from multiple pages. Use for indexing documentation sites, wikis, or any multi-page resource.
|
|
129
137
|
|
|
130
138
|
Key parameters:
|
|
@@ -132,8 +140,11 @@ Key parameters:
|
|
|
132
140
|
- max_depth: how many links deep to follow (default 2)
|
|
133
141
|
- max_pages: maximum pages to fetch (default 20)
|
|
134
142
|
- include_patterns/exclude_patterns: regex filters on URLs
|
|
143
|
+
- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).
|
|
144
|
+
- include_full_markdown: default false — pages return evidence excerpts; set true for full bodies.
|
|
145
|
+
- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.
|
|
135
146
|
|
|
136
|
-
Returns an array of pages with title,
|
|
147
|
+
Returns an array of pages with title, evidence, and depth. Content is deduplicated across pages. All pages are cached for later cache queries.`,
|
|
137
148
|
cache: `Search previously fetched content without hitting the network. Use before searching the web -- if relevant content was already fetched or crawled, this returns it instantly.
|
|
138
149
|
|
|
139
150
|
Key parameters:
|
|
@@ -165,11 +176,14 @@ Key parameters:
|
|
|
165
176
|
- concept: free-text description of what you want similar content for. Use when you do not have a specific URL.
|
|
166
177
|
- max_results: number of similar items to return (default 5)
|
|
167
178
|
- include_cached: true (default) to search the local cache first, false to skip cache and search the web only
|
|
168
|
-
- threshold: minimum similarity score (0-1, default 0.5)
|
|
179
|
+
- threshold: minimum similarity score (0-1, default 0.5)
|
|
180
|
+
- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).
|
|
181
|
+
- include_full_markdown: default false — results return evidence excerpts; set true for full bodies.
|
|
182
|
+
- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.
|
|
169
183
|
|
|
170
|
-
Provide either url or concept
|
|
184
|
+
Provide either url or concept. Results fuse three signals via 3-way RRF: keyword match, semantic embeddings, and (if local hits sparse) live web search. Each result carries \`match_signals\` with \`embedding_rank\`, \`fts5_rank\`, and \`fused_score\`.
|
|
171
185
|
|
|
172
|
-
The response may include a \`cold_start\` string when local signals are weak
|
|
186
|
+
The response may include a \`cold_start\` string when local signals are weak. Pass this verbatim to the user.
|
|
173
187
|
|
|
174
188
|
Returns results array, method used ("hybrid" | "embedding" | "fts5" | "search"), cache_hits, search_hits, embedding_available, and total_time_ms.`,
|
|
175
189
|
research: `Run multi-step research on a complex question. Decomposes the question into sub-queries, searches in parallel, fetches top sources, and synthesizes a report with citations.
|
|
@@ -179,21 +193,13 @@ Key parameters:
|
|
|
179
193
|
- depth: "quick" (~15s, 2 sub-queries, 5-8 sources), "standard" (~40s, 4 sub-queries, 10-15 sources, default), "comprehensive" (~80s, 7 sub-queries, 20-25 sources)
|
|
180
194
|
- max_sources: override the default source count for the chosen depth
|
|
181
195
|
- include_domains/exclude_domains: scope research to specific sites
|
|
182
|
-
- schema: optional JSON Schema --
|
|
183
|
-
- stream: true to receive progress notifications as each
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
Uses MCP requestSampling for intelligent decomposition and synthesis when available. Without sampling support (the common case), the output includes a \`brief\` with:
|
|
188
|
-
- \`topics\`, \`highlights\` (ML-scored), \`key_findings\` (per-source, by relevance)
|
|
189
|
-
- \`query_type\`: "comparison" | "how-to" | "concept" | "general"
|
|
190
|
-
- \`sections.overview\`: top findings + cross_references (corroborated by 2+ sources)
|
|
191
|
-
- \`sections.comparison\`: entities + comparison_points (comparison queries only)
|
|
192
|
-
- \`sections.gaps\`: sub-queries with limited source coverage
|
|
196
|
+
- schema: optional JSON Schema -- structures the report to extract matching fields
|
|
197
|
+
- stream: true to receive progress notifications as each phase completes
|
|
198
|
+
- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).
|
|
199
|
+
- include_full_markdown: default false — sources return evidence excerpts; set true for full bodies.
|
|
200
|
+
- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.
|
|
193
201
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
Returns report (markdown), citations array, sources with full content, sub_queries used, depth level, total_time_ms, sampling_supported flag, and optional brief.`,
|
|
202
|
+
Returns report (markdown with [N] citations), citations array, sources, sub_queries, depth, total_time_ms, sampling_supported, and brief (topics, highlights, key_findings, sections.overview/comparison/gaps).`,
|
|
197
203
|
agent: `Execute a natural-language data gathering task. Plans search queries and URLs from a prompt, executes them in parallel, and synthesizes results. Full step transparency.
|
|
198
204
|
|
|
199
205
|
Key parameters:
|
|
@@ -203,13 +209,14 @@ Key parameters:
|
|
|
203
209
|
- max_pages: maximum pages to fetch (default 10)
|
|
204
210
|
- max_time_ms: maximum execution time in milliseconds (default 60000)
|
|
205
211
|
- stream: true to receive progress notifications as each step completes
|
|
212
|
+
- max_tokens_out: token-budget cap on total output (cl100k-base; wins over max_chars).
|
|
213
|
+
- include_full_markdown: default false — pages return evidence excerpts; set true for full bodies.
|
|
214
|
+
- citation_format: 'numbered' (default) | 'json' | 'anthropic_tags'.
|
|
206
215
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
The steps array in the output provides full transparency into every action taken (plan, search, fetch, extract, synthesize) with timing. This differentiates from black-box alternatives.
|
|
216
|
+
Pipeline: (1) plan, (2) execute search+fetch in parallel within budget, (3) optional schema extraction, (4) synthesize. The steps array exposes every action with timing.
|
|
210
217
|
|
|
211
|
-
Uses MCP requestSampling for planning and synthesis. Without sampling support, uses keyword extraction
|
|
218
|
+
Uses MCP requestSampling for planning and synthesis. Without sampling support, uses keyword extraction.
|
|
212
219
|
|
|
213
|
-
Returns result
|
|
220
|
+
Returns result, sources array, pages_fetched count, steps array, total_time_ms, sampling_supported.`,
|
|
214
221
|
};
|
|
215
222
|
//# sourceMappingURL=instructions.js.map
|
package/dist/instructions.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"instructions.js","sourceRoot":"","sources":["../src/instructions.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,MAAM,CAAC,MAAM,mBAAmB,GAAG
|
|
1
|
+
{"version":3,"file":"instructions.js","sourceRoot":"","sources":["../src/instructions.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,MAAM,CAAC,MAAM,mBAAmB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;+GAqF4E,CAAC;AAEhH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,KAAK,EAAE;;;;;;;;;;;;;;gKAcuJ;IAE9J,MAAM,EAAE;;;;;;;;;;;;;;;;mDAgByC;IAEjD,KAAK,EAAE;;;;;;;;;;;+IAWsI;IAE7I,KAAK,EAAE;;;;;;;;;kGASyF;IAEhG,OAAO,EAAE;;;;;;;;;;;;;4LAaiL;IAE1L,YAAY,EAAE;;;;;;;;;;;;;;;;kJAgBkI;IAEhJ,QAAQ,EAAE;;;;;;;;;;;;;gNAaoM;IAE9M,KAAK,EAAE;;;;;;;;;;;;;;;;;oGAiB2F;CAC1F,CAAC"}
|
package/dist/logger.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
type Module = 'fetch' | 'search' | 'crawl' | 'cache' | 'extract' | 'searxng' | 'server' | 'cli' | 'jsonld' | 'repl' | 'embedding' | 'research' | 'agent';
|
|
1
|
+
type Module = 'fetch' | 'search' | 'crawl' | 'cache' | 'extract' | 'searxng' | 'server' | 'cli' | 'jsonld' | 'repl' | 'embedding' | 'research' | 'agent' | 'structured-data' | 'reranker';
|
|
2
2
|
export interface Logger {
|
|
3
3
|
debug(msg: string, data?: Record<string, unknown>): void;
|
|
4
4
|
info(msg: string, data?: Record<string, unknown>): void;
|
package/dist/logger.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"logger.d.ts","sourceRoot":"","sources":["../src/logger.ts"],"names":[],"mappings":"AAKA,KAAK,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,OAAO,GAAG,OAAO,GAAG,SAAS,GAAG,SAAS,GAAG,QAAQ,GAAG,KAAK,GAAG,QAAQ,GAAG,MAAM,GAAG,WAAW,GAAG,UAAU,GAAG,OAAO,CAAC;
|
|
1
|
+
{"version":3,"file":"logger.d.ts","sourceRoot":"","sources":["../src/logger.ts"],"names":[],"mappings":"AAKA,KAAK,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,OAAO,GAAG,OAAO,GAAG,SAAS,GAAG,SAAS,GAAG,QAAQ,GAAG,KAAK,GAAG,QAAQ,GAAG,MAAM,GAAG,WAAW,GAAG,UAAU,GAAG,OAAO,GAAG,iBAAiB,GAAG,UAAU,CAAC;AAS1L,MAAM,WAAW,MAAM;IACrB,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;IACzD,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;IACxD,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;IACxD,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;CAC1D;AA+CD,wBAAgB,YAAY,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAiBnD"}
|
package/dist/research/brief.js
CHANGED
|
@@ -9,7 +9,7 @@ const MIN_PHRASE_LEN = 4;
|
|
|
9
9
|
// shape to produce the final report without needing to re-read raw sources.
|
|
10
10
|
export async function buildResearchBrief(question, sources, subQueries, perSourceCharCap, totalSourcesCharCap, queryType = 'general', comparisonEntities = []) {
|
|
11
11
|
const fetched = sources.filter((s) => s.fetched && s.markdown_content.length > 0);
|
|
12
|
-
// Highlights reuse the
|
|
12
|
+
// Highlights reuse the ONNX-reranker-or-paragraph scorer so briefs align with
|
|
13
13
|
// whatever format='highlights' produces for single-query searches.
|
|
14
14
|
const searchItems = fetched.map((s) => ({
|
|
15
15
|
title: s.title,
|