@staticn0va/wigolo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +74 -0
- package/README.md +272 -0
- package/dist/cache/db.d.ts +5 -0
- package/dist/cache/db.d.ts.map +1 -0
- package/dist/cache/db.js +97 -0
- package/dist/cache/db.js.map +1 -0
- package/dist/cache/store.d.ts +26 -0
- package/dist/cache/store.d.ts.map +1 -0
- package/dist/cache/store.js +214 -0
- package/dist/cache/store.js.map +1 -0
- package/dist/cli/daemon.d.ts +2 -0
- package/dist/cli/daemon.d.ts.map +1 -0
- package/dist/cli/daemon.js +5 -0
- package/dist/cli/daemon.js.map +1 -0
- package/dist/cli/health.d.ts +2 -0
- package/dist/cli/health.d.ts.map +1 -0
- package/dist/cli/health.js +5 -0
- package/dist/cli/health.js.map +1 -0
- package/dist/cli/index.d.ts +7 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +9 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/warmup.d.ts +11 -0
- package/dist/cli/warmup.d.ts.map +1 -0
- package/dist/cli/warmup.js +107 -0
- package/dist/cli/warmup.js.map +1 -0
- package/dist/config.d.ts +41 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +66 -0
- package/dist/config.js.map +1 -0
- package/dist/crawl/crawler.d.ts +18 -0
- package/dist/crawl/crawler.d.ts.map +1 -0
- package/dist/crawl/crawler.js +228 -0
- package/dist/crawl/crawler.js.map +1 -0
- package/dist/crawl/dedup.d.ts +15 -0
- package/dist/crawl/dedup.d.ts.map +1 -0
- package/dist/crawl/dedup.js +93 -0
- package/dist/crawl/dedup.js.map +1 -0
- package/dist/crawl/mapper.d.ts +17 -0
- package/dist/crawl/mapper.d.ts.map +1 -0
- package/dist/crawl/mapper.js +178 -0
- package/dist/crawl/mapper.js.map +1 -0
- package/dist/crawl/rate-limiter.d.ts +10 -0
- package/dist/crawl/rate-limiter.d.ts.map +1 -0
- package/dist/crawl/rate-limiter.js +72 -0
- package/dist/crawl/rate-limiter.js.map +1 -0
- package/dist/crawl/robots.d.ts +9 -0
- package/dist/crawl/robots.d.ts.map +1 -0
- package/dist/crawl/robots.js +63 -0
- package/dist/crawl/robots.js.map +1 -0
- package/dist/crawl/sitemap.d.ts +4 -0
- package/dist/crawl/sitemap.d.ts.map +1 -0
- package/dist/crawl/sitemap.js +38 -0
- package/dist/crawl/sitemap.js.map +1 -0
- package/dist/crawl/url-utils.d.ts +3 -0
- package/dist/crawl/url-utils.d.ts.map +1 -0
- package/dist/crawl/url-utils.js +41 -0
- package/dist/crawl/url-utils.js.map +1 -0
- package/dist/extraction/defuddle.d.ts +3 -0
- package/dist/extraction/defuddle.d.ts.map +1 -0
- package/dist/extraction/defuddle.js +26 -0
- package/dist/extraction/defuddle.js.map +1 -0
- package/dist/extraction/extract.d.ts +5 -0
- package/dist/extraction/extract.d.ts.map +1 -0
- package/dist/extraction/extract.js +83 -0
- package/dist/extraction/extract.js.map +1 -0
- package/dist/extraction/jsonld.d.ts +4 -0
- package/dist/extraction/jsonld.d.ts.map +1 -0
- package/dist/extraction/jsonld.js +64 -0
- package/dist/extraction/jsonld.js.map +1 -0
- package/dist/extraction/markdown.d.ts +10 -0
- package/dist/extraction/markdown.d.ts.map +1 -0
- package/dist/extraction/markdown.js +107 -0
- package/dist/extraction/markdown.js.map +1 -0
- package/dist/extraction/pipeline.d.ts +11 -0
- package/dist/extraction/pipeline.d.ts.map +1 -0
- package/dist/extraction/pipeline.js +95 -0
- package/dist/extraction/pipeline.js.map +1 -0
- package/dist/extraction/readability.d.ts +3 -0
- package/dist/extraction/readability.d.ts.map +1 -0
- package/dist/extraction/readability.js +32 -0
- package/dist/extraction/readability.js.map +1 -0
- package/dist/extraction/schema.d.ts +7 -0
- package/dist/extraction/schema.d.ts.map +1 -0
- package/dist/extraction/schema.js +86 -0
- package/dist/extraction/schema.js.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.js +104 -0
- package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
- package/dist/extraction/site-extractors/github.d.ts +3 -0
- package/dist/extraction/site-extractors/github.d.ts.map +1 -0
- package/dist/extraction/site-extractors/github.js +107 -0
- package/dist/extraction/site-extractors/github.js.map +1 -0
- package/dist/extraction/site-extractors/mdn.d.ts +3 -0
- package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
- package/dist/extraction/site-extractors/mdn.js +58 -0
- package/dist/extraction/site-extractors/mdn.js.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.js +88 -0
- package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
- package/dist/extraction/trafilatura.d.ts +6 -0
- package/dist/extraction/trafilatura.d.ts.map +1 -0
- package/dist/extraction/trafilatura.js +105 -0
- package/dist/extraction/trafilatura.js.map +1 -0
- package/dist/fetch/auth.d.ts +8 -0
- package/dist/fetch/auth.d.ts.map +1 -0
- package/dist/fetch/auth.js +32 -0
- package/dist/fetch/auth.js.map +1 -0
- package/dist/fetch/browser-pool.d.ts +28 -0
- package/dist/fetch/browser-pool.d.ts.map +1 -0
- package/dist/fetch/browser-pool.js +138 -0
- package/dist/fetch/browser-pool.js.map +1 -0
- package/dist/fetch/content-check.d.ts +2 -0
- package/dist/fetch/content-check.d.ts.map +1 -0
- package/dist/fetch/content-check.js +62 -0
- package/dist/fetch/content-check.js.map +1 -0
- package/dist/fetch/http-client.d.ts +15 -0
- package/dist/fetch/http-client.d.ts.map +1 -0
- package/dist/fetch/http-client.js +146 -0
- package/dist/fetch/http-client.js.map +1 -0
- package/dist/fetch/router.d.ts +45 -0
- package/dist/fetch/router.d.ts.map +1 -0
- package/dist/fetch/router.js +89 -0
- package/dist/fetch/router.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +10 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +39 -0
- package/dist/logger.js.map +1 -0
- package/dist/search/dedup.d.ts +10 -0
- package/dist/search/dedup.d.ts.map +1 -0
- package/dist/search/dedup.js +35 -0
- package/dist/search/dedup.js.map +1 -0
- package/dist/search/engines/bing.d.ts +7 -0
- package/dist/search/engines/bing.d.ts.map +1 -0
- package/dist/search/engines/bing.js +48 -0
- package/dist/search/engines/bing.js.map +1 -0
- package/dist/search/engines/duckduckgo.d.ts +7 -0
- package/dist/search/engines/duckduckgo.d.ts.map +1 -0
- package/dist/search/engines/duckduckgo.js +50 -0
- package/dist/search/engines/duckduckgo.js.map +1 -0
- package/dist/search/engines/startpage.d.ts +7 -0
- package/dist/search/engines/startpage.d.ts.map +1 -0
- package/dist/search/engines/startpage.js +50 -0
- package/dist/search/engines/startpage.js.map +1 -0
- package/dist/search/filters.d.ts +16 -0
- package/dist/search/filters.d.ts.map +1 -0
- package/dist/search/filters.js +63 -0
- package/dist/search/filters.js.map +1 -0
- package/dist/search/flashrank.d.ts +12 -0
- package/dist/search/flashrank.d.ts.map +1 -0
- package/dist/search/flashrank.js +63 -0
- package/dist/search/flashrank.js.map +1 -0
- package/dist/search/query.d.ts +2 -0
- package/dist/search/query.d.ts.map +1 -0
- package/dist/search/query.js +41 -0
- package/dist/search/query.js.map +1 -0
- package/dist/search/rerank.d.ts +3 -0
- package/dist/search/rerank.d.ts.map +1 -0
- package/dist/search/rerank.js +40 -0
- package/dist/search/rerank.js.map +1 -0
- package/dist/search/searxng.d.ts +8 -0
- package/dist/search/searxng.d.ts.map +1 -0
- package/dist/search/searxng.js +87 -0
- package/dist/search/searxng.js.map +1 -0
- package/dist/search/validator.d.ts +6 -0
- package/dist/search/validator.d.ts.map +1 -0
- package/dist/search/validator.js +35 -0
- package/dist/search/validator.js.map +1 -0
- package/dist/searxng/bootstrap.d.ts +18 -0
- package/dist/searxng/bootstrap.d.ts.map +1 -0
- package/dist/searxng/bootstrap.js +136 -0
- package/dist/searxng/bootstrap.js.map +1 -0
- package/dist/searxng/docker.d.ts +9 -0
- package/dist/searxng/docker.d.ts.map +1 -0
- package/dist/searxng/docker.js +67 -0
- package/dist/searxng/docker.js.map +1 -0
- package/dist/searxng/process.d.ts +23 -0
- package/dist/searxng/process.d.ts.map +1 -0
- package/dist/searxng/process.js +188 -0
- package/dist/searxng/process.js.map +1 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +311 -0
- package/dist/server.js.map +1 -0
- package/dist/tools/cache.d.ts +3 -0
- package/dist/tools/cache.d.ts.map +1 -0
- package/dist/tools/cache.js +50 -0
- package/dist/tools/cache.js.map +1 -0
- package/dist/tools/crawl.d.ts +6 -0
- package/dist/tools/crawl.d.ts.map +1 -0
- package/dist/tools/crawl.js +97 -0
- package/dist/tools/crawl.js.map +1 -0
- package/dist/tools/extract.d.ts +4 -0
- package/dist/tools/extract.d.ts.map +1 -0
- package/dist/tools/extract.js +69 -0
- package/dist/tools/extract.js.map +1 -0
- package/dist/tools/fetch.d.ts +4 -0
- package/dist/tools/fetch.d.ts.map +1 -0
- package/dist/tools/fetch.js +76 -0
- package/dist/tools/fetch.js.map +1 -0
- package/dist/tools/search.d.ts +4 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +160 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/types.d.ts +222 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +61 -0
package/dist/server.js
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
import { join } from 'node:path';
|
|
2
|
+
import { mkdirSync } from 'node:fs';
|
|
3
|
+
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
4
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
5
|
+
import { ListToolsRequestSchema, CallToolRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
6
|
+
import { SmartRouter } from './fetch/router.js';
|
|
7
|
+
import { BrowserPool } from './fetch/browser-pool.js';
|
|
8
|
+
import { httpFetch } from './fetch/http-client.js';
|
|
9
|
+
import { initDatabase, closeDatabase } from './cache/db.js';
|
|
10
|
+
import { handleFetch } from './tools/fetch.js';
|
|
11
|
+
import { handleSearch } from './tools/search.js';
|
|
12
|
+
import { handleCrawl } from './tools/crawl.js';
|
|
13
|
+
import { handleCache } from './tools/cache.js';
|
|
14
|
+
import { handleExtract } from './tools/extract.js';
|
|
15
|
+
import { SearxngClient } from './search/searxng.js';
|
|
16
|
+
import { DuckDuckGoEngine } from './search/engines/duckduckgo.js';
|
|
17
|
+
import { BingEngine } from './search/engines/bing.js';
|
|
18
|
+
import { StartpageEngine } from './search/engines/startpage.js';
|
|
19
|
+
import { resolveSearchBackend } from './searxng/bootstrap.js';
|
|
20
|
+
import { SearxngProcess } from './searxng/process.js';
|
|
21
|
+
import { DockerSearxng } from './searxng/docker.js';
|
|
22
|
+
import { getConfig } from './config.js';
|
|
23
|
+
import { createLogger } from './logger.js';
|
|
24
|
+
const log = createLogger('server');
|
|
25
|
+
const FETCH_TOOL_SCHEMA = {
|
|
26
|
+
type: 'object',
|
|
27
|
+
properties: {
|
|
28
|
+
url: { type: 'string', description: 'URL to fetch' },
|
|
29
|
+
render_js: {
|
|
30
|
+
type: 'string',
|
|
31
|
+
enum: ['auto', 'always', 'never'],
|
|
32
|
+
description: 'JavaScript rendering mode (default: auto)',
|
|
33
|
+
},
|
|
34
|
+
use_auth: {
|
|
35
|
+
type: 'boolean',
|
|
36
|
+
description: 'Use stored auth credentials (default: false)',
|
|
37
|
+
},
|
|
38
|
+
max_chars: {
|
|
39
|
+
type: 'number',
|
|
40
|
+
description: 'Maximum characters to return',
|
|
41
|
+
},
|
|
42
|
+
section: {
|
|
43
|
+
type: 'string',
|
|
44
|
+
description: 'Extract a specific section by heading text',
|
|
45
|
+
},
|
|
46
|
+
section_index: {
|
|
47
|
+
type: 'number',
|
|
48
|
+
description: 'Index of the section match (default: 0)',
|
|
49
|
+
},
|
|
50
|
+
screenshot: {
|
|
51
|
+
type: 'boolean',
|
|
52
|
+
description: 'Capture a screenshot (default: false)',
|
|
53
|
+
},
|
|
54
|
+
headers: {
|
|
55
|
+
type: 'object',
|
|
56
|
+
description: 'Additional HTTP headers',
|
|
57
|
+
additionalProperties: { type: 'string' },
|
|
58
|
+
},
|
|
59
|
+
},
|
|
60
|
+
required: ['url'],
|
|
61
|
+
};
|
|
62
|
+
const SEARCH_TOOL_SCHEMA = {
|
|
63
|
+
type: 'object',
|
|
64
|
+
properties: {
|
|
65
|
+
query: { type: 'string', description: 'Search query' },
|
|
66
|
+
max_results: { type: 'number', description: 'Max results to return (default 5, max 20)' },
|
|
67
|
+
include_content: { type: 'boolean', description: 'Fetch full content for results (default true)' },
|
|
68
|
+
content_max_chars: { type: 'number', description: 'Max chars per result content (default 30000)' },
|
|
69
|
+
max_total_chars: { type: 'number', description: 'Max total chars across all results (default 50000)' },
|
|
70
|
+
time_range: { type: 'string', enum: ['day', 'week', 'month', 'year'], description: 'Time range filter' },
|
|
71
|
+
search_engines: { type: 'array', items: { type: 'string' }, description: 'Override engine selection' },
|
|
72
|
+
language: { type: 'string', description: 'Language preference' },
|
|
73
|
+
include_domains: {
|
|
74
|
+
type: 'array',
|
|
75
|
+
items: { type: 'string' },
|
|
76
|
+
description: 'Only return results from these domains (e.g. ["react.dev", "github.com"])',
|
|
77
|
+
},
|
|
78
|
+
exclude_domains: {
|
|
79
|
+
type: 'array',
|
|
80
|
+
items: { type: 'string' },
|
|
81
|
+
description: 'Never return results from these domains',
|
|
82
|
+
},
|
|
83
|
+
from_date: {
|
|
84
|
+
type: 'string',
|
|
85
|
+
description: 'ISO date (YYYY-MM-DD) — only return results published after this date',
|
|
86
|
+
},
|
|
87
|
+
to_date: {
|
|
88
|
+
type: 'string',
|
|
89
|
+
description: 'ISO date (YYYY-MM-DD) — only return results published before this date',
|
|
90
|
+
},
|
|
91
|
+
category: {
|
|
92
|
+
type: 'string',
|
|
93
|
+
enum: ['general', 'news', 'code', 'docs', 'papers', 'images'],
|
|
94
|
+
description: 'Category of search (general, news, code, docs, papers, images)',
|
|
95
|
+
},
|
|
96
|
+
},
|
|
97
|
+
required: ['query'],
|
|
98
|
+
};
|
|
99
|
+
const CRAWL_TOOL_SCHEMA = {
|
|
100
|
+
type: 'object',
|
|
101
|
+
properties: {
|
|
102
|
+
url: { type: 'string', description: 'Seed URL to start crawling from' },
|
|
103
|
+
max_depth: { type: 'number', description: 'Maximum link depth from seed (default: 2)' },
|
|
104
|
+
max_pages: { type: 'number', description: 'Maximum pages to crawl (default: 20)' },
|
|
105
|
+
strategy: {
|
|
106
|
+
type: 'string',
|
|
107
|
+
enum: ['bfs', 'dfs', 'sitemap', 'map'],
|
|
108
|
+
description: 'Crawl strategy: bfs (breadth-first), dfs (depth-first), sitemap (use sitemap.xml), map (URL-only discovery — returns list of URLs without content, faster than full crawl)',
|
|
109
|
+
},
|
|
110
|
+
include_patterns: {
|
|
111
|
+
type: 'array',
|
|
112
|
+
items: { type: 'string' },
|
|
113
|
+
description: 'URL regex whitelist — only crawl matching URLs',
|
|
114
|
+
},
|
|
115
|
+
exclude_patterns: {
|
|
116
|
+
type: 'array',
|
|
117
|
+
items: { type: 'string' },
|
|
118
|
+
description: 'URL regex blacklist — skip matching URLs',
|
|
119
|
+
},
|
|
120
|
+
use_auth: { type: 'boolean', description: 'Use stored auth credentials (default: false)' },
|
|
121
|
+
extract_links: { type: 'boolean', description: 'Return link graph between pages (default: false)' },
|
|
122
|
+
max_total_chars: { type: 'number', description: 'Max total chars across all pages (default: 100000)' },
|
|
123
|
+
},
|
|
124
|
+
required: ['url'],
|
|
125
|
+
};
|
|
126
|
+
const CACHE_TOOL_SCHEMA = {
|
|
127
|
+
type: 'object',
|
|
128
|
+
properties: {
|
|
129
|
+
query: { type: 'string', description: 'Full-text search over cached content' },
|
|
130
|
+
url_pattern: {
|
|
131
|
+
type: 'string',
|
|
132
|
+
description: 'Filter by URL glob pattern (e.g., "*example.com*")',
|
|
133
|
+
},
|
|
134
|
+
since: {
|
|
135
|
+
type: 'string',
|
|
136
|
+
description: 'ISO date — only results cached after this date',
|
|
137
|
+
},
|
|
138
|
+
clear: {
|
|
139
|
+
type: 'boolean',
|
|
140
|
+
description: 'Clear matching cache entries (requires at least one filter: query, url_pattern, or since)',
|
|
141
|
+
},
|
|
142
|
+
stats: {
|
|
143
|
+
type: 'boolean',
|
|
144
|
+
description: 'Return cache statistics (total URLs, size, date range)',
|
|
145
|
+
},
|
|
146
|
+
},
|
|
147
|
+
};
|
|
148
|
+
const EXTRACT_TOOL_SCHEMA = {
|
|
149
|
+
type: 'object',
|
|
150
|
+
properties: {
|
|
151
|
+
url: { type: 'string', description: 'URL to fetch and extract from' },
|
|
152
|
+
html: { type: 'string', description: 'Raw HTML to extract from (url takes priority if both provided)' },
|
|
153
|
+
mode: {
|
|
154
|
+
type: 'string',
|
|
155
|
+
enum: ['selector', 'tables', 'metadata', 'schema'],
|
|
156
|
+
description: 'Extraction mode: selector (CSS), tables (HTML tables), metadata (meta tags + JSON-LD), schema (extract fields matching a JSON Schema via heuristic matching)',
|
|
157
|
+
},
|
|
158
|
+
css_selector: {
|
|
159
|
+
type: 'string',
|
|
160
|
+
description: 'CSS selector to match (required when mode="selector")',
|
|
161
|
+
},
|
|
162
|
+
multiple: {
|
|
163
|
+
type: 'boolean',
|
|
164
|
+
description: 'Return array of all matches instead of first (default: false, only for mode="selector")',
|
|
165
|
+
},
|
|
166
|
+
schema: {
|
|
167
|
+
type: 'object',
|
|
168
|
+
description: 'JSON Schema defining fields to extract. Field names are matched against page content via CSS classes, ARIA labels, microdata, and JSON-LD. Required when mode="schema".',
|
|
169
|
+
},
|
|
170
|
+
},
|
|
171
|
+
};
|
|
172
|
+
export async function startServer() {
|
|
173
|
+
const config = getConfig();
|
|
174
|
+
mkdirSync(config.dataDir, { recursive: true });
|
|
175
|
+
initDatabase(join(config.dataDir, 'wigolo.db'));
|
|
176
|
+
const httpClient = {
|
|
177
|
+
fetch: (url, options) => httpFetch(url, options),
|
|
178
|
+
};
|
|
179
|
+
const browserPool = new BrowserPool();
|
|
180
|
+
const router = new SmartRouter(httpClient, browserPool);
|
|
181
|
+
// --- Search backend initialization ---
|
|
182
|
+
const backend = await resolveSearchBackend();
|
|
183
|
+
const searchEngines = [];
|
|
184
|
+
let searxngProcess = null;
|
|
185
|
+
let dockerSearxng = null;
|
|
186
|
+
if (backend.type === 'external' && backend.url) {
|
|
187
|
+
searchEngines.push(new SearxngClient(backend.url));
|
|
188
|
+
}
|
|
189
|
+
else if (backend.type === 'native' && backend.searxngPath) {
|
|
190
|
+
searxngProcess = new SearxngProcess(backend.searxngPath, config.dataDir);
|
|
191
|
+
const url = await searxngProcess.start();
|
|
192
|
+
if (url) {
|
|
193
|
+
searchEngines.push(new SearxngClient(url));
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
log.warn('SearXNG failed to start, using direct scraping fallback');
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
else if (backend.type === 'docker') {
|
|
200
|
+
dockerSearxng = new DockerSearxng();
|
|
201
|
+
const url = await dockerSearxng.start();
|
|
202
|
+
if (url) {
|
|
203
|
+
searchEngines.push(new SearxngClient(url));
|
|
204
|
+
}
|
|
205
|
+
else {
|
|
206
|
+
log.warn('Docker SearXNG failed to start, using direct scraping fallback');
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
searchEngines.push(new BingEngine(), new DuckDuckGoEngine(), new StartpageEngine());
|
|
210
|
+
const server = new Server({ name: 'wigolo', version: '0.1.0' }, { capabilities: { tools: {} } });
|
|
211
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
212
|
+
tools: [
|
|
213
|
+
{
|
|
214
|
+
name: 'fetch',
|
|
215
|
+
description: 'Fetch a web page and return its content as clean markdown. ' +
|
|
216
|
+
'Supports JavaScript rendering, auth, section extraction, and caching.',
|
|
217
|
+
inputSchema: FETCH_TOOL_SCHEMA,
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
name: 'search',
|
|
221
|
+
description: 'Search the web and return results with optional full content extraction. ' +
|
|
222
|
+
'One call: query in, clean markdown out.',
|
|
223
|
+
inputSchema: SEARCH_TOOL_SCHEMA,
|
|
224
|
+
},
|
|
225
|
+
{
|
|
226
|
+
name: 'crawl',
|
|
227
|
+
description: 'Crawl a website starting from a seed URL. Supports BFS, DFS, and sitemap strategies ' +
|
|
228
|
+
'with depth/page limits, URL filtering, and cross-page content deduplication.',
|
|
229
|
+
inputSchema: CRAWL_TOOL_SCHEMA,
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
name: 'cache',
|
|
233
|
+
description: 'Query the local knowledge base of previously fetched content. ' +
|
|
234
|
+
'Search cached pages by full-text query, URL pattern, or date. ' +
|
|
235
|
+
'Can also return cache statistics or clear entries.',
|
|
236
|
+
inputSchema: CACHE_TOOL_SCHEMA,
|
|
237
|
+
},
|
|
238
|
+
{
|
|
239
|
+
name: 'extract',
|
|
240
|
+
description: 'Extract structured data from a web page. Supports CSS selector extraction, ' +
|
|
241
|
+
'table-to-JSON conversion, metadata extraction (title, author, date, JSON-LD), ' +
|
|
242
|
+
'and schema-based extraction (provide a JSON Schema to heuristically extract matching fields).',
|
|
243
|
+
inputSchema: EXTRACT_TOOL_SCHEMA,
|
|
244
|
+
},
|
|
245
|
+
],
|
|
246
|
+
}));
|
|
247
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
248
|
+
const { name, arguments: args } = request.params;
|
|
249
|
+
if (name === 'fetch') {
|
|
250
|
+
const input = (args ?? {});
|
|
251
|
+
const result = await handleFetch(input, router);
|
|
252
|
+
return {
|
|
253
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
254
|
+
isError: !!result.error,
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
if (name === 'search') {
|
|
258
|
+
const input = (args ?? {});
|
|
259
|
+
const result = await handleSearch(input, searchEngines, router);
|
|
260
|
+
return {
|
|
261
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
262
|
+
isError: !!result.error,
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
if (name === 'crawl') {
|
|
266
|
+
const input = (args ?? {});
|
|
267
|
+
const result = await handleCrawl(input, router);
|
|
268
|
+
return {
|
|
269
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
270
|
+
isError: !!result.error,
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
if (name === 'cache') {
|
|
274
|
+
const input = (args ?? {});
|
|
275
|
+
const result = handleCache(input);
|
|
276
|
+
return {
|
|
277
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
278
|
+
isError: !!result.error,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
if (name === 'extract') {
|
|
282
|
+
const input = (args ?? {});
|
|
283
|
+
const result = await handleExtract(input, router);
|
|
284
|
+
return {
|
|
285
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
286
|
+
isError: !!result.error,
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
return {
|
|
290
|
+
content: [{ type: 'text', text: `Unknown tool: ${name}` }],
|
|
291
|
+
isError: true,
|
|
292
|
+
};
|
|
293
|
+
});
|
|
294
|
+
const transport = new StdioServerTransport();
|
|
295
|
+
await server.connect(transport);
|
|
296
|
+
log.info('MCP server started');
|
|
297
|
+
const shutdown = async () => {
|
|
298
|
+
log.info('Shutting down');
|
|
299
|
+
if (searxngProcess)
|
|
300
|
+
await searxngProcess.stop();
|
|
301
|
+
if (dockerSearxng)
|
|
302
|
+
await dockerSearxng.stop();
|
|
303
|
+
await browserPool.shutdown();
|
|
304
|
+
closeDatabase();
|
|
305
|
+
await server.close();
|
|
306
|
+
process.exit(0);
|
|
307
|
+
};
|
|
308
|
+
process.on('SIGINT', shutdown);
|
|
309
|
+
process.on('SIGTERM', shutdown);
|
|
310
|
+
}
|
|
311
|
+
//# sourceMappingURL=server.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"server.js","sourceRoot":"","sources":["../src/server.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AACpC,OAAO,EAAE,MAAM,EAAE,MAAM,2CAA2C,CAAC;AACnE,OAAO,EAAE,oBAAoB,EAAE,MAAM,2CAA2C,CAAC;AACjF,OAAO,EACL,sBAAsB,EACtB,qBAAqB,GACtB,MAAM,oCAAoC,CAAC;AAC5C,OAAO,EAAE,WAAW,EAAmB,MAAM,mBAAmB,CAAC;AACjE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAC;AACnD,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAC/C,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAG3C,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;AAEnC,MAAM,iBAAiB,GAAG;IACxB,IAAI,EAAE,QAAiB;IACvB,UAAU,EAAE;QACV,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,cAAc,EAAE;QACpD,SAAS,EAAE;YACT,IAAI,EAAE,QAAQ;YACd,IAAI,EAAE,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,CAAC;YACjC,WAAW,EAAE,2CAA2C;SACzD;QACD,QAAQ,EAAE;YACR,IAAI,EAAE,SAAS;YACf,WAAW,EAAE,8CAA8C;SAC5D;QACD,SAAS,EAAE;YACT,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,8BAA8B;SAC5C;QACD,OAAO,EAAE;YACP,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,4CAA4C;SAC1D;QACD,aAAa,EAAE;YACb,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,yCAAyC;SACvD;QACD,UAAU,EAAE;YACV,IAAI,EAAE,SAAS;YACf,WAAW,EAAE,uCAAuC;SACrD;QACD,OAAO,EAAE;YACP,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,yBAAyB;YACtC,oBAAoB,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;SACzC;KACF;IACD,QAAQ,EAAE,CAAC,KAAK,CAAC;CAClB,CAAC;AAEF,MAAM,kBAAkB,GAAG;IACzB,IAAI,EAAE,QAAiB;IACvB,UAAU,EAAE;QACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,cAAc,EAAE;QACtD,WAAW,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,2CAA2C,EAAE;QACzF,eAAe,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,+CAA+C,EAAE;QAClG,iBAAiB,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,8CAA8C,EAAE;QAClG,eAAe,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,oDAAoD,EAAE;QACtG,UAAU,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,EAAE,WAAW,EAAE,mBAAmB,EAAE;QACxG,cAAc,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,WAAW,EAAE,2BAA2B,EAAE;QACtG,QAAQ,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,qBAAqB,EAAE;QAChE,eAAe,EAAE;YACf,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;YACzB,WAAW,EAAE,2EAA2E;SACzF;QACD,eAAe,EAAE;YACf,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;YACzB,WAAW,EAAE,yCAAyC;SACvD;QACD,SAAS,EAAE;YACT,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,uEAAuE;SACrF;QACD,OAAO,EAAE;YACP,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,wEAAwE;SACtF;QACD,QAAQ,EAAE;YACR,IAAI,EAAE,QAAQ;YACd,IAAI,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC;YAC7D,WAAW,EAAE,gEAAgE;SAC9E;KACF;IACD,QAAQ,EAAE,CAAC,OAAO,CAAC;CACpB,CAAC;AAEF,MAAM,iBAAiB,GAAG;IACxB,IAAI,EAAE,QAAiB;IACvB,UAAU,EAAE;QACV,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,iCAAiC,EAAE;QACvE,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,2CAA2C,EAAE;QACvF,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,sCAAsC,EAAE;QAClF,QAAQ,EAAE;YACR,IAAI,EAAE,QAAQ;YACd,IAAI,EAAE,CAAC,KAAK,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,CAAC;YACtC,WAAW,EAAE,4KAA4K;SAC1L;QACD,gBAAgB,EAAE;YAChB,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;YACzB,WAAW,EAAE,gDAAgD;SAC9D;QACD,gBAAgB,EAAE;YAChB,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;YACzB,WAAW,EAAE,0CAA0C;SACxD;QACD,QAAQ,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,8CAA8C,EAAE;QAC1F,aAAa,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,kDAAkD,EAAE;QACnG,eAAe,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,oDAAoD,EAAE;KACvG;IACD,QAAQ,EAAE,CAAC,KAAK,CAAC;CAClB,CAAC;AAEF,MAAM,iBAAiB,GAAG;IACxB,IAAI,EAAE,QAAiB;IACvB,UAAU,EAAE;QACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,sCAAsC,EAAE;QAC9E,WAAW,EAAE;YACX,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,oDAAoD;SAClE;QACD,KAAK,EAAE;YACL,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,gDAAgD;SAC9D;QACD,KAAK,EAAE;YACL,IAAI,EAAE,SAAS;YACf,WAAW,EAAE,2FAA2F;SACzG;QACD,KAAK,EAAE;YACL,IAAI,EAAE,SAAS;YACf,WAAW,EAAE,wDAAwD;SACtE;KACF;CACF,CAAC;AAEF,MAAM,mBAAmB,GAAG;IAC1B,IAAI,EAAE,QAAiB;IACvB,UAAU,EAAE;QACV,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,+BAA+B,EAAE;QACrE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,gEAAgE,EAAE;QACvG,IAAI,EAAE;YACJ,IAAI,EAAE,QAAQ;YACd,IAAI,EAAE,CAAC,UAAU,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,CAAC;YAClD,WAAW,EAAE,8JAA8J;SAC5K;QACD,YAAY,EAAE;YACZ,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,uDAAuD;SACrE;QACD,QAAQ,EAAE;YACR,IAAI,EAAE,SAAS;YACf,WAAW,EAAE,yFAAyF;SACvG;QACD,MAAM,EAAE;YACN,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,yKAAyK;SACvL;KACF;CACF,CAAC;AAEF,MAAM,CAAC,KAAK,UAAU,WAAW;IAC/B,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;IAE3B,SAAS,CAAC,MAAM,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC/C,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC,CAAC;IAEhD,MAAM,UAAU,GAAe;QAC7B,KAAK,EAAE,CAAC,GAAG,EAAE,OAAO,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,EAAE,OAAO,CAAC;KACjD,CAAC;IACF,MAAM,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;IACtC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;IAExD,wCAAwC;IACxC,MAAM,OAAO,GAAG,MAAM,oBAAoB,EAAE,CAAC;IAC7C,MAAM,aAAa,GAAmB,EAAE,CAAC;IACzC,IAAI,cAAc,GAA0B,IAAI,CAAC;IACjD,IAAI,aAAa,GAAyB,IAAI,CAAC;IAE/C,IAAI,OAAO,CAAC,IAAI,KAAK,UAAU,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;QAC/C,aAAa,CAAC,IAAI,CAAC,IAAI,aAAa,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC;IACrD,CAAC;SAAM,IAAI,OAAO,CAAC,IAAI,KAAK,QAAQ,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;QAC5D,cAAc,GAAG,IAAI,cAAc,CAAC,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;QACzE,MAAM,GAAG,GAAG,MAAM,cAAc,CAAC,KAAK,EAAE,CAAC;QACzC,IAAI,GAAG,EAAE,CAAC;YACR,aAAa,CAAC,IAAI,CAAC,IAAI,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,GAAG,CAAC,IAAI,CAAC,yDAAyD,CAAC,CAAC;QACtE,CAAC;IACH,CAAC;SAAM,IAAI,OAAO,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QACrC,aAAa,GAAG,IAAI,aAAa,EAAE,CAAC;QACpC,MAAM,GAAG,GAAG,MAAM,aAAa,CAAC,KAAK,EAAE,CAAC;QACxC,IAAI,GAAG,EAAE,CAAC;YACR,aAAa,CAAC,IAAI,CAAC,IAAI,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,GAAG,CAAC,IAAI,CAAC,gEAAgE,CAAC,CAAC;QAC7E,CAAC;IACH,CAAC;IAED,aAAa,CAAC,IAAI,CAAC,IAAI,UAAU,EAAE,EAAE,IAAI,gBAAgB,EAAE,EAAE,IAAI,eAAe,EAAE,CAAC,CAAC;IAEpF,MAAM,MAAM,GAAG,IAAI,MAAM,CACvB,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,EACpC,EAAE,YAAY,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,CAChC,CAAC;IAEF,MAAM,CAAC,iBAAiB,CAAC,sBAAsB,EAAE,KAAK,IAAI,EAAE,CAAC,CAAC;QAC5D,KAAK,EAAE;YACL;gBACE,IAAI,EAAE,OAAO;gBACb,WAAW,EACT,6DAA6D;oBAC7D,uEAAuE;gBACzE,WAAW,EAAE,iBAAiB;aAC/B;YACD;gBACE,IAAI,EAAE,QAAQ;gBACd,WAAW,EACT,2EAA2E;oBAC3E,yCAAyC;gBAC3C,WAAW,EAAE,kBAAkB;aAChC;YACD;gBACE,IAAI,EAAE,OAAO;gBACb,WAAW,EACT,sFAAsF;oBACtF,8EAA8E;gBAChF,WAAW,EAAE,iBAAiB;aAC/B;YACD;gBACE,IAAI,EAAE,OAAO;gBACb,WAAW,EACT,gEAAgE;oBAChE,gEAAgE;oBAChE,oDAAoD;gBACtD,WAAW,EAAE,iBAAiB;aAC/B;YACD;gBACE,IAAI,EAAE,SAAS;gBACf,WAAW,EACT,6EAA6E;oBAC7E,gFAAgF;oBAChF,+FAA+F;gBACjG,WAAW,EAAE,mBAAmB;aACjC;SACF;KACF,CAAC,CAAC,CAAC;IAEJ,MAAM,CAAC,iBAAiB,CAAC,qBAAqB,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;QAChE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC;QAEjD,IAAI,IAAI,KAAK,OAAO,EAAE,CAAC;YACrB,MAAM,KAAK,GAAG,CAAC,IAAI,IAAI,EAAE,CAA0B,CAAC;YACpD,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;YAChD,OAAO;gBACL,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;gBAClE,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;aACxB,CAAC;QACJ,CAAC;QAED,IAAI,IAAI,KAAK,QAAQ,EAAE,CAAC;YACtB,MAAM,KAAK,GAAG,CAAC,IAAI,IAAI,EAAE,CAA2B,CAAC;YACrD,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC,KAAK,EAAE,aAAa,EAAE,MAAM,CAAC,CAAC;YAChE,OAAO;gBACL,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;gBAClE,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;aACxB,CAAC;QACJ,CAAC;QAED,IAAI,IAAI,KAAK,OAAO,EAAE,CAAC;YACrB,MAAM,KAAK,GAAG,CAAC,IAAI,IAAI,EAAE,CAA0B,CAAC;YACpD,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;YAChD,OAAO;gBACL,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;gBAClE,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;aACxB,CAAC;QACJ,CAAC;QAED,IAAI,IAAI,KAAK,OAAO,EAAE,CAAC;YACrB,MAAM,KAAK,GAAG,CAAC,IAAI,IAAI,EAAE,CAA0B,CAAC;YACpD,MAAM,MAAM,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC;YAClC,OAAO;gBACL,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;gBAClE,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;aACxB,CAAC;QACJ,CAAC;QAED,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACvB,MAAM,KAAK,GAAG,CAAC,IAAI,IAAI,EAAE,CAA4B,CAAC;YACtD,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;YAClD,OAAO;gBACL,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;gBAClE,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;aACxB,CAAC;QACJ,CAAC;QAED,OAAO;YACL,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,iBAAiB,IAAI,EAAE,EAAE,CAAC;YAC1D,OAAO,EAAE,IAAI;SACd,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,SAAS,GAAG,IAAI,oBAAoB,EAAE,CAAC;IAC7C,MAAM,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAChC,GAAG,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;IAE/B,MAAM,QAAQ,GAAG,KAAK,IAAI,EAAE;QAC1B,GAAG,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAC1B,IAAI,cAAc;YAAE,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;QAChD,IAAI,aAAa;YAAE,MAAM,aAAa,CAAC,IAAI,EAAE,CAAC;QAC9C,MAAM,WAAW,CAAC,QAAQ,EAAE,CAAC;QAC7B,aAAa,EAAE,CAAC;QAChB,MAAM,MAAM,CAAC,KAAK,EAAE,CAAC;QACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC;IAEF,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC/B,OAAO,CAAC,EAAE,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;AAClC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache.d.ts","sourceRoot":"","sources":["../../src/tools/cache.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAI3D,wBAAgB,WAAW,CAAC,KAAK,EAAE,UAAU,GAAG,WAAW,CA+C1D"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { searchCacheFiltered, getCacheStats, clearCacheEntries } from '../cache/store.js';
|
|
2
|
+
import { createLogger } from '../logger.js';
|
|
3
|
+
const log = createLogger('cache');
|
|
4
|
+
export function handleCache(input) {
|
|
5
|
+
try {
|
|
6
|
+
if (input.stats) {
|
|
7
|
+
log.debug('Cache stats requested');
|
|
8
|
+
return { stats: getCacheStats() };
|
|
9
|
+
}
|
|
10
|
+
if (input.clear) {
|
|
11
|
+
if (!input.query && !input.url_pattern && !input.since) {
|
|
12
|
+
return { error: 'clear requires at least one filter (query, url_pattern, or since)' };
|
|
13
|
+
}
|
|
14
|
+
log.info('Clearing cache entries', {
|
|
15
|
+
query: input.query,
|
|
16
|
+
urlPattern: input.url_pattern,
|
|
17
|
+
since: input.since,
|
|
18
|
+
});
|
|
19
|
+
const count = clearCacheEntries({
|
|
20
|
+
query: input.query,
|
|
21
|
+
urlPattern: input.url_pattern,
|
|
22
|
+
since: input.since,
|
|
23
|
+
});
|
|
24
|
+
return { cleared: count };
|
|
25
|
+
}
|
|
26
|
+
log.debug('Cache search', {
|
|
27
|
+
query: input.query,
|
|
28
|
+
urlPattern: input.url_pattern,
|
|
29
|
+
since: input.since,
|
|
30
|
+
});
|
|
31
|
+
const results = searchCacheFiltered({
|
|
32
|
+
query: input.query,
|
|
33
|
+
urlPattern: input.url_pattern,
|
|
34
|
+
since: input.since,
|
|
35
|
+
});
|
|
36
|
+
return {
|
|
37
|
+
results: results.map((r) => ({
|
|
38
|
+
url: r.url,
|
|
39
|
+
title: r.title,
|
|
40
|
+
markdown: r.markdown,
|
|
41
|
+
fetched_at: r.fetchedAt,
|
|
42
|
+
})),
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
catch (err) {
|
|
46
|
+
log.error('Cache tool error', { error: String(err) });
|
|
47
|
+
return { error: err instanceof Error ? err.message : String(err) };
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=cache.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache.js","sourceRoot":"","sources":["../../src/tools/cache.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,aAAa,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAC1F,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAG5C,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;AAElC,MAAM,UAAU,WAAW,CAAC,KAAiB;IAC3C,IAAI,CAAC;QACH,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,GAAG,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;YACnC,OAAO,EAAE,KAAK,EAAE,aAAa,EAAE,EAAE,CAAC;QACpC,CAAC;QAED,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,IAAI,CAAC,KAAK,CAAC,KAAK,IAAI,CAAC,KAAK,CAAC,WAAW,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;gBACvD,OAAO,EAAE,KAAK,EAAE,mEAAmE,EAAE,CAAC;YACxF,CAAC;YACD,GAAG,CAAC,IAAI,CAAC,wBAAwB,EAAE;gBACjC,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,UAAU,EAAE,KAAK,CAAC,WAAW;gBAC7B,KAAK,EAAE,KAAK,CAAC,KAAK;aACnB,CAAC,CAAC;YACH,MAAM,KAAK,GAAG,iBAAiB,CAAC;gBAC9B,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,UAAU,EAAE,KAAK,CAAC,WAAW;gBAC7B,KAAK,EAAE,KAAK,CAAC,KAAK;aACnB,CAAC,CAAC;YACH,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;QAC5B,CAAC;QAED,GAAG,CAAC,KAAK,CAAC,cAAc,EAAE;YACxB,KAAK,EAAE,KAAK,CAAC,KAAK;YAClB,UAAU,EAAE,KAAK,CAAC,WAAW;YAC7B,KAAK,EAAE,KAAK,CAAC,KAAK;SACnB,CAAC,CAAC;QACH,MAAM,OAAO,GAAG,mBAAmB,CAAC;YAClC,KAAK,EAAE,KAAK,CAAC,KAAK;YAClB,UAAU,EAAE,KAAK,CAAC,WAAW;YAC7B,KAAK,EAAE,KAAK,CAAC,KAAK;SACnB,CAAC,CAAC;QAEH,OAAO;YACL,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBAC3B,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,QAAQ,EAAE,CAAC,CAAC,QAAQ;gBACpB,UAAU,EAAE,CAAC,CAAC,SAAS;aACxB,CAAC,CAAC;SACJ,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,KAAK,CAAC,kBAAkB,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACtD,OAAO,EAAE,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC;IACrE,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { CrawlInput, CrawlOutput, MapOutput } from '../types.js';
|
|
2
|
+
import type { SmartRouter } from '../fetch/router.js';
|
|
3
|
+
export declare function handleCrawl(input: CrawlInput, router: SmartRouter): Promise<CrawlOutput | (MapOutput & {
|
|
4
|
+
crawled: number;
|
|
5
|
+
})>;
|
|
6
|
+
//# sourceMappingURL=crawl.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawl.d.ts","sourceRoot":"","sources":["../../src/tools/crawl.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACtE,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAWtD,wBAAsB,WAAW,CAC/B,KAAK,EAAE,UAAU,EACjB,MAAM,EAAE,WAAW,GAClB,OAAO,CAAC,WAAW,GAAG,CAAC,SAAS,GAAG;IAAE,OAAO,EAAE,MAAM,CAAA;CAAE,CAAC,CAAC,CAgE1D"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { Crawler } from '../crawl/crawler.js';
|
|
2
|
+
import { deduplicatePages } from '../crawl/dedup.js';
|
|
3
|
+
import { mapUrls } from '../crawl/mapper.js';
|
|
4
|
+
import { handleFetch } from './fetch.js';
|
|
5
|
+
import { createLogger } from '../logger.js';
|
|
6
|
+
const log = createLogger('crawl');
|
|
7
|
+
const DEFAULT_MAX_TOTAL_CHARS = 100000;
|
|
8
|
+
export async function handleCrawl(input, router) {
|
|
9
|
+
try {
|
|
10
|
+
// Map strategy: lightweight URL-only discovery, skip full crawl pipeline
|
|
11
|
+
if (input.strategy === 'map') {
|
|
12
|
+
return handleMapStrategy(input, router);
|
|
13
|
+
}
|
|
14
|
+
const fetchFn = async (url) => handleFetch({ url, use_auth: input.use_auth }, router);
|
|
15
|
+
const rawFetchFn = async (url) => router.fetch(url, { renderJs: 'never' });
|
|
16
|
+
const crawler = new Crawler(fetchFn, rawFetchFn);
|
|
17
|
+
const result = await crawler.crawl(input);
|
|
18
|
+
// Deduplicate cross-page content (pass domain for SQLite boilerplate caching)
|
|
19
|
+
const domain = new URL(input.url).hostname;
|
|
20
|
+
const dedupedPages = deduplicatePages(result.pages.map((p) => ({ url: p.url, markdown: p.markdown })), domain);
|
|
21
|
+
// Apply deduped markdown back to pages
|
|
22
|
+
const pages = result.pages.map((page, i) => ({
|
|
23
|
+
...page,
|
|
24
|
+
markdown: dedupedPages[i]?.markdown ?? page.markdown,
|
|
25
|
+
}));
|
|
26
|
+
// Enforce max_total_chars budget
|
|
27
|
+
const maxTotalChars = input.max_total_chars ?? DEFAULT_MAX_TOTAL_CHARS;
|
|
28
|
+
const budgetedPages = [];
|
|
29
|
+
let charCount = 0;
|
|
30
|
+
for (const page of pages) {
|
|
31
|
+
if (charCount + page.markdown.length > maxTotalChars && budgetedPages.length > 0) {
|
|
32
|
+
break;
|
|
33
|
+
}
|
|
34
|
+
budgetedPages.push(page);
|
|
35
|
+
charCount += page.markdown.length;
|
|
36
|
+
}
|
|
37
|
+
log.info('Crawl complete', {
|
|
38
|
+
url: input.url,
|
|
39
|
+
crawled: result.crawled,
|
|
40
|
+
returned: budgetedPages.length,
|
|
41
|
+
totalChars: charCount,
|
|
42
|
+
});
|
|
43
|
+
return {
|
|
44
|
+
pages: budgetedPages,
|
|
45
|
+
total_found: result.total_found,
|
|
46
|
+
crawled: result.crawled,
|
|
47
|
+
...(result.links ? { links: result.links } : {}),
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
catch (err) {
|
|
51
|
+
log.error('Crawl failed', { url: input.url, error: String(err) });
|
|
52
|
+
return {
|
|
53
|
+
pages: [],
|
|
54
|
+
total_found: 0,
|
|
55
|
+
crawled: 0,
|
|
56
|
+
error: err instanceof Error ? err.message : String(err),
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
async function handleMapStrategy(input, router) {
|
|
61
|
+
const httpFetchFn = async (url) => {
|
|
62
|
+
const raw = await router.fetch(url, { renderJs: 'never' });
|
|
63
|
+
return { html: raw.html, finalUrl: raw.finalUrl, statusCode: raw.statusCode };
|
|
64
|
+
};
|
|
65
|
+
try {
|
|
66
|
+
const mapResult = await mapUrls({
|
|
67
|
+
url: input.url,
|
|
68
|
+
max_depth: input.max_depth,
|
|
69
|
+
max_pages: input.max_pages,
|
|
70
|
+
include_patterns: input.include_patterns,
|
|
71
|
+
exclude_patterns: input.exclude_patterns,
|
|
72
|
+
}, httpFetchFn);
|
|
73
|
+
log.info('Map complete', {
|
|
74
|
+
url: input.url,
|
|
75
|
+
total_found: mapResult.total_found,
|
|
76
|
+
sitemap_found: mapResult.sitemap_found,
|
|
77
|
+
});
|
|
78
|
+
return {
|
|
79
|
+
urls: mapResult.urls,
|
|
80
|
+
total_found: mapResult.total_found,
|
|
81
|
+
sitemap_found: mapResult.sitemap_found,
|
|
82
|
+
crawled: 0,
|
|
83
|
+
...(mapResult.error ? { error: mapResult.error } : {}),
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
catch (err) {
|
|
87
|
+
log.error('Map strategy failed', { url: input.url, error: String(err) });
|
|
88
|
+
return {
|
|
89
|
+
urls: [],
|
|
90
|
+
total_found: 0,
|
|
91
|
+
sitemap_found: false,
|
|
92
|
+
crawled: 0,
|
|
93
|
+
error: err instanceof Error ? err.message : String(err),
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
//# sourceMappingURL=crawl.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawl.js","sourceRoot":"","sources":["../../src/tools/crawl.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AAC9C,OAAO,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AAC7C,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAE5C,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;AAElC,MAAM,uBAAuB,GAAG,MAAM,CAAC;AAEvC,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,KAAiB,EACjB,MAAmB;IAEnB,IAAI,CAAC;QACH,yEAAyE;QACzE,IAAI,KAAK,CAAC,QAAQ,KAAK,KAAK,EAAE,CAAC;YAC7B,OAAO,iBAAiB,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,OAAO,GAAG,KAAK,EAAE,GAAW,EAAE,EAAE,CACpC,WAAW,CAAC,EAAE,GAAG,EAAE,QAAQ,EAAE,KAAK,CAAC,QAAQ,EAAE,EAAE,MAAM,CAAC,CAAC;QAEzD,MAAM,UAAU,GAAG,KAAK,EAAE,GAAW,EAAE,EAAE,CACvC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;QAE3C,MAAM,OAAO,GAAG,IAAI,OAAO,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;QACjD,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAE1C,8EAA8E;QAC9E,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;QAC3C,MAAM,YAAY,GAAG,gBAAgB,CACnC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,EAC/D,MAAM,CACP,CAAC;QAEF,uCAAuC;QACvC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YAC3C,GAAG,IAAI;YACP,QAAQ,EAAE,YAAY,CAAC,CAAC,CAAC,EAAE,QAAQ,IAAI,IAAI,CAAC,QAAQ;SACrD,CAAC,CAAC,CAAC;QAEJ,iCAAiC;QACjC,MAAM,aAAa,GAAG,KAAK,CAAC,eAAe,IAAI,uBAAuB,CAAC;QACvE,MAAM,aAAa,GAAG,EAAE,CAAC;QACzB,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,aAAa,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACjF,MAAM;YACR,CAAC;YACD,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACzB,SAAS,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;QACpC,CAAC;QAED,GAAG,CAAC,IAAI,CAAC,gBAAgB,EAAE;YACzB,GAAG,EAAE,KAAK,CAAC,GAAG;YACd,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,QAAQ,EAAE,aAAa,CAAC,MAAM;YAC9B,UAAU,EAAE,SAAS;SACtB,CAAC,CAAC;QAEH,OAAO;YACL,KAAK,EAAE,aAAa;YACpB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACjD,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,KAAK,CAAC,cAAc,EAAE,EAAE,GAAG,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAClE,OAAO;YACL,KAAK,EAAE,EAAE;YACT,WAAW,EAAE,CAAC;YACd,OAAO,EAAE,CAAC;YACV,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;SACxD,CAAC;IACJ,CAAC;AACH,CAAC;AAED,KAAK,UAAU,iBAAiB,CAC9B,KAAiB,EACjB,MAAmB;IAEnB,MAAM,WAAW,GAAG,KAAK,EAAE,GAAW,EAAE,EAAE;QACxC,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;QAC3D,OAAO,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,EAAE,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE,UAAU,EAAE,GAAG,CAAC,UAAU,EAAE,CAAC;IAChF,CAAC,CAAC;IAEF,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,MAAM,OAAO,CAC7B;YACE,GAAG,EAAE,KAAK,CAAC,GAAG;YACd,SAAS,EAAE,KAAK,CAAC,SAAS;YAC1B,SAAS,EAAE,KAAK,CAAC,SAAS;YAC1B,gBAAgB,EAAE,KAAK,CAAC,gBAAgB;YACxC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB;SACzC,EACD,WAAW,CACZ,CAAC;QAEF,GAAG,CAAC,IAAI,CAAC,cAAc,EAAE;YACvB,GAAG,EAAE,KAAK,CAAC,GAAG;YACd,WAAW,EAAE,SAAS,CAAC,WAAW;YAClC,aAAa,EAAE,SAAS,CAAC,aAAa;SACvC,CAAC,CAAC;QAEH,OAAO;YACL,IAAI,EAAE,SAAS,CAAC,IAAI;YACpB,WAAW,EAAE,SAAS,CAAC,WAAW;YAClC,aAAa,EAAE,SAAS,CAAC,aAAa;YACtC,OAAO,EAAE,CAAC;YACV,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,SAAS,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACvD,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,KAAK,CAAC,qBAAqB,EAAE,EAAE,GAAG,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACzE,OAAO;YACL,IAAI,EAAE,EAAE;YACR,WAAW,EAAE,CAAC;YACd,aAAa,EAAE,KAAK;YACpB,OAAO,EAAE,CAAC;YACV,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;SACxD,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract.d.ts","sourceRoot":"","sources":["../../src/tools/extract.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC/D,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AA8BtD,wBAAsB,aAAa,CACjC,KAAK,EAAE,YAAY,EACnB,MAAM,EAAE,WAAW,GAClB,OAAO,CAAC,aAAa,CAAC,CAoDxB"}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { extractMetadata, extractSelector, extractTables } from '../extraction/extract.js';
|
|
2
|
+
import { extractWithSchema } from '../extraction/schema.js';
|
|
3
|
+
import { extractJsonLd } from '../extraction/jsonld.js';
|
|
4
|
+
import { getCachedContent, isExpired } from '../cache/store.js';
|
|
5
|
+
import { createLogger } from '../logger.js';
|
|
6
|
+
const log = createLogger('extract');
|
|
7
|
+
async function resolveHtml(input, router) {
|
|
8
|
+
if (input.url) {
|
|
9
|
+
const cached = getCachedContent(input.url);
|
|
10
|
+
if (cached && !isExpired(cached)) {
|
|
11
|
+
log.info('Using cached HTML', { url: input.url });
|
|
12
|
+
return { html: cached.rawHtml, sourceUrl: cached.url };
|
|
13
|
+
}
|
|
14
|
+
const raw = await router.fetch(input.url, {
|
|
15
|
+
renderJs: 'auto',
|
|
16
|
+
useAuth: false,
|
|
17
|
+
});
|
|
18
|
+
return { html: raw.html, sourceUrl: raw.finalUrl };
|
|
19
|
+
}
|
|
20
|
+
return { html: input.html };
|
|
21
|
+
}
|
|
22
|
+
export async function handleExtract(input, router) {
|
|
23
|
+
const mode = input.mode ?? 'metadata';
|
|
24
|
+
if (!input.url && !input.html) {
|
|
25
|
+
return { data: {}, mode, error: 'Either url or html must be provided' };
|
|
26
|
+
}
|
|
27
|
+
if (mode === 'selector' && !input.css_selector) {
|
|
28
|
+
return { data: '', mode, error: 'css_selector is required when mode is "selector"' };
|
|
29
|
+
}
|
|
30
|
+
if (mode === 'schema' && (!input.schema || !input.schema.properties)) {
|
|
31
|
+
return { data: {}, mode, error: 'schema is required when mode is "schema" and must have properties' };
|
|
32
|
+
}
|
|
33
|
+
try {
|
|
34
|
+
const { html, sourceUrl } = await resolveHtml(input, router);
|
|
35
|
+
let data;
|
|
36
|
+
switch (mode) {
|
|
37
|
+
case 'selector':
|
|
38
|
+
data = extractSelector(html, input.css_selector, input.multiple ?? false);
|
|
39
|
+
break;
|
|
40
|
+
case 'tables':
|
|
41
|
+
data = extractTables(html);
|
|
42
|
+
break;
|
|
43
|
+
case 'schema':
|
|
44
|
+
data = extractWithSchema(html, input.schema);
|
|
45
|
+
break;
|
|
46
|
+
case 'metadata':
|
|
47
|
+
default: {
|
|
48
|
+
const meta = extractMetadata(html);
|
|
49
|
+
const jsonld = extractJsonLd(html);
|
|
50
|
+
if (jsonld.length > 0) {
|
|
51
|
+
meta.jsonld = jsonld;
|
|
52
|
+
}
|
|
53
|
+
data = meta;
|
|
54
|
+
break;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return { data, source_url: sourceUrl, mode };
|
|
58
|
+
}
|
|
59
|
+
catch (err) {
|
|
60
|
+
log.error('Extract failed', { url: input.url, error: String(err) });
|
|
61
|
+
return {
|
|
62
|
+
data: mode === 'selector' ? '' : mode === 'tables' ? [] : {},
|
|
63
|
+
source_url: input.url,
|
|
64
|
+
mode,
|
|
65
|
+
error: err instanceof Error ? err.message : String(err),
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
//# sourceMappingURL=extract.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract.js","sourceRoot":"","sources":["../../src/tools/extract.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,eAAe,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AAC3F,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,OAAO,EAAE,aAAa,EAAE,MAAM,yBAAyB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAChE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAE5C,MAAM,GAAG,GAAG,YAAY,CAAC,SAAS,CAAC,CAAC;AAEpC,KAAK,UAAU,WAAW,CACxB,KAAmB,EACnB,MAAmB;IAEnB,IAAI,KAAK,CAAC,GAAG,EAAE,CAAC;QACd,MAAM,MAAM,GAAG,gBAAgB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC3C,IAAI,MAAM,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,EAAE,CAAC;YACjC,GAAG,CAAC,IAAI,CAAC,mBAAmB,EAAE,EAAE,GAAG,EAAE,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC;YAClD,OAAO,EAAE,IAAI,EAAE,MAAM,CAAC,OAAO,EAAE,SAAS,EAAE,MAAM,CAAC,GAAG,EAAE,CAAC;QACzD,CAAC;QAED,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,EAAE;YACxC,QAAQ,EAAE,MAAM;YAChB,OAAO,EAAE,KAAK;SACf,CAAC,CAAC;QACH,OAAO,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,EAAE,SAAS,EAAE,GAAG,CAAC,QAAQ,EAAE,CAAC;IACrD,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,KAAK,CAAC,IAAK,EAAE,CAAC;AAC/B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,KAAmB,EACnB,MAAmB;IAEnB,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,IAAI,UAAU,CAAC;IAEtC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;QAC9B,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,qCAAqC,EAAE,CAAC;IAC1E,CAAC;IAED,IAAI,IAAI,KAAK,UAAU,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;QAC/C,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,kDAAkD,EAAE,CAAC;IACvF,CAAC;IAED,IAAI,IAAI,KAAK,QAAQ,IAAI,CAAC,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;QACrE,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,mEAAmE,EAAE,CAAC;IACxG,CAAC;IAED,IAAI,CAAC;QACH,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,MAAM,WAAW,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;QAE7D,IAAI,IAA2B,CAAC;QAEhC,QAAQ,IAAI,EAAE,CAAC;YACb,KAAK,UAAU;gBACb,IAAI,GAAG,eAAe,CAAC,IAAI,EAAE,KAAK,CAAC,YAAa,EAAE,KAAK,CAAC,QAAQ,IAAI,KAAK,CAAC,CAAC;gBAC3E,MAAM;YACR,KAAK,QAAQ;gBACX,IAAI,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;gBAC3B,MAAM;YACR,KAAK,QAAQ;gBACX,IAAI,GAAG,iBAAiB,CAAC,IAAI,EAAE,KAAK,CAAC,MAAO,CAAC,CAAC;gBAC9C,MAAM;YACR,KAAK,UAAU,CAAC;YAChB,OAAO,CAAC,CAAC,CAAC;gBACR,MAAM,IAAI,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;gBACnC,MAAM,MAAM,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;gBACnC,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACtB,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;gBACvB,CAAC;gBACD,IAAI,GAAG,IAAI,CAAC;gBACZ,MAAM;YACR,CAAC;QACH,CAAC;QAED,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;IAC/C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,KAAK,CAAC,gBAAgB,EAAE,EAAE,GAAG,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACpE,OAAO;YACL,IAAI,EAAE,IAAI,KAAK,UAAU,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE;YAC5D,UAAU,EAAE,KAAK,CAAC,GAAG;YACrB,IAAI;YACJ,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;SACxD,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetch.d.ts","sourceRoot":"","sources":["../../src/tools/fetch.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAiB,MAAM,aAAa,CAAC;AAC1E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAoCtD,wBAAsB,WAAW,CAC/B,KAAK,EAAE,UAAU,EACjB,MAAM,EAAE,WAAW,GAClB,OAAO,CAAC,WAAW,CAAC,CAgDtB"}
|