webskim 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,139 @@
1
+ # webskim
2
+
3
+ Context-efficient web search and reading for AI agents. MCP server powered by [Jina AI](https://jina.ai).
4
+
5
+ Built-in `WebFetch` dumps entire pages into context. One page = thousands of tokens gone.
6
+
7
+ **webskim** saves pages to disk and returns a table of contents. Your agent reads only what it needs.
8
+
9
+ ## Prerequisites
10
+
11
+ webskim uses [Jina AI](https://jina.ai) APIs under the hood — you need a **Jina API key** to use it.
12
+
13
+ > **[Get your free API key at jina.ai](https://jina.ai)** — 1M tokens included, no credit card required.
14
+
15
+ ## Quick Start
16
+
17
+ **Claude Code** — add to `.mcp.json` in your project:
18
+
19
+ ```json
20
+ {
21
+ "mcpServers": {
22
+ "webskim": {
23
+ "command": "npx",
24
+ "args": ["-y", "webskim"],
25
+ "env": { "JINA_API_KEY": "jina_..." }
26
+ }
27
+ }
28
+ }
29
+ ```
30
+
31
+ **Claude Desktop** — add to `claude_desktop_config.json`:
32
+
33
+ ```json
34
+ {
35
+ "mcpServers": {
36
+ "webskim": {
37
+ "command": "npx",
38
+ "args": ["-y", "webskim"],
39
+ "env": { "JINA_API_KEY": "jina_..." }
40
+ }
41
+ }
42
+ }
43
+ ```
44
+
45
+ **Cursor / Windsurf / other MCP clients** — same pattern, point at `npx -y webskim` with `JINA_API_KEY` in env.
46
+
47
+ ## How It Works
48
+
49
+ ```
50
+ Agent: jina_search("react server components")
51
+ → 5 results: title, URL, snippet (minimal tokens)
52
+
53
+ Agent: jina_read("https://react.dev/reference/rsc/server-components")
54
+ → Saved: .ai_pages/20260220_143052_react_dev__reference__rsc.md
55
+ → Lines: 342 | ~2800 tokens
56
+ → Table of Contents:
57
+ L1: # Server Components
58
+ L18: ## Reference
59
+ L45: ## Usage
60
+ L89: ### Fetching data
61
+ L156: ### Streaming
62
+
63
+ Agent: Read(".ai_pages/..._rsc.md", offset=89, limit=67)
64
+ → reads only the section it needs
65
+ ```
66
+
67
+ No full pages in context. No wasted tokens. The agent decides what to read.
68
+
69
+ ## Tools
70
+
71
+ | Tool | What it does |
72
+ |------|-------------|
73
+ | `jina_search` | Web search → titles, URLs, snippets |
74
+ | `jina_read` | Fetch URL/PDF → save as markdown, return TOC |
75
+
76
+ ### jina_search
77
+
78
+ | Param | Description |
79
+ |-------|-------------|
80
+ | `query` | Search query |
81
+ | `num_results` | 1–10 (default 5) |
82
+ | `site` | Restrict to domain, e.g. `"python.org"` |
83
+ | `country` | Locale code, e.g. `"US"`, `"PL"` |
84
+
85
+ ### jina_read
86
+
87
+ | Param | Description |
88
+ |-------|-------------|
89
+ | `url` | Page or PDF URL |
90
+ | `max_tokens` | Server-side truncation (saves context) |
91
+ | `target_selector` | CSS — extract only this element |
92
+ | `remove_selector` | CSS — remove elements before extraction |
93
+
94
+ ## Why webskim?
95
+
96
+ **Context efficiency** — pages saved to `.ai_pages/` on disk, not dumped into context. Agent reads sections via offset/limit.
97
+
98
+ **Tiny footprint** — ~190 tokens per tool definition in system prompt. Minimal overhead vs. built-in alternatives.
99
+
100
+ **Smart search** — returns snippets, not full pages. Agent picks which URLs are worth reading.
101
+
102
+ **PDF support** — Jina Reader handles PDFs natively. Same API, same workflow.
103
+
104
+ **Server-side token budget** — `max_tokens` truncates on the server before content reaches your agent.
105
+
106
+ **CSS selectors** — `target_selector` / `remove_selector` extract exactly the part of the page you need.
107
+
108
+ **Clean markdown** — no HTML soup, no boilerplate, just readable content.
109
+
110
+ **Fast and cheap** — search ~2.5s, read ~8s. Jina API costs $0.02/1M tokens.
111
+
112
+ ## Make It the Default
113
+
114
+ Add this to your project's `CLAUDE.md` so your agent always prefers webskim over built-in tools:
115
+
116
+ ```markdown
117
+ ## Web Research
118
+
119
+ Always use Jina MCP tools for web operations:
120
+ - `jina_search` instead of `WebSearch`
121
+ - `jina_read` instead of `WebFetch`
122
+
123
+ Workflow: search → read URL to disk → Read file with offset/limit.
124
+ WebSearch/WebFetch are fallback only.
125
+ ```
126
+
127
+ Add `.ai_pages/` to your `.gitignore`.
128
+
129
+ ## Development
130
+
131
+ ```bash
132
+ git clone <repo-url> && cd webskim
133
+ npm install && npm run build
134
+ npm test
135
+ ```
136
+
137
+ ## License
138
+
139
+ MIT
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
package/dist/index.js ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env node
2
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
+ import { JinaClient } from "./services/jina-client.js";
5
+ import { FileManager } from "./services/file-manager.js";
6
+ import { registerSearchTool } from "./tools/search.js";
7
+ import { registerReadTool } from "./tools/read.js";
8
+ import { join } from "node:path";
9
+ const JINA_API_KEY = process.env.JINA_API_KEY;
10
+ if (!JINA_API_KEY) {
11
+ console.error("FATAL: JINA_API_KEY is required. Pass it via env in your MCP config.");
12
+ process.exit(1);
13
+ }
14
+ const server = new McpServer({
15
+ name: "webskim",
16
+ version: "1.0.0",
17
+ });
18
+ const client = new JinaClient(JINA_API_KEY);
19
+ const fileManager = new FileManager(join(process.cwd(), ".ai_pages"));
20
+ registerSearchTool(server, client);
21
+ registerReadTool(server, client, fileManager);
22
+ const transport = new StdioServerTransport();
23
+ await server.connect(transport);
24
+ console.error("webskim server started");
@@ -0,0 +1,6 @@
1
+ export declare class FileManager {
2
+ private baseDir;
3
+ constructor(baseDir: string);
4
+ generateFilename(url: string): string;
5
+ savePage(content: string, url: string): Promise<string>;
6
+ }
@@ -0,0 +1,33 @@
1
+ import { mkdir, writeFile } from "node:fs/promises";
2
+ import { join } from "node:path";
3
+ export class FileManager {
4
+ baseDir;
5
+ constructor(baseDir) {
6
+ this.baseDir = baseDir;
7
+ }
8
+ generateFilename(url) {
9
+ const parsed = new URL(url);
10
+ const domain = parsed.hostname.replace(/\./g, "_");
11
+ // Process pathname: strip leading slash, extension, and normalize
12
+ let path = parsed.pathname
13
+ .slice(1) // remove leading /
14
+ .replace(/\.[^.]+$/, "") // strip file extension
15
+ .replace(/\//g, "__"); // slashes to double underscores
16
+ // Remove trailing underscores
17
+ path = path.replace(/_+$/, "");
18
+ const now = new Date();
19
+ const ts = now.toISOString()
20
+ .replace(/[-:T]/g, "")
21
+ .slice(0, 15)
22
+ .replace(/^(\d{8})(\d{6}).*/, "$1_$2");
23
+ const slug = path ? `${domain}__${path}` : domain;
24
+ return `${ts}_${slug}.md`;
25
+ }
26
+ async savePage(content, url) {
27
+ await mkdir(this.baseDir, { recursive: true });
28
+ const filename = this.generateFilename(url);
29
+ const filePath = join(this.baseDir, filename);
30
+ await writeFile(filePath, content, "utf-8");
31
+ return filePath;
32
+ }
33
+ }
@@ -0,0 +1,30 @@
1
+ export interface SearchResult {
2
+ title: string;
3
+ url: string;
4
+ snippet: string;
5
+ }
6
+ export interface SearchOptions {
7
+ num_results?: number;
8
+ site?: string;
9
+ country?: string;
10
+ }
11
+ export interface ReadOptions {
12
+ target_selector?: string;
13
+ remove_selector?: string;
14
+ max_tokens?: number;
15
+ }
16
+ export interface ReadResult {
17
+ title: string;
18
+ content: string;
19
+ }
20
+ export interface SegmentResult {
21
+ num_tokens: number;
22
+ chunks: string[];
23
+ }
24
+ export declare class JinaClient {
25
+ private apiKey;
26
+ constructor(apiKey: string);
27
+ search(query: string, options?: SearchOptions): Promise<SearchResult[]>;
28
+ read(url: string, options?: ReadOptions): Promise<ReadResult>;
29
+ segment(content: string): Promise<SegmentResult>;
30
+ }
@@ -0,0 +1,85 @@
1
+ export class JinaClient {
2
+ apiKey;
3
+ constructor(apiKey) {
4
+ this.apiKey = apiKey;
5
+ }
6
+ async search(query, options = {}) {
7
+ const headers = {
8
+ Authorization: `Bearer ${this.apiKey}`,
9
+ Accept: "application/json",
10
+ "Content-Type": "application/json",
11
+ "X-Return-Format": "markdown",
12
+ };
13
+ if (options.site) {
14
+ headers["X-Site"] = options.site;
15
+ }
16
+ if (options.country) {
17
+ headers["X-Locale"] = options.country;
18
+ }
19
+ const body = { q: query };
20
+ if (options.num_results) {
21
+ body.num = options.num_results;
22
+ }
23
+ const response = await fetch("https://s.jina.ai/", {
24
+ method: "POST",
25
+ headers,
26
+ body: JSON.stringify(body),
27
+ });
28
+ if (!response.ok) {
29
+ throw new Error(`Jina Search API error: ${response.status} ${response.statusText}`);
30
+ }
31
+ const json = await response.json();
32
+ return json.data.map((item) => ({
33
+ title: item.title,
34
+ url: item.url,
35
+ snippet: item.description,
36
+ }));
37
+ }
38
+ async read(url, options = {}) {
39
+ const headers = {
40
+ Authorization: `Bearer ${this.apiKey}`,
41
+ Accept: "application/json",
42
+ "Content-Type": "application/json",
43
+ "X-Return-Format": "markdown",
44
+ };
45
+ if (options.target_selector) {
46
+ headers["X-Target-Selector"] = options.target_selector;
47
+ }
48
+ if (options.remove_selector) {
49
+ headers["X-Remove-Selector"] = options.remove_selector;
50
+ }
51
+ if (options.max_tokens) {
52
+ headers["X-Token-Budget"] = String(options.max_tokens);
53
+ }
54
+ const response = await fetch("https://r.jina.ai/", {
55
+ method: "POST",
56
+ headers,
57
+ body: JSON.stringify({ url }),
58
+ });
59
+ if (!response.ok) {
60
+ throw new Error(`Jina Reader API error: ${response.status} ${response.statusText}`);
61
+ }
62
+ const json = await response.json();
63
+ return { title: json.data.title, content: json.data.content };
64
+ }
65
+ async segment(content) {
66
+ const response = await fetch("https://segment.jina.ai/", {
67
+ method: "POST",
68
+ headers: {
69
+ Authorization: `Bearer ${this.apiKey}`,
70
+ "Content-Type": "application/json",
71
+ },
72
+ body: JSON.stringify({
73
+ content,
74
+ tokenizer: "cl100k_base",
75
+ return_tokens: false,
76
+ return_chunks: true,
77
+ }),
78
+ });
79
+ if (!response.ok) {
80
+ throw new Error(`Jina Segmenter API error: ${response.status} ${response.statusText}`);
81
+ }
82
+ const json = await response.json();
83
+ return { num_tokens: json.num_tokens, chunks: json.chunks };
84
+ }
85
+ }
@@ -0,0 +1 @@
1
+ export declare function generateToc(markdown: string): string;
@@ -0,0 +1,16 @@
1
+ export function generateToc(markdown) {
2
+ const lines = markdown.split("\n");
3
+ const entries = [];
4
+ let inCodeBlock = false;
5
+ for (let i = 0; i < lines.length; i++) {
6
+ const line = lines[i];
7
+ if (line.startsWith("```")) {
8
+ inCodeBlock = !inCodeBlock;
9
+ continue;
10
+ }
11
+ if (!inCodeBlock && /^#{1,6}\s/.test(line)) {
12
+ entries.push(`L${i + 1}: ${line}`);
13
+ }
14
+ }
15
+ return entries.join("\n");
16
+ }
@@ -0,0 +1,4 @@
1
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
+ import { JinaClient } from "../services/jina-client.js";
3
+ import { FileManager } from "../services/file-manager.js";
4
+ export declare function registerReadTool(server: McpServer, client: JinaClient, fileManager: FileManager): void;
@@ -0,0 +1,50 @@
1
+ import { z } from "zod";
2
+ import { generateToc } from "../services/toc-generator.js";
3
+ export function registerReadTool(server, client, fileManager) {
4
+ server.tool("jina_read", "Read a web page or PDF from URL, save as markdown to disk, and return file path with table of contents. Use the Read tool on the returned file_path to view content — you control how much to read via offset/limit.", {
5
+ url: z.string().url().describe("URL of web page or PDF to read"),
6
+ max_tokens: z.number().positive().optional().describe("Truncate content to this many tokens (saves context window)"),
7
+ target_selector: z.string().optional().describe("CSS selector — extract only this element from the page"),
8
+ remove_selector: z.string().optional().describe("CSS selector — remove these elements before extraction"),
9
+ }, async ({ url, max_tokens, target_selector, remove_selector }) => {
10
+ try {
11
+ // 1. Fetch page content via Jina Reader
12
+ const { title, content } = await client.read(url, {
13
+ target_selector: target_selector ?? undefined,
14
+ remove_selector: remove_selector ?? undefined,
15
+ max_tokens: max_tokens ?? undefined,
16
+ });
17
+ // 2. Save to disk
18
+ const filePath = await fileManager.savePage(content, url);
19
+ // 3. Generate TOC and count lines/estimate tokens
20
+ const toc = generateToc(content);
21
+ const totalLines = content.split("\n").length;
22
+ // Rough estimate: ~4 chars per token for English text
23
+ const estimatedTokens = Math.round(content.length / 4);
24
+ // 4. Return metadata
25
+ const response = [
26
+ `**${title}**`,
27
+ `File: ${filePath}`,
28
+ `Lines: ${totalLines} | ~${estimatedTokens} tokens (estimate)`,
29
+ "",
30
+ toc ? `**Table of Contents:**\n${toc}` : "(no headings found)",
31
+ "",
32
+ "Use Read tool on the file path above to view content. Use offset/limit to read specific sections.",
33
+ ].join("\n");
34
+ return {
35
+ content: [{ type: "text", text: response }],
36
+ };
37
+ }
38
+ catch (error) {
39
+ return {
40
+ isError: true,
41
+ content: [
42
+ {
43
+ type: "text",
44
+ text: `Failed to read URL: ${error instanceof Error ? error.message : String(error)}`,
45
+ },
46
+ ],
47
+ };
48
+ }
49
+ });
50
+ }
@@ -0,0 +1,3 @@
1
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
+ import { JinaClient } from "../services/jina-client.js";
3
+ export declare function registerSearchTool(server: McpServer, client: JinaClient): void;
@@ -0,0 +1,41 @@
1
+ import { z } from "zod";
2
+ export function registerSearchTool(server, client) {
3
+ server.tool("jina_search", "Search the web using Jina Search API. Returns lightweight results (title, URL, snippet) without full page content. Use jina_read on interesting URLs to get full content saved to disk.", {
4
+ query: z.string().describe("Search query"),
5
+ num_results: z.number().min(1).max(10).default(5).describe("Number of results (1-10, default 5)"),
6
+ site: z.string().optional().describe("Restrict search to this domain, e.g. 'python.org'"),
7
+ country: z.string().optional().describe("Country code for localized results, e.g. 'US', 'PL'"),
8
+ }, async ({ query, num_results, site, country }) => {
9
+ try {
10
+ const results = await client.search(query, {
11
+ num_results,
12
+ site,
13
+ country,
14
+ });
15
+ const formatted = results
16
+ .map((r, i) => `${i + 1}. **${r.title}**\n ${r.url}\n ${r.snippet}`)
17
+ .join("\n\n");
18
+ return {
19
+ content: [
20
+ {
21
+ type: "text",
22
+ text: results.length > 0
23
+ ? `Found ${results.length} results:\n\n${formatted}`
24
+ : "No results found.",
25
+ },
26
+ ],
27
+ };
28
+ }
29
+ catch (error) {
30
+ return {
31
+ isError: true,
32
+ content: [
33
+ {
34
+ type: "text",
35
+ text: `Search failed: ${error instanceof Error ? error.message : String(error)}`,
36
+ },
37
+ ],
38
+ };
39
+ }
40
+ });
41
+ }
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "webskim",
3
+ "version": "1.0.0",
4
+ "description": "Context-efficient web search and reading for AI agents. MCP server powered by Jina AI.",
5
+ "type": "module",
6
+ "main": "dist/index.js",
7
+ "files": [
8
+ "dist"
9
+ ],
10
+ "scripts": {
11
+ "build": "tsc",
12
+ "prepare": "npm run build",
13
+ "start": "node dist/index.js",
14
+ "dev": "tsc --watch",
15
+ "test": "vitest run",
16
+ "test:watch": "vitest"
17
+ },
18
+ "bin": {
19
+ "webskim": "dist/index.js"
20
+ },
21
+ "keywords": [
22
+ "mcp",
23
+ "model-context-protocol",
24
+ "jina",
25
+ "search",
26
+ "web",
27
+ "reader",
28
+ "ai",
29
+ "claude"
30
+ ],
31
+ "author": "",
32
+ "license": "MIT",
33
+ "dependencies": {
34
+ "@modelcontextprotocol/sdk": "^1.26.0",
35
+ "zod": "^4.3.6"
36
+ },
37
+ "devDependencies": {
38
+ "@types/node": "^25.3.0",
39
+ "typescript": "^5.9.3",
40
+ "vitest": "^4.0.18"
41
+ }
42
+ }