npm - webskim - Versions diffs - 1.0.0 - Mend

webskim 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +139 -0
package/dist/index.d.ts +2 -0
package/dist/index.js +24 -0
package/dist/services/file-manager.d.ts +6 -0
package/dist/services/file-manager.js +33 -0
package/dist/services/jina-client.d.ts +30 -0
package/dist/services/jina-client.js +85 -0
package/dist/services/toc-generator.d.ts +1 -0
package/dist/services/toc-generator.js +16 -0
package/dist/tools/read.d.ts +4 -0
package/dist/tools/read.js +50 -0
package/dist/tools/search.d.ts +3 -0
package/dist/tools/search.js +41 -0
package/package.json +42 -0

package/README.md ADDED Viewed

@@ -0,0 +1,139 @@
+# webskim
+Context-efficient web search and reading for AI agents. MCP server powered by [Jina AI](https://jina.ai).
+Built-in `WebFetch` dumps entire pages into context. One page = thousands of tokens gone.
+**webskim** saves pages to disk and returns a table of contents. Your agent reads only what it needs.
+## Prerequisites
+webskim uses [Jina AI](https://jina.ai) APIs under the hood — you need a **Jina API key** to use it.
+> **[Get your free API key at jina.ai](https://jina.ai)** — 1M tokens included, no credit card required.
+## Quick Start
+**Claude Code** — add to `.mcp.json` in your project:
+```json
+{
+  "mcpServers": {
+    "webskim": {
+      "command": "npx",
+      "args": ["-y", "webskim"],
+      "env": { "JINA_API_KEY": "jina_..." }
+    }
+  }
+}
+```
+**Claude Desktop** — add to `claude_desktop_config.json`:
+```json
+{
+  "mcpServers": {
+    "webskim": {
+      "command": "npx",
+      "args": ["-y", "webskim"],
+      "env": { "JINA_API_KEY": "jina_..." }
+    }
+  }
+}
+```
+**Cursor / Windsurf / other MCP clients** — same pattern, point at `npx -y webskim` with `JINA_API_KEY` in env.
+## How It Works
+```
+Agent: jina_search("react server components")
+  → 5 results: title, URL, snippet (minimal tokens)
+Agent: jina_read("https://react.dev/reference/rsc/server-components")
+  → Saved: .ai_pages/20260220_143052_react_dev__reference__rsc.md
+  → Lines: 342 | ~2800 tokens
+  → Table of Contents:
+      L1:   # Server Components
+      L18:  ## Reference
+      L45:  ## Usage
+      L89:  ### Fetching data
+      L156: ### Streaming
+Agent: Read(".ai_pages/..._rsc.md", offset=89, limit=67)
+  → reads only the section it needs
+```
+No full pages in context. No wasted tokens. The agent decides what to read.
+## Tools
+| Tool | What it does |
+|------|-------------|
+| `jina_search` | Web search → titles, URLs, snippets |
+| `jina_read` | Fetch URL/PDF → save as markdown, return TOC |
+### jina_search
+| Param | Description |
+|-------|-------------|
+| `query` | Search query |
+| `num_results` | 1–10 (default 5) |
+| `site` | Restrict to domain, e.g. `"python.org"` |
+| `country` | Locale code, e.g. `"US"`, `"PL"` |
+### jina_read
+| Param | Description |
+|-------|-------------|
+| `url` | Page or PDF URL |
+| `max_tokens` | Server-side truncation (saves context) |
+| `target_selector` | CSS — extract only this element |
+| `remove_selector` | CSS — remove elements before extraction |
+## Why webskim?
+**Context efficiency** — pages saved to `.ai_pages/` on disk, not dumped into context. Agent reads sections via offset/limit.
+**Tiny footprint** — ~190 tokens per tool definition in system prompt. Minimal overhead vs. built-in alternatives.
+**Smart search** — returns snippets, not full pages. Agent picks which URLs are worth reading.
+**PDF support** — Jina Reader handles PDFs natively. Same API, same workflow.
+**Server-side token budget** — `max_tokens` truncates on the server before content reaches your agent.
+**CSS selectors** — `target_selector` / `remove_selector` extract exactly the part of the page you need.
+**Clean markdown** — no HTML soup, no boilerplate, just readable content.
+**Fast and cheap** — search ~2.5s, read ~8s. Jina API costs $0.02/1M tokens.
+## Make It the Default
+Add this to your project's `CLAUDE.md` so your agent always prefers webskim over built-in tools:
+```markdown
+## Web Research
+Always use Jina MCP tools for web operations:
+- `jina_search` instead of `WebSearch`
+- `jina_read` instead of `WebFetch`
+Workflow: search → read URL to disk → Read file with offset/limit.
+WebSearch/WebFetch are fallback only.
+```
+Add `.ai_pages/` to your `.gitignore`.
+## Development
+```bash
+git clone <repo-url> && cd webskim
+npm install && npm run build
+npm test
+```
+## License
+MIT

package/dist/index.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env node
2	+ export {};

package/dist/index.js ADDED Viewed

@@ -0,0 +1,24 @@
+#!/usr/bin/env node
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
+import { JinaClient } from "./services/jina-client.js";
+import { FileManager } from "./services/file-manager.js";
+import { registerSearchTool } from "./tools/search.js";
+import { registerReadTool } from "./tools/read.js";
+import { join } from "node:path";
+const JINA_API_KEY = process.env.JINA_API_KEY;
+if (!JINA_API_KEY) {
+    console.error("FATAL: JINA_API_KEY is required. Pass it via env in your MCP config.");
+    process.exit(1);
+}
+const server = new McpServer({
+    name: "webskim",
+    version: "1.0.0",
+});
+const client = new JinaClient(JINA_API_KEY);
+const fileManager = new FileManager(join(process.cwd(), ".ai_pages"));
+registerSearchTool(server, client);
+registerReadTool(server, client, fileManager);
+const transport = new StdioServerTransport();
+await server.connect(transport);
+console.error("webskim server started");

package/dist/services/file-manager.d.ts ADDED Viewed

@@ -0,0 +1,6 @@
+export declare class FileManager {
+    private baseDir;
+    constructor(baseDir: string);
+    generateFilename(url: string): string;
+    savePage(content: string, url: string): Promise<string>;
+}

package/dist/services/file-manager.js ADDED Viewed

@@ -0,0 +1,33 @@
+import { mkdir, writeFile } from "node:fs/promises";
+import { join } from "node:path";
+export class FileManager {
+    baseDir;
+    constructor(baseDir) {
+        this.baseDir = baseDir;
+    }
+    generateFilename(url) {
+        const parsed = new URL(url);
+        const domain = parsed.hostname.replace(/\./g, "_");
+        // Process pathname: strip leading slash, extension, and normalize
+        let path = parsed.pathname
+            .slice(1) // remove leading /
+            .replace(/\.[^.]+$/, "") // strip file extension
+            .replace(/\//g, "__"); // slashes to double underscores
+        // Remove trailing underscores
+        path = path.replace(/_+$/, "");
+        const now = new Date();
+        const ts = now.toISOString()
+            .replace(/[-:T]/g, "")
+            .slice(0, 15)
+            .replace(/^(\d{8})(\d{6}).*/, "$1_$2");
+        const slug = path ? `${domain}__${path}` : domain;
+        return `${ts}_${slug}.md`;
+    }
+    async savePage(content, url) {
+        await mkdir(this.baseDir, { recursive: true });
+        const filename = this.generateFilename(url);
+        const filePath = join(this.baseDir, filename);
+        await writeFile(filePath, content, "utf-8");
+        return filePath;
+    }
+}

package/dist/services/jina-client.d.ts ADDED Viewed

@@ -0,0 +1,30 @@
+export interface SearchResult {
+    title: string;
+    url: string;
+    snippet: string;
+}
+export interface SearchOptions {
+    num_results?: number;
+    site?: string;
+    country?: string;
+}
+export interface ReadOptions {
+    target_selector?: string;
+    remove_selector?: string;
+    max_tokens?: number;
+}
+export interface ReadResult {
+    title: string;
+    content: string;
+}
+export interface SegmentResult {
+    num_tokens: number;
+    chunks: string[];
+}
+export declare class JinaClient {
+    private apiKey;
+    constructor(apiKey: string);
+    search(query: string, options?: SearchOptions): Promise<SearchResult[]>;
+    read(url: string, options?: ReadOptions): Promise<ReadResult>;
+    segment(content: string): Promise<SegmentResult>;
+}

package/dist/services/jina-client.js ADDED Viewed

@@ -0,0 +1,85 @@
+export class JinaClient {
+    apiKey;
+    constructor(apiKey) {
+        this.apiKey = apiKey;
+    }
+    async search(query, options = {}) {
+        const headers = {
+            Authorization: `Bearer ${this.apiKey}`,
+            Accept: "application/json",
+            "Content-Type": "application/json",
+            "X-Return-Format": "markdown",
+        };
+        if (options.site) {
+            headers["X-Site"] = options.site;
+        }
+        if (options.country) {
+            headers["X-Locale"] = options.country;
+        }
+        const body = { q: query };
+        if (options.num_results) {
+            body.num = options.num_results;
+        }
+        const response = await fetch("https://s.jina.ai/", {
+            method: "POST",
+            headers,
+            body: JSON.stringify(body),
+        });
+        if (!response.ok) {
+            throw new Error(`Jina Search API error: ${response.status} ${response.statusText}`);
+        }
+        const json = await response.json();
+        return json.data.map((item) => ({
+            title: item.title,
+            url: item.url,
+            snippet: item.description,
+        }));
+    }
+    async read(url, options = {}) {
+        const headers = {
+            Authorization: `Bearer ${this.apiKey}`,
+            Accept: "application/json",
+            "Content-Type": "application/json",
+            "X-Return-Format": "markdown",
+        };
+        if (options.target_selector) {
+            headers["X-Target-Selector"] = options.target_selector;
+        }
+        if (options.remove_selector) {
+            headers["X-Remove-Selector"] = options.remove_selector;
+        }
+        if (options.max_tokens) {
+            headers["X-Token-Budget"] = String(options.max_tokens);
+        }
+        const response = await fetch("https://r.jina.ai/", {
+            method: "POST",
+            headers,
+            body: JSON.stringify({ url }),
+        });
+        if (!response.ok) {
+            throw new Error(`Jina Reader API error: ${response.status} ${response.statusText}`);
+        }
+        const json = await response.json();
+        return { title: json.data.title, content: json.data.content };
+    }
+    async segment(content) {
+        const response = await fetch("https://segment.jina.ai/", {
+            method: "POST",
+            headers: {
+                Authorization: `Bearer ${this.apiKey}`,
+                "Content-Type": "application/json",
+            },
+            body: JSON.stringify({
+                content,
+                tokenizer: "cl100k_base",
+                return_tokens: false,
+                return_chunks: true,
+            }),
+        });
+        if (!response.ok) {
+            throw new Error(`Jina Segmenter API error: ${response.status} ${response.statusText}`);
+        }
+        const json = await response.json();
+        return { num_tokens: json.num_tokens, chunks: json.chunks };
+    }
+}

package/dist/services/toc-generator.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare function generateToc(markdown: string): string;

package/dist/services/toc-generator.js ADDED Viewed

@@ -0,0 +1,16 @@
+export function generateToc(markdown) {
+    const lines = markdown.split("\n");
+    const entries = [];
+    let inCodeBlock = false;
+    for (let i = 0; i < lines.length; i++) {
+        const line = lines[i];
+        if (line.startsWith("```")) {
+            inCodeBlock = !inCodeBlock;
+            continue;
+        }
+        if (!inCodeBlock && /^#{1,6}\s/.test(line)) {
+            entries.push(`L${i + 1}: ${line}`);
+        }
+    }
+    return entries.join("\n");
+}

package/dist/tools/read.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { JinaClient } from "../services/jina-client.js";
+import { FileManager } from "../services/file-manager.js";
+export declare function registerReadTool(server: McpServer, client: JinaClient, fileManager: FileManager): void;

package/dist/tools/read.js ADDED Viewed

@@ -0,0 +1,50 @@
+import { z } from "zod";
+import { generateToc } from "../services/toc-generator.js";
+export function registerReadTool(server, client, fileManager) {
+    server.tool("jina_read", "Read a web page or PDF from URL, save as markdown to disk, and return file path with table of contents. Use the Read tool on the returned file_path to view content — you control how much to read via offset/limit.", {
+        url: z.string().url().describe("URL of web page or PDF to read"),
+        max_tokens: z.number().positive().optional().describe("Truncate content to this many tokens (saves context window)"),
+        target_selector: z.string().optional().describe("CSS selector — extract only this element from the page"),
+        remove_selector: z.string().optional().describe("CSS selector — remove these elements before extraction"),
+    }, async ({ url, max_tokens, target_selector, remove_selector }) => {
+        try {
+            // 1. Fetch page content via Jina Reader
+            const { title, content } = await client.read(url, {
+                target_selector: target_selector ?? undefined,
+                remove_selector: remove_selector ?? undefined,
+                max_tokens: max_tokens ?? undefined,
+            });
+            // 2. Save to disk
+            const filePath = await fileManager.savePage(content, url);
+            // 3. Generate TOC and count lines/estimate tokens
+            const toc = generateToc(content);
+            const totalLines = content.split("\n").length;
+            // Rough estimate: ~4 chars per token for English text
+            const estimatedTokens = Math.round(content.length / 4);
+            // 4. Return metadata
+            const response = [
+                `**${title}**`,
+                `File: ${filePath}`,
+                `Lines: ${totalLines} | ~${estimatedTokens} tokens (estimate)`,
+                "",
+                toc ? `**Table of Contents:**\n${toc}` : "(no headings found)",
+                "",
+                "Use Read tool on the file path above to view content. Use offset/limit to read specific sections.",
+            ].join("\n");
+            return {
+                content: [{ type: "text", text: response }],
+            };
+        }
+        catch (error) {
+            return {
+                isError: true,
+                content: [
+                    {
+                        type: "text",
+                        text: `Failed to read URL: ${error instanceof Error ? error.message : String(error)}`,
+                    },
+                ],
+            };
+        }
+    });
+}

package/dist/tools/search.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { JinaClient } from "../services/jina-client.js";
+export declare function registerSearchTool(server: McpServer, client: JinaClient): void;

package/dist/tools/search.js ADDED Viewed

@@ -0,0 +1,41 @@
+import { z } from "zod";
+export function registerSearchTool(server, client) {
+    server.tool("jina_search", "Search the web using Jina Search API. Returns lightweight results (title, URL, snippet) without full page content. Use jina_read on interesting URLs to get full content saved to disk.", {
+        query: z.string().describe("Search query"),
+        num_results: z.number().min(1).max(10).default(5).describe("Number of results (1-10, default 5)"),
+        site: z.string().optional().describe("Restrict search to this domain, e.g. 'python.org'"),
+        country: z.string().optional().describe("Country code for localized results, e.g. 'US', 'PL'"),
+    }, async ({ query, num_results, site, country }) => {
+        try {
+            const results = await client.search(query, {
+                num_results,
+                site,
+                country,
+            });
+            const formatted = results
+                .map((r, i) => `${i + 1}. **${r.title}**\n   ${r.url}\n   ${r.snippet}`)
+                .join("\n\n");
+            return {
+                content: [
+                    {
+                        type: "text",
+                        text: results.length > 0
+                            ? `Found ${results.length} results:\n\n${formatted}`
+                            : "No results found.",
+                    },
+                ],
+            };
+        }
+        catch (error) {
+            return {
+                isError: true,
+                content: [
+                    {
+                        type: "text",
+                        text: `Search failed: ${error instanceof Error ? error.message : String(error)}`,
+                    },
+                ],
+            };
+        }
+    });
+}

package/package.json ADDED Viewed

@@ -0,0 +1,42 @@
+{
+  "name": "webskim",
+  "version": "1.0.0",
+  "description": "Context-efficient web search and reading for AI agents. MCP server powered by Jina AI.",
+  "type": "module",
+  "main": "dist/index.js",
+  "files": [
+    "dist"
+  ],
+  "scripts": {
+    "build": "tsc",
+    "prepare": "npm run build",
+    "start": "node dist/index.js",
+    "dev": "tsc --watch",
+    "test": "vitest run",
+    "test:watch": "vitest"
+  },
+  "bin": {
+    "webskim": "dist/index.js"
+  },
+  "keywords": [
+    "mcp",
+    "model-context-protocol",
+    "jina",
+    "search",
+    "web",
+    "reader",
+    "ai",
+    "claude"
+  ],
+  "author": "",
+  "license": "MIT",
+  "dependencies": {
+    "@modelcontextprotocol/sdk": "^1.26.0",
+    "zod": "^4.3.6"
+  },
+  "devDependencies": {
+    "@types/node": "^25.3.0",
+    "typescript": "^5.9.3",
+    "vitest": "^4.0.18"
+  }
+}