newskit-mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ # Contributing to NewsKit MCP Server
2
+
3
+ Thank you for your interest in contributing! This document provides guidelines for contributing to this project.
4
+
5
+ ## Development Setup
6
+
7
+ 1. Clone the repository:
8
+ ```bash
9
+ git clone https://github.com/CodeAKrome/newskit-mcp-server.git
10
+ cd newskit-mcp-server
11
+ ```
12
+
13
+ 2. Install Node.js dependencies:
14
+ ```bash
15
+ npm install
16
+ ```
17
+
18
+ 3. Install Python dependencies:
19
+ ```bash
20
+ pip install chromadb sentence-transformers pandas numpy scikit-learn
21
+ ```
22
+
23
+ 4. Build the project:
24
+ ```bash
25
+ npm run build
26
+ ```
27
+
28
+ ## Making Changes
29
+
30
+ 1. Create a new branch for your feature or bug fix
31
+ 2. Make your changes
32
+ 3. Test thoroughly
33
+ 4. Update documentation if needed
34
+ 5. Submit a pull request
35
+
36
+ ## Code Style
37
+
38
+ - TypeScript: Use strict mode, follow existing patterns
39
+ - Python: Follow PEP 8 style guide
40
+ - Keep functions focused and well-documented
41
+
42
+ ## Testing
43
+
44
+ Before submitting a PR:
45
+ - Test all four tools (categorize_articles, load_articles, search_similar, get_categories)
46
+ - Verify the build compiles without errors
47
+ - Test with sample data
48
+
49
+ ## Pull Request Process
50
+
51
+ 1. Ensure your PR description clearly describes the problem and solution
52
+ 2. Reference any related issues
53
+ 3. Wait for review and address feedback
54
+
55
+ ## Questions?
56
+
57
+ Open an issue for questions or discussion.
package/Dockerfile ADDED
@@ -0,0 +1,63 @@
1
+ # Multi-stage build for NewsKit MCP Server
2
+ FROM node:20-slim AS builder
3
+
4
+ WORKDIR /app
5
+
6
+ # Copy package files
7
+ COPY package*.json ./
8
+ COPY tsconfig.json ./
9
+
10
+ # Install dependencies
11
+ RUN npm ci
12
+
13
+ # Copy source code
14
+ COPY src/ ./src/
15
+
16
+ # Build the project
17
+ RUN npm run build
18
+
19
+ # Production stage
20
+ FROM node:20-slim
21
+
22
+ WORKDIR /app
23
+
24
+ # Install Python and required packages
25
+ RUN apt-get update && apt-get install -y \
26
+ python3 \
27
+ python3-pip \
28
+ && rm -rf /var/lib/apt/lists/*
29
+
30
+ # Install Python dependencies
31
+ RUN pip3 install --break-system-packages \
32
+ chromadb \
33
+ sentence-transformers \
34
+ pandas \
35
+ numpy \
36
+ scikit-learn
37
+
38
+ # Copy package files
39
+ COPY package*.json ./
40
+
41
+ # Install production Node.js dependencies
42
+ RUN npm ci --production
43
+
44
+ # Copy built files from builder
45
+ COPY --from=builder /app/build ./build
46
+
47
+ # Copy Python bridge
48
+ COPY python_bridge.py ./
49
+
50
+ # Set executable permissions
51
+ RUN chmod +x build/index.js
52
+
53
+ # Create directory for ChromaDB
54
+ RUN mkdir -p /app/chroma_db
55
+
56
+ # Set environment variables
57
+ ENV NODE_ENV=production
58
+ ENV PYTHONPATH=/app
59
+
60
+ # Expose no ports (uses stdio transport)
61
+
62
+ # Run the server
63
+ ENTRYPOINT ["node", "build/index.js"]
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 NewsKit Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,174 @@
1
+ # NewsKit MCP Server
2
+
3
+ An MCP server for intelligent news article categorization using embeddings and clustering. Automatically groups similar articles together and generates human-readable category names.
4
+
5
+ ## Features
6
+
7
+ - **Semantic Categorization**: Uses sentence-transformers to generate embeddings and DBSCAN clustering to group similar articles
8
+ - **ChromaDB Integration**: Stores article embeddings for fast semantic search
9
+ - **Automatic Category Naming**: Uses TF-IDF to extract keywords and generate descriptive category names
10
+ - **Configurable Parameters**: Adjust similarity thresholds and minimum cluster sizes to fine-tune results
11
+ - **Search Capability**: Find semantically similar articles using natural language queries
12
+
13
+ ## Tools
14
+
15
+ ### categorize_articles
16
+
17
+ Run the full categorization pipeline on a TSV file of news articles.
18
+
19
+ **Parameters:**
20
+ - `inputPath` (required): Path to TSV file with `article_id` and `title` columns
21
+ - `outputPath` (optional): Output JSON file path (default: `categories.json`)
22
+ - `minClusterSize` (optional): Minimum articles per category (default: 2)
23
+ - `similarityThreshold` (optional): Cosine similarity threshold 0-1 (default: 0.75)
24
+ - `persistDir` (optional): ChromaDB storage directory (default: `./chroma_db`)
25
+
26
+ **Example:**
27
+ ```json
28
+ {
29
+ "inputPath": "/path/to/articles.tsv",
30
+ "outputPath": "/path/to/categories.json",
31
+ "similarityThreshold": 0.8,
32
+ "minClusterSize": 3
33
+ }
34
+ ```
35
+
36
+ ### load_articles
37
+
38
+ Preview articles from a TSV file without categorizing.
39
+
40
+ **Parameters:**
41
+ - `inputPath` (required): Path to TSV file
42
+ - `limit` (optional): Maximum articles to return (default: 50)
43
+
44
+ ### search_similar
45
+
46
+ Search for semantically similar articles using natural language queries.
47
+
48
+ **Parameters:**
49
+ - `query` (required): Search query text
50
+ - `persistDir` (optional): ChromaDB directory (default: `./chroma_db`)
51
+ - `nResults` (optional): Number of results (default: 5, max: 20)
52
+
53
+ ### get_categories
54
+
55
+ Display categorized results from a JSON output file.
56
+
57
+ **Parameters:**
58
+ - `resultsPath` (required): Path to categories.json file
59
+
60
+ ## Installation
61
+
62
+ ### Prerequisites
63
+
64
+ - Node.js 18 or higher
65
+ - Python 3.8 or higher
66
+ - Python dependencies: `pip install chromadb sentence-transformers pandas numpy scikit-learn`
67
+
68
+ ### From NPM
69
+
70
+ ```bash
71
+ npm install -g newskit-mcp-server
72
+ ```
73
+
74
+ ### From Source
75
+
76
+ ```bash
77
+ git clone https://github.com/CodeAKrome/newskit-mcp-server.git
78
+ cd newskit-mcp-server
79
+ npm install
80
+ npm run build
81
+ ```
82
+
83
+ ## Configuration
84
+
85
+ Add to your MCP settings file:
86
+
87
+ ```json
88
+ {
89
+ "mcpServers": {
90
+ "newskit": {
91
+ "command": "node",
92
+ "args": ["/path/to/newskit-mcp-server/build/index.js"],
93
+ "disabled": false,
94
+ "alwaysAllow": [],
95
+ "disabledTools": []
96
+ }
97
+ }
98
+ }
99
+ ```
100
+
101
+ Or if installed via npm:
102
+
103
+ ```json
104
+ {
105
+ "mcpServers": {
106
+ "newskit": {
107
+ "command": "npx",
108
+ "args": ["newskit-mcp-server"],
109
+ "disabled": false
110
+ }
111
+ }
112
+ }
113
+ ```
114
+
115
+ ## Input Format
116
+
117
+ The input TSV file should have two columns:
118
+ - `article_id`: Unique identifier for the article
119
+ - `title`: Article title text
120
+
121
+ Example:
122
+ ```tsv
123
+ article_id title
124
+ abc123 Venezuela releases over 100 political prisoners
125
+ def456 Seahawks advance to Super Bowl with thrilling win
126
+ ```
127
+
128
+ ## Output Format
129
+
130
+ The output JSON file contains:
131
+
132
+ ```json
133
+ {
134
+ "categories": [
135
+ {
136
+ "category_id": 1,
137
+ "category_name": "Venezuela / Prisoners",
138
+ "article_count": 3,
139
+ "articles": [
140
+ {"article_id": "abc123", "title": "Venezuela releases..."}
141
+ ]
142
+ }
143
+ ],
144
+ "uncategorized": [
145
+ {"article_id": "xyz789", "title": "Unique article..."}
146
+ ]
147
+ }
148
+ ```
149
+
150
+ ## Tuning Guide
151
+
152
+ | Goal | Parameter Adjustment |
153
+ |------|---------------------|
154
+ | More categories (looser) | Lower `similarityThreshold` (try 0.65) |
155
+ | Fewer, tighter categories | Raise `similarityThreshold` (try 0.85) |
156
+ | Only major categories | Raise `minClusterSize` (try 5) |
157
+ | Include smaller clusters | Lower `minClusterSize` (try 2) |
158
+
159
+ ## Architecture
160
+
161
+ - **TypeScript MCP Server**: Provides the tool interface via stdio transport
162
+ - **Python Bridge**: Interfaces with ML libraries (sentence-transformers, scikit-learn)
163
+ - **ChromaDB**: Vector database for embedding storage and similarity search
164
+ - **Sentence-Transformers**: all-MiniLM-L6-v2 model for generating embeddings
165
+ - **DBSCAN**: Clustering algorithm for grouping similar articles
166
+ - **TF-IDF**: Keyword extraction for automatic category naming
167
+
168
+ ## License
169
+
170
+ MIT License - See LICENSE file for details
171
+
172
+ ## Contributing
173
+
174
+ Contributions welcome! Please read CONTRIBUTING.md for guidelines.
package/build/index.js ADDED
@@ -0,0 +1,237 @@
1
+ #!/usr/bin/env node
2
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
+ import { z } from "zod";
5
+ import { spawn } from "child_process";
6
+ import * as path from "path";
7
+ import * as fs from "fs";
8
+ import { fileURLToPath } from "url";
9
+ // Get __dirname equivalent in ES modules
10
+ const __filename = fileURLToPath(import.meta.url);
11
+ const __dirname = path.dirname(__filename);
12
+ // Path to the Python script bridge
13
+ const PYTHON_BRIDGE_PATH = path.join(__dirname, "..", "python_bridge.py");
14
+ const SRC_DIR = path.join(__dirname, "..", "..", "src");
15
+ /**
16
+ * Execute Python script with arguments and return JSON output
17
+ */
18
+ async function runPythonBridge(args) {
19
+ return new Promise((resolve, reject) => {
20
+ const pythonProcess = spawn("python3", [PYTHON_BRIDGE_PATH, ...args], {
21
+ cwd: SRC_DIR,
22
+ env: { ...process.env, PYTHONPATH: SRC_DIR }
23
+ });
24
+ let stdout = "";
25
+ let stderr = "";
26
+ pythonProcess.stdout.on("data", (data) => {
27
+ stdout += data.toString();
28
+ });
29
+ pythonProcess.stderr.on("data", (data) => {
30
+ stderr += data.toString();
31
+ });
32
+ pythonProcess.on("close", (code) => {
33
+ if (code !== 0) {
34
+ reject(new Error(`Python bridge failed: ${stderr || stdout}`));
35
+ }
36
+ else {
37
+ try {
38
+ // Find JSON output (it should be the last line)
39
+ const lines = stdout.trim().split("\n");
40
+ const jsonLine = lines.find(line => line.startsWith("{") || line.startsWith("["));
41
+ if (jsonLine) {
42
+ resolve(JSON.parse(jsonLine));
43
+ }
44
+ else {
45
+ resolve({ success: true, output: stdout.trim() });
46
+ }
47
+ }
48
+ catch (e) {
49
+ resolve({ success: true, output: stdout.trim() });
50
+ }
51
+ }
52
+ });
53
+ pythonProcess.on("error", (error) => {
54
+ reject(new Error(`Failed to start Python: ${error.message}`));
55
+ });
56
+ });
57
+ }
58
+ // Create MCP server
59
+ const server = new McpServer({
60
+ name: "newskit-mcp-server",
61
+ version: "1.0.0"
62
+ });
63
+ // Tool: Categorize articles from TSV file
64
+ server.tool("categorize_articles", {
65
+ inputPath: z.string().describe("Path to input TSV file with article_id and title columns"),
66
+ outputPath: z.string().optional().describe("Path to output JSON file (default: categories.json)"),
67
+ minClusterSize: z.number().min(1).optional().describe("Minimum articles per category (default: 2)"),
68
+ similarityThreshold: z.number().min(0).max(1).optional().describe("Cosine similarity threshold 0-1 (default: 0.75)"),
69
+ persistDir: z.string().optional().describe("ChromaDB storage directory (default: ./chroma_db)")
70
+ }, async ({ inputPath, outputPath, minClusterSize, similarityThreshold, persistDir }) => {
71
+ try {
72
+ // Validate input file exists
73
+ if (!fs.existsSync(inputPath)) {
74
+ return {
75
+ content: [{ type: "text", text: `Error: Input file not found: ${inputPath}` }],
76
+ isError: true
77
+ };
78
+ }
79
+ const args = [
80
+ "categorize",
81
+ "--input", inputPath,
82
+ "--output", outputPath || "categories.json",
83
+ "--min-cluster-size", String(minClusterSize || 2),
84
+ "--similarity-threshold", String(similarityThreshold || 0.75),
85
+ "--persist-dir", persistDir || "./chroma_db"
86
+ ];
87
+ const result = await runPythonBridge(args);
88
+ return {
89
+ content: [
90
+ {
91
+ type: "text",
92
+ text: `Categorization complete!\n\nResults:\n${JSON.stringify(result, null, 2)}`
93
+ }
94
+ ]
95
+ };
96
+ }
97
+ catch (error) {
98
+ return {
99
+ content: [
100
+ {
101
+ type: "text",
102
+ text: `Categorization failed: ${error instanceof Error ? error.message : String(error)}`
103
+ }
104
+ ],
105
+ isError: true
106
+ };
107
+ }
108
+ });
109
+ // Tool: Load and view articles from TSV
110
+ server.tool("load_articles", {
111
+ inputPath: z.string().describe("Path to input TSV file with article_id and title columns"),
112
+ limit: z.number().optional().describe("Maximum number of articles to return (default: 50)")
113
+ }, async ({ inputPath, limit }) => {
114
+ try {
115
+ if (!fs.existsSync(inputPath)) {
116
+ return {
117
+ content: [{ type: "text", text: `Error: Input file not found: ${inputPath}` }],
118
+ isError: true
119
+ };
120
+ }
121
+ const args = [
122
+ "load",
123
+ "--input", inputPath,
124
+ "--limit", String(limit || 50)
125
+ ];
126
+ const result = await runPythonBridge(args);
127
+ return {
128
+ content: [
129
+ {
130
+ type: "text",
131
+ text: `Articles loaded (${result.count} total):\n\n${JSON.stringify(result.articles, null, 2)}`
132
+ }
133
+ ]
134
+ };
135
+ }
136
+ catch (error) {
137
+ return {
138
+ content: [
139
+ {
140
+ type: "text",
141
+ text: `Failed to load articles: ${error instanceof Error ? error.message : String(error)}`
142
+ }
143
+ ],
144
+ isError: true
145
+ };
146
+ }
147
+ });
148
+ // Tool: Search similar articles in ChromaDB
149
+ server.tool("search_similar", {
150
+ query: z.string().describe("Search query to find similar articles"),
151
+ persistDir: z.string().optional().describe("ChromaDB storage directory (default: ./chroma_db)"),
152
+ nResults: z.number().min(1).max(20).optional().describe("Number of results to return (default: 5)")
153
+ }, async ({ query, persistDir, nResults }) => {
154
+ try {
155
+ const args = [
156
+ "search",
157
+ "--query", query,
158
+ "--persist-dir", persistDir || "./chroma_db",
159
+ "--n-results", String(nResults || 5)
160
+ ];
161
+ const result = await runPythonBridge(args);
162
+ return {
163
+ content: [
164
+ {
165
+ type: "text",
166
+ text: `Search results for "${query}":\n\n${JSON.stringify(result, null, 2)}`
167
+ }
168
+ ]
169
+ };
170
+ }
171
+ catch (error) {
172
+ return {
173
+ content: [
174
+ {
175
+ type: "text",
176
+ text: `Search failed: ${error instanceof Error ? error.message : String(error)}`
177
+ }
178
+ ],
179
+ isError: true
180
+ };
181
+ }
182
+ });
183
+ // Tool: Get categories from results file
184
+ server.tool("get_categories", {
185
+ resultsPath: z.string().describe("Path to the categories.json results file")
186
+ }, async ({ resultsPath }) => {
187
+ try {
188
+ if (!fs.existsSync(resultsPath)) {
189
+ return {
190
+ content: [{ type: "text", text: `Error: Results file not found: ${resultsPath}` }],
191
+ isError: true
192
+ };
193
+ }
194
+ const content = fs.readFileSync(resultsPath, "utf-8");
195
+ const data = JSON.parse(content);
196
+ let output = "Categories:\n\n";
197
+ if (data.categories) {
198
+ for (const cat of data.categories) {
199
+ output += `Category ${cat.category_id}: "${cat.category_name}" (${cat.article_count} articles)\n`;
200
+ for (const article of cat.articles) {
201
+ output += ` - ${article.article_id}: ${article.title}\n`;
202
+ }
203
+ output += "\n";
204
+ }
205
+ }
206
+ if (data.uncategorized && data.uncategorized.length > 0) {
207
+ output += `Uncategorized: ${data.uncategorized.length} articles\n`;
208
+ for (const article of data.uncategorized.slice(0, 10)) {
209
+ output += ` - ${article.article_id}: ${article.title}\n`;
210
+ }
211
+ if (data.uncategorized.length > 10) {
212
+ output += ` ... and ${data.uncategorized.length - 10} more\n`;
213
+ }
214
+ }
215
+ return {
216
+ content: [{ type: "text", text: output }]
217
+ };
218
+ }
219
+ catch (error) {
220
+ return {
221
+ content: [
222
+ {
223
+ type: "text",
224
+ text: `Failed to read categories: ${error instanceof Error ? error.message : String(error)}`
225
+ }
226
+ ],
227
+ isError: true
228
+ };
229
+ }
230
+ });
231
+ // Start the server
232
+ async function main() {
233
+ const transport = new StdioServerTransport();
234
+ await server.connect(transport);
235
+ console.error("NewsKit MCP server running on stdio");
236
+ }
237
+ main();
package/package.json ADDED
@@ -0,0 +1,47 @@
1
+ {
2
+ "name": "newskit-mcp-server",
3
+ "version": "1.0.0",
4
+ "description": "MCP server for intelligent news article categorization using embeddings and clustering",
5
+ "main": "build/index.js",
6
+ "type": "module",
7
+ "bin": {
8
+ "newskit-mcp-server": "build/index.js"
9
+ },
10
+ "scripts": {
11
+ "build": "tsc && node -e \"require('fs').chmodSync('build/index.js', '755')\"",
12
+ "dev": "ts-node --esm src/index.ts",
13
+ "prepare": "npm run build"
14
+ },
15
+ "keywords": [
16
+ "mcp",
17
+ "mcp-server",
18
+ "news",
19
+ "categorization",
20
+ "clustering",
21
+ "embeddings",
22
+ "nlp",
23
+ "machine-learning"
24
+ ],
25
+ "author": "",
26
+ "license": "MIT",
27
+ "repository": {
28
+ "type": "git",
29
+ "url": "git+https://github.com/CodeAKrome/newskit-mcp-server.git"
30
+ },
31
+ "bugs": {
32
+ "url": "https://github.com/CodeAKrome/newskit-mcp-server/issues"
33
+ },
34
+ "homepage": "https://github.com/CodeAKrome/newskit-mcp-server#readme",
35
+ "engines": {
36
+ "node": ">=18.0.0"
37
+ },
38
+ "dependencies": {
39
+ "@modelcontextprotocol/sdk": "^1.25.3",
40
+ "zod": "^4.3.6"
41
+ },
42
+ "devDependencies": {
43
+ "@types/node": "^25.1.0",
44
+ "ts-node": "^10.9.2",
45
+ "typescript": "^5.9.3"
46
+ }
47
+ }
@@ -0,0 +1,189 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Python bridge script for NewsKit MCP server.
4
+ This script provides a command-line interface to the NewsKit functionality
5
+ that can be called from the TypeScript MCP server.
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import sys
11
+ import os
12
+
13
+ # Add the parent directory to Python path to find src
14
+ parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15
+ src_dir = os.path.join(parent_dir, 'src')
16
+ sys.path.insert(0, parent_dir)
17
+ sys.path.insert(0, src_dir)
18
+
19
+ from file_handler import load_articles, export_results
20
+ from embeddings import generate_embeddings
21
+ from chroma_manager import initialize_chroma, add_articles_to_chroma
22
+ from categorizer import cluster_articles, generate_category_names, format_results
23
+ from chromadb.config import Settings
24
+ import config
25
+
26
+ import chromadb
27
+
28
+
29
+ def cmd_categorize(args):
30
+ """Run the full categorization pipeline."""
31
+ try:
32
+ # Step 1: Load articles
33
+ articles_df = load_articles(args.input)
34
+
35
+ if articles_df.empty:
36
+ print(json.dumps({"error": "No articles found in input file"}))
37
+ return 1
38
+
39
+ # Step 2: Generate embeddings
40
+ titles = articles_df['title'].tolist()
41
+ embeddings = generate_embeddings(titles)
42
+
43
+ # Step 3: Initialize ChromaDB
44
+ collection = initialize_chroma(
45
+ collection_name=config.COLLECTION_NAME,
46
+ persist_dir=args.persist_dir
47
+ )
48
+
49
+ # Step 4: Add articles to ChromaDB
50
+ add_articles_to_chroma(collection, articles_df, embeddings)
51
+
52
+ # Step 5: Cluster articles
53
+ cluster_labels = cluster_articles(
54
+ embeddings,
55
+ method="dbscan",
56
+ min_samples=args.min_cluster_size,
57
+ eps=1 - args.similarity_threshold
58
+ )
59
+
60
+ # Step 6: Generate category names
61
+ categories = generate_category_names(
62
+ articles_df,
63
+ cluster_labels,
64
+ args.min_cluster_size
65
+ )
66
+
67
+ # Step 7: Format and export results
68
+ results = format_results(categories, articles_df, cluster_labels)
69
+ export_results(results, args.output)
70
+
71
+ # Output summary
72
+ summary = {
73
+ "success": True,
74
+ "total_articles": len(articles_df),
75
+ "categories_count": len(results['categories']),
76
+ "uncategorized_count": len(results['uncategorized']),
77
+ "output_file": args.output
78
+ }
79
+ print(json.dumps(summary))
80
+ return 0
81
+
82
+ except Exception as e:
83
+ print(json.dumps({"error": str(e)}))
84
+ return 1
85
+
86
+
87
+ def cmd_load(args):
88
+ """Load and return articles from TSV file."""
89
+ try:
90
+ articles_df = load_articles(args.input)
91
+
92
+ # Convert to list of dicts
93
+ articles = articles_df.head(args.limit).to_dict('records')
94
+
95
+ result = {
96
+ "count": len(articles_df),
97
+ "articles": [
98
+ {"article_id": str(a['article_id']), "title": a['title']}
99
+ for a in articles
100
+ ]
101
+ }
102
+ print(json.dumps(result))
103
+ return 0
104
+
105
+ except Exception as e:
106
+ print(json.dumps({"error": str(e)}))
107
+ return 1
108
+
109
+
110
+ def cmd_search(args):
111
+ """Search for similar articles in ChromaDB."""
112
+ try:
113
+ # Initialize ChromaDB client
114
+ settings = Settings(anonymized_telemetry=False, allow_reset=True)
115
+ client = chromadb.PersistentClient(path=args.persist_dir, settings=settings)
116
+
117
+ # Get the collection
118
+ try:
119
+ collection = client.get_collection(name=config.COLLECTION_NAME)
120
+ except Exception:
121
+ print(json.dumps({"error": "Collection not found. Run categorization first."}))
122
+ return 1
123
+
124
+ # Query the collection
125
+ results = collection.query(
126
+ query_texts=[args.query],
127
+ n_results=args.n_results
128
+ )
129
+
130
+ # Format results
131
+ formatted_results = []
132
+ if results['ids'] and len(results['ids']) > 0:
133
+ for i in range(len(results['ids'][0])):
134
+ formatted_results.append({
135
+ "article_id": results['ids'][0][i],
136
+ "title": results['documents'][0][i] if results['documents'] else "",
137
+ "metadata": results['metadatas'][0][i] if results['metadatas'] else {},
138
+ "distance": results['distances'][0][i] if results['distances'] else None
139
+ })
140
+
141
+ print(json.dumps({
142
+ "query": args.query,
143
+ "results": formatted_results
144
+ }))
145
+ return 0
146
+
147
+ except Exception as e:
148
+ print(json.dumps({"error": str(e)}))
149
+ return 1
150
+
151
+
152
+ def main():
153
+ parser = argparse.ArgumentParser(description="NewsKit Python Bridge")
154
+ subparsers = parser.add_subparsers(dest='command', help='Command to run')
155
+
156
+ # Categorize command
157
+ categorize_parser = subparsers.add_parser('categorize', help='Categorize articles')
158
+ categorize_parser.add_argument('--input', required=True, help='Input TSV file path')
159
+ categorize_parser.add_argument('--output', default='categories.json', help='Output JSON file path')
160
+ categorize_parser.add_argument('--min-cluster-size', type=int, default=2, help='Minimum cluster size')
161
+ categorize_parser.add_argument('--similarity-threshold', type=float, default=0.75, help='Similarity threshold')
162
+ categorize_parser.add_argument('--persist-dir', default='./chroma_db', help='ChromaDB persistence directory')
163
+
164
+ # Load command
165
+ load_parser = subparsers.add_parser('load', help='Load articles from TSV')
166
+ load_parser.add_argument('--input', required=True, help='Input TSV file path')
167
+ load_parser.add_argument('--limit', type=int, default=50, help='Maximum articles to return')
168
+
169
+ # Search command
170
+ search_parser = subparsers.add_parser('search', help='Search similar articles')
171
+ search_parser.add_argument('--query', required=True, help='Search query')
172
+ search_parser.add_argument('--persist-dir', default='./chroma_db', help='ChromaDB persistence directory')
173
+ search_parser.add_argument('--n-results', type=int, default=5, help='Number of results')
174
+
175
+ args = parser.parse_args()
176
+
177
+ if args.command == 'categorize':
178
+ return cmd_categorize(args)
179
+ elif args.command == 'load':
180
+ return cmd_load(args)
181
+ elif args.command == 'search':
182
+ return cmd_search(args)
183
+ else:
184
+ parser.print_help()
185
+ return 1
186
+
187
+
188
+ if __name__ == "__main__":
189
+ sys.exit(main())
package/src/index.ts ADDED
@@ -0,0 +1,273 @@
1
+ #!/usr/bin/env node
2
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
+ import { z } from "zod";
5
+ import { spawn } from "child_process";
6
+ import * as path from "path";
7
+ import * as fs from "fs";
8
+ import { fileURLToPath } from "url";
9
+
10
+ // Get __dirname equivalent in ES modules
11
+ const __filename = fileURLToPath(import.meta.url);
12
+ const __dirname = path.dirname(__filename);
13
+
14
+ // Path to the Python script bridge
15
+ const PYTHON_BRIDGE_PATH = path.join(__dirname, "..", "python_bridge.py");
16
+ const SRC_DIR = path.join(__dirname, "..", "..", "src");
17
+
18
+ /**
19
+ * Execute Python script with arguments and return JSON output
20
+ */
21
+ async function runPythonBridge(args: string[]): Promise<any> {
22
+ return new Promise((resolve, reject) => {
23
+ const pythonProcess = spawn("python3", [PYTHON_BRIDGE_PATH, ...args], {
24
+ cwd: SRC_DIR,
25
+ env: { ...process.env, PYTHONPATH: SRC_DIR }
26
+ });
27
+
28
+ let stdout = "";
29
+ let stderr = "";
30
+
31
+ pythonProcess.stdout.on("data", (data) => {
32
+ stdout += data.toString();
33
+ });
34
+
35
+ pythonProcess.stderr.on("data", (data) => {
36
+ stderr += data.toString();
37
+ });
38
+
39
+ pythonProcess.on("close", (code) => {
40
+ if (code !== 0) {
41
+ reject(new Error(`Python bridge failed: ${stderr || stdout}`));
42
+ } else {
43
+ try {
44
+ // Find JSON output (it should be the last line)
45
+ const lines = stdout.trim().split("\n");
46
+ const jsonLine = lines.find(line => line.startsWith("{") || line.startsWith("["));
47
+ if (jsonLine) {
48
+ resolve(JSON.parse(jsonLine));
49
+ } else {
50
+ resolve({ success: true, output: stdout.trim() });
51
+ }
52
+ } catch (e) {
53
+ resolve({ success: true, output: stdout.trim() });
54
+ }
55
+ }
56
+ });
57
+
58
+ pythonProcess.on("error", (error) => {
59
+ reject(new Error(`Failed to start Python: ${error.message}`));
60
+ });
61
+ });
62
+ }
63
+
64
+ // Create MCP server
65
+ const server = new McpServer({
66
+ name: "newskit-mcp-server",
67
+ version: "1.0.0"
68
+ });
69
+
70
+ // Tool: Categorize articles from TSV file
71
+ server.tool(
72
+ "categorize_articles",
73
+ {
74
+ inputPath: z.string().describe("Path to input TSV file with article_id and title columns"),
75
+ outputPath: z.string().optional().describe("Path to output JSON file (default: categories.json)"),
76
+ minClusterSize: z.number().min(1).optional().describe("Minimum articles per category (default: 2)"),
77
+ similarityThreshold: z.number().min(0).max(1).optional().describe("Cosine similarity threshold 0-1 (default: 0.75)"),
78
+ persistDir: z.string().optional().describe("ChromaDB storage directory (default: ./chroma_db)")
79
+ },
80
+ async ({ inputPath, outputPath, minClusterSize, similarityThreshold, persistDir }) => {
81
+ try {
82
+ // Validate input file exists
83
+ if (!fs.existsSync(inputPath)) {
84
+ return {
85
+ content: [{ type: "text", text: `Error: Input file not found: ${inputPath}` }],
86
+ isError: true
87
+ };
88
+ }
89
+
90
+ const args = [
91
+ "categorize",
92
+ "--input", inputPath,
93
+ "--output", outputPath || "categories.json",
94
+ "--min-cluster-size", String(minClusterSize || 2),
95
+ "--similarity-threshold", String(similarityThreshold || 0.75),
96
+ "--persist-dir", persistDir || "./chroma_db"
97
+ ];
98
+
99
+ const result = await runPythonBridge(args);
100
+
101
+ return {
102
+ content: [
103
+ {
104
+ type: "text",
105
+ text: `Categorization complete!\n\nResults:\n${JSON.stringify(result, null, 2)}`
106
+ }
107
+ ]
108
+ };
109
+ } catch (error) {
110
+ return {
111
+ content: [
112
+ {
113
+ type: "text",
114
+ text: `Categorization failed: ${error instanceof Error ? error.message : String(error)}`
115
+ }
116
+ ],
117
+ isError: true
118
+ };
119
+ }
120
+ }
121
+ );
122
+
123
+ // Tool: Load and view articles from TSV
124
+ server.tool(
125
+ "load_articles",
126
+ {
127
+ inputPath: z.string().describe("Path to input TSV file with article_id and title columns"),
128
+ limit: z.number().optional().describe("Maximum number of articles to return (default: 50)")
129
+ },
130
+ async ({ inputPath, limit }) => {
131
+ try {
132
+ if (!fs.existsSync(inputPath)) {
133
+ return {
134
+ content: [{ type: "text", text: `Error: Input file not found: ${inputPath}` }],
135
+ isError: true
136
+ };
137
+ }
138
+
139
+ const args = [
140
+ "load",
141
+ "--input", inputPath,
142
+ "--limit", String(limit || 50)
143
+ ];
144
+
145
+ const result = await runPythonBridge(args);
146
+
147
+ return {
148
+ content: [
149
+ {
150
+ type: "text",
151
+ text: `Articles loaded (${result.count} total):\n\n${JSON.stringify(result.articles, null, 2)}`
152
+ }
153
+ ]
154
+ };
155
+ } catch (error) {
156
+ return {
157
+ content: [
158
+ {
159
+ type: "text",
160
+ text: `Failed to load articles: ${error instanceof Error ? error.message : String(error)}`
161
+ }
162
+ ],
163
+ isError: true
164
+ };
165
+ }
166
+ }
167
+ );
168
+
169
+ // Tool: Search similar articles in ChromaDB
170
+ server.tool(
171
+ "search_similar",
172
+ {
173
+ query: z.string().describe("Search query to find similar articles"),
174
+ persistDir: z.string().optional().describe("ChromaDB storage directory (default: ./chroma_db)"),
175
+ nResults: z.number().min(1).max(20).optional().describe("Number of results to return (default: 5)")
176
+ },
177
+ async ({ query, persistDir, nResults }) => {
178
+ try {
179
+ const args = [
180
+ "search",
181
+ "--query", query,
182
+ "--persist-dir", persistDir || "./chroma_db",
183
+ "--n-results", String(nResults || 5)
184
+ ];
185
+
186
+ const result = await runPythonBridge(args);
187
+
188
+ return {
189
+ content: [
190
+ {
191
+ type: "text",
192
+ text: `Search results for "${query}":\n\n${JSON.stringify(result, null, 2)}`
193
+ }
194
+ ]
195
+ };
196
+ } catch (error) {
197
+ return {
198
+ content: [
199
+ {
200
+ type: "text",
201
+ text: `Search failed: ${error instanceof Error ? error.message : String(error)}`
202
+ }
203
+ ],
204
+ isError: true
205
+ };
206
+ }
207
+ }
208
+ );
209
+
210
+ // Tool: Get categories from results file
211
+ server.tool(
212
+ "get_categories",
213
+ {
214
+ resultsPath: z.string().describe("Path to the categories.json results file")
215
+ },
216
+ async ({ resultsPath }) => {
217
+ try {
218
+ if (!fs.existsSync(resultsPath)) {
219
+ return {
220
+ content: [{ type: "text", text: `Error: Results file not found: ${resultsPath}` }],
221
+ isError: true
222
+ };
223
+ }
224
+
225
+ const content = fs.readFileSync(resultsPath, "utf-8");
226
+ const data = JSON.parse(content);
227
+
228
+ let output = "Categories:\n\n";
229
+ if (data.categories) {
230
+ for (const cat of data.categories) {
231
+ output += `Category ${cat.category_id}: "${cat.category_name}" (${cat.article_count} articles)\n`;
232
+ for (const article of cat.articles) {
233
+ output += ` - ${article.article_id}: ${article.title}\n`;
234
+ }
235
+ output += "\n";
236
+ }
237
+ }
238
+
239
+ if (data.uncategorized && data.uncategorized.length > 0) {
240
+ output += `Uncategorized: ${data.uncategorized.length} articles\n`;
241
+ for (const article of data.uncategorized.slice(0, 10)) {
242
+ output += ` - ${article.article_id}: ${article.title}\n`;
243
+ }
244
+ if (data.uncategorized.length > 10) {
245
+ output += ` ... and ${data.uncategorized.length - 10} more\n`;
246
+ }
247
+ }
248
+
249
+ return {
250
+ content: [{ type: "text", text: output }]
251
+ };
252
+ } catch (error) {
253
+ return {
254
+ content: [
255
+ {
256
+ type: "text",
257
+ text: `Failed to read categories: ${error instanceof Error ? error.message : String(error)}`
258
+ }
259
+ ],
260
+ isError: true
261
+ };
262
+ }
263
+ }
264
+ );
265
+
266
+ // Start the server
267
+ async function main() {
268
+ const transport = new StdioServerTransport();
269
+ await server.connect(transport);
270
+ console.error("NewsKit MCP server running on stdio");
271
+ }
272
+
273
+ main();
package/tsconfig.json ADDED
@@ -0,0 +1,16 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "Node16",
5
+ "moduleResolution": "Node16",
6
+ "outDir": "./build",
7
+ "rootDir": "./src",
8
+ "strict": true,
9
+ "esModuleInterop": true,
10
+ "skipLibCheck": true,
11
+ "forceConsistentCasingInFileNames": true,
12
+ "resolveJsonModule": true
13
+ },
14
+ "include": ["src/**/*"],
15
+ "exclude": ["node_modules"]
16
+ }