tryll-dataset-builder-mcp 1.1.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +228 -28
- package/index.js +344 -14
- package/lib/store.js +131 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
# Tryll Dataset Builder — MCP Server
|
|
2
2
|
|
|
3
|
-
An MCP (Model Context Protocol) server for building structured RAG knowledge base datasets. Use it with Claude Code to create, manage, and export JSON datasets via natural language.
|
|
3
|
+
An MCP (Model Context Protocol) server for building structured RAG knowledge base datasets. Use it with Claude Code to create, manage, and export JSON datasets via natural language — with optional real-time sync to the [Dataset Builder web app](https://trylljsoncreator.onrender.com).
|
|
4
4
|
|
|
5
5
|
Built by [Tryll Engine](https://tryllengine.com) | [Discord](https://discord.gg/CMnMrmapyB)
|
|
6
6
|
|
|
7
|
+
---
|
|
8
|
+
|
|
7
9
|
## Quick Start
|
|
8
10
|
|
|
9
11
|
### 1. Install
|
|
@@ -14,8 +16,6 @@ npm install -g tryll-dataset-builder-mcp
|
|
|
14
16
|
|
|
15
17
|
### 2. Add to Claude Code
|
|
16
18
|
|
|
17
|
-
Run in your terminal:
|
|
18
|
-
|
|
19
19
|
```bash
|
|
20
20
|
claude mcp add dataset-builder -- npx tryll-dataset-builder-mcp
|
|
21
21
|
```
|
|
@@ -42,55 +42,208 @@ Just talk to Claude:
|
|
|
42
42
|
|
|
43
43
|
> "Create a knowledge base about Minecraft with categories: Mobs, Blocks, Biomes. Add 10 chunks to each category."
|
|
44
44
|
|
|
45
|
-
> "
|
|
45
|
+
> "Parse this wiki page and add it to my dataset: https://minecraft.wiki/w/Creeper"
|
|
46
|
+
|
|
47
|
+
> "Show me the version history of my project"
|
|
46
48
|
|
|
47
|
-
|
|
49
|
+
---
|
|
48
50
|
|
|
49
51
|
## Configuration
|
|
50
52
|
|
|
51
53
|
| Variable | Default | Description |
|
|
52
54
|
|----------|---------|-------------|
|
|
53
|
-
| `DATA_DIR` | `./datasets` | Directory
|
|
55
|
+
| `DATA_DIR` | `./datasets` | Directory for project JSON files (local mode) |
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Two Modes of Operation
|
|
60
|
+
|
|
61
|
+
### Local Mode (default)
|
|
62
|
+
Data is stored as JSON files in `DATA_DIR`. No server needed.
|
|
63
|
+
|
|
64
|
+
### Connected Mode (real-time sync)
|
|
65
|
+
Connect to the [Dataset Builder web app](https://trylljsoncreator.onrender.com) for live collaboration. Changes made via MCP appear instantly in the browser, and vice versa.
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
You: "Connect to session ABC123"
|
|
69
|
+
Claude: *connects via WebSocket*
|
|
70
|
+
You: "Add 5 chunks about dragons"
|
|
71
|
+
→ chunks appear in the browser in real-time
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
54
75
|
|
|
55
|
-
## Available Tools (
|
|
76
|
+
## Available Tools (27)
|
|
77
|
+
|
|
78
|
+
### Session Management
|
|
79
|
+
|
|
80
|
+
| Tool | Description |
|
|
81
|
+
|------|-------------|
|
|
82
|
+
| `connect_session` | Connect to the web app for real-time collaboration. Requires a 6-character session code from the browser UI |
|
|
83
|
+
| `disconnect_session` | Disconnect from the web app, switch back to local storage |
|
|
56
84
|
|
|
57
85
|
### Project Management
|
|
86
|
+
|
|
58
87
|
| Tool | Description |
|
|
59
88
|
|------|-------------|
|
|
60
89
|
| `create_project` | Create a new dataset project |
|
|
61
90
|
| `list_projects` | List all projects with stats |
|
|
62
|
-
| `delete_project` |
|
|
63
|
-
| `get_project_stats` | Detailed statistics |
|
|
91
|
+
| `delete_project` | Permanently delete a project |
|
|
92
|
+
| `get_project_stats` | Detailed statistics (categories, chunks, text lengths) |
|
|
64
93
|
|
|
65
94
|
### Category Management
|
|
95
|
+
|
|
66
96
|
| Tool | Description |
|
|
67
97
|
|------|-------------|
|
|
68
|
-
| `create_category` | Add a category to
|
|
98
|
+
| `create_category` | Add a category to organize chunks |
|
|
69
99
|
| `list_categories` | List categories with chunk counts |
|
|
70
100
|
| `rename_category` | Rename a category |
|
|
71
|
-
| `delete_category` | Delete a category and its chunks |
|
|
101
|
+
| `delete_category` | Delete a category and all its chunks |
|
|
72
102
|
|
|
73
103
|
### Chunk Operations
|
|
104
|
+
|
|
74
105
|
| Tool | Description |
|
|
75
106
|
|------|-------------|
|
|
76
|
-
| `add_chunk` | Add a single knowledge chunk |
|
|
77
|
-
| `bulk_add_chunks` | Add multiple chunks at once |
|
|
78
|
-
| `get_chunk` | Get
|
|
79
|
-
| `update_chunk` | Update chunk fields |
|
|
80
|
-
| `delete_chunk` | Delete a chunk |
|
|
81
|
-
| `duplicate_chunk` | Clone a chunk |
|
|
82
|
-
| `move_chunk` | Move chunk between categories |
|
|
107
|
+
| `add_chunk` | Add a single knowledge chunk with ID, text, and metadata |
|
|
108
|
+
| `bulk_add_chunks` | Add multiple chunks at once (faster than one by one) |
|
|
109
|
+
| `get_chunk` | Get full content of a chunk by ID |
|
|
110
|
+
| `update_chunk` | Update chunk fields (ID, text, metadata) |
|
|
111
|
+
| `delete_chunk` | Delete a chunk by ID |
|
|
112
|
+
| `duplicate_chunk` | Clone a chunk (creates `id_copy`) |
|
|
113
|
+
| `move_chunk` | Move a chunk between categories |
|
|
83
114
|
|
|
84
115
|
### Search & Export
|
|
116
|
+
|
|
117
|
+
| Tool | Description |
|
|
118
|
+
|------|-------------|
|
|
119
|
+
| `search_chunks` | Search by chunk ID or text content |
|
|
120
|
+
| `export_project` | Export as flat JSON array (RAG-ready) |
|
|
121
|
+
| `import_json` | Import an existing JSON dataset |
|
|
122
|
+
| `export_category` | Export a single category as JSON |
|
|
123
|
+
|
|
124
|
+
### URL Parsing
|
|
125
|
+
|
|
126
|
+
| Tool | Description |
|
|
127
|
+
|------|-------------|
|
|
128
|
+
| `parse_url` | Fetch a web page, extract text, auto-create chunks. Splits text > 2000 chars into multiple chunks. Extracts wiki infobox metadata |
|
|
129
|
+
| `batch_parse_urls` | Parse multiple URLs at once |
|
|
130
|
+
|
|
131
|
+
### Bulk Operations
|
|
132
|
+
|
|
85
133
|
| Tool | Description |
|
|
86
134
|
|------|-------------|
|
|
87
|
-
| `
|
|
88
|
-
| `
|
|
89
|
-
|
|
135
|
+
| `bulk_update_metadata` | Set a metadata field across all chunks (or per category) |
|
|
136
|
+
| `merge_projects` | Merge all data from one project into another |
|
|
137
|
+
|
|
138
|
+
### Version History
|
|
139
|
+
|
|
140
|
+
| Tool | Description |
|
|
141
|
+
|------|-------------|
|
|
142
|
+
| `get_history` | Get version history (last 50 commits) for a project |
|
|
143
|
+
| `get_commit` | Get a specific commit with full snapshot data for diffing |
|
|
144
|
+
| `rollback` | Rollback a project to a previous commit's state |
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Tool Details
|
|
149
|
+
|
|
150
|
+
### `add_chunk`
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
project: "minecraft"
|
|
154
|
+
category: "Mobs"
|
|
155
|
+
id: "creeper"
|
|
156
|
+
text: "A Creeper is a hostile mob that silently approaches players..."
|
|
157
|
+
metadata:
|
|
158
|
+
page_title: "Creeper"
|
|
159
|
+
source: "Minecraft Wiki"
|
|
160
|
+
license: "CC BY-NC-SA 3.0"
|
|
161
|
+
health: "20" ← custom metadata field
|
|
162
|
+
behavior: "explodes" ← custom metadata field
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Standard metadata fields: `page_title`, `source`, `license`. Any extra fields become custom metadata.
|
|
166
|
+
|
|
167
|
+
### `parse_url`
|
|
168
|
+
|
|
169
|
+
```
|
|
170
|
+
project: "minecraft"
|
|
171
|
+
category: "Mobs"
|
|
172
|
+
url: "https://minecraft.wiki/w/Creeper"
|
|
173
|
+
chunk_id: "creeper"
|
|
174
|
+
license: "CC BY-NC-SA 3.0"
|
|
175
|
+
```
|
|
90
176
|
|
|
91
|
-
|
|
177
|
+
- Fetches the page, extracts main text content
|
|
178
|
+
- If text > 2000 chars → auto-splits into `creeper_1`, `creeper_2`, etc.
|
|
179
|
+
- Extracts page title and source URL as metadata
|
|
180
|
+
- For wiki pages: extracts infobox/sidebar data as custom metadata fields
|
|
92
181
|
|
|
93
|
-
|
|
182
|
+
### `get_history`
|
|
183
|
+
|
|
184
|
+
```
|
|
185
|
+
project: "minecraft"
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
```json
|
|
190
|
+
[
|
|
191
|
+
{
|
|
192
|
+
"id": "uuid",
|
|
193
|
+
"timestamp": "2026-02-27T14:30:00.000Z",
|
|
194
|
+
"source": "mcp",
|
|
195
|
+
"action": "addChunk",
|
|
196
|
+
"summary": "Added chunk 'creeper' to 'Mobs'",
|
|
197
|
+
"stats": { "categories": 3, "chunks": 12 }
|
|
198
|
+
}
|
|
199
|
+
]
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### `rollback`
|
|
203
|
+
|
|
204
|
+
```
|
|
205
|
+
project: "minecraft"
|
|
206
|
+
commit_id: "uuid-of-target-commit"
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Restores the project to that commit's snapshot. Creates a new "rollback" commit so you can undo the rollback later.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Data Formats
|
|
214
|
+
|
|
215
|
+
### Project JSON (internal)
|
|
216
|
+
|
|
217
|
+
```json
|
|
218
|
+
{
|
|
219
|
+
"name": "minecraft",
|
|
220
|
+
"createdAt": "2026-02-27T10:00:00.000Z",
|
|
221
|
+
"categories": [
|
|
222
|
+
{
|
|
223
|
+
"id": "uuid",
|
|
224
|
+
"name": "Mobs",
|
|
225
|
+
"expanded": true,
|
|
226
|
+
"chunks": [
|
|
227
|
+
{
|
|
228
|
+
"_uid": "uuid",
|
|
229
|
+
"id": "creeper",
|
|
230
|
+
"text": "A Creeper is a hostile mob...",
|
|
231
|
+
"metadata": {
|
|
232
|
+
"page_title": "Creeper",
|
|
233
|
+
"source": "Minecraft Wiki",
|
|
234
|
+
"license": "CC BY-NC-SA 3.0"
|
|
235
|
+
},
|
|
236
|
+
"customFields": [
|
|
237
|
+
{ "key": "health", "value": "20" }
|
|
238
|
+
]
|
|
239
|
+
}
|
|
240
|
+
]
|
|
241
|
+
}
|
|
242
|
+
]
|
|
243
|
+
}
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Export Format (RAG-ready)
|
|
94
247
|
|
|
95
248
|
```json
|
|
96
249
|
[
|
|
@@ -101,20 +254,67 @@ The exported JSON is a flat array, compatible with the [Dataset Builder web app]
|
|
|
101
254
|
"page_title": "Creeper",
|
|
102
255
|
"source": "Minecraft Wiki",
|
|
103
256
|
"license": "CC BY-NC-SA 3.0",
|
|
104
|
-
"type": "hostile_mob",
|
|
105
257
|
"health": "20"
|
|
106
258
|
}
|
|
107
259
|
}
|
|
108
260
|
]
|
|
109
261
|
```
|
|
110
262
|
|
|
263
|
+
### History Commit
|
|
264
|
+
|
|
265
|
+
```json
|
|
266
|
+
{
|
|
267
|
+
"id": "uuid",
|
|
268
|
+
"timestamp": "2026-02-27T14:30:00.000Z",
|
|
269
|
+
"source": "browser | mcp",
|
|
270
|
+
"action": "addChunk",
|
|
271
|
+
"summary": "Added chunk 'creeper' to 'Mobs'",
|
|
272
|
+
"stats": { "categories": 3, "chunks": 12 },
|
|
273
|
+
"snapshot": { "...full project state..." }
|
|
274
|
+
}
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## Real-Time Collaboration
|
|
280
|
+
|
|
281
|
+
```
|
|
282
|
+
┌─────────────┐ WebSocket ┌──────────────┐ REST API ┌─────────────┐
|
|
283
|
+
│ Browser │ ◄──────────────► │ Web Server │ ◄──────────────► │ MCP Server │
|
|
284
|
+
│ (Dataset │ data:changed │ (Express + │ POST/PUT/DEL │ (Claude │
|
|
285
|
+
│ Builder) │ mcp:connected │ WebSocket) │ + source:mcp │ Code) │
|
|
286
|
+
└─────────────┘ └──────────────┘ └─────────────┘
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
1. Open the [Dataset Builder](https://trylljsoncreator.onrender.com) in your browser
|
|
290
|
+
2. Copy the 6-character session code from the top bar
|
|
291
|
+
3. Tell Claude: *"Connect to session ABC123"*
|
|
292
|
+
4. All changes sync in real-time between browser and Claude
|
|
293
|
+
5. Version history tracks who made each change (browser vs MCP)
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
111
297
|
## Example Prompts
|
|
112
298
|
|
|
113
299
|
- *"Create a Dark Souls knowledge base with categories for Bosses, Weapons, and Locations"*
|
|
114
|
-
- *"
|
|
115
|
-
- *"
|
|
116
|
-
- *"
|
|
117
|
-
- *"
|
|
300
|
+
- *"Parse these wiki pages and add them to my Minecraft project: [url1], [url2], [url3]"*
|
|
301
|
+
- *"Bulk update the license field to 'MIT' for all chunks in the Mobs category"*
|
|
302
|
+
- *"Show me the version history of my project"*
|
|
303
|
+
- *"Rollback my project to the commit before I deleted that category"*
|
|
304
|
+
- *"Merge my test_data project into the main production project"*
|
|
305
|
+
- *"Export the Bosses category as JSON"*
|
|
306
|
+
- *"Connect to session XYZ789 and add 20 chunks about potions"*
|
|
307
|
+
|
|
308
|
+
---
|
|
309
|
+
|
|
310
|
+
## Links
|
|
311
|
+
|
|
312
|
+
- **Web App**: [trylljsoncreator.onrender.com](https://trylljsoncreator.onrender.com)
|
|
313
|
+
- **Web App Repo**: [github.com/Skizziik/json_creator](https://github.com/Skizziik/json_creator)
|
|
314
|
+
- **MCP Repo**: [github.com/Skizziik/tryll_dataset_builder](https://github.com/Skizziik/tryll_dataset_builder)
|
|
315
|
+
- **npm**: [tryll-dataset-builder-mcp](https://www.npmjs.com/package/tryll-dataset-builder-mcp)
|
|
316
|
+
- **Tryll Engine**: [tryllengine.com](https://tryllengine.com)
|
|
317
|
+
- **Discord**: [discord.gg/CMnMrmapyB](https://discord.gg/CMnMrmapyB)
|
|
118
318
|
|
|
119
319
|
## License
|
|
120
320
|
|
package/index.js
CHANGED
|
@@ -5,11 +5,12 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
|
|
|
5
5
|
import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
|
|
6
6
|
import { Store } from "./lib/store.js";
|
|
7
7
|
import WebSocket from "ws";
|
|
8
|
+
import * as cheerio from "cheerio";
|
|
8
9
|
|
|
9
10
|
const store = new Store(process.env.DATA_DIR);
|
|
10
11
|
|
|
11
12
|
const server = new Server(
|
|
12
|
-
{ name: "tryll-dataset-builder", version: "1.
|
|
13
|
+
{ name: "tryll-dataset-builder", version: "1.3.0" },
|
|
13
14
|
{ capabilities: { tools: {} } }
|
|
14
15
|
);
|
|
15
16
|
|
|
@@ -38,6 +39,96 @@ async function apiCall(method, path, body) {
|
|
|
38
39
|
return data;
|
|
39
40
|
}
|
|
40
41
|
|
|
42
|
+
// ============================================
|
|
43
|
+
// URL PARSING HELPERS
|
|
44
|
+
// ============================================
|
|
45
|
+
|
|
46
|
+
const CHUNK_LIMIT = 2000;
|
|
47
|
+
|
|
48
|
+
async function parseUrl(url) {
|
|
49
|
+
const res = await fetch(url, {
|
|
50
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; TryllDatasetBuilder/1.2)' },
|
|
51
|
+
});
|
|
52
|
+
if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`);
|
|
53
|
+
const html = await res.text();
|
|
54
|
+
const $ = cheerio.load(html);
|
|
55
|
+
|
|
56
|
+
// Extract page title
|
|
57
|
+
const pageTitle = $('title').first().text().trim()
|
|
58
|
+
|| $('h1').first().text().trim()
|
|
59
|
+
|| '';
|
|
60
|
+
|
|
61
|
+
// Extract wiki infobox metadata
|
|
62
|
+
const infobox = {};
|
|
63
|
+
$('.infobox tr, .sidebar tr, .wikitable.infobox tr, table.infobox tr').each((_, row) => {
|
|
64
|
+
const $row = $(row);
|
|
65
|
+
const key = $row.find('th').first().text().trim().replace(/\s+/g, ' ');
|
|
66
|
+
const val = $row.find('td').first().text().trim().replace(/\s+/g, ' ');
|
|
67
|
+
if (key && val && key.length < 60 && val.length < 200) {
|
|
68
|
+
infobox[key] = val;
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
// Remove noise elements
|
|
73
|
+
$('script, style, nav, footer, header, .sidebar, .infobox, .navbox, .mw-editsection, .reference, .reflist, #mw-navigation, .noprint, .toc').remove();
|
|
74
|
+
|
|
75
|
+
// Extract main text
|
|
76
|
+
const mainContent = $('article, main, #mw-content-text, #content, .mw-parser-output, #bodyContent, .entry-content, .post-content').first();
|
|
77
|
+
let text = '';
|
|
78
|
+
if (mainContent.length) {
|
|
79
|
+
text = mainContent.text();
|
|
80
|
+
} else {
|
|
81
|
+
text = $('body').text();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Clean up whitespace
|
|
85
|
+
text = text
|
|
86
|
+
.replace(/\t/g, ' ')
|
|
87
|
+
.replace(/[ ]{2,}/g, ' ')
|
|
88
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
89
|
+
.trim();
|
|
90
|
+
|
|
91
|
+
return { text, pageTitle, infobox, source: url };
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function splitTextIntoChunks(text, baseId, limit = CHUNK_LIMIT) {
|
|
95
|
+
if (text.length <= limit) {
|
|
96
|
+
return [{ id: baseId, text }];
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const chunks = [];
|
|
100
|
+
let remaining = text;
|
|
101
|
+
let index = 1;
|
|
102
|
+
|
|
103
|
+
while (remaining.length > 0) {
|
|
104
|
+
let cutPoint = limit;
|
|
105
|
+
if (remaining.length > limit) {
|
|
106
|
+
// Try to cut at paragraph boundary
|
|
107
|
+
const paraBreak = remaining.lastIndexOf('\n\n', limit);
|
|
108
|
+
if (paraBreak > limit * 0.3) {
|
|
109
|
+
cutPoint = paraBreak;
|
|
110
|
+
} else {
|
|
111
|
+
// Try sentence boundary
|
|
112
|
+
const sentBreak = remaining.lastIndexOf('. ', limit);
|
|
113
|
+
if (sentBreak > limit * 0.3) {
|
|
114
|
+
cutPoint = sentBreak + 1;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
} else {
|
|
118
|
+
cutPoint = remaining.length;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
chunks.push({
|
|
122
|
+
id: `${baseId}_${index}`,
|
|
123
|
+
text: remaining.substring(0, cutPoint).trim(),
|
|
124
|
+
});
|
|
125
|
+
remaining = remaining.substring(cutPoint).trim();
|
|
126
|
+
index++;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return chunks;
|
|
130
|
+
}
|
|
131
|
+
|
|
41
132
|
// ============================================
|
|
42
133
|
// TOOL DEFINITIONS
|
|
43
134
|
// ============================================
|
|
@@ -309,6 +400,126 @@ const TOOLS = [
|
|
|
309
400
|
required: ["project"],
|
|
310
401
|
},
|
|
311
402
|
},
|
|
403
|
+
|
|
404
|
+
// ---- URL Parsing ----
|
|
405
|
+
{
|
|
406
|
+
name: "parse_url",
|
|
407
|
+
description: "Fetch a web page, extract its text content, and auto-create chunks. If text exceeds 2000 characters, it auto-splits into multiple chunks with _1, _2 suffixes. Extracts page title and source URL as metadata. For wiki pages, extracts infobox/sidebar data as custom metadata fields.",
|
|
408
|
+
inputSchema: {
|
|
409
|
+
type: "object",
|
|
410
|
+
properties: {
|
|
411
|
+
project: { type: "string", description: "Project name" },
|
|
412
|
+
category: { type: "string", description: "Category to add chunks into" },
|
|
413
|
+
url: { type: "string", description: "URL to fetch and parse" },
|
|
414
|
+
chunk_id: { type: "string", description: "Base chunk ID. If text is split, becomes chunk_id_1, chunk_id_2, etc." },
|
|
415
|
+
license: { type: "string", description: "License for the content. Default: CC BY-NC-SA 3.0" },
|
|
416
|
+
},
|
|
417
|
+
required: ["project", "category", "url", "chunk_id"],
|
|
418
|
+
},
|
|
419
|
+
},
|
|
420
|
+
{
|
|
421
|
+
name: "batch_parse_urls",
|
|
422
|
+
description: "Parse multiple URLs at once and add all chunks to a category. Each URL gets its own chunk ID prefix. Auto-splits long texts into multiple chunks.",
|
|
423
|
+
inputSchema: {
|
|
424
|
+
type: "object",
|
|
425
|
+
properties: {
|
|
426
|
+
project: { type: "string", description: "Project name" },
|
|
427
|
+
category: { type: "string", description: "Category to add chunks into" },
|
|
428
|
+
urls: {
|
|
429
|
+
type: "array",
|
|
430
|
+
description: "Array of URL entries to parse",
|
|
431
|
+
items: {
|
|
432
|
+
type: "object",
|
|
433
|
+
properties: {
|
|
434
|
+
url: { type: "string", description: "URL to fetch" },
|
|
435
|
+
chunk_id: { type: "string", description: "Base chunk ID for this URL" },
|
|
436
|
+
},
|
|
437
|
+
required: ["url", "chunk_id"],
|
|
438
|
+
},
|
|
439
|
+
},
|
|
440
|
+
license: { type: "string", description: "License for all content. Default: CC BY-NC-SA 3.0" },
|
|
441
|
+
},
|
|
442
|
+
required: ["project", "category", "urls"],
|
|
443
|
+
},
|
|
444
|
+
},
|
|
445
|
+
|
|
446
|
+
// ---- Bulk Operations ----
|
|
447
|
+
{
|
|
448
|
+
name: "bulk_update_metadata",
|
|
449
|
+
description: "Update a metadata field across ALL chunks in a project (or a specific category). Useful for setting license, source, or custom fields in bulk.",
|
|
450
|
+
inputSchema: {
|
|
451
|
+
type: "object",
|
|
452
|
+
properties: {
|
|
453
|
+
project: { type: "string", description: "Project name" },
|
|
454
|
+
field: { type: "string", description: "Metadata field to update (e.g. 'license', 'source', or any custom field name)" },
|
|
455
|
+
value: { type: "string", description: "New value for the field" },
|
|
456
|
+
category: { type: "string", description: "Optional: only update chunks in this category. If omitted, updates all chunks in the project." },
|
|
457
|
+
},
|
|
458
|
+
required: ["project", "field", "value"],
|
|
459
|
+
},
|
|
460
|
+
},
|
|
461
|
+
{
|
|
462
|
+
name: "merge_projects",
|
|
463
|
+
description: "Merge all categories and chunks from a source project into a target project. Categories with the same name are combined. Chunks with duplicate IDs are skipped.",
|
|
464
|
+
inputSchema: {
|
|
465
|
+
type: "object",
|
|
466
|
+
properties: {
|
|
467
|
+
source: { type: "string", description: "Source project name (data is copied FROM here)" },
|
|
468
|
+
target: { type: "string", description: "Target project name (data is merged INTO here)" },
|
|
469
|
+
},
|
|
470
|
+
required: ["source", "target"],
|
|
471
|
+
},
|
|
472
|
+
},
|
|
473
|
+
{
|
|
474
|
+
name: "export_category",
|
|
475
|
+
description: "Export a single category as a flat JSON array. Same format as export_project but filtered to one category.",
|
|
476
|
+
inputSchema: {
|
|
477
|
+
type: "object",
|
|
478
|
+
properties: {
|
|
479
|
+
project: { type: "string", description: "Project name" },
|
|
480
|
+
category: { type: "string", description: "Category name to export" },
|
|
481
|
+
save_to_file: { type: "boolean", description: "If true, saves to a file. Default: false." },
|
|
482
|
+
},
|
|
483
|
+
required: ["project", "category"],
|
|
484
|
+
},
|
|
485
|
+
},
|
|
486
|
+
|
|
487
|
+
// ---- History ----
|
|
488
|
+
{
|
|
489
|
+
name: "get_history",
|
|
490
|
+
description: "Get version history (last 50 commits) for a project. Each commit shows who made the change (browser/MCP), what was changed, and when. Returns lightweight list without snapshots.",
|
|
491
|
+
inputSchema: {
|
|
492
|
+
type: "object",
|
|
493
|
+
properties: {
|
|
494
|
+
project: { type: "string", description: "Project name" },
|
|
495
|
+
},
|
|
496
|
+
required: ["project"],
|
|
497
|
+
},
|
|
498
|
+
},
|
|
499
|
+
{
|
|
500
|
+
name: "get_commit",
|
|
501
|
+
description: "Get a specific commit with full snapshot data. Returns the commit's snapshot and the previous commit's snapshot for computing diffs.",
|
|
502
|
+
inputSchema: {
|
|
503
|
+
type: "object",
|
|
504
|
+
properties: {
|
|
505
|
+
project: { type: "string", description: "Project name" },
|
|
506
|
+
commit_id: { type: "string", description: "Commit UUID" },
|
|
507
|
+
},
|
|
508
|
+
required: ["project", "commit_id"],
|
|
509
|
+
},
|
|
510
|
+
},
|
|
511
|
+
{
|
|
512
|
+
name: "rollback",
|
|
513
|
+
description: "Rollback a project to a specific commit's state. Restores the project data from that commit's snapshot and creates a new 'rollback' commit in history. Safe: you can undo a rollback by rolling back to a later commit.",
|
|
514
|
+
inputSchema: {
|
|
515
|
+
type: "object",
|
|
516
|
+
properties: {
|
|
517
|
+
project: { type: "string", description: "Project name" },
|
|
518
|
+
commit_id: { type: "string", description: "Commit UUID to rollback to" },
|
|
519
|
+
},
|
|
520
|
+
required: ["project", "commit_id"],
|
|
521
|
+
},
|
|
522
|
+
},
|
|
312
523
|
];
|
|
313
524
|
|
|
314
525
|
// ============================================
|
|
@@ -327,28 +538,28 @@ async function handleRemote(name, args) {
|
|
|
327
538
|
|
|
328
539
|
switch (name) {
|
|
329
540
|
case "create_project":
|
|
330
|
-
return apiCall('POST', '/api/projects', { name: args.name, session: s });
|
|
541
|
+
return apiCall('POST', '/api/projects', { name: args.name, session: s, source: 'mcp' });
|
|
331
542
|
case "list_projects":
|
|
332
543
|
return apiCall('GET', '/api/projects');
|
|
333
544
|
case "delete_project":
|
|
334
|
-
return apiCall('DELETE', `/api/projects/${p(args.name)}?session=${s}`);
|
|
545
|
+
return apiCall('DELETE', `/api/projects/${p(args.name)}?session=${s}&source=mcp`);
|
|
335
546
|
case "get_project_stats":
|
|
336
547
|
return apiCall('GET', `/api/projects/${p(args.name)}/stats`);
|
|
337
548
|
case "create_category":
|
|
338
|
-
return apiCall('POST', `/api/projects/${p(args.project)}/categories`, { name: args.name, session: s });
|
|
549
|
+
return apiCall('POST', `/api/projects/${p(args.project)}/categories`, { name: args.name, session: s, source: 'mcp' });
|
|
339
550
|
case "list_categories":
|
|
340
551
|
return apiCall('GET', `/api/projects/${p(args.project)}/categories`);
|
|
341
552
|
case "rename_category":
|
|
342
|
-
return apiCall('PUT', `/api/projects/${p(args.project)}/categories/${p(args.old_name)}`, { newName: args.new_name, session: s });
|
|
553
|
+
return apiCall('PUT', `/api/projects/${p(args.project)}/categories/${p(args.old_name)}`, { newName: args.new_name, session: s, source: 'mcp' });
|
|
343
554
|
case "delete_category":
|
|
344
|
-
return apiCall('DELETE', `/api/projects/${p(args.project)}/categories/${p(args.name)}?session=${s}`);
|
|
555
|
+
return apiCall('DELETE', `/api/projects/${p(args.project)}/categories/${p(args.name)}?session=${s}&source=mcp`);
|
|
345
556
|
case "add_chunk":
|
|
346
557
|
return apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks`, {
|
|
347
|
-
id: args.id, text: args.text, metadata: args.metadata, session: s,
|
|
558
|
+
id: args.id, text: args.text, metadata: args.metadata, session: s, source: 'mcp',
|
|
348
559
|
});
|
|
349
560
|
case "bulk_add_chunks":
|
|
350
561
|
return apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
|
|
351
|
-
chunks: args.chunks, session: s,
|
|
562
|
+
chunks: args.chunks, session: s, source: 'mcp',
|
|
352
563
|
});
|
|
353
564
|
case "get_chunk": {
|
|
354
565
|
const proj = await apiCall('GET', `/api/projects/${p(args.project)}`);
|
|
@@ -363,7 +574,7 @@ async function handleRemote(name, args) {
|
|
|
363
574
|
for (const cat of proj2.categories) {
|
|
364
575
|
const ch = cat.chunks.find(c => c.id === args.id);
|
|
365
576
|
if (ch) {
|
|
366
|
-
const body = { session: s };
|
|
577
|
+
const body = { session: s, source: 'mcp' };
|
|
367
578
|
if (args.new_id !== undefined) body.id = args.new_id;
|
|
368
579
|
if (args.text !== undefined) body.text = args.text;
|
|
369
580
|
const meta = {};
|
|
@@ -384,7 +595,7 @@ async function handleRemote(name, args) {
|
|
|
384
595
|
for (const cat of proj3.categories) {
|
|
385
596
|
const ch = cat.chunks.find(c => c.id === args.id);
|
|
386
597
|
if (ch) {
|
|
387
|
-
return apiCall('DELETE', `/api/projects/${p(args.project)}/categories/${cat.id}/chunks/${ch._uid}?session=${s}`);
|
|
598
|
+
return apiCall('DELETE', `/api/projects/${p(args.project)}/categories/${cat.id}/chunks/${ch._uid}?session=${s}&source=mcp`);
|
|
388
599
|
}
|
|
389
600
|
}
|
|
390
601
|
throw new Error(`Chunk "${args.id}" not found`);
|
|
@@ -394,14 +605,14 @@ async function handleRemote(name, args) {
|
|
|
394
605
|
for (const cat of proj4.categories) {
|
|
395
606
|
const ch = cat.chunks.find(c => c.id === args.id);
|
|
396
607
|
if (ch) {
|
|
397
|
-
return apiCall('POST', `/api/projects/${p(args.project)}/categories/${cat.id}/chunks/${ch._uid}/duplicate
|
|
608
|
+
return apiCall('POST', `/api/projects/${p(args.project)}/categories/${cat.id}/chunks/${ch._uid}/duplicate`, { source: 'mcp' });
|
|
398
609
|
}
|
|
399
610
|
}
|
|
400
611
|
throw new Error(`Chunk "${args.id}" not found`);
|
|
401
612
|
}
|
|
402
613
|
case "move_chunk":
|
|
403
614
|
return apiCall('POST', `/api/projects/${p(args.project)}/chunks/${p(args.id)}/move`, {
|
|
404
|
-
targetCategory: args.target_category, session: s,
|
|
615
|
+
targetCategory: args.target_category, session: s, source: 'mcp',
|
|
405
616
|
});
|
|
406
617
|
case "search_chunks":
|
|
407
618
|
return apiCall('GET', `/api/projects/${p(args.project)}/search?q=${encodeURIComponent(args.query)}`);
|
|
@@ -415,9 +626,61 @@ async function handleRemote(name, args) {
|
|
|
415
626
|
}
|
|
416
627
|
if (!jsonData) throw new Error('Provide either "json_path" or "data" parameter');
|
|
417
628
|
return apiCall('POST', `/api/projects/${p(args.project)}/import`, {
|
|
418
|
-
data: jsonData, category: args.category, session: s,
|
|
629
|
+
data: jsonData, category: args.category, session: s, source: 'mcp',
|
|
630
|
+
});
|
|
631
|
+
}
|
|
632
|
+
case "parse_url": {
|
|
633
|
+
const parsed = await parseUrl(args.url);
|
|
634
|
+
const chunks = splitTextIntoChunks(parsed.text, args.chunk_id);
|
|
635
|
+
const license = args.license || 'CC BY-NC-SA 3.0';
|
|
636
|
+
const chunkData = chunks.map(ch => ({
|
|
637
|
+
id: ch.id, text: ch.text,
|
|
638
|
+
metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
|
|
639
|
+
}));
|
|
640
|
+
const result = await apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
|
|
641
|
+
chunks: chunkData, session: s, source: 'mcp',
|
|
419
642
|
});
|
|
643
|
+
return { ...result, pageTitle: parsed.pageTitle, chunksCreated: chunks.length, infoboxFields: Object.keys(parsed.infobox) };
|
|
420
644
|
}
|
|
645
|
+
case "batch_parse_urls": {
|
|
646
|
+
const results = [];
|
|
647
|
+
const license = args.license || 'CC BY-NC-SA 3.0';
|
|
648
|
+
for (const entry of args.urls) {
|
|
649
|
+
try {
|
|
650
|
+
const parsed = await parseUrl(entry.url);
|
|
651
|
+
const chunks = splitTextIntoChunks(parsed.text, entry.chunk_id);
|
|
652
|
+
const chunkData = chunks.map(ch => ({
|
|
653
|
+
id: ch.id, text: ch.text,
|
|
654
|
+
metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
|
|
655
|
+
}));
|
|
656
|
+
const r = await apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
|
|
657
|
+
chunks: chunkData, session: s, source: 'mcp',
|
|
658
|
+
});
|
|
659
|
+
results.push({ url: entry.url, chunk_id: entry.chunk_id, chunks: chunks.length, added: r.added, errors: r.errors });
|
|
660
|
+
} catch (err) {
|
|
661
|
+
results.push({ url: entry.url, chunk_id: entry.chunk_id, error: err.message });
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
return { parsed: results.filter(r => !r.error).length, failed: results.filter(r => r.error).length, results };
|
|
665
|
+
}
|
|
666
|
+
case "bulk_update_metadata":
|
|
667
|
+
return apiCall('POST', `/api/projects/${p(args.project)}/bulk-metadata`, {
|
|
668
|
+
field: args.field, value: args.value, category: args.category, session: s, source: 'mcp',
|
|
669
|
+
});
|
|
670
|
+
case "merge_projects":
|
|
671
|
+
return apiCall('POST', `/api/projects/${p(args.source)}/merge`, {
|
|
672
|
+
target: args.target, session: s, source: 'mcp',
|
|
673
|
+
});
|
|
674
|
+
case "export_category":
|
|
675
|
+
return apiCall('GET', `/api/projects/${p(args.project)}/categories/${p(args.category)}/export`);
|
|
676
|
+
case "get_history":
|
|
677
|
+
return apiCall('GET', `/api/projects/${p(args.project)}/history`);
|
|
678
|
+
case "get_commit":
|
|
679
|
+
return apiCall('GET', `/api/projects/${p(args.project)}/history/${args.commit_id}`);
|
|
680
|
+
case "rollback":
|
|
681
|
+
return apiCall('POST', `/api/projects/${p(args.project)}/history/${args.commit_id}/rollback`, {
|
|
682
|
+
session: s, source: 'mcp',
|
|
683
|
+
});
|
|
421
684
|
default:
|
|
422
685
|
throw new Error(`Unknown tool: ${name}`);
|
|
423
686
|
}
|
|
@@ -593,6 +856,73 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
593
856
|
break;
|
|
594
857
|
}
|
|
595
858
|
|
|
859
|
+
case "parse_url": {
|
|
860
|
+
const parsed = await parseUrl(args.url);
|
|
861
|
+
const chunks = splitTextIntoChunks(parsed.text, args.chunk_id);
|
|
862
|
+
const license = args.license || 'CC BY-NC-SA 3.0';
|
|
863
|
+
const chunkData = chunks.map(ch => ({
|
|
864
|
+
id: ch.id, text: ch.text,
|
|
865
|
+
metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
|
|
866
|
+
}));
|
|
867
|
+
const bulkResult = store.bulkAddChunks(args.project, args.category, chunkData);
|
|
868
|
+
result = { ...bulkResult, pageTitle: parsed.pageTitle, chunksCreated: chunks.length, infoboxFields: Object.keys(parsed.infobox) };
|
|
869
|
+
break;
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
case "batch_parse_urls": {
|
|
873
|
+
const results = [];
|
|
874
|
+
const license = args.license || 'CC BY-NC-SA 3.0';
|
|
875
|
+
for (const entry of args.urls) {
|
|
876
|
+
try {
|
|
877
|
+
const parsed = await parseUrl(entry.url);
|
|
878
|
+
const chunks = splitTextIntoChunks(parsed.text, entry.chunk_id);
|
|
879
|
+
const chunkData = chunks.map(ch => ({
|
|
880
|
+
id: ch.id, text: ch.text,
|
|
881
|
+
metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
|
|
882
|
+
}));
|
|
883
|
+
const r = store.bulkAddChunks(args.project, args.category, chunkData);
|
|
884
|
+
results.push({ url: entry.url, chunk_id: entry.chunk_id, chunks: chunks.length, added: r.added, errors: r.errors });
|
|
885
|
+
} catch (err) {
|
|
886
|
+
results.push({ url: entry.url, chunk_id: entry.chunk_id, error: err.message });
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
result = { parsed: results.filter(r => !r.error).length, failed: results.filter(r => r.error).length, results };
|
|
890
|
+
break;
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
case "bulk_update_metadata":
|
|
894
|
+
result = store.bulkUpdateMetadata(args.project, args.field, args.value, args.category);
|
|
895
|
+
break;
|
|
896
|
+
|
|
897
|
+
case "merge_projects":
|
|
898
|
+
result = store.mergeProjects(args.source, args.target);
|
|
899
|
+
break;
|
|
900
|
+
|
|
901
|
+
case "export_category": {
|
|
902
|
+
const exported = store.exportCategory(args.project, args.category);
|
|
903
|
+
if (args.save_to_file) {
|
|
904
|
+
const outPath = store._filePath(args.project).replace('.json', `.${args.category}.export.json`);
|
|
905
|
+
const { writeFileSync } = await import('fs');
|
|
906
|
+
writeFileSync(outPath, JSON.stringify(exported, null, 2), 'utf-8');
|
|
907
|
+
result = { exported: exported.length, savedTo: outPath };
|
|
908
|
+
} else {
|
|
909
|
+
result = { exported: exported.length, data: exported };
|
|
910
|
+
}
|
|
911
|
+
break;
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
case "get_history":
|
|
915
|
+
result = store.getHistory(args.project);
|
|
916
|
+
break;
|
|
917
|
+
|
|
918
|
+
case "get_commit":
|
|
919
|
+
result = store.getCommit(args.project, args.commit_id);
|
|
920
|
+
break;
|
|
921
|
+
|
|
922
|
+
case "rollback":
|
|
923
|
+
result = store.rollback(args.project, args.commit_id, 'mcp');
|
|
924
|
+
break;
|
|
925
|
+
|
|
596
926
|
default:
|
|
597
927
|
throw new Error(`Unknown tool: ${name}`);
|
|
598
928
|
}
|
|
@@ -617,7 +947,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
617
947
|
async function main() {
|
|
618
948
|
const transport = new StdioServerTransport();
|
|
619
949
|
await server.connect(transport);
|
|
620
|
-
console.error("Tryll Dataset Builder MCP server running (v1.
|
|
950
|
+
console.error("Tryll Dataset Builder MCP server running (v1.3.0)");
|
|
621
951
|
}
|
|
622
952
|
|
|
623
953
|
main().catch((err) => {
|
package/lib/store.js
CHANGED
|
@@ -4,6 +4,7 @@ import { randomUUID } from 'crypto';
|
|
|
4
4
|
|
|
5
5
|
const DEFAULT_LICENSE = 'CC BY-NC-SA 3.0';
|
|
6
6
|
const STANDARD_META = ['page_title', 'source', 'license'];
|
|
7
|
+
const MAX_HISTORY = 50;
|
|
7
8
|
|
|
8
9
|
export class Store {
|
|
9
10
|
constructor(dataDir) {
|
|
@@ -25,7 +26,7 @@ export class Store {
|
|
|
25
26
|
|
|
26
27
|
listProjects() {
|
|
27
28
|
this._ensureDir();
|
|
28
|
-
const files = readdirSync(this.dataDir).filter(f => f.endsWith('.json'));
|
|
29
|
+
const files = readdirSync(this.dataDir).filter(f => f.endsWith('.json') && !f.endsWith('.history.json'));
|
|
29
30
|
return files.map(f => {
|
|
30
31
|
const name = f.replace(/\.json$/, '');
|
|
31
32
|
try {
|
|
@@ -343,6 +344,74 @@ export class Store {
|
|
|
343
344
|
return { project: projectName, category: catName, imported, skipped };
|
|
344
345
|
}
|
|
345
346
|
|
|
347
|
+
// ---- BULK UPDATE METADATA ----
|
|
348
|
+
|
|
349
|
+
bulkUpdateMetadata(projectName, field, value, categoryName) {
|
|
350
|
+
const data = this._load(projectName);
|
|
351
|
+
let updated = 0;
|
|
352
|
+
const cats = categoryName
|
|
353
|
+
? [this._findCategory(data, categoryName)]
|
|
354
|
+
: data.categories;
|
|
355
|
+
for (const cat of cats) {
|
|
356
|
+
for (const ch of cat.chunks) {
|
|
357
|
+
if (STANDARD_META.includes(field)) {
|
|
358
|
+
ch.metadata[field] = value;
|
|
359
|
+
} else {
|
|
360
|
+
if (!ch.customFields) ch.customFields = [];
|
|
361
|
+
const existing = ch.customFields.find(cf => cf.key === field);
|
|
362
|
+
if (existing) { existing.value = value; }
|
|
363
|
+
else { ch.customFields.push({ key: field, value }); }
|
|
364
|
+
}
|
|
365
|
+
updated++;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
this._save(projectName, data);
|
|
369
|
+
return { project: projectName, field, value, updated };
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// ---- MERGE PROJECTS ----
|
|
373
|
+
|
|
374
|
+
mergeProjects(sourceName, targetName) {
|
|
375
|
+
const source = this._load(sourceName);
|
|
376
|
+
const target = this._load(targetName);
|
|
377
|
+
let categoriesMerged = 0, chunksAdded = 0, chunksSkipped = 0;
|
|
378
|
+
|
|
379
|
+
for (const srcCat of source.categories) {
|
|
380
|
+
let tgtCat = target.categories.find(c => c.name.toLowerCase() === srcCat.name.toLowerCase());
|
|
381
|
+
if (!tgtCat) {
|
|
382
|
+
tgtCat = { id: randomUUID(), name: srcCat.name, expanded: true, chunks: [] };
|
|
383
|
+
target.categories.push(tgtCat);
|
|
384
|
+
categoriesMerged++;
|
|
385
|
+
}
|
|
386
|
+
for (const ch of srcCat.chunks) {
|
|
387
|
+
if (this._isIdTaken(target, ch.id)) { chunksSkipped++; continue; }
|
|
388
|
+
tgtCat.chunks.push({ ...JSON.parse(JSON.stringify(ch)), _uid: randomUUID() });
|
|
389
|
+
chunksAdded++;
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
this._save(targetName, target);
|
|
394
|
+
return { source: sourceName, target: targetName, categoriesMerged, chunksAdded, chunksSkipped };
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// ---- EXPORT CATEGORY ----
|
|
398
|
+
|
|
399
|
+
exportCategory(projectName, categoryName) {
|
|
400
|
+
const data = this._load(projectName);
|
|
401
|
+
const cat = this._findCategory(data, categoryName);
|
|
402
|
+
const flat = [];
|
|
403
|
+
for (const ch of cat.chunks) {
|
|
404
|
+
const entry = { id: ch.id, text: ch.text, metadata: { ...ch.metadata } };
|
|
405
|
+
if (ch.customFields) {
|
|
406
|
+
for (const cf of ch.customFields) {
|
|
407
|
+
if (cf.key && cf.key.trim()) entry.metadata[cf.key.trim()] = String(cf.value ?? '');
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
flat.push(entry);
|
|
411
|
+
}
|
|
412
|
+
return flat;
|
|
413
|
+
}
|
|
414
|
+
|
|
346
415
|
// ---- INTERNAL ----
|
|
347
416
|
|
|
348
417
|
_load(name) {
|
|
@@ -371,6 +440,67 @@ export class Store {
|
|
|
371
440
|
return false;
|
|
372
441
|
}
|
|
373
442
|
|
|
443
|
+
// ---- HISTORY ----
|
|
444
|
+
|
|
445
|
+
_historyFilePath(name) {
|
|
446
|
+
return join(this.dataDir, `${name}.history.json`);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
_loadHistory(name) {
|
|
450
|
+
const fp = this._historyFilePath(name);
|
|
451
|
+
if (!existsSync(fp)) return { project: name, commits: [] };
|
|
452
|
+
return JSON.parse(readFileSync(fp, 'utf-8'));
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
_saveHistory(name, history) {
|
|
456
|
+
this._ensureDir();
|
|
457
|
+
writeFileSync(this._historyFilePath(name), JSON.stringify(history, null, 2), 'utf-8');
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
_commit(projectName, action, summary, source) {
|
|
461
|
+
try {
|
|
462
|
+
const data = this._load(projectName);
|
|
463
|
+
const history = this._loadHistory(projectName);
|
|
464
|
+
const totalChunks = data.categories.reduce((sum, c) => sum + c.chunks.length, 0);
|
|
465
|
+
history.commits.unshift({
|
|
466
|
+
id: randomUUID(),
|
|
467
|
+
timestamp: new Date().toISOString(),
|
|
468
|
+
source: source || 'mcp',
|
|
469
|
+
action, summary,
|
|
470
|
+
stats: { categories: data.categories.length, chunks: totalChunks },
|
|
471
|
+
snapshot: JSON.parse(JSON.stringify(data)),
|
|
472
|
+
});
|
|
473
|
+
if (history.commits.length > MAX_HISTORY) history.commits.length = MAX_HISTORY;
|
|
474
|
+
this._saveHistory(projectName, history);
|
|
475
|
+
} catch { /* history logging should never break mutations */ }
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
getHistory(name) {
|
|
479
|
+
const history = this._loadHistory(name);
|
|
480
|
+
return history.commits.map(c => ({
|
|
481
|
+
id: c.id, timestamp: c.timestamp, source: c.source,
|
|
482
|
+
action: c.action, summary: c.summary, stats: c.stats,
|
|
483
|
+
}));
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
getCommit(name, commitId) {
|
|
487
|
+
const history = this._loadHistory(name);
|
|
488
|
+
const idx = history.commits.findIndex(c => c.id === commitId);
|
|
489
|
+
if (idx === -1) throw new Error('Commit not found');
|
|
490
|
+
const commit = history.commits[idx];
|
|
491
|
+
const prev = idx + 1 < history.commits.length ? history.commits[idx + 1].snapshot : null;
|
|
492
|
+
return { ...commit, prevSnapshot: prev };
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
rollback(name, commitId, source) {
|
|
496
|
+
const history = this._loadHistory(name);
|
|
497
|
+
const commit = history.commits.find(c => c.id === commitId);
|
|
498
|
+
if (!commit) throw new Error('Commit not found');
|
|
499
|
+
this._save(name, commit.snapshot);
|
|
500
|
+
this._commit(name, 'rollback', `Rolled back to commit from ${commit.timestamp}`, source || 'mcp');
|
|
501
|
+
return this._load(name);
|
|
502
|
+
}
|
|
503
|
+
|
|
374
504
|
_parseCustomFields(metadata) {
|
|
375
505
|
if (!metadata || typeof metadata !== 'object') return [];
|
|
376
506
|
return Object.entries(metadata)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "tryll-dataset-builder-mcp",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "MCP server for building RAG knowledge base datasets. Create, manage and export structured JSON datasets via Claude Code.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -31,6 +31,7 @@
|
|
|
31
31
|
},
|
|
32
32
|
"dependencies": {
|
|
33
33
|
"@modelcontextprotocol/sdk": "^1.12.1",
|
|
34
|
+
"cheerio": "^1.2.0",
|
|
34
35
|
"ws": "^8.19.0",
|
|
35
36
|
"zod": "^3.24.0"
|
|
36
37
|
}
|