tryll-dataset-builder-mcp 1.1.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/README.md +228 -28
  2. package/index.js +344 -14
  3. package/lib/store.js +131 -1
  4. package/package.json +2 -1
package/README.md CHANGED
@@ -1,9 +1,11 @@
1
1
  # Tryll Dataset Builder — MCP Server
2
2
 
3
- An MCP (Model Context Protocol) server for building structured RAG knowledge base datasets. Use it with Claude Code to create, manage, and export JSON datasets via natural language.
3
+ An MCP (Model Context Protocol) server for building structured RAG knowledge base datasets. Use it with Claude Code to create, manage, and export JSON datasets via natural language — with optional real-time sync to the [Dataset Builder web app](https://trylljsoncreator.onrender.com).
4
4
 
5
5
  Built by [Tryll Engine](https://tryllengine.com) | [Discord](https://discord.gg/CMnMrmapyB)
6
6
 
7
+ ---
8
+
7
9
  ## Quick Start
8
10
 
9
11
  ### 1. Install
@@ -14,8 +16,6 @@ npm install -g tryll-dataset-builder-mcp
14
16
 
15
17
  ### 2. Add to Claude Code
16
18
 
17
- Run in your terminal:
18
-
19
19
  ```bash
20
20
  claude mcp add dataset-builder -- npx tryll-dataset-builder-mcp
21
21
  ```
@@ -42,55 +42,208 @@ Just talk to Claude:
42
42
 
43
43
  > "Create a knowledge base about Minecraft with categories: Mobs, Blocks, Biomes. Add 10 chunks to each category."
44
44
 
45
- > "Import my existing dataset from ./data/minecraft.json"
45
+ > "Parse this wiki page and add it to my dataset: https://minecraft.wiki/w/Creeper"
46
+
47
+ > "Show me the version history of my project"
46
48
 
47
- > "Search for all chunks mentioning 'diamond' in my project"
49
+ ---
48
50
 
49
51
  ## Configuration
50
52
 
51
53
  | Variable | Default | Description |
52
54
  |----------|---------|-------------|
53
- | `DATA_DIR` | `./datasets` | Directory where project JSON files are stored |
55
+ | `DATA_DIR` | `./datasets` | Directory for project JSON files (local mode) |
56
+
57
+ ---
58
+
59
+ ## Two Modes of Operation
60
+
61
+ ### Local Mode (default)
62
+ Data is stored as JSON files in `DATA_DIR`. No server needed.
63
+
64
+ ### Connected Mode (real-time sync)
65
+ Connect to the [Dataset Builder web app](https://trylljsoncreator.onrender.com) for live collaboration. Changes made via MCP appear instantly in the browser, and vice versa.
66
+
67
+ ```
68
+ You: "Connect to session ABC123"
69
+ Claude: *connects via WebSocket*
70
+ You: "Add 5 chunks about dragons"
71
+ → chunks appear in the browser in real-time
72
+ ```
73
+
74
+ ---
54
75
 
55
- ## Available Tools (18)
76
+ ## Available Tools (27)
77
+
78
+ ### Session Management
79
+
80
+ | Tool | Description |
81
+ |------|-------------|
82
+ | `connect_session` | Connect to the web app for real-time collaboration. Requires a 6-character session code from the browser UI |
83
+ | `disconnect_session` | Disconnect from the web app, switch back to local storage |
56
84
 
57
85
  ### Project Management
86
+
58
87
  | Tool | Description |
59
88
  |------|-------------|
60
89
  | `create_project` | Create a new dataset project |
61
90
  | `list_projects` | List all projects with stats |
62
- | `delete_project` | Delete a project |
63
- | `get_project_stats` | Detailed statistics |
91
+ | `delete_project` | Permanently delete a project |
92
+ | `get_project_stats` | Detailed statistics (categories, chunks, text lengths) |
64
93
 
65
94
  ### Category Management
95
+
66
96
  | Tool | Description |
67
97
  |------|-------------|
68
- | `create_category` | Add a category to a project |
98
+ | `create_category` | Add a category to organize chunks |
69
99
  | `list_categories` | List categories with chunk counts |
70
100
  | `rename_category` | Rename a category |
71
- | `delete_category` | Delete a category and its chunks |
101
+ | `delete_category` | Delete a category and all its chunks |
72
102
 
73
103
  ### Chunk Operations
104
+
74
105
  | Tool | Description |
75
106
  |------|-------------|
76
- | `add_chunk` | Add a single knowledge chunk |
77
- | `bulk_add_chunks` | Add multiple chunks at once |
78
- | `get_chunk` | Get chunk content by ID |
79
- | `update_chunk` | Update chunk fields |
80
- | `delete_chunk` | Delete a chunk |
81
- | `duplicate_chunk` | Clone a chunk |
82
- | `move_chunk` | Move chunk between categories |
107
+ | `add_chunk` | Add a single knowledge chunk with ID, text, and metadata |
108
+ | `bulk_add_chunks` | Add multiple chunks at once (faster than one by one) |
109
+ | `get_chunk` | Get full content of a chunk by ID |
110
+ | `update_chunk` | Update chunk fields (ID, text, metadata) |
111
+ | `delete_chunk` | Delete a chunk by ID |
112
+ | `duplicate_chunk` | Clone a chunk (creates `id_copy`) |
113
+ | `move_chunk` | Move a chunk between categories |
83
114
 
84
115
  ### Search & Export
116
+
117
+ | Tool | Description |
118
+ |------|-------------|
119
+ | `search_chunks` | Search by chunk ID or text content |
120
+ | `export_project` | Export as flat JSON array (RAG-ready) |
121
+ | `import_json` | Import an existing JSON dataset |
122
+ | `export_category` | Export a single category as JSON |
123
+
124
+ ### URL Parsing
125
+
126
+ | Tool | Description |
127
+ |------|-------------|
128
+ | `parse_url` | Fetch a web page, extract text, auto-create chunks. Splits text > 2000 chars into multiple chunks. Extracts wiki infobox metadata |
129
+ | `batch_parse_urls` | Parse multiple URLs at once |
130
+
131
+ ### Bulk Operations
132
+
85
133
  | Tool | Description |
86
134
  |------|-------------|
87
- | `search_chunks` | Search by ID or text content |
88
- | `export_project` | Export as flat JSON (RAG-ready) |
89
- | `import_json` | Import existing JSON dataset |
135
+ | `bulk_update_metadata` | Set a metadata field across all chunks (or per category) |
136
+ | `merge_projects` | Merge all data from one project into another |
137
+
138
+ ### Version History
139
+
140
+ | Tool | Description |
141
+ |------|-------------|
142
+ | `get_history` | Get version history (last 50 commits) for a project |
143
+ | `get_commit` | Get a specific commit with full snapshot data for diffing |
144
+ | `rollback` | Rollback a project to a previous commit's state |
145
+
146
+ ---
147
+
148
+ ## Tool Details
149
+
150
+ ### `add_chunk`
151
+
152
+ ```
153
+ project: "minecraft"
154
+ category: "Mobs"
155
+ id: "creeper"
156
+ text: "A Creeper is a hostile mob that silently approaches players..."
157
+ metadata:
158
+ page_title: "Creeper"
159
+ source: "Minecraft Wiki"
160
+ license: "CC BY-NC-SA 3.0"
161
+ health: "20" ← custom metadata field
162
+ behavior: "explodes" ← custom metadata field
163
+ ```
164
+
165
+ Standard metadata fields: `page_title`, `source`, `license`. Any extra fields become custom metadata.
166
+
167
+ ### `parse_url`
168
+
169
+ ```
170
+ project: "minecraft"
171
+ category: "Mobs"
172
+ url: "https://minecraft.wiki/w/Creeper"
173
+ chunk_id: "creeper"
174
+ license: "CC BY-NC-SA 3.0"
175
+ ```
90
176
 
91
- ## Export Format
177
+ - Fetches the page, extracts main text content
178
+ - If text > 2000 chars → auto-splits into `creeper_1`, `creeper_2`, etc.
179
+ - Extracts page title and source URL as metadata
180
+ - For wiki pages: extracts infobox/sidebar data as custom metadata fields
92
181
 
93
- The exported JSON is a flat array, compatible with the [Dataset Builder web app](https://github.com/Skizziik/json_creator) and ready for RAG pipelines:
182
+ ### `get_history`
183
+
184
+ ```
185
+ project: "minecraft"
186
+ ```
187
+
188
+ Returns:
189
+ ```json
190
+ [
191
+ {
192
+ "id": "uuid",
193
+ "timestamp": "2026-02-27T14:30:00.000Z",
194
+ "source": "mcp",
195
+ "action": "addChunk",
196
+ "summary": "Added chunk 'creeper' to 'Mobs'",
197
+ "stats": { "categories": 3, "chunks": 12 }
198
+ }
199
+ ]
200
+ ```
201
+
202
+ ### `rollback`
203
+
204
+ ```
205
+ project: "minecraft"
206
+ commit_id: "uuid-of-target-commit"
207
+ ```
208
+
209
+ Restores the project to that commit's snapshot. Creates a new "rollback" commit so you can undo the rollback later.
210
+
211
+ ---
212
+
213
+ ## Data Formats
214
+
215
+ ### Project JSON (internal)
216
+
217
+ ```json
218
+ {
219
+ "name": "minecraft",
220
+ "createdAt": "2026-02-27T10:00:00.000Z",
221
+ "categories": [
222
+ {
223
+ "id": "uuid",
224
+ "name": "Mobs",
225
+ "expanded": true,
226
+ "chunks": [
227
+ {
228
+ "_uid": "uuid",
229
+ "id": "creeper",
230
+ "text": "A Creeper is a hostile mob...",
231
+ "metadata": {
232
+ "page_title": "Creeper",
233
+ "source": "Minecraft Wiki",
234
+ "license": "CC BY-NC-SA 3.0"
235
+ },
236
+ "customFields": [
237
+ { "key": "health", "value": "20" }
238
+ ]
239
+ }
240
+ ]
241
+ }
242
+ ]
243
+ }
244
+ ```
245
+
246
+ ### Export Format (RAG-ready)
94
247
 
95
248
  ```json
96
249
  [
@@ -101,20 +254,67 @@ The exported JSON is a flat array, compatible with the [Dataset Builder web app]
101
254
  "page_title": "Creeper",
102
255
  "source": "Minecraft Wiki",
103
256
  "license": "CC BY-NC-SA 3.0",
104
- "type": "hostile_mob",
105
257
  "health": "20"
106
258
  }
107
259
  }
108
260
  ]
109
261
  ```
110
262
 
263
+ ### History Commit
264
+
265
+ ```json
266
+ {
267
+ "id": "uuid",
268
+ "timestamp": "2026-02-27T14:30:00.000Z",
269
+ "source": "browser | mcp",
270
+ "action": "addChunk",
271
+ "summary": "Added chunk 'creeper' to 'Mobs'",
272
+ "stats": { "categories": 3, "chunks": 12 },
273
+ "snapshot": { "...full project state..." }
274
+ }
275
+ ```
276
+
277
+ ---
278
+
279
+ ## Real-Time Collaboration
280
+
281
+ ```
282
+ ┌─────────────┐ WebSocket ┌──────────────┐ REST API ┌─────────────┐
283
+ │ Browser │ ◄──────────────► │ Web Server │ ◄──────────────► │ MCP Server │
284
+ │ (Dataset │ data:changed │ (Express + │ POST/PUT/DEL │ (Claude │
285
+ │ Builder) │ mcp:connected │ WebSocket) │ + source:mcp │ Code) │
286
+ └─────────────┘ └──────────────┘ └─────────────┘
287
+ ```
288
+
289
+ 1. Open the [Dataset Builder](https://trylljsoncreator.onrender.com) in your browser
290
+ 2. Copy the 6-character session code from the top bar
291
+ 3. Tell Claude: *"Connect to session ABC123"*
292
+ 4. All changes sync in real-time between browser and Claude
293
+ 5. Version history tracks who made each change (browser vs MCP)
294
+
295
+ ---
296
+
111
297
  ## Example Prompts
112
298
 
113
299
  - *"Create a Dark Souls knowledge base with categories for Bosses, Weapons, and Locations"*
114
- - *"Add 15 chunks about Minecraft mobs with detailed descriptions"*
115
- - *"Export my project as JSON and save to file"*
116
- - *"Search for chunks about 'fire' in my dark_souls project"*
117
- - *"Move chunk 'ancient_dragon' from Bosses to Enemies category"*
300
+ - *"Parse these wiki pages and add them to my Minecraft project: [url1], [url2], [url3]"*
301
+ - *"Bulk update the license field to 'MIT' for all chunks in the Mobs category"*
302
+ - *"Show me the version history of my project"*
303
+ - *"Rollback my project to the commit before I deleted that category"*
304
+ - *"Merge my test_data project into the main production project"*
305
+ - *"Export the Bosses category as JSON"*
306
+ - *"Connect to session XYZ789 and add 20 chunks about potions"*
307
+
308
+ ---
309
+
310
+ ## Links
311
+
312
+ - **Web App**: [trylljsoncreator.onrender.com](https://trylljsoncreator.onrender.com)
313
+ - **Web App Repo**: [github.com/Skizziik/json_creator](https://github.com/Skizziik/json_creator)
314
+ - **MCP Repo**: [github.com/Skizziik/tryll_dataset_builder](https://github.com/Skizziik/tryll_dataset_builder)
315
+ - **npm**: [tryll-dataset-builder-mcp](https://www.npmjs.com/package/tryll-dataset-builder-mcp)
316
+ - **Tryll Engine**: [tryllengine.com](https://tryllengine.com)
317
+ - **Discord**: [discord.gg/CMnMrmapyB](https://discord.gg/CMnMrmapyB)
118
318
 
119
319
  ## License
120
320
 
package/index.js CHANGED
@@ -5,11 +5,12 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
5
5
  import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
6
6
  import { Store } from "./lib/store.js";
7
7
  import WebSocket from "ws";
8
+ import * as cheerio from "cheerio";
8
9
 
9
10
  const store = new Store(process.env.DATA_DIR);
10
11
 
11
12
  const server = new Server(
12
- { name: "tryll-dataset-builder", version: "1.1.0" },
13
+ { name: "tryll-dataset-builder", version: "1.3.0" },
13
14
  { capabilities: { tools: {} } }
14
15
  );
15
16
 
@@ -38,6 +39,96 @@ async function apiCall(method, path, body) {
38
39
  return data;
39
40
  }
40
41
 
42
+ // ============================================
43
+ // URL PARSING HELPERS
44
+ // ============================================
45
+
46
+ const CHUNK_LIMIT = 2000;
47
+
48
+ async function parseUrl(url) {
49
+ const res = await fetch(url, {
50
+ headers: { 'User-Agent': 'Mozilla/5.0 (compatible; TryllDatasetBuilder/1.2)' },
51
+ });
52
+ if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`);
53
+ const html = await res.text();
54
+ const $ = cheerio.load(html);
55
+
56
+ // Extract page title
57
+ const pageTitle = $('title').first().text().trim()
58
+ || $('h1').first().text().trim()
59
+ || '';
60
+
61
+ // Extract wiki infobox metadata
62
+ const infobox = {};
63
+ $('.infobox tr, .sidebar tr, .wikitable.infobox tr, table.infobox tr').each((_, row) => {
64
+ const $row = $(row);
65
+ const key = $row.find('th').first().text().trim().replace(/\s+/g, ' ');
66
+ const val = $row.find('td').first().text().trim().replace(/\s+/g, ' ');
67
+ if (key && val && key.length < 60 && val.length < 200) {
68
+ infobox[key] = val;
69
+ }
70
+ });
71
+
72
+ // Remove noise elements
73
+ $('script, style, nav, footer, header, .sidebar, .infobox, .navbox, .mw-editsection, .reference, .reflist, #mw-navigation, .noprint, .toc').remove();
74
+
75
+ // Extract main text
76
+ const mainContent = $('article, main, #mw-content-text, #content, .mw-parser-output, #bodyContent, .entry-content, .post-content').first();
77
+ let text = '';
78
+ if (mainContent.length) {
79
+ text = mainContent.text();
80
+ } else {
81
+ text = $('body').text();
82
+ }
83
+
84
+ // Clean up whitespace
85
+ text = text
86
+ .replace(/\t/g, ' ')
87
+ .replace(/[ ]{2,}/g, ' ')
88
+ .replace(/\n{3,}/g, '\n\n')
89
+ .trim();
90
+
91
+ return { text, pageTitle, infobox, source: url };
92
+ }
93
+
94
+ function splitTextIntoChunks(text, baseId, limit = CHUNK_LIMIT) {
95
+ if (text.length <= limit) {
96
+ return [{ id: baseId, text }];
97
+ }
98
+
99
+ const chunks = [];
100
+ let remaining = text;
101
+ let index = 1;
102
+
103
+ while (remaining.length > 0) {
104
+ let cutPoint = limit;
105
+ if (remaining.length > limit) {
106
+ // Try to cut at paragraph boundary
107
+ const paraBreak = remaining.lastIndexOf('\n\n', limit);
108
+ if (paraBreak > limit * 0.3) {
109
+ cutPoint = paraBreak;
110
+ } else {
111
+ // Try sentence boundary
112
+ const sentBreak = remaining.lastIndexOf('. ', limit);
113
+ if (sentBreak > limit * 0.3) {
114
+ cutPoint = sentBreak + 1;
115
+ }
116
+ }
117
+ } else {
118
+ cutPoint = remaining.length;
119
+ }
120
+
121
+ chunks.push({
122
+ id: `${baseId}_${index}`,
123
+ text: remaining.substring(0, cutPoint).trim(),
124
+ });
125
+ remaining = remaining.substring(cutPoint).trim();
126
+ index++;
127
+ }
128
+
129
+ return chunks;
130
+ }
131
+
41
132
  // ============================================
42
133
  // TOOL DEFINITIONS
43
134
  // ============================================
@@ -309,6 +400,126 @@ const TOOLS = [
309
400
  required: ["project"],
310
401
  },
311
402
  },
403
+
404
+ // ---- URL Parsing ----
405
+ {
406
+ name: "parse_url",
407
+ description: "Fetch a web page, extract its text content, and auto-create chunks. If text exceeds 2000 characters, it auto-splits into multiple chunks with _1, _2 suffixes. Extracts page title and source URL as metadata. For wiki pages, extracts infobox/sidebar data as custom metadata fields.",
408
+ inputSchema: {
409
+ type: "object",
410
+ properties: {
411
+ project: { type: "string", description: "Project name" },
412
+ category: { type: "string", description: "Category to add chunks into" },
413
+ url: { type: "string", description: "URL to fetch and parse" },
414
+ chunk_id: { type: "string", description: "Base chunk ID. If text is split, becomes chunk_id_1, chunk_id_2, etc." },
415
+ license: { type: "string", description: "License for the content. Default: CC BY-NC-SA 3.0" },
416
+ },
417
+ required: ["project", "category", "url", "chunk_id"],
418
+ },
419
+ },
420
+ {
421
+ name: "batch_parse_urls",
422
+ description: "Parse multiple URLs at once and add all chunks to a category. Each URL gets its own chunk ID prefix. Auto-splits long texts into multiple chunks.",
423
+ inputSchema: {
424
+ type: "object",
425
+ properties: {
426
+ project: { type: "string", description: "Project name" },
427
+ category: { type: "string", description: "Category to add chunks into" },
428
+ urls: {
429
+ type: "array",
430
+ description: "Array of URL entries to parse",
431
+ items: {
432
+ type: "object",
433
+ properties: {
434
+ url: { type: "string", description: "URL to fetch" },
435
+ chunk_id: { type: "string", description: "Base chunk ID for this URL" },
436
+ },
437
+ required: ["url", "chunk_id"],
438
+ },
439
+ },
440
+ license: { type: "string", description: "License for all content. Default: CC BY-NC-SA 3.0" },
441
+ },
442
+ required: ["project", "category", "urls"],
443
+ },
444
+ },
445
+
446
+ // ---- Bulk Operations ----
447
+ {
448
+ name: "bulk_update_metadata",
449
+ description: "Update a metadata field across ALL chunks in a project (or a specific category). Useful for setting license, source, or custom fields in bulk.",
450
+ inputSchema: {
451
+ type: "object",
452
+ properties: {
453
+ project: { type: "string", description: "Project name" },
454
+ field: { type: "string", description: "Metadata field to update (e.g. 'license', 'source', or any custom field name)" },
455
+ value: { type: "string", description: "New value for the field" },
456
+ category: { type: "string", description: "Optional: only update chunks in this category. If omitted, updates all chunks in the project." },
457
+ },
458
+ required: ["project", "field", "value"],
459
+ },
460
+ },
461
+ {
462
+ name: "merge_projects",
463
+ description: "Merge all categories and chunks from a source project into a target project. Categories with the same name are combined. Chunks with duplicate IDs are skipped.",
464
+ inputSchema: {
465
+ type: "object",
466
+ properties: {
467
+ source: { type: "string", description: "Source project name (data is copied FROM here)" },
468
+ target: { type: "string", description: "Target project name (data is merged INTO here)" },
469
+ },
470
+ required: ["source", "target"],
471
+ },
472
+ },
473
+ {
474
+ name: "export_category",
475
+ description: "Export a single category as a flat JSON array. Same format as export_project but filtered to one category.",
476
+ inputSchema: {
477
+ type: "object",
478
+ properties: {
479
+ project: { type: "string", description: "Project name" },
480
+ category: { type: "string", description: "Category name to export" },
481
+ save_to_file: { type: "boolean", description: "If true, saves to a file. Default: false." },
482
+ },
483
+ required: ["project", "category"],
484
+ },
485
+ },
486
+
487
+ // ---- History ----
488
+ {
489
+ name: "get_history",
490
+ description: "Get version history (last 50 commits) for a project. Each commit shows who made the change (browser/MCP), what was changed, and when. Returns lightweight list without snapshots.",
491
+ inputSchema: {
492
+ type: "object",
493
+ properties: {
494
+ project: { type: "string", description: "Project name" },
495
+ },
496
+ required: ["project"],
497
+ },
498
+ },
499
+ {
500
+ name: "get_commit",
501
+ description: "Get a specific commit with full snapshot data. Returns the commit's snapshot and the previous commit's snapshot for computing diffs.",
502
+ inputSchema: {
503
+ type: "object",
504
+ properties: {
505
+ project: { type: "string", description: "Project name" },
506
+ commit_id: { type: "string", description: "Commit UUID" },
507
+ },
508
+ required: ["project", "commit_id"],
509
+ },
510
+ },
511
+ {
512
+ name: "rollback",
513
+ description: "Rollback a project to a specific commit's state. Restores the project data from that commit's snapshot and creates a new 'rollback' commit in history. Safe: you can undo a rollback by rolling back to a later commit.",
514
+ inputSchema: {
515
+ type: "object",
516
+ properties: {
517
+ project: { type: "string", description: "Project name" },
518
+ commit_id: { type: "string", description: "Commit UUID to rollback to" },
519
+ },
520
+ required: ["project", "commit_id"],
521
+ },
522
+ },
312
523
  ];
313
524
 
314
525
  // ============================================
@@ -327,28 +538,28 @@ async function handleRemote(name, args) {
327
538
 
328
539
  switch (name) {
329
540
  case "create_project":
330
- return apiCall('POST', '/api/projects', { name: args.name, session: s });
541
+ return apiCall('POST', '/api/projects', { name: args.name, session: s, source: 'mcp' });
331
542
  case "list_projects":
332
543
  return apiCall('GET', '/api/projects');
333
544
  case "delete_project":
334
- return apiCall('DELETE', `/api/projects/${p(args.name)}?session=${s}`);
545
+ return apiCall('DELETE', `/api/projects/${p(args.name)}?session=${s}&source=mcp`);
335
546
  case "get_project_stats":
336
547
  return apiCall('GET', `/api/projects/${p(args.name)}/stats`);
337
548
  case "create_category":
338
- return apiCall('POST', `/api/projects/${p(args.project)}/categories`, { name: args.name, session: s });
549
+ return apiCall('POST', `/api/projects/${p(args.project)}/categories`, { name: args.name, session: s, source: 'mcp' });
339
550
  case "list_categories":
340
551
  return apiCall('GET', `/api/projects/${p(args.project)}/categories`);
341
552
  case "rename_category":
342
- return apiCall('PUT', `/api/projects/${p(args.project)}/categories/${p(args.old_name)}`, { newName: args.new_name, session: s });
553
+ return apiCall('PUT', `/api/projects/${p(args.project)}/categories/${p(args.old_name)}`, { newName: args.new_name, session: s, source: 'mcp' });
343
554
  case "delete_category":
344
- return apiCall('DELETE', `/api/projects/${p(args.project)}/categories/${p(args.name)}?session=${s}`);
555
+ return apiCall('DELETE', `/api/projects/${p(args.project)}/categories/${p(args.name)}?session=${s}&source=mcp`);
345
556
  case "add_chunk":
346
557
  return apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks`, {
347
- id: args.id, text: args.text, metadata: args.metadata, session: s,
558
+ id: args.id, text: args.text, metadata: args.metadata, session: s, source: 'mcp',
348
559
  });
349
560
  case "bulk_add_chunks":
350
561
  return apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
351
- chunks: args.chunks, session: s,
562
+ chunks: args.chunks, session: s, source: 'mcp',
352
563
  });
353
564
  case "get_chunk": {
354
565
  const proj = await apiCall('GET', `/api/projects/${p(args.project)}`);
@@ -363,7 +574,7 @@ async function handleRemote(name, args) {
363
574
  for (const cat of proj2.categories) {
364
575
  const ch = cat.chunks.find(c => c.id === args.id);
365
576
  if (ch) {
366
- const body = { session: s };
577
+ const body = { session: s, source: 'mcp' };
367
578
  if (args.new_id !== undefined) body.id = args.new_id;
368
579
  if (args.text !== undefined) body.text = args.text;
369
580
  const meta = {};
@@ -384,7 +595,7 @@ async function handleRemote(name, args) {
384
595
  for (const cat of proj3.categories) {
385
596
  const ch = cat.chunks.find(c => c.id === args.id);
386
597
  if (ch) {
387
- return apiCall('DELETE', `/api/projects/${p(args.project)}/categories/${cat.id}/chunks/${ch._uid}?session=${s}`);
598
+ return apiCall('DELETE', `/api/projects/${p(args.project)}/categories/${cat.id}/chunks/${ch._uid}?session=${s}&source=mcp`);
388
599
  }
389
600
  }
390
601
  throw new Error(`Chunk "${args.id}" not found`);
@@ -394,14 +605,14 @@ async function handleRemote(name, args) {
394
605
  for (const cat of proj4.categories) {
395
606
  const ch = cat.chunks.find(c => c.id === args.id);
396
607
  if (ch) {
397
- return apiCall('POST', `/api/projects/${p(args.project)}/categories/${cat.id}/chunks/${ch._uid}/duplicate`);
608
+ return apiCall('POST', `/api/projects/${p(args.project)}/categories/${cat.id}/chunks/${ch._uid}/duplicate`, { source: 'mcp' });
398
609
  }
399
610
  }
400
611
  throw new Error(`Chunk "${args.id}" not found`);
401
612
  }
402
613
  case "move_chunk":
403
614
  return apiCall('POST', `/api/projects/${p(args.project)}/chunks/${p(args.id)}/move`, {
404
- targetCategory: args.target_category, session: s,
615
+ targetCategory: args.target_category, session: s, source: 'mcp',
405
616
  });
406
617
  case "search_chunks":
407
618
  return apiCall('GET', `/api/projects/${p(args.project)}/search?q=${encodeURIComponent(args.query)}`);
@@ -415,9 +626,61 @@ async function handleRemote(name, args) {
415
626
  }
416
627
  if (!jsonData) throw new Error('Provide either "json_path" or "data" parameter');
417
628
  return apiCall('POST', `/api/projects/${p(args.project)}/import`, {
418
- data: jsonData, category: args.category, session: s,
629
+ data: jsonData, category: args.category, session: s, source: 'mcp',
630
+ });
631
+ }
632
+ case "parse_url": {
633
+ const parsed = await parseUrl(args.url);
634
+ const chunks = splitTextIntoChunks(parsed.text, args.chunk_id);
635
+ const license = args.license || 'CC BY-NC-SA 3.0';
636
+ const chunkData = chunks.map(ch => ({
637
+ id: ch.id, text: ch.text,
638
+ metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
639
+ }));
640
+ const result = await apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
641
+ chunks: chunkData, session: s, source: 'mcp',
419
642
  });
643
+ return { ...result, pageTitle: parsed.pageTitle, chunksCreated: chunks.length, infoboxFields: Object.keys(parsed.infobox) };
420
644
  }
645
+ case "batch_parse_urls": {
646
+ const results = [];
647
+ const license = args.license || 'CC BY-NC-SA 3.0';
648
+ for (const entry of args.urls) {
649
+ try {
650
+ const parsed = await parseUrl(entry.url);
651
+ const chunks = splitTextIntoChunks(parsed.text, entry.chunk_id);
652
+ const chunkData = chunks.map(ch => ({
653
+ id: ch.id, text: ch.text,
654
+ metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
655
+ }));
656
+ const r = await apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
657
+ chunks: chunkData, session: s, source: 'mcp',
658
+ });
659
+ results.push({ url: entry.url, chunk_id: entry.chunk_id, chunks: chunks.length, added: r.added, errors: r.errors });
660
+ } catch (err) {
661
+ results.push({ url: entry.url, chunk_id: entry.chunk_id, error: err.message });
662
+ }
663
+ }
664
+ return { parsed: results.filter(r => !r.error).length, failed: results.filter(r => r.error).length, results };
665
+ }
666
+ case "bulk_update_metadata":
667
+ return apiCall('POST', `/api/projects/${p(args.project)}/bulk-metadata`, {
668
+ field: args.field, value: args.value, category: args.category, session: s, source: 'mcp',
669
+ });
670
+ case "merge_projects":
671
+ return apiCall('POST', `/api/projects/${p(args.source)}/merge`, {
672
+ target: args.target, session: s, source: 'mcp',
673
+ });
674
+ case "export_category":
675
+ return apiCall('GET', `/api/projects/${p(args.project)}/categories/${p(args.category)}/export`);
676
+ case "get_history":
677
+ return apiCall('GET', `/api/projects/${p(args.project)}/history`);
678
+ case "get_commit":
679
+ return apiCall('GET', `/api/projects/${p(args.project)}/history/${args.commit_id}`);
680
+ case "rollback":
681
+ return apiCall('POST', `/api/projects/${p(args.project)}/history/${args.commit_id}/rollback`, {
682
+ session: s, source: 'mcp',
683
+ });
421
684
  default:
422
685
  throw new Error(`Unknown tool: ${name}`);
423
686
  }
@@ -593,6 +856,73 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
593
856
  break;
594
857
  }
595
858
 
859
+ case "parse_url": {
860
+ const parsed = await parseUrl(args.url);
861
+ const chunks = splitTextIntoChunks(parsed.text, args.chunk_id);
862
+ const license = args.license || 'CC BY-NC-SA 3.0';
863
+ const chunkData = chunks.map(ch => ({
864
+ id: ch.id, text: ch.text,
865
+ metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
866
+ }));
867
+ const bulkResult = store.bulkAddChunks(args.project, args.category, chunkData);
868
+ result = { ...bulkResult, pageTitle: parsed.pageTitle, chunksCreated: chunks.length, infoboxFields: Object.keys(parsed.infobox) };
869
+ break;
870
+ }
871
+
872
+ case "batch_parse_urls": {
873
+ const results = [];
874
+ const license = args.license || 'CC BY-NC-SA 3.0';
875
+ for (const entry of args.urls) {
876
+ try {
877
+ const parsed = await parseUrl(entry.url);
878
+ const chunks = splitTextIntoChunks(parsed.text, entry.chunk_id);
879
+ const chunkData = chunks.map(ch => ({
880
+ id: ch.id, text: ch.text,
881
+ metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
882
+ }));
883
+ const r = store.bulkAddChunks(args.project, args.category, chunkData);
884
+ results.push({ url: entry.url, chunk_id: entry.chunk_id, chunks: chunks.length, added: r.added, errors: r.errors });
885
+ } catch (err) {
886
+ results.push({ url: entry.url, chunk_id: entry.chunk_id, error: err.message });
887
+ }
888
+ }
889
+ result = { parsed: results.filter(r => !r.error).length, failed: results.filter(r => r.error).length, results };
890
+ break;
891
+ }
892
+
893
+ case "bulk_update_metadata":
894
+ result = store.bulkUpdateMetadata(args.project, args.field, args.value, args.category);
895
+ break;
896
+
897
+ case "merge_projects":
898
+ result = store.mergeProjects(args.source, args.target);
899
+ break;
900
+
901
+ case "export_category": {
902
+ const exported = store.exportCategory(args.project, args.category);
903
+ if (args.save_to_file) {
904
+ const outPath = store._filePath(args.project).replace('.json', `.${args.category}.export.json`);
905
+ const { writeFileSync } = await import('fs');
906
+ writeFileSync(outPath, JSON.stringify(exported, null, 2), 'utf-8');
907
+ result = { exported: exported.length, savedTo: outPath };
908
+ } else {
909
+ result = { exported: exported.length, data: exported };
910
+ }
911
+ break;
912
+ }
913
+
914
+ case "get_history":
915
+ result = store.getHistory(args.project);
916
+ break;
917
+
918
+ case "get_commit":
919
+ result = store.getCommit(args.project, args.commit_id);
920
+ break;
921
+
922
+ case "rollback":
923
+ result = store.rollback(args.project, args.commit_id, 'mcp');
924
+ break;
925
+
596
926
  default:
597
927
  throw new Error(`Unknown tool: ${name}`);
598
928
  }
@@ -617,7 +947,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
617
947
  async function main() {
618
948
  const transport = new StdioServerTransport();
619
949
  await server.connect(transport);
620
- console.error("Tryll Dataset Builder MCP server running (v1.1.0)");
950
+ console.error("Tryll Dataset Builder MCP server running (v1.3.0)");
621
951
  }
622
952
 
623
953
  main().catch((err) => {
package/lib/store.js CHANGED
@@ -4,6 +4,7 @@ import { randomUUID } from 'crypto';
4
4
 
5
5
  const DEFAULT_LICENSE = 'CC BY-NC-SA 3.0';
6
6
  const STANDARD_META = ['page_title', 'source', 'license'];
7
+ const MAX_HISTORY = 50;
7
8
 
8
9
  export class Store {
9
10
  constructor(dataDir) {
@@ -25,7 +26,7 @@ export class Store {
25
26
 
26
27
  listProjects() {
27
28
  this._ensureDir();
28
- const files = readdirSync(this.dataDir).filter(f => f.endsWith('.json'));
29
+ const files = readdirSync(this.dataDir).filter(f => f.endsWith('.json') && !f.endsWith('.history.json'));
29
30
  return files.map(f => {
30
31
  const name = f.replace(/\.json$/, '');
31
32
  try {
@@ -343,6 +344,74 @@ export class Store {
343
344
  return { project: projectName, category: catName, imported, skipped };
344
345
  }
345
346
 
347
+ // ---- BULK UPDATE METADATA ----
348
+
349
+ bulkUpdateMetadata(projectName, field, value, categoryName) {
350
+ const data = this._load(projectName);
351
+ let updated = 0;
352
+ const cats = categoryName
353
+ ? [this._findCategory(data, categoryName)]
354
+ : data.categories;
355
+ for (const cat of cats) {
356
+ for (const ch of cat.chunks) {
357
+ if (STANDARD_META.includes(field)) {
358
+ ch.metadata[field] = value;
359
+ } else {
360
+ if (!ch.customFields) ch.customFields = [];
361
+ const existing = ch.customFields.find(cf => cf.key === field);
362
+ if (existing) { existing.value = value; }
363
+ else { ch.customFields.push({ key: field, value }); }
364
+ }
365
+ updated++;
366
+ }
367
+ }
368
+ this._save(projectName, data);
369
+ return { project: projectName, field, value, updated };
370
+ }
371
+
372
+ // ---- MERGE PROJECTS ----
373
+
374
+ mergeProjects(sourceName, targetName) {
375
+ const source = this._load(sourceName);
376
+ const target = this._load(targetName);
377
+ let categoriesMerged = 0, chunksAdded = 0, chunksSkipped = 0;
378
+
379
+ for (const srcCat of source.categories) {
380
+ let tgtCat = target.categories.find(c => c.name.toLowerCase() === srcCat.name.toLowerCase());
381
+ if (!tgtCat) {
382
+ tgtCat = { id: randomUUID(), name: srcCat.name, expanded: true, chunks: [] };
383
+ target.categories.push(tgtCat);
384
+ categoriesMerged++;
385
+ }
386
+ for (const ch of srcCat.chunks) {
387
+ if (this._isIdTaken(target, ch.id)) { chunksSkipped++; continue; }
388
+ tgtCat.chunks.push({ ...JSON.parse(JSON.stringify(ch)), _uid: randomUUID() });
389
+ chunksAdded++;
390
+ }
391
+ }
392
+
393
+ this._save(targetName, target);
394
+ return { source: sourceName, target: targetName, categoriesMerged, chunksAdded, chunksSkipped };
395
+ }
396
+
397
+ // ---- EXPORT CATEGORY ----
398
+
399
+ exportCategory(projectName, categoryName) {
400
+ const data = this._load(projectName);
401
+ const cat = this._findCategory(data, categoryName);
402
+ const flat = [];
403
+ for (const ch of cat.chunks) {
404
+ const entry = { id: ch.id, text: ch.text, metadata: { ...ch.metadata } };
405
+ if (ch.customFields) {
406
+ for (const cf of ch.customFields) {
407
+ if (cf.key && cf.key.trim()) entry.metadata[cf.key.trim()] = String(cf.value ?? '');
408
+ }
409
+ }
410
+ flat.push(entry);
411
+ }
412
+ return flat;
413
+ }
414
+
346
415
  // ---- INTERNAL ----
347
416
 
348
417
  _load(name) {
@@ -371,6 +440,67 @@ export class Store {
371
440
  return false;
372
441
  }
373
442
 
443
+ // ---- HISTORY ----
444
+
445
+ _historyFilePath(name) {
446
+ return join(this.dataDir, `${name}.history.json`);
447
+ }
448
+
449
+ _loadHistory(name) {
450
+ const fp = this._historyFilePath(name);
451
+ if (!existsSync(fp)) return { project: name, commits: [] };
452
+ return JSON.parse(readFileSync(fp, 'utf-8'));
453
+ }
454
+
455
+ _saveHistory(name, history) {
456
+ this._ensureDir();
457
+ writeFileSync(this._historyFilePath(name), JSON.stringify(history, null, 2), 'utf-8');
458
+ }
459
+
460
+ _commit(projectName, action, summary, source) {
461
+ try {
462
+ const data = this._load(projectName);
463
+ const history = this._loadHistory(projectName);
464
+ const totalChunks = data.categories.reduce((sum, c) => sum + c.chunks.length, 0);
465
+ history.commits.unshift({
466
+ id: randomUUID(),
467
+ timestamp: new Date().toISOString(),
468
+ source: source || 'mcp',
469
+ action, summary,
470
+ stats: { categories: data.categories.length, chunks: totalChunks },
471
+ snapshot: JSON.parse(JSON.stringify(data)),
472
+ });
473
+ if (history.commits.length > MAX_HISTORY) history.commits.length = MAX_HISTORY;
474
+ this._saveHistory(projectName, history);
475
+ } catch { /* history logging should never break mutations */ }
476
+ }
477
+
478
+ getHistory(name) {
479
+ const history = this._loadHistory(name);
480
+ return history.commits.map(c => ({
481
+ id: c.id, timestamp: c.timestamp, source: c.source,
482
+ action: c.action, summary: c.summary, stats: c.stats,
483
+ }));
484
+ }
485
+
486
+ getCommit(name, commitId) {
487
+ const history = this._loadHistory(name);
488
+ const idx = history.commits.findIndex(c => c.id === commitId);
489
+ if (idx === -1) throw new Error('Commit not found');
490
+ const commit = history.commits[idx];
491
+ const prev = idx + 1 < history.commits.length ? history.commits[idx + 1].snapshot : null;
492
+ return { ...commit, prevSnapshot: prev };
493
+ }
494
+
495
+ rollback(name, commitId, source) {
496
+ const history = this._loadHistory(name);
497
+ const commit = history.commits.find(c => c.id === commitId);
498
+ if (!commit) throw new Error('Commit not found');
499
+ this._save(name, commit.snapshot);
500
+ this._commit(name, 'rollback', `Rolled back to commit from ${commit.timestamp}`, source || 'mcp');
501
+ return this._load(name);
502
+ }
503
+
374
504
  _parseCustomFields(metadata) {
375
505
  if (!metadata || typeof metadata !== 'object') return [];
376
506
  return Object.entries(metadata)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tryll-dataset-builder-mcp",
3
- "version": "1.1.1",
3
+ "version": "1.3.0",
4
4
  "description": "MCP server for building RAG knowledge base datasets. Create, manage and export structured JSON datasets via Claude Code.",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -31,6 +31,7 @@
31
31
  },
32
32
  "dependencies": {
33
33
  "@modelcontextprotocol/sdk": "^1.12.1",
34
+ "cheerio": "^1.2.0",
34
35
  "ws": "^8.19.0",
35
36
  "zod": "^3.24.0"
36
37
  }