tryll-dataset-builder-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/README.md +121 -0
  2. package/index.js +406 -0
  3. package/lib/store.js +389 -0
  4. package/package.json +36 -0
package/README.md ADDED
@@ -0,0 +1,121 @@
1
+ # Tryll Dataset Builder — MCP Server
2
+
3
+ An MCP (Model Context Protocol) server for building structured RAG knowledge base datasets. Use it with Claude Code to create, manage, and export JSON datasets via natural language.
4
+
5
+ Built by [Tryll Engine](https://tryllengine.com) | [Discord](https://discord.gg/CMnMrmapyB)
6
+
7
+ ## Quick Start
8
+
9
+ ### 1. Install
10
+
11
+ ```bash
12
+ npm install -g tryll-dataset-builder-mcp
13
+ ```
14
+
15
+ ### 2. Add to Claude Code
16
+
17
+ Run in your terminal:
18
+
19
+ ```bash
20
+ claude mcp add dataset-builder -- npx tryll-dataset-builder-mcp
21
+ ```
22
+
23
+ Or manually add to `~/.claude/mcp_settings.json`:
24
+
25
+ ```json
26
+ {
27
+ "mcpServers": {
28
+ "dataset-builder": {
29
+ "command": "npx",
30
+ "args": ["tryll-dataset-builder-mcp"],
31
+ "env": {
32
+ "DATA_DIR": "./datasets"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ ```
38
+
39
+ ### 3. Use
40
+
41
+ Just talk to Claude:
42
+
43
+ > "Create a knowledge base about Minecraft with categories: Mobs, Blocks, Biomes. Add 10 chunks to each category."
44
+
45
+ > "Import my existing dataset from ./data/minecraft.json"
46
+
47
+ > "Search for all chunks mentioning 'diamond' in my project"
48
+
49
+ ## Configuration
50
+
51
+ | Variable | Default | Description |
52
+ |----------|---------|-------------|
53
+ | `DATA_DIR` | `./datasets` | Directory where project JSON files are stored |
54
+
55
+ ## Available Tools (18)
56
+
57
+ ### Project Management
58
+ | Tool | Description |
59
+ |------|-------------|
60
+ | `create_project` | Create a new dataset project |
61
+ | `list_projects` | List all projects with stats |
62
+ | `delete_project` | Delete a project |
63
+ | `get_project_stats` | Detailed statistics |
64
+
65
+ ### Category Management
66
+ | Tool | Description |
67
+ |------|-------------|
68
+ | `create_category` | Add a category to a project |
69
+ | `list_categories` | List categories with chunk counts |
70
+ | `rename_category` | Rename a category |
71
+ | `delete_category` | Delete a category and its chunks |
72
+
73
+ ### Chunk Operations
74
+ | Tool | Description |
75
+ |------|-------------|
76
+ | `add_chunk` | Add a single knowledge chunk |
77
+ | `bulk_add_chunks` | Add multiple chunks at once |
78
+ | `get_chunk` | Get chunk content by ID |
79
+ | `update_chunk` | Update chunk fields |
80
+ | `delete_chunk` | Delete a chunk |
81
+ | `duplicate_chunk` | Clone a chunk |
82
+ | `move_chunk` | Move chunk between categories |
83
+
84
+ ### Search & Export
85
+ | Tool | Description |
86
+ |------|-------------|
87
+ | `search_chunks` | Search by ID or text content |
88
+ | `export_project` | Export as flat JSON (RAG-ready) |
89
+ | `import_json` | Import existing JSON dataset |
90
+
91
+ ## Export Format
92
+
93
+ The exported JSON is a flat array, compatible with the [Dataset Builder web app](https://github.com/Skizziik/json_creator) and ready for RAG pipelines:
94
+
95
+ ```json
96
+ [
97
+ {
98
+ "id": "creeper",
99
+ "text": "A Creeper is a hostile mob that silently approaches players and explodes...",
100
+ "metadata": {
101
+ "page_title": "Creeper",
102
+ "source": "Minecraft Wiki",
103
+ "license": "CC BY-NC-SA 3.0",
104
+ "type": "hostile_mob",
105
+ "health": "20"
106
+ }
107
+ }
108
+ ]
109
+ ```
110
+
111
+ ## Example Prompts
112
+
113
+ - *"Create a Dark Souls knowledge base with categories for Bosses, Weapons, and Locations"*
114
+ - *"Add 15 chunks about Minecraft mobs with detailed descriptions"*
115
+ - *"Export my project as JSON and save to file"*
116
+ - *"Search for chunks about 'fire' in my dark_souls project"*
117
+ - *"Move chunk 'ancient_dragon' from Bosses to Enemies category"*
118
+
119
+ ## License
120
+
121
+ MIT
package/index.js ADDED
@@ -0,0 +1,406 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
4
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
5
+ import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
6
+ import { Store } from "./lib/store.js";
7
+
8
+ const store = new Store(process.env.DATA_DIR);
9
+
10
+ const server = new Server(
11
+ { name: "tryll-dataset-builder", version: "1.0.0" },
12
+ { capabilities: { tools: {} } }
13
+ );
14
+
15
+ // ============================================
16
+ // TOOL DEFINITIONS
17
+ // ============================================
18
+
19
+ const TOOLS = [
20
+ // ---- Project ----
21
+ {
22
+ name: "create_project",
23
+ description: "Create a new dataset project. Each project stores categories and chunks, exported as a single JSON file.",
24
+ inputSchema: {
25
+ type: "object",
26
+ properties: {
27
+ name: { type: "string", description: "Project name (used as filename on export)" },
28
+ },
29
+ required: ["name"],
30
+ },
31
+ },
32
+ {
33
+ name: "list_projects",
34
+ description: "List all existing dataset projects with basic stats (category count, chunk count).",
35
+ inputSchema: { type: "object", properties: {} },
36
+ },
37
+ {
38
+ name: "delete_project",
39
+ description: "Permanently delete a project and all its data.",
40
+ inputSchema: {
41
+ type: "object",
42
+ properties: {
43
+ name: { type: "string", description: "Project name to delete" },
44
+ },
45
+ required: ["name"],
46
+ },
47
+ },
48
+ {
49
+ name: "get_project_stats",
50
+ description: "Get detailed statistics for a project: category names, total chunks, average text length, longest/shortest chunk.",
51
+ inputSchema: {
52
+ type: "object",
53
+ properties: {
54
+ name: { type: "string", description: "Project name" },
55
+ },
56
+ required: ["name"],
57
+ },
58
+ },
59
+
60
+ // ---- Category ----
61
+ {
62
+ name: "create_category",
63
+ description: "Add a new category to a project. Categories organize chunks by topic (e.g. 'Mobs', 'Weapons', 'Biomes').",
64
+ inputSchema: {
65
+ type: "object",
66
+ properties: {
67
+ project: { type: "string", description: "Project name" },
68
+ name: { type: "string", description: "Category name" },
69
+ },
70
+ required: ["project", "name"],
71
+ },
72
+ },
73
+ {
74
+ name: "list_categories",
75
+ description: "List all categories in a project with chunk counts.",
76
+ inputSchema: {
77
+ type: "object",
78
+ properties: {
79
+ project: { type: "string", description: "Project name" },
80
+ },
81
+ required: ["project"],
82
+ },
83
+ },
84
+ {
85
+ name: "rename_category",
86
+ description: "Rename an existing category.",
87
+ inputSchema: {
88
+ type: "object",
89
+ properties: {
90
+ project: { type: "string", description: "Project name" },
91
+ old_name: { type: "string", description: "Current category name" },
92
+ new_name: { type: "string", description: "New category name" },
93
+ },
94
+ required: ["project", "old_name", "new_name"],
95
+ },
96
+ },
97
+ {
98
+ name: "delete_category",
99
+ description: "Delete a category and all its chunks.",
100
+ inputSchema: {
101
+ type: "object",
102
+ properties: {
103
+ project: { type: "string", description: "Project name" },
104
+ name: { type: "string", description: "Category name to delete" },
105
+ },
106
+ required: ["project", "name"],
107
+ },
108
+ },
109
+
110
+ // ---- Chunk ----
111
+ {
112
+ name: "add_chunk",
113
+ description: "Add a single knowledge chunk to a category. Each chunk has a unique ID, text content, and optional metadata (page_title, source, license, plus any custom fields).",
114
+ inputSchema: {
115
+ type: "object",
116
+ properties: {
117
+ project: { type: "string", description: "Project name" },
118
+ category: { type: "string", description: "Category name" },
119
+ id: { type: "string", description: "Unique chunk ID (e.g. 'creeper', 'diamond_sword')" },
120
+ text: { type: "string", description: "Main text content of the chunk (knowledge entry)" },
121
+ metadata: {
122
+ type: "object",
123
+ description: "Optional metadata. Standard fields: page_title, source, license. Any extra fields become custom metadata.",
124
+ properties: {
125
+ page_title: { type: "string" },
126
+ source: { type: "string" },
127
+ license: { type: "string" },
128
+ },
129
+ additionalProperties: { type: "string" },
130
+ },
131
+ },
132
+ required: ["project", "category", "id", "text"],
133
+ },
134
+ },
135
+ {
136
+ name: "bulk_add_chunks",
137
+ description: "Add multiple chunks at once to a category. Much faster than adding one by one. Skips chunks with duplicate IDs and reports errors.",
138
+ inputSchema: {
139
+ type: "object",
140
+ properties: {
141
+ project: { type: "string", description: "Project name" },
142
+ category: { type: "string", description: "Category name" },
143
+ chunks: {
144
+ type: "array",
145
+ description: "Array of chunk objects, each with id, text, and optional metadata",
146
+ items: {
147
+ type: "object",
148
+ properties: {
149
+ id: { type: "string", description: "Unique chunk ID" },
150
+ text: { type: "string", description: "Chunk text content" },
151
+ metadata: { type: "object", additionalProperties: { type: "string" } },
152
+ },
153
+ required: ["id", "text"],
154
+ },
155
+ },
156
+ },
157
+ required: ["project", "category", "chunks"],
158
+ },
159
+ },
160
+ {
161
+ name: "get_chunk",
162
+ description: "Get full content of a specific chunk by its ID.",
163
+ inputSchema: {
164
+ type: "object",
165
+ properties: {
166
+ project: { type: "string", description: "Project name" },
167
+ id: { type: "string", description: "Chunk ID" },
168
+ },
169
+ required: ["project", "id"],
170
+ },
171
+ },
172
+ {
173
+ name: "update_chunk",
174
+ description: "Update fields of an existing chunk. Only provided fields will be changed.",
175
+ inputSchema: {
176
+ type: "object",
177
+ properties: {
178
+ project: { type: "string", description: "Project name" },
179
+ id: { type: "string", description: "Current chunk ID" },
180
+ new_id: { type: "string", description: "New chunk ID (if renaming)" },
181
+ text: { type: "string", description: "New text content" },
182
+ page_title: { type: "string", description: "New page title" },
183
+ source: { type: "string", description: "New source" },
184
+ license: { type: "string", description: "New license" },
185
+ metadata: { type: "object", description: "Custom metadata fields to update", additionalProperties: { type: "string" } },
186
+ },
187
+ required: ["project", "id"],
188
+ },
189
+ },
190
+ {
191
+ name: "delete_chunk",
192
+ description: "Delete a chunk by its ID.",
193
+ inputSchema: {
194
+ type: "object",
195
+ properties: {
196
+ project: { type: "string", description: "Project name" },
197
+ id: { type: "string", description: "Chunk ID to delete" },
198
+ },
199
+ required: ["project", "id"],
200
+ },
201
+ },
202
+ {
203
+ name: "duplicate_chunk",
204
+ description: "Create a copy of an existing chunk with a new ID (original_id + '_copy' suffix).",
205
+ inputSchema: {
206
+ type: "object",
207
+ properties: {
208
+ project: { type: "string", description: "Project name" },
209
+ id: { type: "string", description: "Chunk ID to duplicate" },
210
+ },
211
+ required: ["project", "id"],
212
+ },
213
+ },
214
+ {
215
+ name: "move_chunk",
216
+ description: "Move a chunk from its current category to a different one.",
217
+ inputSchema: {
218
+ type: "object",
219
+ properties: {
220
+ project: { type: "string", description: "Project name" },
221
+ id: { type: "string", description: "Chunk ID to move" },
222
+ target_category: { type: "string", description: "Target category name" },
223
+ },
224
+ required: ["project", "id", "target_category"],
225
+ },
226
+ },
227
+
228
+ // ---- Search & Export ----
229
+ {
230
+ name: "search_chunks",
231
+ description: "Search for chunks by ID or text content across the entire project. Returns matching chunks with preview.",
232
+ inputSchema: {
233
+ type: "object",
234
+ properties: {
235
+ project: { type: "string", description: "Project name" },
236
+ query: { type: "string", description: "Search query (searches in chunk ID and text)" },
237
+ },
238
+ required: ["project", "query"],
239
+ },
240
+ },
241
+ {
242
+ name: "export_project",
243
+ description: "Export the project as a flat JSON array — compatible with Dataset Builder web app and ready for RAG systems. Each entry has id, text, and metadata.",
244
+ inputSchema: {
245
+ type: "object",
246
+ properties: {
247
+ project: { type: "string", description: "Project name" },
248
+ save_to_file: { type: "boolean", description: "If true, saves to a .export.json file in the data directory. Default: false (returns JSON in response)." },
249
+ },
250
+ required: ["project"],
251
+ },
252
+ },
253
+ {
254
+ name: "import_json",
255
+ description: "Import a JSON array of chunks into a project. Expected format: [{id, text, metadata}, ...]. Skips entries with duplicate IDs.",
256
+ inputSchema: {
257
+ type: "object",
258
+ properties: {
259
+ project: { type: "string", description: "Project name (will be created if it doesn't exist)" },
260
+ category: { type: "string", description: "Category to import into (default: 'Imported')" },
261
+ json_path: { type: "string", description: "Absolute path to the JSON file to import" },
262
+ data: { type: "array", description: "Or provide the JSON array directly instead of a file path", items: { type: "object" } },
263
+ },
264
+ required: ["project"],
265
+ },
266
+ },
267
+ ];
268
+
269
+ // ============================================
270
+ // LIST TOOLS
271
+ // ============================================
272
+
273
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
274
+
275
+ // ============================================
276
+ // CALL TOOL
277
+ // ============================================
278
+
279
+ server.setRequestHandler(CallToolRequestSchema, async (request) => {
280
+ const { name, arguments: args } = request.params;
281
+
282
+ try {
283
+ let result;
284
+
285
+ switch (name) {
286
+ // ---- Project ----
287
+ case "create_project":
288
+ result = store.createProject(args.name);
289
+ break;
290
+ case "list_projects":
291
+ result = store.listProjects();
292
+ break;
293
+ case "delete_project":
294
+ result = store.deleteProject(args.name);
295
+ break;
296
+ case "get_project_stats":
297
+ result = store.getStats(args.name);
298
+ break;
299
+
300
+ // ---- Category ----
301
+ case "create_category":
302
+ result = store.createCategory(args.project, args.name);
303
+ break;
304
+ case "list_categories":
305
+ result = store.listCategories(args.project);
306
+ break;
307
+ case "rename_category":
308
+ result = store.renameCategory(args.project, args.old_name, args.new_name);
309
+ break;
310
+ case "delete_category":
311
+ result = store.deleteCategory(args.project, args.name);
312
+ break;
313
+
314
+ // ---- Chunk ----
315
+ case "add_chunk":
316
+ result = store.addChunk(args.project, args.category, {
317
+ id: args.id,
318
+ text: args.text,
319
+ metadata: args.metadata,
320
+ });
321
+ break;
322
+ case "bulk_add_chunks":
323
+ result = store.bulkAddChunks(args.project, args.category, args.chunks);
324
+ break;
325
+ case "get_chunk":
326
+ result = store.getChunk(args.project, args.id);
327
+ break;
328
+ case "update_chunk":
329
+ result = store.updateChunk(args.project, args.id, {
330
+ newId: args.new_id,
331
+ text: args.text,
332
+ page_title: args.page_title,
333
+ source: args.source,
334
+ license: args.license,
335
+ metadata: args.metadata,
336
+ });
337
+ break;
338
+ case "delete_chunk":
339
+ result = store.deleteChunk(args.project, args.id);
340
+ break;
341
+ case "duplicate_chunk":
342
+ result = store.duplicateChunk(args.project, args.id);
343
+ break;
344
+ case "move_chunk":
345
+ result = store.moveChunk(args.project, args.id, args.target_category);
346
+ break;
347
+
348
+ // ---- Search & Export ----
349
+ case "search_chunks":
350
+ result = store.searchChunks(args.project, args.query);
351
+ break;
352
+
353
+ case "export_project": {
354
+ const exported = store.exportProject(args.project);
355
+ if (args.save_to_file) {
356
+ const outPath = store._filePath(args.project).replace('.json', '.export.json');
357
+ const { writeFileSync } = await import('fs');
358
+ writeFileSync(outPath, JSON.stringify(exported, null, 2), 'utf-8');
359
+ result = { exported: exported.length, savedTo: outPath };
360
+ } else {
361
+ result = { exported: exported.length, data: exported };
362
+ }
363
+ break;
364
+ }
365
+
366
+ case "import_json": {
367
+ let jsonData = args.data;
368
+ if (!jsonData && args.json_path) {
369
+ const { readFileSync } = await import('fs');
370
+ jsonData = JSON.parse(readFileSync(args.json_path, 'utf-8'));
371
+ }
372
+ if (!jsonData) throw new Error('Provide either "json_path" or "data" parameter');
373
+ result = store.importJSON(args.project, jsonData, args.category);
374
+ break;
375
+ }
376
+
377
+ default:
378
+ throw new Error(`Unknown tool: ${name}`);
379
+ }
380
+
381
+ return {
382
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
383
+ };
384
+
385
+ } catch (err) {
386
+ return {
387
+ content: [{ type: "text", text: `Error: ${err.message}` }],
388
+ isError: true,
389
+ };
390
+ }
391
+ });
392
+
393
+ // ============================================
394
+ // START
395
+ // ============================================
396
+
397
+ async function main() {
398
+ const transport = new StdioServerTransport();
399
+ await server.connect(transport);
400
+ console.error("Tryll Dataset Builder MCP server running");
401
+ }
402
+
403
+ main().catch((err) => {
404
+ console.error("Fatal:", err);
405
+ process.exit(1);
406
+ });
package/lib/store.js ADDED
@@ -0,0 +1,389 @@
1
+ import { readFileSync, writeFileSync, readdirSync, mkdirSync, unlinkSync, existsSync } from 'fs';
2
+ import { join } from 'path';
3
+ import { randomUUID } from 'crypto';
4
+
5
+ const DEFAULT_LICENSE = 'CC BY-NC-SA 3.0';
6
+ const STANDARD_META = ['page_title', 'source', 'license'];
7
+
8
+ export class Store {
9
+ constructor(dataDir) {
10
+ this.dataDir = dataDir || process.env.DATA_DIR || './datasets';
11
+ this._ensureDir();
12
+ }
13
+
14
+ _ensureDir() {
15
+ if (!existsSync(this.dataDir)) {
16
+ mkdirSync(this.dataDir, { recursive: true });
17
+ }
18
+ }
19
+
20
+ _filePath(name) {
21
+ return join(this.dataDir, `${name}.json`);
22
+ }
23
+
24
+ // ---- PROJECT ----
25
+
26
+ listProjects() {
27
+ this._ensureDir();
28
+ const files = readdirSync(this.dataDir).filter(f => f.endsWith('.json'));
29
+ return files.map(f => {
30
+ const name = f.replace(/\.json$/, '');
31
+ try {
32
+ const data = this._load(name);
33
+ const totalChunks = data.categories.reduce((sum, c) => sum + c.chunks.length, 0);
34
+ return { name, categories: data.categories.length, chunks: totalChunks, createdAt: data.createdAt };
35
+ } catch {
36
+ return { name, categories: 0, chunks: 0, createdAt: null };
37
+ }
38
+ });
39
+ }
40
+
41
+ createProject(name) {
42
+ const safeName = name.replace(/[^a-zA-Z0-9_\-. ]/g, '').trim();
43
+ if (!safeName) throw new Error('Invalid project name');
44
+ if (existsSync(this._filePath(safeName))) throw new Error(`Project "${safeName}" already exists`);
45
+ const project = { name: safeName, createdAt: new Date().toISOString(), categories: [] };
46
+ this._save(safeName, project);
47
+ return project;
48
+ }
49
+
50
+ deleteProject(name) {
51
+ const fp = this._filePath(name);
52
+ if (!existsSync(fp)) throw new Error(`Project "${name}" not found`);
53
+ unlinkSync(fp);
54
+ return { deleted: name };
55
+ }
56
+
57
+ getStats(name) {
58
+ const data = this._load(name);
59
+ let totalChunks = 0, totalLength = 0, longest = 0, shortest = Infinity;
60
+ for (const cat of data.categories) {
61
+ for (const ch of cat.chunks) {
62
+ totalChunks++;
63
+ const len = (ch.text || '').length;
64
+ totalLength += len;
65
+ if (len > longest) longest = len;
66
+ if (len < shortest) shortest = len;
67
+ }
68
+ }
69
+ return {
70
+ project: name,
71
+ categories: data.categories.length,
72
+ categoryNames: data.categories.map(c => `${c.name} (${c.chunks.length} chunks)`),
73
+ totalChunks,
74
+ avgTextLength: totalChunks ? Math.round(totalLength / totalChunks) : 0,
75
+ longestChunk: totalChunks ? longest : 0,
76
+ shortestChunk: totalChunks ? shortest : 0,
77
+ createdAt: data.createdAt,
78
+ };
79
+ }
80
+
81
+ // ---- CATEGORY ----
82
+
83
+ listCategories(projectName) {
84
+ const data = this._load(projectName);
85
+ return data.categories.map(c => ({
86
+ name: c.name,
87
+ chunks: c.chunks.length,
88
+ }));
89
+ }
90
+
91
+ createCategory(projectName, categoryName) {
92
+ const data = this._load(projectName);
93
+ const trimmed = categoryName.trim();
94
+ if (!trimmed) throw new Error('Category name cannot be empty');
95
+ if (data.categories.some(c => c.name.toLowerCase() === trimmed.toLowerCase())) {
96
+ throw new Error(`Category "${trimmed}" already exists in project "${projectName}"`);
97
+ }
98
+ const cat = { id: randomUUID(), name: trimmed, chunks: [] };
99
+ data.categories.push(cat);
100
+ this._save(projectName, data);
101
+ return cat;
102
+ }
103
+
104
+ renameCategory(projectName, oldName, newName) {
105
+ const data = this._load(projectName);
106
+ const cat = this._findCategory(data, oldName);
107
+ const trimmed = newName.trim();
108
+ if (!trimmed) throw new Error('New name cannot be empty');
109
+ if (data.categories.some(c => c.name.toLowerCase() === trimmed.toLowerCase() && c.id !== cat.id)) {
110
+ throw new Error(`Category "${trimmed}" already exists`);
111
+ }
112
+ cat.name = trimmed;
113
+ this._save(projectName, data);
114
+ return { old: oldName, new: trimmed };
115
+ }
116
+
117
+ deleteCategory(projectName, categoryName) {
118
+ const data = this._load(projectName);
119
+ const idx = data.categories.findIndex(c => c.name.toLowerCase() === categoryName.toLowerCase());
120
+ if (idx === -1) throw new Error(`Category "${categoryName}" not found`);
121
+ const removed = data.categories.splice(idx, 1)[0];
122
+ this._save(projectName, data);
123
+ return { deleted: removed.name, chunksRemoved: removed.chunks.length };
124
+ }
125
+
126
+ // ---- CHUNK ----
127
+
128
+ addChunk(projectName, categoryName, chunk) {
129
+ const data = this._load(projectName);
130
+ const cat = this._findCategory(data, categoryName);
131
+ const id = (chunk.id || '').trim();
132
+ if (!id) throw new Error('Chunk ID is required');
133
+ if (this._isIdTaken(data, id)) throw new Error(`Chunk ID "${id}" already exists in this project. Try adding _1, _2 suffix.`);
134
+
135
+ const newChunk = {
136
+ _uid: randomUUID(),
137
+ id,
138
+ text: chunk.text || '',
139
+ metadata: {
140
+ page_title: chunk.page_title || chunk.metadata?.page_title || '',
141
+ source: chunk.source || chunk.metadata?.source || '',
142
+ license: chunk.license || chunk.metadata?.license || DEFAULT_LICENSE,
143
+ },
144
+ customFields: this._parseCustomFields(chunk.metadata),
145
+ };
146
+ cat.chunks.push(newChunk);
147
+ this._save(projectName, data);
148
+ return { id: newChunk.id, category: cat.name };
149
+ }
150
+
151
+ bulkAddChunks(projectName, categoryName, chunks) {
152
+ const data = this._load(projectName);
153
+ const cat = this._findCategory(data, categoryName);
154
+ const added = [];
155
+ const errors = [];
156
+
157
+ for (const chunk of chunks) {
158
+ const id = (chunk.id || '').trim();
159
+ if (!id) { errors.push({ id: '(empty)', reason: 'ID is required' }); continue; }
160
+ if (this._isIdTaken(data, id)) { errors.push({ id, reason: 'Duplicate ID' }); continue; }
161
+
162
+ cat.chunks.push({
163
+ _uid: randomUUID(),
164
+ id,
165
+ text: chunk.text || '',
166
+ metadata: {
167
+ page_title: chunk.page_title || chunk.metadata?.page_title || '',
168
+ source: chunk.source || chunk.metadata?.source || '',
169
+ license: chunk.license || chunk.metadata?.license || DEFAULT_LICENSE,
170
+ },
171
+ customFields: this._parseCustomFields(chunk.metadata),
172
+ });
173
+ added.push(id);
174
+ }
175
+
176
+ this._save(projectName, data);
177
+ return { added: added.length, errors: errors.length, details: errors.length ? errors : undefined, ids: added };
178
+ }
179
+
180
+ getChunk(projectName, chunkId) {
181
+ const data = this._load(projectName);
182
+ for (const cat of data.categories) {
183
+ const ch = cat.chunks.find(c => c.id === chunkId);
184
+ if (ch) return { ...this._formatChunk(ch), category: cat.name };
185
+ }
186
+ throw new Error(`Chunk "${chunkId}" not found in project "${projectName}"`);
187
+ }
188
+
189
+ updateChunk(projectName, chunkId, updates) {
190
+ const data = this._load(projectName);
191
+ for (const cat of data.categories) {
192
+ const ch = cat.chunks.find(c => c.id === chunkId);
193
+ if (!ch) continue;
194
+
195
+ if (updates.newId && updates.newId !== chunkId) {
196
+ if (this._isIdTaken(data, updates.newId, ch._uid)) {
197
+ throw new Error(`Chunk ID "${updates.newId}" already exists`);
198
+ }
199
+ ch.id = updates.newId.trim();
200
+ }
201
+ if (updates.text !== undefined) ch.text = updates.text;
202
+ if (updates.page_title !== undefined) ch.metadata.page_title = updates.page_title;
203
+ if (updates.source !== undefined) ch.metadata.source = updates.source;
204
+ if (updates.license !== undefined) ch.metadata.license = updates.license;
205
+ if (updates.metadata) {
206
+ const custom = this._parseCustomFields(updates.metadata);
207
+ if (custom.length) ch.customFields = custom;
208
+ }
209
+
210
+ this._save(projectName, data);
211
+ return { updated: ch.id, category: cat.name };
212
+ }
213
+ throw new Error(`Chunk "${chunkId}" not found`);
214
+ }
215
+
216
+ deleteChunk(projectName, chunkId) {
217
+ const data = this._load(projectName);
218
+ for (const cat of data.categories) {
219
+ const idx = cat.chunks.findIndex(c => c.id === chunkId);
220
+ if (idx === -1) continue;
221
+ cat.chunks.splice(idx, 1);
222
+ this._save(projectName, data);
223
+ return { deleted: chunkId, category: cat.name };
224
+ }
225
+ throw new Error(`Chunk "${chunkId}" not found`);
226
+ }
227
+
228
+ duplicateChunk(projectName, chunkId) {
229
+ const data = this._load(projectName);
230
+ for (const cat of data.categories) {
231
+ const ch = cat.chunks.find(c => c.id === chunkId);
232
+ if (!ch) continue;
233
+
234
+ let newId = chunkId + '_copy';
235
+ let n = 1;
236
+ while (this._isIdTaken(data, newId)) { newId = `${chunkId}_copy_${n++}`; }
237
+
238
+ const clone = { ...JSON.parse(JSON.stringify(ch)), _uid: randomUUID(), id: newId };
239
+ cat.chunks.push(clone);
240
+ this._save(projectName, data);
241
+ return { original: chunkId, duplicate: newId, category: cat.name };
242
+ }
243
+ throw new Error(`Chunk "${chunkId}" not found`);
244
+ }
245
+
246
+ moveChunk(projectName, chunkId, targetCategory) {
247
+ const data = this._load(projectName);
248
+ const targetCat = this._findCategory(data, targetCategory);
249
+ for (const cat of data.categories) {
250
+ const idx = cat.chunks.findIndex(c => c.id === chunkId);
251
+ if (idx === -1) continue;
252
+ if (cat.id === targetCat.id) throw new Error('Chunk is already in that category');
253
+ const [chunk] = cat.chunks.splice(idx, 1);
254
+ targetCat.chunks.push(chunk);
255
+ this._save(projectName, data);
256
+ return { moved: chunkId, from: cat.name, to: targetCat.name };
257
+ }
258
+ throw new Error(`Chunk "${chunkId}" not found`);
259
+ }
260
+
261
+ // ---- SEARCH ----
262
+
263
+ searchChunks(projectName, query) {
264
+ const data = this._load(projectName);
265
+ const q = query.toLowerCase();
266
+ const results = [];
267
+ for (const cat of data.categories) {
268
+ for (const ch of cat.chunks) {
269
+ if (ch.id.toLowerCase().includes(q) || (ch.text || '').toLowerCase().includes(q)) {
270
+ results.push({ id: ch.id, category: cat.name, preview: ch.text.substring(0, 120) + (ch.text.length > 120 ? '...' : '') });
271
+ }
272
+ }
273
+ }
274
+ return { query, found: results.length, results };
275
+ }
276
+
277
+ // ---- EXPORT / IMPORT ----
278
+
279
+ exportProject(projectName) {
280
+ const data = this._load(projectName);
281
+ const flat = [];
282
+ for (const cat of data.categories) {
283
+ for (const ch of cat.chunks) {
284
+ const entry = {
285
+ id: ch.id,
286
+ text: ch.text,
287
+ metadata: { ...ch.metadata },
288
+ };
289
+ if (ch.customFields) {
290
+ for (const cf of ch.customFields) {
291
+ if (cf.key && cf.key.trim()) {
292
+ entry.metadata[cf.key.trim()] = String(cf.value ?? '');
293
+ }
294
+ }
295
+ }
296
+ flat.push(entry);
297
+ }
298
+ }
299
+ return flat;
300
+ }
301
+
302
+ importJSON(projectName, jsonArray, categoryName) {
303
+ if (!Array.isArray(jsonArray)) throw new Error('Import data must be a JSON array');
304
+
305
+ let data;
306
+ try {
307
+ data = this._load(projectName);
308
+ } catch {
309
+ data = this.createProject(projectName);
310
+ }
311
+
312
+ const catName = categoryName || 'Imported';
313
+ let cat = data.categories.find(c => c.name.toLowerCase() === catName.toLowerCase());
314
+ if (!cat) {
315
+ cat = { id: randomUUID(), name: catName, chunks: [] };
316
+ data.categories.push(cat);
317
+ }
318
+
319
+ let imported = 0, skipped = 0;
320
+ for (const entry of jsonArray) {
321
+ const id = (entry.id || '').trim();
322
+ if (!id) { skipped++; continue; }
323
+ if (this._isIdTaken(data, id)) { skipped++; continue; }
324
+
325
+ const meta = entry.metadata || {};
326
+ cat.chunks.push({
327
+ _uid: randomUUID(),
328
+ id,
329
+ text: entry.text || '',
330
+ metadata: {
331
+ page_title: meta.page_title || '',
332
+ source: meta.source || '',
333
+ license: meta.license || DEFAULT_LICENSE,
334
+ },
335
+ customFields: Object.entries(meta)
336
+ .filter(([k]) => !STANDARD_META.includes(k))
337
+ .map(([key, value]) => ({ key, value: String(value ?? '') })),
338
+ });
339
+ imported++;
340
+ }
341
+
342
+ this._save(projectName, data);
343
+ return { project: projectName, category: catName, imported, skipped };
344
+ }
345
+
346
+ // ---- INTERNAL ----
347
+
348
+ _load(name) {
349
+ const fp = this._filePath(name);
350
+ if (!existsSync(fp)) throw new Error(`Project "${name}" not found`);
351
+ return JSON.parse(readFileSync(fp, 'utf-8'));
352
+ }
353
+
354
+ _save(name, data) {
355
+ this._ensureDir();
356
+ writeFileSync(this._filePath(name), JSON.stringify(data, null, 2), 'utf-8');
357
+ }
358
+
359
+ _findCategory(data, name) {
360
+ const cat = data.categories.find(c => c.name.toLowerCase() === name.toLowerCase());
361
+ if (!cat) throw new Error(`Category "${name}" not found`);
362
+ return cat;
363
+ }
364
+
365
+ _isIdTaken(data, id, excludeUid) {
366
+ for (const cat of data.categories) {
367
+ for (const ch of cat.chunks) {
368
+ if (ch.id === id && ch._uid !== excludeUid) return true;
369
+ }
370
+ }
371
+ return false;
372
+ }
373
+
374
+ _parseCustomFields(metadata) {
375
+ if (!metadata || typeof metadata !== 'object') return [];
376
+ return Object.entries(metadata)
377
+ .filter(([k]) => !STANDARD_META.includes(k))
378
+ .map(([key, value]) => ({ key, value: String(value ?? '') }));
379
+ }
380
+
381
+ _formatChunk(ch) {
382
+ return {
383
+ id: ch.id,
384
+ text: ch.text,
385
+ metadata: ch.metadata,
386
+ customFields: ch.customFields || [],
387
+ };
388
+ }
389
+ }
package/package.json ADDED
@@ -0,0 +1,36 @@
1
+ {
2
+ "name": "tryll-dataset-builder-mcp",
3
+ "version": "1.0.0",
4
+ "description": "MCP server for building RAG knowledge base datasets. Create, manage and export structured JSON datasets via Claude Code.",
5
+ "type": "module",
6
+ "main": "index.js",
7
+ "bin": {
8
+ "tryll-dataset-builder-mcp": "index.js"
9
+ },
10
+ "files": [
11
+ "index.js",
12
+ "lib/",
13
+ "README.md",
14
+ "LICENSE"
15
+ ],
16
+ "keywords": [
17
+ "mcp",
18
+ "model-context-protocol",
19
+ "rag",
20
+ "dataset",
21
+ "knowledge-base",
22
+ "claude",
23
+ "ai",
24
+ "tryll-engine"
25
+ ],
26
+ "author": "Tryll Engine",
27
+ "license": "MIT",
28
+ "repository": {
29
+ "type": "git",
30
+ "url": "https://github.com/Skizziik/tryll_dataset_builder"
31
+ },
32
+ "dependencies": {
33
+ "@modelcontextprotocol/sdk": "^1.12.1",
34
+ "zod": "^3.24.0"
35
+ }
36
+ }