tryll-dataset-builder-mcp 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +275 -2
- package/lib/store.js +68 -0
- package/package.json +2 -1
package/index.js
CHANGED
|
@@ -5,11 +5,12 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
|
|
|
5
5
|
import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
|
|
6
6
|
import { Store } from "./lib/store.js";
|
|
7
7
|
import WebSocket from "ws";
|
|
8
|
+
import * as cheerio from "cheerio";
|
|
8
9
|
|
|
9
10
|
const store = new Store(process.env.DATA_DIR);
|
|
10
11
|
|
|
11
12
|
const server = new Server(
|
|
12
|
-
{ name: "tryll-dataset-builder", version: "1.
|
|
13
|
+
{ name: "tryll-dataset-builder", version: "1.2.0" },
|
|
13
14
|
{ capabilities: { tools: {} } }
|
|
14
15
|
);
|
|
15
16
|
|
|
@@ -38,6 +39,96 @@ async function apiCall(method, path, body) {
|
|
|
38
39
|
return data;
|
|
39
40
|
}
|
|
40
41
|
|
|
42
|
+
// ============================================
|
|
43
|
+
// URL PARSING HELPERS
|
|
44
|
+
// ============================================
|
|
45
|
+
|
|
46
|
+
const CHUNK_LIMIT = 2000;
|
|
47
|
+
|
|
48
|
+
async function parseUrl(url) {
|
|
49
|
+
const res = await fetch(url, {
|
|
50
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; TryllDatasetBuilder/1.2)' },
|
|
51
|
+
});
|
|
52
|
+
if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`);
|
|
53
|
+
const html = await res.text();
|
|
54
|
+
const $ = cheerio.load(html);
|
|
55
|
+
|
|
56
|
+
// Extract page title
|
|
57
|
+
const pageTitle = $('title').first().text().trim()
|
|
58
|
+
|| $('h1').first().text().trim()
|
|
59
|
+
|| '';
|
|
60
|
+
|
|
61
|
+
// Extract wiki infobox metadata
|
|
62
|
+
const infobox = {};
|
|
63
|
+
$('.infobox tr, .sidebar tr, .wikitable.infobox tr, table.infobox tr').each((_, row) => {
|
|
64
|
+
const $row = $(row);
|
|
65
|
+
const key = $row.find('th').first().text().trim().replace(/\s+/g, ' ');
|
|
66
|
+
const val = $row.find('td').first().text().trim().replace(/\s+/g, ' ');
|
|
67
|
+
if (key && val && key.length < 60 && val.length < 200) {
|
|
68
|
+
infobox[key] = val;
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
// Remove noise elements
|
|
73
|
+
$('script, style, nav, footer, header, .sidebar, .infobox, .navbox, .mw-editsection, .reference, .reflist, #mw-navigation, .noprint, .toc').remove();
|
|
74
|
+
|
|
75
|
+
// Extract main text
|
|
76
|
+
const mainContent = $('article, main, #mw-content-text, #content, .mw-parser-output, #bodyContent, .entry-content, .post-content').first();
|
|
77
|
+
let text = '';
|
|
78
|
+
if (mainContent.length) {
|
|
79
|
+
text = mainContent.text();
|
|
80
|
+
} else {
|
|
81
|
+
text = $('body').text();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Clean up whitespace
|
|
85
|
+
text = text
|
|
86
|
+
.replace(/\t/g, ' ')
|
|
87
|
+
.replace(/[ ]{2,}/g, ' ')
|
|
88
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
89
|
+
.trim();
|
|
90
|
+
|
|
91
|
+
return { text, pageTitle, infobox, source: url };
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function splitTextIntoChunks(text, baseId, limit = CHUNK_LIMIT) {
|
|
95
|
+
if (text.length <= limit) {
|
|
96
|
+
return [{ id: baseId, text }];
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const chunks = [];
|
|
100
|
+
let remaining = text;
|
|
101
|
+
let index = 1;
|
|
102
|
+
|
|
103
|
+
while (remaining.length > 0) {
|
|
104
|
+
let cutPoint = limit;
|
|
105
|
+
if (remaining.length > limit) {
|
|
106
|
+
// Try to cut at paragraph boundary
|
|
107
|
+
const paraBreak = remaining.lastIndexOf('\n\n', limit);
|
|
108
|
+
if (paraBreak > limit * 0.3) {
|
|
109
|
+
cutPoint = paraBreak;
|
|
110
|
+
} else {
|
|
111
|
+
// Try sentence boundary
|
|
112
|
+
const sentBreak = remaining.lastIndexOf('. ', limit);
|
|
113
|
+
if (sentBreak > limit * 0.3) {
|
|
114
|
+
cutPoint = sentBreak + 1;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
} else {
|
|
118
|
+
cutPoint = remaining.length;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
chunks.push({
|
|
122
|
+
id: `${baseId}_${index}`,
|
|
123
|
+
text: remaining.substring(0, cutPoint).trim(),
|
|
124
|
+
});
|
|
125
|
+
remaining = remaining.substring(cutPoint).trim();
|
|
126
|
+
index++;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return chunks;
|
|
130
|
+
}
|
|
131
|
+
|
|
41
132
|
// ============================================
|
|
42
133
|
// TOOL DEFINITIONS
|
|
43
134
|
// ============================================
|
|
@@ -309,6 +400,89 @@ const TOOLS = [
|
|
|
309
400
|
required: ["project"],
|
|
310
401
|
},
|
|
311
402
|
},
|
|
403
|
+
|
|
404
|
+
// ---- URL Parsing ----
|
|
405
|
+
{
|
|
406
|
+
name: "parse_url",
|
|
407
|
+
description: "Fetch a web page, extract its text content, and auto-create chunks. If text exceeds 2000 characters, it auto-splits into multiple chunks with _1, _2 suffixes. Extracts page title and source URL as metadata. For wiki pages, extracts infobox/sidebar data as custom metadata fields.",
|
|
408
|
+
inputSchema: {
|
|
409
|
+
type: "object",
|
|
410
|
+
properties: {
|
|
411
|
+
project: { type: "string", description: "Project name" },
|
|
412
|
+
category: { type: "string", description: "Category to add chunks into" },
|
|
413
|
+
url: { type: "string", description: "URL to fetch and parse" },
|
|
414
|
+
chunk_id: { type: "string", description: "Base chunk ID. If text is split, becomes chunk_id_1, chunk_id_2, etc." },
|
|
415
|
+
license: { type: "string", description: "License for the content. Default: CC BY-NC-SA 3.0" },
|
|
416
|
+
},
|
|
417
|
+
required: ["project", "category", "url", "chunk_id"],
|
|
418
|
+
},
|
|
419
|
+
},
|
|
420
|
+
{
|
|
421
|
+
name: "batch_parse_urls",
|
|
422
|
+
description: "Parse multiple URLs at once and add all chunks to a category. Each URL gets its own chunk ID prefix. Auto-splits long texts into multiple chunks.",
|
|
423
|
+
inputSchema: {
|
|
424
|
+
type: "object",
|
|
425
|
+
properties: {
|
|
426
|
+
project: { type: "string", description: "Project name" },
|
|
427
|
+
category: { type: "string", description: "Category to add chunks into" },
|
|
428
|
+
urls: {
|
|
429
|
+
type: "array",
|
|
430
|
+
description: "Array of URL entries to parse",
|
|
431
|
+
items: {
|
|
432
|
+
type: "object",
|
|
433
|
+
properties: {
|
|
434
|
+
url: { type: "string", description: "URL to fetch" },
|
|
435
|
+
chunk_id: { type: "string", description: "Base chunk ID for this URL" },
|
|
436
|
+
},
|
|
437
|
+
required: ["url", "chunk_id"],
|
|
438
|
+
},
|
|
439
|
+
},
|
|
440
|
+
license: { type: "string", description: "License for all content. Default: CC BY-NC-SA 3.0" },
|
|
441
|
+
},
|
|
442
|
+
required: ["project", "category", "urls"],
|
|
443
|
+
},
|
|
444
|
+
},
|
|
445
|
+
|
|
446
|
+
// ---- Bulk Operations ----
|
|
447
|
+
{
|
|
448
|
+
name: "bulk_update_metadata",
|
|
449
|
+
description: "Update a metadata field across ALL chunks in a project (or a specific category). Useful for setting license, source, or custom fields in bulk.",
|
|
450
|
+
inputSchema: {
|
|
451
|
+
type: "object",
|
|
452
|
+
properties: {
|
|
453
|
+
project: { type: "string", description: "Project name" },
|
|
454
|
+
field: { type: "string", description: "Metadata field to update (e.g. 'license', 'source', or any custom field name)" },
|
|
455
|
+
value: { type: "string", description: "New value for the field" },
|
|
456
|
+
category: { type: "string", description: "Optional: only update chunks in this category. If omitted, updates all chunks in the project." },
|
|
457
|
+
},
|
|
458
|
+
required: ["project", "field", "value"],
|
|
459
|
+
},
|
|
460
|
+
},
|
|
461
|
+
{
|
|
462
|
+
name: "merge_projects",
|
|
463
|
+
description: "Merge all categories and chunks from a source project into a target project. Categories with the same name are combined. Chunks with duplicate IDs are skipped.",
|
|
464
|
+
inputSchema: {
|
|
465
|
+
type: "object",
|
|
466
|
+
properties: {
|
|
467
|
+
source: { type: "string", description: "Source project name (data is copied FROM here)" },
|
|
468
|
+
target: { type: "string", description: "Target project name (data is merged INTO here)" },
|
|
469
|
+
},
|
|
470
|
+
required: ["source", "target"],
|
|
471
|
+
},
|
|
472
|
+
},
|
|
473
|
+
{
|
|
474
|
+
name: "export_category",
|
|
475
|
+
description: "Export a single category as a flat JSON array. Same format as export_project but filtered to one category.",
|
|
476
|
+
inputSchema: {
|
|
477
|
+
type: "object",
|
|
478
|
+
properties: {
|
|
479
|
+
project: { type: "string", description: "Project name" },
|
|
480
|
+
category: { type: "string", description: "Category name to export" },
|
|
481
|
+
save_to_file: { type: "boolean", description: "If true, saves to a file. Default: false." },
|
|
482
|
+
},
|
|
483
|
+
required: ["project", "category"],
|
|
484
|
+
},
|
|
485
|
+
},
|
|
312
486
|
];
|
|
313
487
|
|
|
314
488
|
// ============================================
|
|
@@ -418,6 +592,50 @@ async function handleRemote(name, args) {
|
|
|
418
592
|
data: jsonData, category: args.category, session: s,
|
|
419
593
|
});
|
|
420
594
|
}
|
|
595
|
+
case "parse_url": {
|
|
596
|
+
const parsed = await parseUrl(args.url);
|
|
597
|
+
const chunks = splitTextIntoChunks(parsed.text, args.chunk_id);
|
|
598
|
+
const license = args.license || 'CC BY-NC-SA 3.0';
|
|
599
|
+
const chunkData = chunks.map(ch => ({
|
|
600
|
+
id: ch.id, text: ch.text,
|
|
601
|
+
metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
|
|
602
|
+
}));
|
|
603
|
+
const result = await apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
|
|
604
|
+
chunks: chunkData, session: s,
|
|
605
|
+
});
|
|
606
|
+
return { ...result, pageTitle: parsed.pageTitle, chunksCreated: chunks.length, infoboxFields: Object.keys(parsed.infobox) };
|
|
607
|
+
}
|
|
608
|
+
case "batch_parse_urls": {
|
|
609
|
+
const results = [];
|
|
610
|
+
const license = args.license || 'CC BY-NC-SA 3.0';
|
|
611
|
+
for (const entry of args.urls) {
|
|
612
|
+
try {
|
|
613
|
+
const parsed = await parseUrl(entry.url);
|
|
614
|
+
const chunks = splitTextIntoChunks(parsed.text, entry.chunk_id);
|
|
615
|
+
const chunkData = chunks.map(ch => ({
|
|
616
|
+
id: ch.id, text: ch.text,
|
|
617
|
+
metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
|
|
618
|
+
}));
|
|
619
|
+
const r = await apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
|
|
620
|
+
chunks: chunkData, session: s,
|
|
621
|
+
});
|
|
622
|
+
results.push({ url: entry.url, chunk_id: entry.chunk_id, chunks: chunks.length, added: r.added, errors: r.errors });
|
|
623
|
+
} catch (err) {
|
|
624
|
+
results.push({ url: entry.url, chunk_id: entry.chunk_id, error: err.message });
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
return { parsed: results.filter(r => !r.error).length, failed: results.filter(r => r.error).length, results };
|
|
628
|
+
}
|
|
629
|
+
case "bulk_update_metadata":
|
|
630
|
+
return apiCall('POST', `/api/projects/${p(args.project)}/bulk-metadata`, {
|
|
631
|
+
field: args.field, value: args.value, category: args.category, session: s,
|
|
632
|
+
});
|
|
633
|
+
case "merge_projects":
|
|
634
|
+
return apiCall('POST', `/api/projects/${p(args.source)}/merge`, {
|
|
635
|
+
target: args.target, session: s,
|
|
636
|
+
});
|
|
637
|
+
case "export_category":
|
|
638
|
+
return apiCall('GET', `/api/projects/${p(args.project)}/categories/${p(args.category)}/export`);
|
|
421
639
|
default:
|
|
422
640
|
throw new Error(`Unknown tool: ${name}`);
|
|
423
641
|
}
|
|
@@ -593,6 +811,61 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
593
811
|
break;
|
|
594
812
|
}
|
|
595
813
|
|
|
814
|
+
case "parse_url": {
|
|
815
|
+
const parsed = await parseUrl(args.url);
|
|
816
|
+
const chunks = splitTextIntoChunks(parsed.text, args.chunk_id);
|
|
817
|
+
const license = args.license || 'CC BY-NC-SA 3.0';
|
|
818
|
+
const chunkData = chunks.map(ch => ({
|
|
819
|
+
id: ch.id, text: ch.text,
|
|
820
|
+
metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
|
|
821
|
+
}));
|
|
822
|
+
const bulkResult = store.bulkAddChunks(args.project, args.category, chunkData);
|
|
823
|
+
result = { ...bulkResult, pageTitle: parsed.pageTitle, chunksCreated: chunks.length, infoboxFields: Object.keys(parsed.infobox) };
|
|
824
|
+
break;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
case "batch_parse_urls": {
|
|
828
|
+
const results = [];
|
|
829
|
+
const license = args.license || 'CC BY-NC-SA 3.0';
|
|
830
|
+
for (const entry of args.urls) {
|
|
831
|
+
try {
|
|
832
|
+
const parsed = await parseUrl(entry.url);
|
|
833
|
+
const chunks = splitTextIntoChunks(parsed.text, entry.chunk_id);
|
|
834
|
+
const chunkData = chunks.map(ch => ({
|
|
835
|
+
id: ch.id, text: ch.text,
|
|
836
|
+
metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
|
|
837
|
+
}));
|
|
838
|
+
const r = store.bulkAddChunks(args.project, args.category, chunkData);
|
|
839
|
+
results.push({ url: entry.url, chunk_id: entry.chunk_id, chunks: chunks.length, added: r.added, errors: r.errors });
|
|
840
|
+
} catch (err) {
|
|
841
|
+
results.push({ url: entry.url, chunk_id: entry.chunk_id, error: err.message });
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
result = { parsed: results.filter(r => !r.error).length, failed: results.filter(r => r.error).length, results };
|
|
845
|
+
break;
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
case "bulk_update_metadata":
|
|
849
|
+
result = store.bulkUpdateMetadata(args.project, args.field, args.value, args.category);
|
|
850
|
+
break;
|
|
851
|
+
|
|
852
|
+
case "merge_projects":
|
|
853
|
+
result = store.mergeProjects(args.source, args.target);
|
|
854
|
+
break;
|
|
855
|
+
|
|
856
|
+
case "export_category": {
|
|
857
|
+
const exported = store.exportCategory(args.project, args.category);
|
|
858
|
+
if (args.save_to_file) {
|
|
859
|
+
const outPath = store._filePath(args.project).replace('.json', `.${args.category}.export.json`);
|
|
860
|
+
const { writeFileSync } = await import('fs');
|
|
861
|
+
writeFileSync(outPath, JSON.stringify(exported, null, 2), 'utf-8');
|
|
862
|
+
result = { exported: exported.length, savedTo: outPath };
|
|
863
|
+
} else {
|
|
864
|
+
result = { exported: exported.length, data: exported };
|
|
865
|
+
}
|
|
866
|
+
break;
|
|
867
|
+
}
|
|
868
|
+
|
|
596
869
|
default:
|
|
597
870
|
throw new Error(`Unknown tool: ${name}`);
|
|
598
871
|
}
|
|
@@ -617,7 +890,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
617
890
|
async function main() {
|
|
618
891
|
const transport = new StdioServerTransport();
|
|
619
892
|
await server.connect(transport);
|
|
620
|
-
console.error("Tryll Dataset Builder MCP server running (v1.
|
|
893
|
+
console.error("Tryll Dataset Builder MCP server running (v1.2.0)");
|
|
621
894
|
}
|
|
622
895
|
|
|
623
896
|
main().catch((err) => {
|
package/lib/store.js
CHANGED
|
@@ -343,6 +343,74 @@ export class Store {
|
|
|
343
343
|
return { project: projectName, category: catName, imported, skipped };
|
|
344
344
|
}
|
|
345
345
|
|
|
346
|
+
// ---- BULK UPDATE METADATA ----
|
|
347
|
+
|
|
348
|
+
bulkUpdateMetadata(projectName, field, value, categoryName) {
|
|
349
|
+
const data = this._load(projectName);
|
|
350
|
+
let updated = 0;
|
|
351
|
+
const cats = categoryName
|
|
352
|
+
? [this._findCategory(data, categoryName)]
|
|
353
|
+
: data.categories;
|
|
354
|
+
for (const cat of cats) {
|
|
355
|
+
for (const ch of cat.chunks) {
|
|
356
|
+
if (STANDARD_META.includes(field)) {
|
|
357
|
+
ch.metadata[field] = value;
|
|
358
|
+
} else {
|
|
359
|
+
if (!ch.customFields) ch.customFields = [];
|
|
360
|
+
const existing = ch.customFields.find(cf => cf.key === field);
|
|
361
|
+
if (existing) { existing.value = value; }
|
|
362
|
+
else { ch.customFields.push({ key: field, value }); }
|
|
363
|
+
}
|
|
364
|
+
updated++;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
this._save(projectName, data);
|
|
368
|
+
return { project: projectName, field, value, updated };
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
// ---- MERGE PROJECTS ----
|
|
372
|
+
|
|
373
|
+
mergeProjects(sourceName, targetName) {
|
|
374
|
+
const source = this._load(sourceName);
|
|
375
|
+
const target = this._load(targetName);
|
|
376
|
+
let categoriesMerged = 0, chunksAdded = 0, chunksSkipped = 0;
|
|
377
|
+
|
|
378
|
+
for (const srcCat of source.categories) {
|
|
379
|
+
let tgtCat = target.categories.find(c => c.name.toLowerCase() === srcCat.name.toLowerCase());
|
|
380
|
+
if (!tgtCat) {
|
|
381
|
+
tgtCat = { id: randomUUID(), name: srcCat.name, expanded: true, chunks: [] };
|
|
382
|
+
target.categories.push(tgtCat);
|
|
383
|
+
categoriesMerged++;
|
|
384
|
+
}
|
|
385
|
+
for (const ch of srcCat.chunks) {
|
|
386
|
+
if (this._isIdTaken(target, ch.id)) { chunksSkipped++; continue; }
|
|
387
|
+
tgtCat.chunks.push({ ...JSON.parse(JSON.stringify(ch)), _uid: randomUUID() });
|
|
388
|
+
chunksAdded++;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
this._save(targetName, target);
|
|
393
|
+
return { source: sourceName, target: targetName, categoriesMerged, chunksAdded, chunksSkipped };
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// ---- EXPORT CATEGORY ----
|
|
397
|
+
|
|
398
|
+
exportCategory(projectName, categoryName) {
|
|
399
|
+
const data = this._load(projectName);
|
|
400
|
+
const cat = this._findCategory(data, categoryName);
|
|
401
|
+
const flat = [];
|
|
402
|
+
for (const ch of cat.chunks) {
|
|
403
|
+
const entry = { id: ch.id, text: ch.text, metadata: { ...ch.metadata } };
|
|
404
|
+
if (ch.customFields) {
|
|
405
|
+
for (const cf of ch.customFields) {
|
|
406
|
+
if (cf.key && cf.key.trim()) entry.metadata[cf.key.trim()] = String(cf.value ?? '');
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
flat.push(entry);
|
|
410
|
+
}
|
|
411
|
+
return flat;
|
|
412
|
+
}
|
|
413
|
+
|
|
346
414
|
// ---- INTERNAL ----
|
|
347
415
|
|
|
348
416
|
_load(name) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "tryll-dataset-builder-mcp",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "MCP server for building RAG knowledge base datasets. Create, manage and export structured JSON datasets via Claude Code.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -31,6 +31,7 @@
|
|
|
31
31
|
},
|
|
32
32
|
"dependencies": {
|
|
33
33
|
"@modelcontextprotocol/sdk": "^1.12.1",
|
|
34
|
+
"cheerio": "^1.2.0",
|
|
34
35
|
"ws": "^8.19.0",
|
|
35
36
|
"zod": "^3.24.0"
|
|
36
37
|
}
|