tryll-dataset-builder-mcp 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/index.js +275 -2
  2. package/lib/store.js +68 -0
  3. package/package.json +2 -1
package/index.js CHANGED
@@ -5,11 +5,12 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
5
5
  import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
6
6
  import { Store } from "./lib/store.js";
7
7
  import WebSocket from "ws";
8
+ import * as cheerio from "cheerio";
8
9
 
9
10
  const store = new Store(process.env.DATA_DIR);
10
11
 
11
12
  const server = new Server(
12
- { name: "tryll-dataset-builder", version: "1.1.0" },
13
+ { name: "tryll-dataset-builder", version: "1.2.0" },
13
14
  { capabilities: { tools: {} } }
14
15
  );
15
16
 
@@ -38,6 +39,96 @@ async function apiCall(method, path, body) {
38
39
  return data;
39
40
  }
40
41
 
42
+ // ============================================
43
+ // URL PARSING HELPERS
44
+ // ============================================
45
+
46
+ const CHUNK_LIMIT = 2000;
47
+
48
+ async function parseUrl(url) {
49
+ const res = await fetch(url, {
50
+ headers: { 'User-Agent': 'Mozilla/5.0 (compatible; TryllDatasetBuilder/1.2)' },
51
+ });
52
+ if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`);
53
+ const html = await res.text();
54
+ const $ = cheerio.load(html);
55
+
56
+ // Extract page title
57
+ const pageTitle = $('title').first().text().trim()
58
+ || $('h1').first().text().trim()
59
+ || '';
60
+
61
+ // Extract wiki infobox metadata
62
+ const infobox = {};
63
+ $('.infobox tr, .sidebar tr, .wikitable.infobox tr, table.infobox tr').each((_, row) => {
64
+ const $row = $(row);
65
+ const key = $row.find('th').first().text().trim().replace(/\s+/g, ' ');
66
+ const val = $row.find('td').first().text().trim().replace(/\s+/g, ' ');
67
+ if (key && val && key.length < 60 && val.length < 200) {
68
+ infobox[key] = val;
69
+ }
70
+ });
71
+
72
+ // Remove noise elements
73
+ $('script, style, nav, footer, header, .sidebar, .infobox, .navbox, .mw-editsection, .reference, .reflist, #mw-navigation, .noprint, .toc').remove();
74
+
75
+ // Extract main text
76
+ const mainContent = $('article, main, #mw-content-text, #content, .mw-parser-output, #bodyContent, .entry-content, .post-content').first();
77
+ let text = '';
78
+ if (mainContent.length) {
79
+ text = mainContent.text();
80
+ } else {
81
+ text = $('body').text();
82
+ }
83
+
84
+ // Clean up whitespace
85
+ text = text
86
+ .replace(/\t/g, ' ')
87
+ .replace(/[ ]{2,}/g, ' ')
88
+ .replace(/\n{3,}/g, '\n\n')
89
+ .trim();
90
+
91
+ return { text, pageTitle, infobox, source: url };
92
+ }
93
+
94
+ function splitTextIntoChunks(text, baseId, limit = CHUNK_LIMIT) {
95
+ if (text.length <= limit) {
96
+ return [{ id: baseId, text }];
97
+ }
98
+
99
+ const chunks = [];
100
+ let remaining = text;
101
+ let index = 1;
102
+
103
+ while (remaining.length > 0) {
104
+ let cutPoint = limit;
105
+ if (remaining.length > limit) {
106
+ // Try to cut at paragraph boundary
107
+ const paraBreak = remaining.lastIndexOf('\n\n', limit);
108
+ if (paraBreak > limit * 0.3) {
109
+ cutPoint = paraBreak;
110
+ } else {
111
+ // Try sentence boundary
112
+ const sentBreak = remaining.lastIndexOf('. ', limit);
113
+ if (sentBreak > limit * 0.3) {
114
+ cutPoint = sentBreak + 1;
115
+ }
116
+ }
117
+ } else {
118
+ cutPoint = remaining.length;
119
+ }
120
+
121
+ chunks.push({
122
+ id: `${baseId}_${index}`,
123
+ text: remaining.substring(0, cutPoint).trim(),
124
+ });
125
+ remaining = remaining.substring(cutPoint).trim();
126
+ index++;
127
+ }
128
+
129
+ return chunks;
130
+ }
131
+
41
132
  // ============================================
42
133
  // TOOL DEFINITIONS
43
134
  // ============================================
@@ -309,6 +400,89 @@ const TOOLS = [
309
400
  required: ["project"],
310
401
  },
311
402
  },
403
+
404
+ // ---- URL Parsing ----
405
+ {
406
+ name: "parse_url",
407
+ description: "Fetch a web page, extract its text content, and auto-create chunks. If text exceeds 2000 characters, it auto-splits into multiple chunks with _1, _2 suffixes. Extracts page title and source URL as metadata. For wiki pages, extracts infobox/sidebar data as custom metadata fields.",
408
+ inputSchema: {
409
+ type: "object",
410
+ properties: {
411
+ project: { type: "string", description: "Project name" },
412
+ category: { type: "string", description: "Category to add chunks into" },
413
+ url: { type: "string", description: "URL to fetch and parse" },
414
+ chunk_id: { type: "string", description: "Base chunk ID. If text is split, becomes chunk_id_1, chunk_id_2, etc." },
415
+ license: { type: "string", description: "License for the content. Default: CC BY-NC-SA 3.0" },
416
+ },
417
+ required: ["project", "category", "url", "chunk_id"],
418
+ },
419
+ },
420
+ {
421
+ name: "batch_parse_urls",
422
+ description: "Parse multiple URLs at once and add all chunks to a category. Each URL gets its own chunk ID prefix. Auto-splits long texts into multiple chunks.",
423
+ inputSchema: {
424
+ type: "object",
425
+ properties: {
426
+ project: { type: "string", description: "Project name" },
427
+ category: { type: "string", description: "Category to add chunks into" },
428
+ urls: {
429
+ type: "array",
430
+ description: "Array of URL entries to parse",
431
+ items: {
432
+ type: "object",
433
+ properties: {
434
+ url: { type: "string", description: "URL to fetch" },
435
+ chunk_id: { type: "string", description: "Base chunk ID for this URL" },
436
+ },
437
+ required: ["url", "chunk_id"],
438
+ },
439
+ },
440
+ license: { type: "string", description: "License for all content. Default: CC BY-NC-SA 3.0" },
441
+ },
442
+ required: ["project", "category", "urls"],
443
+ },
444
+ },
445
+
446
+ // ---- Bulk Operations ----
447
+ {
448
+ name: "bulk_update_metadata",
449
+ description: "Update a metadata field across ALL chunks in a project (or a specific category). Useful for setting license, source, or custom fields in bulk.",
450
+ inputSchema: {
451
+ type: "object",
452
+ properties: {
453
+ project: { type: "string", description: "Project name" },
454
+ field: { type: "string", description: "Metadata field to update (e.g. 'license', 'source', or any custom field name)" },
455
+ value: { type: "string", description: "New value for the field" },
456
+ category: { type: "string", description: "Optional: only update chunks in this category. If omitted, updates all chunks in the project." },
457
+ },
458
+ required: ["project", "field", "value"],
459
+ },
460
+ },
461
+ {
462
+ name: "merge_projects",
463
+ description: "Merge all categories and chunks from a source project into a target project. Categories with the same name are combined. Chunks with duplicate IDs are skipped.",
464
+ inputSchema: {
465
+ type: "object",
466
+ properties: {
467
+ source: { type: "string", description: "Source project name (data is copied FROM here)" },
468
+ target: { type: "string", description: "Target project name (data is merged INTO here)" },
469
+ },
470
+ required: ["source", "target"],
471
+ },
472
+ },
473
+ {
474
+ name: "export_category",
475
+ description: "Export a single category as a flat JSON array. Same format as export_project but filtered to one category.",
476
+ inputSchema: {
477
+ type: "object",
478
+ properties: {
479
+ project: { type: "string", description: "Project name" },
480
+ category: { type: "string", description: "Category name to export" },
481
+ save_to_file: { type: "boolean", description: "If true, saves to a file. Default: false." },
482
+ },
483
+ required: ["project", "category"],
484
+ },
485
+ },
312
486
  ];
313
487
 
314
488
  // ============================================
@@ -418,6 +592,50 @@ async function handleRemote(name, args) {
418
592
  data: jsonData, category: args.category, session: s,
419
593
  });
420
594
  }
595
+ case "parse_url": {
596
+ const parsed = await parseUrl(args.url);
597
+ const chunks = splitTextIntoChunks(parsed.text, args.chunk_id);
598
+ const license = args.license || 'CC BY-NC-SA 3.0';
599
+ const chunkData = chunks.map(ch => ({
600
+ id: ch.id, text: ch.text,
601
+ metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
602
+ }));
603
+ const result = await apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
604
+ chunks: chunkData, session: s,
605
+ });
606
+ return { ...result, pageTitle: parsed.pageTitle, chunksCreated: chunks.length, infoboxFields: Object.keys(parsed.infobox) };
607
+ }
608
+ case "batch_parse_urls": {
609
+ const results = [];
610
+ const license = args.license || 'CC BY-NC-SA 3.0';
611
+ for (const entry of args.urls) {
612
+ try {
613
+ const parsed = await parseUrl(entry.url);
614
+ const chunks = splitTextIntoChunks(parsed.text, entry.chunk_id);
615
+ const chunkData = chunks.map(ch => ({
616
+ id: ch.id, text: ch.text,
617
+ metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
618
+ }));
619
+ const r = await apiCall('POST', `/api/projects/${p(args.project)}/categories/${p(args.category)}/chunks/bulk`, {
620
+ chunks: chunkData, session: s,
621
+ });
622
+ results.push({ url: entry.url, chunk_id: entry.chunk_id, chunks: chunks.length, added: r.added, errors: r.errors });
623
+ } catch (err) {
624
+ results.push({ url: entry.url, chunk_id: entry.chunk_id, error: err.message });
625
+ }
626
+ }
627
+ return { parsed: results.filter(r => !r.error).length, failed: results.filter(r => r.error).length, results };
628
+ }
629
+ case "bulk_update_metadata":
630
+ return apiCall('POST', `/api/projects/${p(args.project)}/bulk-metadata`, {
631
+ field: args.field, value: args.value, category: args.category, session: s,
632
+ });
633
+ case "merge_projects":
634
+ return apiCall('POST', `/api/projects/${p(args.source)}/merge`, {
635
+ target: args.target, session: s,
636
+ });
637
+ case "export_category":
638
+ return apiCall('GET', `/api/projects/${p(args.project)}/categories/${p(args.category)}/export`);
421
639
  default:
422
640
  throw new Error(`Unknown tool: ${name}`);
423
641
  }
@@ -593,6 +811,61 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
593
811
  break;
594
812
  }
595
813
 
814
+ case "parse_url": {
815
+ const parsed = await parseUrl(args.url);
816
+ const chunks = splitTextIntoChunks(parsed.text, args.chunk_id);
817
+ const license = args.license || 'CC BY-NC-SA 3.0';
818
+ const chunkData = chunks.map(ch => ({
819
+ id: ch.id, text: ch.text,
820
+ metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
821
+ }));
822
+ const bulkResult = store.bulkAddChunks(args.project, args.category, chunkData);
823
+ result = { ...bulkResult, pageTitle: parsed.pageTitle, chunksCreated: chunks.length, infoboxFields: Object.keys(parsed.infobox) };
824
+ break;
825
+ }
826
+
827
+ case "batch_parse_urls": {
828
+ const results = [];
829
+ const license = args.license || 'CC BY-NC-SA 3.0';
830
+ for (const entry of args.urls) {
831
+ try {
832
+ const parsed = await parseUrl(entry.url);
833
+ const chunks = splitTextIntoChunks(parsed.text, entry.chunk_id);
834
+ const chunkData = chunks.map(ch => ({
835
+ id: ch.id, text: ch.text,
836
+ metadata: { page_title: parsed.pageTitle, source: parsed.source, license, ...parsed.infobox },
837
+ }));
838
+ const r = store.bulkAddChunks(args.project, args.category, chunkData);
839
+ results.push({ url: entry.url, chunk_id: entry.chunk_id, chunks: chunks.length, added: r.added, errors: r.errors });
840
+ } catch (err) {
841
+ results.push({ url: entry.url, chunk_id: entry.chunk_id, error: err.message });
842
+ }
843
+ }
844
+ result = { parsed: results.filter(r => !r.error).length, failed: results.filter(r => r.error).length, results };
845
+ break;
846
+ }
847
+
848
+ case "bulk_update_metadata":
849
+ result = store.bulkUpdateMetadata(args.project, args.field, args.value, args.category);
850
+ break;
851
+
852
+ case "merge_projects":
853
+ result = store.mergeProjects(args.source, args.target);
854
+ break;
855
+
856
+ case "export_category": {
857
+ const exported = store.exportCategory(args.project, args.category);
858
+ if (args.save_to_file) {
859
+ const outPath = store._filePath(args.project).replace('.json', `.${args.category}.export.json`);
860
+ const { writeFileSync } = await import('fs');
861
+ writeFileSync(outPath, JSON.stringify(exported, null, 2), 'utf-8');
862
+ result = { exported: exported.length, savedTo: outPath };
863
+ } else {
864
+ result = { exported: exported.length, data: exported };
865
+ }
866
+ break;
867
+ }
868
+
596
869
  default:
597
870
  throw new Error(`Unknown tool: ${name}`);
598
871
  }
@@ -617,7 +890,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
617
890
  async function main() {
618
891
  const transport = new StdioServerTransport();
619
892
  await server.connect(transport);
620
- console.error("Tryll Dataset Builder MCP server running (v1.1.0)");
893
+ console.error("Tryll Dataset Builder MCP server running (v1.2.0)");
621
894
  }
622
895
 
623
896
  main().catch((err) => {
package/lib/store.js CHANGED
@@ -343,6 +343,74 @@ export class Store {
343
343
  return { project: projectName, category: catName, imported, skipped };
344
344
  }
345
345
 
346
+ // ---- BULK UPDATE METADATA ----
347
+
348
+ bulkUpdateMetadata(projectName, field, value, categoryName) {
349
+ const data = this._load(projectName);
350
+ let updated = 0;
351
+ const cats = categoryName
352
+ ? [this._findCategory(data, categoryName)]
353
+ : data.categories;
354
+ for (const cat of cats) {
355
+ for (const ch of cat.chunks) {
356
+ if (STANDARD_META.includes(field)) {
357
+ ch.metadata[field] = value;
358
+ } else {
359
+ if (!ch.customFields) ch.customFields = [];
360
+ const existing = ch.customFields.find(cf => cf.key === field);
361
+ if (existing) { existing.value = value; }
362
+ else { ch.customFields.push({ key: field, value }); }
363
+ }
364
+ updated++;
365
+ }
366
+ }
367
+ this._save(projectName, data);
368
+ return { project: projectName, field, value, updated };
369
+ }
370
+
371
+ // ---- MERGE PROJECTS ----
372
+
373
+ mergeProjects(sourceName, targetName) {
374
+ const source = this._load(sourceName);
375
+ const target = this._load(targetName);
376
+ let categoriesMerged = 0, chunksAdded = 0, chunksSkipped = 0;
377
+
378
+ for (const srcCat of source.categories) {
379
+ let tgtCat = target.categories.find(c => c.name.toLowerCase() === srcCat.name.toLowerCase());
380
+ if (!tgtCat) {
381
+ tgtCat = { id: randomUUID(), name: srcCat.name, expanded: true, chunks: [] };
382
+ target.categories.push(tgtCat);
383
+ categoriesMerged++;
384
+ }
385
+ for (const ch of srcCat.chunks) {
386
+ if (this._isIdTaken(target, ch.id)) { chunksSkipped++; continue; }
387
+ tgtCat.chunks.push({ ...JSON.parse(JSON.stringify(ch)), _uid: randomUUID() });
388
+ chunksAdded++;
389
+ }
390
+ }
391
+
392
+ this._save(targetName, target);
393
+ return { source: sourceName, target: targetName, categoriesMerged, chunksAdded, chunksSkipped };
394
+ }
395
+
396
+ // ---- EXPORT CATEGORY ----
397
+
398
+ exportCategory(projectName, categoryName) {
399
+ const data = this._load(projectName);
400
+ const cat = this._findCategory(data, categoryName);
401
+ const flat = [];
402
+ for (const ch of cat.chunks) {
403
+ const entry = { id: ch.id, text: ch.text, metadata: { ...ch.metadata } };
404
+ if (ch.customFields) {
405
+ for (const cf of ch.customFields) {
406
+ if (cf.key && cf.key.trim()) entry.metadata[cf.key.trim()] = String(cf.value ?? '');
407
+ }
408
+ }
409
+ flat.push(entry);
410
+ }
411
+ return flat;
412
+ }
413
+
346
414
  // ---- INTERNAL ----
347
415
 
348
416
  _load(name) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tryll-dataset-builder-mcp",
3
- "version": "1.1.1",
3
+ "version": "1.2.0",
4
4
  "description": "MCP server for building RAG knowledge base datasets. Create, manage and export structured JSON datasets via Claude Code.",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -31,6 +31,7 @@
31
31
  },
32
32
  "dependencies": {
33
33
  "@modelcontextprotocol/sdk": "^1.12.1",
34
+ "cheerio": "^1.2.0",
34
35
  "ws": "^8.19.0",
35
36
  "zod": "^3.24.0"
36
37
  }