0nmcp 2.6.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,520 @@
1
+ // ============================================================
2
+ // 0nMCP — Training Feed Engine
3
+ // ============================================================
4
+ // Continuously fetches data from verified public sources and
5
+ // feeds it into the 0nAI Training Center. Zero cost — uses
6
+ // public RSS feeds, open APIs, and free data endpoints.
7
+ //
8
+ // Run modes:
9
+ // - Once: node engine/training-feed.js
10
+ // - Loop: node engine/training-feed.js --loop 300 (every 5 min)
11
+ // - CLI: 0nmcp feed [--loop <seconds>]
12
+ //
13
+ // All sources are public, free, no API keys required.
14
+ // ============================================================
15
+
16
+ import { createHash } from "crypto";
17
+
18
+ // ── Verified Source Registry ────────────────────────────────
19
+ // Every source here is public, free, and factual.
20
+
21
+ export const FEED_SOURCES = [
22
+
23
+ // ── AI & ML Industry ──────────────────────────────────────
24
+ {
25
+ id: "hn-ai",
26
+ name: "Hacker News — AI",
27
+ type: "api",
28
+ url: "https://hn.algolia.com/api/v1/search_by_date?query=AI+LLM+MCP&tags=story&hitsPerPage=10",
29
+ category: "ai_industry",
30
+ interval: 600, // 10 min
31
+ parser: "hn",
32
+ },
33
+ {
34
+ id: "arxiv-ai",
35
+ name: "arXiv — AI Papers",
36
+ type: "rss",
37
+ url: "https://export.arxiv.org/api/query?search_query=cat:cs.AI&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending",
38
+ category: "ai_research",
39
+ interval: 3600, // 1 hour
40
+ parser: "arxiv",
41
+ },
42
+ {
43
+ id: "devto-ai",
44
+ name: "Dev.to — AI Articles",
45
+ type: "api",
46
+ url: "https://dev.to/api/articles?tag=ai&per_page=10&top=1",
47
+ category: "ai_industry",
48
+ interval: 900,
49
+ parser: "devto",
50
+ },
51
+ {
52
+ id: "devto-mcp",
53
+ name: "Dev.to — MCP Articles",
54
+ type: "api",
55
+ url: "https://dev.to/api/articles?tag=mcp&per_page=10&top=1",
56
+ category: "ai_industry",
57
+ interval: 900,
58
+ parser: "devto",
59
+ },
60
+
61
+ // ── Tech Industry ─────────────────────────────────────────
62
+ {
63
+ id: "hn-top",
64
+ name: "Hacker News — Top Stories",
65
+ type: "api",
66
+ url: "https://hn.algolia.com/api/v1/search?tags=front_page&hitsPerPage=10",
67
+ category: "tech",
68
+ interval: 600,
69
+ parser: "hn",
70
+ },
71
+ {
72
+ id: "github-trending",
73
+ name: "GitHub — Trending Repos",
74
+ type: "api",
75
+ url: "https://api.github.com/search/repositories?q=stars:>100+created:>2026-03-01&sort=stars&order=desc&per_page=10",
76
+ category: "open_source",
77
+ interval: 3600,
78
+ parser: "github_repos",
79
+ },
80
+ {
81
+ id: "npm-mcp",
82
+ name: "npm — MCP Packages",
83
+ type: "api",
84
+ url: "https://registry.npmjs.org/-/v1/search?text=mcp+model+context+protocol&size=10",
85
+ category: "open_source",
86
+ interval: 3600,
87
+ parser: "npm",
88
+ },
89
+
90
+ // ── SaaS & API Ecosystem ──────────────────────────────────
91
+ {
92
+ id: "devto-api",
93
+ name: "Dev.to — API Development",
94
+ type: "api",
95
+ url: "https://dev.to/api/articles?tag=api&per_page=10&top=1",
96
+ category: "saas",
97
+ interval: 1800,
98
+ parser: "devto",
99
+ },
100
+ {
101
+ id: "devto-automation",
102
+ name: "Dev.to — Automation",
103
+ type: "api",
104
+ url: "https://dev.to/api/articles?tag=automation&per_page=10&top=1",
105
+ category: "automation",
106
+ interval: 1800,
107
+ parser: "devto",
108
+ },
109
+
110
+ // ── Crypto / Web3 (Jaxx origin) ───────────────────────────
111
+ {
112
+ id: "coingecko-global",
113
+ name: "CoinGecko — Market Stats",
114
+ type: "api",
115
+ url: "https://api.coingecko.com/api/v3/global",
116
+ category: "crypto",
117
+ interval: 600,
118
+ parser: "coingecko",
119
+ },
120
+
121
+ // ── Public Data / Statistics ──────────────────────────────
122
+ {
123
+ id: "wikipedia-ai",
124
+ name: "Wikipedia — AI Current Events",
125
+ type: "api",
126
+ url: "https://en.wikipedia.org/api/rest_v1/feed/featured/2026/03/22",
127
+ category: "general_knowledge",
128
+ interval: 86400, // daily
129
+ parser: "wikipedia",
130
+ },
131
+ ];
132
+
133
+ // ── Parsers ─────────────────────────────────────────────────
134
+ // Each parser extracts structured data from a specific API format.
135
+
136
+ const PARSERS = {
137
+ hn(data) {
138
+ if (!data?.hits) return [];
139
+ return data.hits.map(h => ({
140
+ title: h.title || "",
141
+ content: `${h.title}. ${h.url || ""} — ${h.points || 0} points, ${h.num_comments || 0} comments on Hacker News.`,
142
+ url: h.url || `https://news.ycombinator.com/item?id=${h.objectID}`,
143
+ metadata: { points: h.points, comments: h.num_comments, author: h.author },
144
+ })).filter(h => h.title.length > 5);
145
+ },
146
+
147
+ arxiv(data) {
148
+ // arXiv returns Atom XML — parse simply
149
+ const entries = [];
150
+ const entryRegex = /<entry>([\s\S]*?)<\/entry>/g;
151
+ let match;
152
+ while ((match = entryRegex.exec(data)) !== null) {
153
+ const block = match[1];
154
+ const title = (block.match(/<title>([\s\S]*?)<\/title>/) || [])[1]?.trim() || "";
155
+ const summary = (block.match(/<summary>([\s\S]*?)<\/summary>/) || [])[1]?.trim() || "";
156
+ const id = (block.match(/<id>([\s\S]*?)<\/id>/) || [])[1]?.trim() || "";
157
+ entries.push({
158
+ title,
159
+ content: `${title}\n\n${summary.slice(0, 1000)}`,
160
+ url: id,
161
+ metadata: { source: "arxiv" },
162
+ });
163
+ }
164
+ return entries;
165
+ },
166
+
167
+ devto(data) {
168
+ if (!Array.isArray(data)) return [];
169
+ return data.map(a => ({
170
+ title: a.title || "",
171
+ content: `${a.title}. ${a.description || ""}. By ${a.user?.name || "unknown"} — ${a.positive_reactions_count || 0} reactions, ${a.comments_count || 0} comments.`,
172
+ url: a.url || "",
173
+ metadata: { reactions: a.positive_reactions_count, comments: a.comments_count, author: a.user?.name, tags: a.tag_list },
174
+ })).filter(a => a.title.length > 5);
175
+ },
176
+
177
+ github_repos(data) {
178
+ if (!data?.items) return [];
179
+ return data.items.map(r => ({
180
+ title: `${r.full_name} — ${r.description || "No description"}`,
181
+ content: `GitHub repo: ${r.full_name}. ${r.description || ""}. Language: ${r.language || "unknown"}. Stars: ${r.stargazers_count}. Forks: ${r.forks_count}. Created: ${r.created_at}.`,
182
+ url: r.html_url,
183
+ metadata: { stars: r.stargazers_count, forks: r.forks_count, language: r.language },
184
+ }));
185
+ },
186
+
187
+ npm(data) {
188
+ if (!data?.objects) return [];
189
+ return data.objects.map(o => ({
190
+ title: `${o.package?.name} — ${o.package?.description || ""}`,
191
+ content: `npm package: ${o.package?.name}@${o.package?.version}. ${o.package?.description || ""}. Keywords: ${(o.package?.keywords || []).join(", ")}. Weekly downloads estimated from search score: ${o.searchScore || 0}.`,
192
+ url: o.package?.links?.npm || "",
193
+ metadata: { version: o.package?.version, keywords: o.package?.keywords, score: o.searchScore },
194
+ }));
195
+ },
196
+
197
+ coingecko(data) {
198
+ if (!data?.data) return [];
199
+ const d = data.data;
200
+ return [{
201
+ title: "Global Crypto Market Stats",
202
+ content: `Global crypto market cap: $${Math.round((d.total_market_cap?.usd || 0) / 1e9)}B. 24h volume: $${Math.round((d.total_volume?.usd || 0) / 1e9)}B. BTC dominance: ${(d.market_cap_percentage?.btc || 0).toFixed(1)}%. Active cryptocurrencies: ${d.active_cryptocurrencies || 0}. Markets: ${d.markets || 0}.`,
203
+ url: "https://www.coingecko.com",
204
+ metadata: { market_cap_usd: d.total_market_cap?.usd, btc_dominance: d.market_cap_percentage?.btc, active_coins: d.active_cryptocurrencies },
205
+ }];
206
+ },
207
+
208
+ wikipedia(data) {
209
+ const items = [];
210
+ if (data?.tfa) {
211
+ items.push({
212
+ title: `Wikipedia Featured: ${data.tfa.titles?.normalized || ""}`,
213
+ content: data.tfa.extract || "",
214
+ url: data.tfa.content_urls?.desktop?.page || "",
215
+ metadata: { type: "featured_article" },
216
+ });
217
+ }
218
+ if (data?.mostread?.articles) {
219
+ for (const a of data.mostread.articles.slice(0, 5)) {
220
+ items.push({
221
+ title: `Trending: ${a.titles?.normalized || ""}`,
222
+ content: a.extract || "",
223
+ url: a.content_urls?.desktop?.page || "",
224
+ metadata: { type: "most_read", views: a.views },
225
+ });
226
+ }
227
+ }
228
+ return items;
229
+ },
230
+ };
231
+
232
+ // ── Feed Engine Class ───────────────────────────────────────
233
+
234
+ export class TrainingFeedEngine {
235
+ /**
236
+ * @param {object} [options]
237
+ * @param {object} [options.supabase] — Supabase client
238
+ * @param {string[]} [options.categories] — Only fetch these categories
239
+ * @param {Function} [options.onItem] — Callback per ingested item
240
+ */
241
+ constructor(options = {}) {
242
+ this.supabase = options.supabase || null;
243
+ this.categories = options.categories ? new Set(options.categories) : null;
244
+ this.onItem = options.onItem || null;
245
+ this.lastFetch = new Map(); // source_id → timestamp
246
+ this.stats = { fetched: 0, ingested: 0, duplicates: 0, errors: 0 };
247
+ this._loopTimer = null;
248
+ }
249
+
250
+ async _getSupabase() {
251
+ if (this.supabase) return this.supabase;
252
+ const { createClient } = await import("@supabase/supabase-js");
253
+ const url = process.env.SUPABASE_URL || "https://pwujhhmlrtxjmjzyttwn.supabase.co";
254
+ const key = process.env.SUPABASE_SERVICE_KEY || process.env.SUPABASE_SERVICE_ROLE_KEY;
255
+ if (!key) throw new Error("No Supabase service key");
256
+ this.supabase = createClient(url, key);
257
+ return this.supabase;
258
+ }
259
+
260
+ /**
261
+ * Fetch a single source and ingest new items.
262
+ */
263
+ async fetchSource(source) {
264
+ try {
265
+ // Check interval
266
+ const lastTime = this.lastFetch.get(source.id) || 0;
267
+ if (Date.now() - lastTime < source.interval * 1000) return { skipped: true };
268
+
269
+ const res = await fetch(source.url, {
270
+ headers: { "User-Agent": "0nMCP-TrainingFeed/1.0", "Accept": "application/json" },
271
+ signal: AbortSignal.timeout(10000),
272
+ });
273
+
274
+ if (!res.ok) {
275
+ this.stats.errors++;
276
+ return { error: `HTTP ${res.status}` };
277
+ }
278
+
279
+ let rawData;
280
+ const contentType = res.headers.get("content-type") || "";
281
+ if (contentType.includes("xml") || source.parser === "arxiv") {
282
+ rawData = await res.text();
283
+ } else {
284
+ rawData = await res.json();
285
+ }
286
+
287
+ // Parse
288
+ const parser = PARSERS[source.parser];
289
+ if (!parser) return { error: `Unknown parser: ${source.parser}` };
290
+
291
+ const items = parser(rawData);
292
+ this.stats.fetched += items.length;
293
+ this.lastFetch.set(source.id, Date.now());
294
+
295
+ // Deduplicate and ingest
296
+ const sb = await this._getSupabase();
297
+ let ingested = 0;
298
+
299
+ for (const item of items) {
300
+ if (!item.title || !item.content) continue;
301
+
302
+ // Content hash for dedup
303
+ const hash = createHash("md5").update(item.title + item.url).digest("hex");
304
+
305
+ // Check if already ingested
306
+ const { data: existing } = await sb.from("training_sources")
307
+ .select("id")
308
+ .eq("metadata->>content_hash", hash)
309
+ .limit(1);
310
+
311
+ if (existing && existing.length > 0) {
312
+ this.stats.duplicates++;
313
+ continue;
314
+ }
315
+
316
+ // Insert
317
+ const { error: insertError } = await sb.from("training_sources").insert({
318
+ source_type: "feed",
319
+ source_path: item.url,
320
+ title: item.title.slice(0, 500),
321
+ content: item.content.slice(0, 5000),
322
+ token_count: Math.ceil(item.content.length / 4),
323
+ tags: [source.category, source.id],
324
+ status: "raw",
325
+ metadata: {
326
+ ...item.metadata,
327
+ feed_source: source.id,
328
+ feed_name: source.name,
329
+ content_hash: hash,
330
+ fetched_at: new Date().toISOString(),
331
+ },
332
+ });
333
+
334
+ if (!insertError) {
335
+ ingested++;
336
+ this.stats.ingested++;
337
+ if (this.onItem) this.onItem(item, source);
338
+ }
339
+ }
340
+
341
+ return { items: items.length, ingested, source: source.name };
342
+ } catch (err) {
343
+ this.stats.errors++;
344
+ return { error: err.message, source: source.name };
345
+ }
346
+ }
347
+
348
+ /**
349
+ * Run one full fetch cycle across all sources.
350
+ */
351
+ async fetchAll() {
352
+ const activeSources = FEED_SOURCES.filter(s =>
353
+ !this.categories || this.categories.has(s.category)
354
+ );
355
+
356
+ const results = [];
357
+ for (const source of activeSources) {
358
+ const result = await this.fetchSource(source);
359
+ if (!result.skipped) {
360
+ results.push(result);
361
+ }
362
+ }
363
+
364
+ return {
365
+ cycle_at: new Date().toISOString(),
366
+ sources_checked: activeSources.length,
367
+ results: results.filter(r => !r.skipped),
368
+ stats: { ...this.stats },
369
+ };
370
+ }
371
+
372
+ /**
373
+ * Start a continuous fetch loop.
374
+ * @param {number} intervalSec — Seconds between cycles (default: 300 = 5 min)
375
+ */
376
+ startLoop(intervalSec = 300) {
377
+ console.log(` → [training-feed] Starting feed loop (every ${intervalSec}s)`);
378
+ console.log(` → [training-feed] ${FEED_SOURCES.length} sources registered`);
379
+
380
+ // Run immediately
381
+ this.fetchAll().then(r => {
382
+ console.log(` → [training-feed] Cycle complete: ${r.stats.ingested} new items`);
383
+ });
384
+
385
+ // Then loop
386
+ this._loopTimer = setInterval(async () => {
387
+ try {
388
+ const result = await this.fetchAll();
389
+ if (result.stats.ingested > 0) {
390
+ console.log(` → [training-feed] +${result.stats.ingested} items ingested`);
391
+ }
392
+ } catch (err) {
393
+ console.error(` ✗ [training-feed] Cycle error: ${err.message}`);
394
+ }
395
+ }, intervalSec * 1000);
396
+ }
397
+
398
+ /**
399
+ * Stop the feed loop.
400
+ */
401
+ stopLoop() {
402
+ if (this._loopTimer) {
403
+ clearInterval(this._loopTimer);
404
+ this._loopTimer = null;
405
+ }
406
+ }
407
+
408
+ /**
409
+ * Get current stats.
410
+ */
411
+ getStats() {
412
+ return {
413
+ ...this.stats,
414
+ sources: FEED_SOURCES.length,
415
+ categories: [...new Set(FEED_SOURCES.map(s => s.category))],
416
+ last_fetches: Object.fromEntries(
417
+ [...this.lastFetch.entries()].map(([k, v]) => [k, new Date(v).toISOString()])
418
+ ),
419
+ };
420
+ }
421
+ }
422
+
423
+ // ── MCP Tool Registration ───────────────────────────────────
424
+
425
+ /**
426
+ * Register training feed tools on an MCP server.
427
+ */
428
+ export function registerFeedTools(server, z) {
429
+ let feedEngine = null;
430
+
431
+ function getFeed() {
432
+ if (!feedEngine) feedEngine = new TrainingFeedEngine();
433
+ return feedEngine;
434
+ }
435
+
436
+ server.tool(
437
+ "training_feed",
438
+ `Manage the 0nAI training feed — continuous data ingestion from verified public sources.
439
+ Fetches from ${FEED_SOURCES.length} sources: Hacker News, arXiv, Dev.to, GitHub, npm, CoinGecko, Wikipedia.
440
+
441
+ Example: training_feed({ action: "fetch" }) — run one fetch cycle
442
+ Example: training_feed({ action: "start", interval: 300 }) — start 5-min loop
443
+ Example: training_feed({ action: "stop" }) — stop the loop
444
+ Example: training_feed({ action: "sources" }) — list all sources
445
+ Example: training_feed({ action: "stats" }) — show feed statistics`,
446
+ {
447
+ action: z.enum(["fetch", "start", "stop", "sources", "stats"]).describe("Feed action"),
448
+ interval: z.number().optional().describe("Loop interval in seconds (default: 300)"),
449
+ categories: z.array(z.string()).optional().describe("Only fetch these categories"),
450
+ },
451
+ async ({ action, interval, categories }) => {
452
+ try {
453
+ const feed = getFeed();
454
+ if (categories) feed.categories = new Set(categories);
455
+
456
+ switch (action) {
457
+ case "fetch": {
458
+ const result = await feed.fetchAll();
459
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
460
+ }
461
+
462
+ case "start": {
463
+ feed.startLoop(interval || 300);
464
+ return { content: [{ type: "text", text: JSON.stringify({ status: "started", interval: interval || 300, sources: FEED_SOURCES.length }) }] };
465
+ }
466
+
467
+ case "stop": {
468
+ feed.stopLoop();
469
+ return { content: [{ type: "text", text: JSON.stringify({ status: "stopped", stats: feed.getStats() }) }] };
470
+ }
471
+
472
+ case "sources": {
473
+ return {
474
+ content: [{
475
+ type: "text",
476
+ text: JSON.stringify({
477
+ count: FEED_SOURCES.length,
478
+ categories: [...new Set(FEED_SOURCES.map(s => s.category))],
479
+ sources: FEED_SOURCES.map(s => ({
480
+ id: s.id,
481
+ name: s.name,
482
+ category: s.category,
483
+ interval: `${s.interval}s`,
484
+ url: s.url.slice(0, 80) + "...",
485
+ })),
486
+ }, null, 2),
487
+ }],
488
+ };
489
+ }
490
+
491
+ case "stats": {
492
+ return { content: [{ type: "text", text: JSON.stringify({ status: "ok", ...feed.getStats() }, null, 2) }] };
493
+ }
494
+ }
495
+ } catch (err) {
496
+ return { content: [{ type: "text", text: JSON.stringify({ status: "failed", error: err.message }) }] };
497
+ }
498
+ }
499
+ );
500
+ }
501
+
502
+ // ── CLI Entry Point ─────────────────────────────────────────
503
+
504
+ const isMain = process.argv[1]?.endsWith("training-feed.js");
505
+ if (isMain) {
506
+ const loopFlag = process.argv.indexOf("--loop");
507
+ const interval = loopFlag > -1 ? parseInt(process.argv[loopFlag + 1]) || 300 : 0;
508
+
509
+ const feed = new TrainingFeedEngine();
510
+
511
+ if (interval > 0) {
512
+ feed.startLoop(interval);
513
+ console.log(`Press Ctrl+C to stop.`);
514
+ } else {
515
+ feed.fetchAll().then(result => {
516
+ console.log(JSON.stringify(result, null, 2));
517
+ process.exit(0);
518
+ });
519
+ }
520
+ }