noosphere 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,1044 @@
1
+ # noosphere
2
+
3
+ Unified AI creation engine — text, image, video, and audio generation across all providers through a single interface.
4
+
5
+ One import. Every model. Every modality.
6
+
7
+ ## Features
8
+
9
+ - **4 modalities** — LLM chat, image generation, video generation, and text-to-speech
10
+ - **246+ LLM models** — via Pi-AI gateway (OpenAI, Anthropic, Google, Groq, Mistral, xAI, Cerebras, OpenRouter)
11
+ - **867+ media endpoints** — via FAL (Flux, SDXL, Kling, Sora 2, VEO 3, Kokoro, ElevenLabs, and hundreds more)
12
+ - **30+ HuggingFace tasks** — LLM, image, TTS, translation, summarization, classification, and more
13
+ - **Local-first architecture** — Auto-detects ComfyUI, Ollama, Piper, and Kokoro on your machine
14
+ - **Agentic capabilities** — Tool use, function calling, reasoning/thinking, vision, and agent loops via Pi-AI
15
+ - **Failover & retry** — Automatic retries with exponential backoff and cross-provider failover
16
+ - **Usage tracking** — Real-time cost, latency, and token tracking across all providers
17
+ - **TypeScript-first** — Full type definitions with ESM and CommonJS support
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ npm install noosphere
23
+ ```
24
+
25
+ ## Quick Start
26
+
27
+ ```typescript
28
+ import { Noosphere } from 'noosphere';
29
+
30
+ const ai = new Noosphere();
31
+
32
+ // Chat with any LLM
33
+ const response = await ai.chat({
34
+ messages: [{ role: 'user', content: 'Hello!' }],
35
+ });
36
+ console.log(response.content);
37
+
38
+ // Generate an image
39
+ const image = await ai.image({
40
+ prompt: 'A sunset over mountains',
41
+ width: 1024,
42
+ height: 1024,
43
+ });
44
+ console.log(image.url);
45
+
46
+ // Generate a video
47
+ const video = await ai.video({
48
+ prompt: 'Ocean waves crashing on rocks',
49
+ duration: 5,
50
+ });
51
+ console.log(video.url);
52
+
53
+ // Text-to-speech
54
+ const audio = await ai.speak({
55
+ text: 'Welcome to Noosphere',
56
+ voice: 'alloy',
57
+ format: 'mp3',
58
+ });
59
+ // audio.buffer contains the audio data
60
+ ```
61
+
62
+ ## Configuration
63
+
64
+ API keys are resolved from the constructor config or environment variables (config takes priority):
65
+
66
+ ```typescript
67
+ const ai = new Noosphere({
68
+ keys: {
69
+ openai: 'sk-...',
70
+ anthropic: 'sk-ant-...',
71
+ google: 'AIza...',
72
+ fal: 'fal-...',
73
+ huggingface: 'hf_...',
74
+ groq: 'gsk_...',
75
+ mistral: '...',
76
+ xai: '...',
77
+ openrouter: 'sk-or-...',
78
+ },
79
+ });
80
+ ```
81
+
82
+ Or set environment variables:
83
+
84
+ | Variable | Provider |
85
+ |---|---|
86
+ | `OPENAI_API_KEY` | OpenAI |
87
+ | `ANTHROPIC_API_KEY` | Anthropic |
88
+ | `GEMINI_API_KEY` | Google Gemini |
89
+ | `FAL_KEY` | FAL.ai |
90
+ | `HUGGINGFACE_TOKEN` | Hugging Face |
91
+ | `GROQ_API_KEY` | Groq |
92
+ | `MISTRAL_API_KEY` | Mistral |
93
+ | `XAI_API_KEY` | xAI (Grok) |
94
+ | `OPENROUTER_API_KEY` | OpenRouter |
95
+
96
+ ### Full Configuration Reference
97
+
98
+ ```typescript
99
+ const ai = new Noosphere({
100
+ // API keys (or use env vars above)
101
+ keys: { /* ... */ },
102
+
103
+ // Default models per modality
104
+ defaults: {
105
+ llm: { provider: 'pi-ai', model: 'claude-sonnet-4-20250514' },
106
+ image: { provider: 'fal', model: 'fal-ai/flux/schnell' },
107
+ video: { provider: 'fal', model: 'fal-ai/kling-video/v2/master/text-to-video' },
108
+ tts: { provider: 'fal', model: 'fal-ai/kokoro/american-english' },
109
+ },
110
+
111
+ // Local service configuration
112
+ autoDetectLocal: true, // env: NOOSPHERE_AUTO_DETECT_LOCAL
113
+ local: {
114
+ ollama: { enabled: true, host: 'http://localhost', port: 11434 },
115
+ comfyui: { enabled: true, host: 'http://localhost', port: 8188 },
116
+ piper: { enabled: true, host: 'http://localhost', port: 5500 },
117
+ kokoro: { enabled: true, host: 'http://localhost', port: 5501 },
118
+ custom: [], // additional LocalServiceConfig[]
119
+ },
120
+
121
+ // Retry & failover
122
+ retry: {
123
+ maxRetries: 2, // default: 2
124
+ backoffMs: 1000, // default: 1000 (exponential: 1s, 2s, 4s...)
125
+ failover: true, // default: true — try other providers on failure
126
+ retryableErrors: ['PROVIDER_UNAVAILABLE', 'RATE_LIMITED', 'TIMEOUT'],
127
+ },
128
+
129
+ // Timeouts per modality (ms)
130
+ timeout: {
131
+ llm: 30000, // 30s
132
+ image: 120000, // 2min
133
+ video: 300000, // 5min
134
+ tts: 60000, // 1min
135
+ },
136
+
137
+ // Model discovery cache (minutes)
138
+ discoveryCacheTTL: 60, // env: NOOSPHERE_DISCOVERY_CACHE_TTL
139
+
140
+ // Real-time usage callback
141
+ onUsage: (event) => {
142
+ console.log(`${event.provider}/${event.model}: $${event.cost} (${event.latencyMs}ms)`);
143
+ },
144
+ });
145
+ ```
146
+
147
+ ### Local Service Environment Variables
148
+
149
+ | Variable | Default | Description |
150
+ |---|---|---|
151
+ | `OLLAMA_HOST` | `http://localhost` | Ollama server host |
152
+ | `OLLAMA_PORT` | `11434` | Ollama server port |
153
+ | `COMFYUI_HOST` | `http://localhost` | ComfyUI server host |
154
+ | `COMFYUI_PORT` | `8188` | ComfyUI server port |
155
+ | `PIPER_HOST` | `http://localhost` | Piper TTS server host |
156
+ | `PIPER_PORT` | `5500` | Piper TTS server port |
157
+ | `KOKORO_HOST` | `http://localhost` | Kokoro TTS server host |
158
+ | `KOKORO_PORT` | `5501` | Kokoro TTS server port |
159
+ | `NOOSPHERE_AUTO_DETECT_LOCAL` | `true` | Enable/disable local service auto-detection |
160
+ | `NOOSPHERE_DISCOVERY_CACHE_TTL` | `60` | Model cache TTL in minutes |
161
+
162
+ ---
163
+
164
+ ## API Reference
165
+
166
+ ### `new Noosphere(config?)`
167
+
168
+ Creates a new instance. Providers are initialized lazily on first API call. Auto-detects local services via HTTP pings (2s timeout each).
169
+
170
+ ### Generation Methods
171
+
172
+ #### `ai.chat(options): Promise<NoosphereResult>`
173
+
174
+ Generate text with any LLM. Supports 246+ models across 8 providers.
175
+
176
+ ```typescript
177
+ const result = await ai.chat({
178
+ provider: 'anthropic', // optional — auto-resolved if omitted
179
+ model: 'claude-sonnet-4-20250514', // optional — uses default or first available
180
+ messages: [
181
+ { role: 'system', content: 'You are helpful.' },
182
+ { role: 'user', content: 'Explain quantum computing' },
183
+ ],
184
+ temperature: 0.7, // optional (0-2)
185
+ maxTokens: 1024, // optional
186
+ jsonMode: false, // optional
187
+ });
188
+
189
+ console.log(result.content); // response text
190
+ console.log(result.thinking); // reasoning output (Claude, GPT-5, o3, Gemini, Grok-4)
191
+ console.log(result.usage.cost); // cost in USD
192
+ console.log(result.usage.input); // input tokens
193
+ console.log(result.usage.output); // output tokens
194
+ console.log(result.latencyMs); // response time in ms
195
+ ```
196
+
197
+ #### `ai.stream(options): NoosphereStream`
198
+
199
+ Stream LLM responses token-by-token. Same options as `chat()`.
200
+
201
+ ```typescript
202
+ const stream = ai.stream({
203
+ messages: [{ role: 'user', content: 'Write a story' }],
204
+ });
205
+
206
+ for await (const event of stream) {
207
+ switch (event.type) {
208
+ case 'text_delta':
209
+ process.stdout.write(event.delta!);
210
+ break;
211
+ case 'thinking_delta':
212
+ console.log('[thinking]', event.delta);
213
+ break;
214
+ case 'done':
215
+ console.log('\n\nUsage:', event.result!.usage);
216
+ break;
217
+ case 'error':
218
+ console.error(event.error);
219
+ break;
220
+ }
221
+ }
222
+
223
+ // Or consume the full result
224
+ const result = await stream.result();
225
+
226
+ // Abort at any time
227
+ stream.abort();
228
+ ```
229
+
230
+ #### `ai.image(options): Promise<NoosphereResult>`
231
+
232
+ Generate images. Supports 200+ image models via FAL, HuggingFace, and ComfyUI.
233
+
234
+ ```typescript
235
+ const result = await ai.image({
236
+ provider: 'fal', // optional
237
+ model: 'fal-ai/flux-2-pro', // optional
238
+ prompt: 'A futuristic cityscape at sunset',
239
+ negativePrompt: 'blurry, low quality', // optional
240
+ width: 1024, // optional
241
+ height: 768, // optional
242
+ seed: 42, // optional — reproducible results
243
+ steps: 30, // optional — inference steps (more = higher quality)
244
+ guidanceScale: 7.5, // optional — prompt adherence (higher = stricter)
245
+ });
246
+
247
+ console.log(result.url); // image URL (FAL)
248
+ console.log(result.buffer); // image Buffer (HuggingFace, ComfyUI)
249
+ console.log(result.media?.width); // actual dimensions
250
+ console.log(result.media?.height);
251
+ console.log(result.media?.format); // 'png'
252
+ ```
253
+
254
+ #### `ai.video(options): Promise<NoosphereResult>`
255
+
256
+ Generate videos. Supports 150+ video models via FAL (Kling, Sora 2, VEO 3, WAN, Pixverse, and more).
257
+
258
+ ```typescript
259
+ const result = await ai.video({
260
+ provider: 'fal',
261
+ model: 'fal-ai/kling-video/v2/master/text-to-video',
262
+ prompt: 'A bird flying through clouds',
263
+ imageUrl: 'https://...', // optional — image-to-video
264
+ duration: 5, // optional — seconds
265
+ fps: 24, // optional
266
+ width: 1280, // optional
267
+ height: 720, // optional
268
+ });
269
+
270
+ console.log(result.url); // video URL
271
+ console.log(result.media?.duration); // actual duration
272
+ console.log(result.media?.fps); // frames per second
273
+ console.log(result.media?.format); // 'mp4'
274
+ ```
275
+
276
+ #### `ai.speak(options): Promise<NoosphereResult>`
277
+
278
+ Text-to-speech synthesis. Supports 50+ TTS models via FAL, HuggingFace, Piper, and Kokoro.
279
+
280
+ ```typescript
281
+ const result = await ai.speak({
282
+ provider: 'fal',
283
+ model: 'fal-ai/kokoro/american-english',
284
+ text: 'Hello world',
285
+ voice: 'af_heart', // optional — voice ID
286
+ language: 'en', // optional
287
+ speed: 1.0, // optional
288
+ format: 'mp3', // optional — 'mp3' | 'wav' | 'ogg'
289
+ });
290
+
291
+ console.log(result.buffer); // audio Buffer
292
+ console.log(result.url); // audio URL (FAL)
293
+ ```
294
+
295
+ ### Discovery Methods
296
+
297
+ #### `ai.getProviders(modality?): Promise<ProviderInfo[]>`
298
+
299
+ List available providers, optionally filtered by modality.
300
+
301
+ ```typescript
302
+ const providers = await ai.getProviders('llm');
303
+ // [{ id: 'pi-ai', name: 'Pi-AI', modalities: ['llm'], local: false, status: 'online', modelCount: 246 }]
304
+ ```
305
+
306
+ #### `ai.getModels(modality?): Promise<ModelInfo[]>`
307
+
308
+ List all available models with full metadata.
309
+
310
+ ```typescript
311
+ const models = await ai.getModels('image');
312
+ // Returns ModelInfo[] with id, provider, name, modality, local, cost, capabilities
313
+ ```
314
+
315
+ #### `ai.getModel(provider, modelId): Promise<ModelInfo | null>`
316
+
317
+ Get details about a specific model.
318
+
319
+ #### `ai.syncModels(): Promise<SyncResult>`
320
+
321
+ Refresh model lists from all providers. Returns sync count, per-provider breakdown, and any errors.
322
+
323
+ ### Usage Tracking
324
+
325
+ #### `ai.getUsage(options?): UsageSummary`
326
+
327
+ Get aggregated usage statistics with optional filtering.
328
+
329
+ ```typescript
330
+ const usage = ai.getUsage({
331
+ since: '2024-01-01', // optional — ISO date or Date object
332
+ until: '2024-12-31', // optional
333
+ provider: 'openai', // optional — filter by provider
334
+ modality: 'llm', // optional — filter by modality
335
+ });
336
+
337
+ console.log(usage.totalCost); // total USD spent
338
+ console.log(usage.totalRequests); // number of requests
339
+ console.log(usage.byProvider); // { openai: 2.50, anthropic: 1.20, fal: 0.30 }
340
+ console.log(usage.byModality); // { llm: 3.00, image: 0.70, video: 0.30, tts: 0.00 }
341
+ ```
342
+
343
+ ### Lifecycle
344
+
345
+ #### `ai.registerProvider(provider): void`
346
+
347
+ Register a custom provider (see [Custom Providers](#custom-providers)).
348
+
349
+ #### `ai.dispose(): Promise<void>`
350
+
351
+ Cleanup all provider resources, clear model cache, and reset usage tracker.
352
+
353
+ ### NoosphereResult
354
+
355
+ Every generation method returns a `NoosphereResult`:
356
+
357
+ ```typescript
358
+ interface NoosphereResult {
359
+ content?: string; // LLM response text
360
+ thinking?: string; // reasoning/thinking output (supported models)
361
+ url?: string; // media URL (images, videos, audio from cloud providers)
362
+ buffer?: Buffer; // media binary data (local providers, HuggingFace)
363
+ provider: string; // which provider handled the request
364
+ model: string; // which model was used
365
+ modality: Modality; // 'llm' | 'image' | 'video' | 'tts'
366
+ latencyMs: number; // request duration in milliseconds
367
+ usage: {
368
+ cost: number; // cost in USD
369
+ input?: number; // input tokens/characters
370
+ output?: number; // output tokens
371
+ unit?: string; // 'tokens' | 'characters' | 'per_image' | 'per_second' | 'free'
372
+ };
373
+ media?: {
374
+ width?: number; // image/video width
375
+ height?: number; // image/video height
376
+ duration?: number; // video/audio duration in seconds
377
+ format?: string; // 'png' | 'mp4' | 'mp3' | 'wav'
378
+ fps?: number; // video frames per second
379
+ };
380
+ }
381
+ ```
382
+
383
+ ---
384
+
385
+ ## Providers In Depth
386
+
387
+ ### Pi-AI — LLM Gateway (246+ models)
388
+
389
+ **Provider ID:** `pi-ai`
390
+ **Modalities:** LLM (chat + streaming)
391
+ **Library:** `@mariozechner/pi-ai`
392
+
393
+ A unified gateway that routes to 8 LLM providers through 4 different API protocols:
394
+
395
+ | API Protocol | Providers |
396
+ |---|---|
397
+ | `anthropic-messages` | Anthropic |
398
+ | `google-generative-ai` | Google |
399
+ | `openai-responses` | OpenAI (reasoning models) |
400
+ | `openai-completions` | OpenAI, xAI, Groq, Cerebras, Zai, OpenRouter |
401
+
402
+ #### Anthropic Models (19)
403
+
404
+ | Model | Context | Reasoning | Vision | Input Cost | Output Cost |
405
+ |---|---|---|---|---|---|
406
+ | `claude-opus-4-0` | 200k | Yes | Yes | $15/M | $75/M |
407
+ | `claude-opus-4-1` | 200k | Yes | Yes | $15/M | $75/M |
408
+ | `claude-sonnet-4-20250514` | 200k | Yes | Yes | $3/M | $15/M |
409
+ | `claude-sonnet-4-5-20250929` | 200k | Yes | Yes | $3/M | $15/M |
410
+ | `claude-3-7-sonnet-20250219` | 200k | Yes | Yes | $3/M | $15/M |
411
+ | `claude-3-5-sonnet-20241022` | 200k | No | Yes | $3/M | $15/M |
412
+ | `claude-haiku-4-5-20251001` | 200k | No | Yes | $0.80/M | $4/M |
413
+ | `claude-3-5-haiku-20241022` | 200k | No | Yes | $0.80/M | $4/M |
414
+ | `claude-3-haiku-20240307` | 200k | No | Yes | $0.25/M | $1.25/M |
415
+ | *...and 10 more variants* | | | | | |
416
+
417
+ #### OpenAI Models (24)
418
+
419
+ | Model | Context | Reasoning | Vision | Input Cost | Output Cost |
420
+ |---|---|---|---|---|---|
421
+ | `gpt-5` | 200k | Yes | Yes | $10/M | $30/M |
422
+ | `gpt-5-mini` | 200k | Yes | Yes | $2.50/M | $10/M |
423
+ | `gpt-4.1` | 128k | No | Yes | $2/M | $8/M |
424
+ | `gpt-4.1-mini` | 128k | No | Yes | $0.40/M | $1.60/M |
425
+ | `gpt-4.1-nano` | 128k | No | Yes | $0.10/M | $0.40/M |
426
+ | `gpt-4o` | 128k | No | Yes | $2.50/M | $10/M |
427
+ | `gpt-4o-mini` | 128k | No | Yes | $0.15/M | $0.60/M |
428
+ | `o3-pro` | 200k | Yes | Yes | $20/M | $80/M |
429
+ | `o3-mini` | 200k | Yes | Yes | $1.10/M | $4.40/M |
430
+ | `o4-mini` | 200k | Yes | Yes | $1.10/M | $4.40/M |
431
+ | `codex-mini-latest` | 200k | Yes | No | $1.50/M | $6/M |
432
+ | *...and 13 more variants* | | | | | |
433
+
434
+ #### Google Gemini Models (19)
435
+
436
+ | Model | Context | Reasoning | Vision | Cost |
437
+ |---|---|---|---|---|
438
+ | `gemini-2.5-flash` | 1M | Yes | Yes | $0.15-0.60/M |
439
+ | `gemini-2.5-pro` | 1M | Yes | Yes | $1.25-10/M |
440
+ | `gemini-2.0-flash` | 1M | No | Yes | $0.10-0.40/M |
441
+ | `gemini-2.0-flash-lite` | 1M | No | Yes | $0.025-0.10/M |
442
+ | `gemini-1.5-flash` | 1M | No | Yes | $0.075-0.30/M |
443
+ | `gemini-1.5-pro` | 2M | No | Yes | $1.25-5/M |
444
+ | *...and 13 more variants* | | | | |
445
+
446
+ #### xAI Grok Models (20)
447
+
448
+ | Model | Context | Reasoning | Vision | Input Cost |
449
+ |---|---|---|---|---|
450
+ | `grok-4` | 256k | Yes | Yes | $5/M |
451
+ | `grok-4-fast` | 256k | Yes | Yes | $3/M |
452
+ | `grok-3` | 131k | No | Yes | $3/M |
453
+ | `grok-3-fast` | 131k | No | Yes | $5/M |
454
+ | `grok-3-mini-fast-latest` | 131k | Yes | No | $0.30/M |
455
+ | `grok-2-vision` | 32k | No | Yes | $2/M |
456
+ | *...and 14 more variants* | | | | |
457
+
458
+ #### Groq Models (15)
459
+
460
+ | Model | Context | Cost |
461
+ |---|---|---|
462
+ | `llama-3.3-70b-versatile` | 128k | $0.59/M |
463
+ | `llama-3.1-8b-instant` | 128k | $0.05/M |
464
+ | `mistral-saba-24b` | 32k | $0.40/M |
465
+ | `qwen-qwq-32b` | 128k | $0.29/M |
466
+ | `deepseek-r1-distill-llama-70b` | 128k | $0.75/M |
467
+ | *...and 10 more* | | |
468
+
469
+ #### Cerebras Models (3)
470
+
471
+ `gpt-oss-120b`, `qwen-3-235b-a22b-instruct-2507`, `qwen-3-coder-480b`
472
+
473
+ #### Zai Models (5)
474
+
475
+ `glm-4.6`, `glm-4.5`, `glm-4.5-flash`, `glm-4.5v`, `glm-4.5-air`
476
+
477
+ #### OpenRouter (141 models)
478
+
479
+ Aggregator providing access to hundreds of additional models including Llama, Deepseek, Mistral, Qwen, and many more. Full list available via `ai.getModels('llm')`.
480
+
481
+ #### Agentic Capabilities (via Pi-AI library)
482
+
483
+ The underlying `@mariozechner/pi-ai` library exposes powerful agentic features. While Noosphere currently surfaces chat and streaming, the library provides:
484
+
485
+ **Tool Use / Function Calling:**
486
+ ```typescript
487
+ // Supported across Anthropic, OpenAI, Google, xAI, Groq
488
+ // Tool definitions use TypeBox schemas for runtime validation
489
+ interface Tool<TParameters extends TSchema = TSchema> {
490
+ name: string;
491
+ description: string;
492
+ parameters: TParameters; // TypeBox schema — validated at runtime with AJV
493
+ }
494
+ ```
495
+
496
+ **Reasoning / Thinking:**
497
+ - **Anthropic:** `thinkingEnabled`, `thinkingBudgetTokens` — Claude Opus/Sonnet extended thinking
498
+ - **OpenAI:** `reasoningEffort` (minimal/low/medium/high) — o1/o3/o4/GPT-5 reasoning
499
+ - **Google:** `thinking.enabled`, `thinking.budgetTokens` — Gemini 2.5 thinking
500
+ - **xAI:** Grok-4 native reasoning
501
+ - Thinking blocks are automatically extracted and streamed as separate `thinking_delta` events
502
+
503
+ **Vision / Multimodal Input:**
504
+ ```typescript
505
+ // Send images alongside text to vision-capable models
506
+ {
507
+ role: "user",
508
+ content: [
509
+ { type: "text", text: "What's in this image?" },
510
+ { type: "image", data: base64String, mimeType: "image/png" }
511
+ ]
512
+ }
513
+ ```
514
+
515
+ **Agent Loop:**
516
+ ```typescript
517
+ // Built-in agentic execution loop with automatic tool calling
518
+ import { agentLoop } from '@mariozechner/pi-ai';
519
+
520
+ const events = agentLoop(prompt, context, {
521
+ tools: [myTool],
522
+ model: getModel('anthropic', 'claude-sonnet-4-20250514'),
523
+ });
524
+
525
+ for await (const event of events) {
526
+ // event.type: agent_start → turn_start → message_start →
527
+ // message_update → tool_execution_start → tool_execution_end →
528
+ // message_end → turn_end → agent_end
529
+ }
530
+ ```
531
+
532
+ **Cost Tracking per Model:**
533
+ ```typescript
534
+ // Costs tracked per 1M tokens with cache-aware pricing
535
+ {
536
+ input: number, // cost per 1M input tokens
537
+ output: number, // cost per 1M output tokens
538
+ cacheRead: number, // prompt cache hit cost
539
+ cacheWrite: number, // prompt cache write cost
540
+ }
541
+ ```
542
+
543
+ ---
544
+
545
+ ### FAL — Media Generation (867+ endpoints)
546
+
547
+ **Provider ID:** `fal`
548
+ **Modalities:** Image, Video, TTS
549
+ **Library:** `@fal-ai/client`
550
+
551
+ The largest media generation provider with dynamic pricing fetched at runtime from `https://api.fal.ai/v1/models/pricing`.
552
+
553
+ #### Image Models (200+)
554
+
555
+ **FLUX Family (20+ variants):**
556
+ | Model | Description |
557
+ |---|---|
558
+ | `fal-ai/flux/schnell` | Fast generation (default) |
559
+ | `fal-ai/flux/dev` | Higher quality |
560
+ | `fal-ai/flux-2` | Next generation |
561
+ | `fal-ai/flux-2-pro` | Professional quality |
562
+ | `fal-ai/flux-2-flex` | Flexible variant |
563
+ | `fal-ai/flux-2/edit` | Image editing |
564
+ | `fal-ai/flux-2/lora` | LoRA fine-tuning |
565
+ | `fal-ai/flux-pro/v1.1-ultra` | Ultra high quality |
566
+ | `fal-ai/flux-pro/kontext` | Context-aware generation |
567
+ | `fal-ai/flux-lora` | Custom style training |
568
+ | `fal-ai/flux-vision-upscaler` | AI upscaling |
569
+ | `fal-ai/flux-krea-trainer` | Model training |
570
+ | `fal-ai/flux-lora-fast-training` | Fast fine-tuning |
571
+ | `fal-ai/flux-lora-portrait-trainer` | Portrait specialist |
572
+
573
+ **Stable Diffusion:**
574
+ `fal-ai/stable-diffusion-v15`, `fal-ai/stable-diffusion-v35-large`, `fal-ai/stable-diffusion-v35-medium`, `fal-ai/stable-diffusion-v3-medium`
575
+
576
+ **Other Image Models:**
577
+ | Model | Description |
578
+ |---|---|
579
+ | `fal-ai/recraft/v3/text-to-image` | Artistic generation |
580
+ | `fal-ai/ideogram/v2`, `v2a`, `v3` | Ideogram series |
581
+ | `fal-ai/imagen3`, `fal-ai/imagen4/preview` | Google Imagen |
582
+ | `fal-ai/gpt-image-1` | GPT image generation |
583
+ | `fal-ai/gpt-image-1/edit-image` | GPT image editing |
584
+ | `fal-ai/reve/text-to-image` | Reve generation |
585
+ | `fal-ai/sana`, `fal-ai/sana/sprint` | Sana models |
586
+ | `fal-ai/pixart-sigma` | PixArt Sigma |
587
+ | `fal-ai/bria/text-to-image/base` | Bria AI |
588
+
589
+ **Pre-trained LoRA Styles:**
590
+ `fal-ai/flux-2-lora-gallery/sepia-vintage`, `virtual-tryon`, `satellite-view-style`, `realism`, `multiple-angles`, `hdr-style`, `face-to-full-portrait`, `digital-comic-art`, `ballpoint-pen-sketch`, `apartment-staging`, `add-background`
591
+
592
+ **Image Editing/Enhancement (30+ tools):**
593
+ `fal-ai/image-editing/age-progression`, `baby-version`, `background-change`, `hair-change`, `expression-change`, `object-removal`, `photo-restoration`, `style-transfer`, and many more.
594
+
595
+ #### Video Models (150+)
596
+
597
+ **Kling Video (20+ variants):**
598
+ | Model | Description |
599
+ |---|---|
600
+ | `fal-ai/kling-video/v2/master/text-to-video` | Default text-to-video |
601
+ | `fal-ai/kling-video/v2/master/image-to-video` | Image-to-video |
602
+ | `fal-ai/kling-video/v2.5-turbo/pro/text-to-video` | Turbo pro |
603
+ | `fal-ai/kling-video/o1/image-to-video` | O1 quality |
604
+ | `fal-ai/kling-video/o1/video-to-video/edit` | Video editing |
605
+ | `fal-ai/kling-video/lipsync/audio-to-video` | Lip sync |
606
+ | `fal-ai/kling-video/video-to-audio` | Audio extraction |
607
+
608
+ **Sora 2 (OpenAI):**
609
+ | Model | Description |
610
+ |---|---|
611
+ | `fal-ai/sora-2/text-to-video` | Text-to-video |
612
+ | `fal-ai/sora-2/text-to-video/pro` | Pro quality |
613
+ | `fal-ai/sora-2/image-to-video` | Image-to-video |
614
+ | `fal-ai/sora-2/video-to-video/remix` | Video remixing |
615
+
616
+ **VEO 3 (Google):**
617
+ | Model | Description |
618
+ |---|---|
619
+ | `fal-ai/veo3` | VEO 3 standard |
620
+ | `fal-ai/veo3/fast` | Fast variant |
621
+ | `fal-ai/veo3/image-to-video` | Image-to-video |
622
+ | `fal-ai/veo3.1` | Latest version |
623
+ | `fal-ai/veo3.1/reference-to-video` | Reference-guided |
624
+ | `fal-ai/veo3.1/first-last-frame-to-video` | Frame interpolation |
625
+
626
+ **WAN (15+ variants):**
627
+ `fal-ai/wan-pro/text-to-video`, `fal-ai/wan-pro/image-to-video`, `fal-ai/wan/v2.2-a14b/text-to-video`, `fal-ai/wan-vace-14b/depth`, `fal-ai/wan-vace-14b/inpainting`, `fal-ai/wan-vace-14b/pose`, `fal-ai/wan-effects`
628
+
629
+ **Pixverse (20+ variants):**
630
+ `fal-ai/pixverse/v5.5/text-to-video`, `fal-ai/pixverse/v5.5/image-to-video`, `fal-ai/pixverse/v5.5/effects`, `fal-ai/pixverse/lipsync`, `fal-ai/pixverse/sound-effects`
631
+
632
+ **Minimax / Hailuo:**
633
+ `fal-ai/minimax/hailuo-2.3/text-to-video/pro`, `fal-ai/minimax/hailuo-2.3/image-to-video/pro`, `fal-ai/minimax/video-01-director`, `fal-ai/minimax/video-01-live`
634
+
635
+ **Other Video Models:**
636
+ | Provider | Models |
637
+ |---|---|
638
+ | Hunyuan | `fal-ai/hunyuan-video/text-to-video`, `image-to-video`, `video-to-video`, `foley` |
639
+ | Pika | `fal-ai/pika/v2.2/text-to-video`, `pikascenes`, `pikaffects` |
640
+ | LTX | `fal-ai/ltx-2/text-to-video`, `image-to-video`, `retake-video` |
641
+ | Luma | `fal-ai/luma-dream-machine/ray-2`, `ray-2-flash`, `luma-photon` |
642
+ | Vidu | `fal-ai/vidu/q2/text-to-video`, `image-to-video/pro` |
643
+ | CogVideoX | `fal-ai/cogvideox-5b/text-to-video`, `video-to-video` |
644
+ | Seedance | `fal-ai/bytedance/seedance/v1/text-to-video`, `image-to-video` |
645
+ | Magi | `fal-ai/magi/text-to-video`, `extend-video` |
646
+
647
+ #### TTS / Speech Models (50+)
648
+
649
+ **Kokoro (9 languages, 20+ voices per language):**
650
+ | Model | Language | Example Voices |
651
+ |---|---|---|
652
+ | `fal-ai/kokoro/american-english` | English (US) | af_heart, af_alloy, af_bella, af_nova, am_adam, am_echo, am_onyx |
653
+ | `fal-ai/kokoro/british-english` | English (UK) | British voice set |
654
+ | `fal-ai/kokoro/french` | French | French voice set |
655
+ | `fal-ai/kokoro/japanese` | Japanese | Japanese voice set |
656
+ | `fal-ai/kokoro/spanish` | Spanish | Spanish voice set |
657
+ | `fal-ai/kokoro/mandarin-chinese` | Chinese | Mandarin voice set |
658
+ | `fal-ai/kokoro/italian` | Italian | Italian voice set |
659
+ | `fal-ai/kokoro/hindi` | Hindi | Hindi voice set |
660
+ | `fal-ai/kokoro/brazilian-portuguese` | Portuguese | Portuguese voice set |
661
+
662
+ **ElevenLabs:**
663
+ | Model | Description |
664
+ |---|---|
665
+ | `fal-ai/elevenlabs/tts/eleven-v3` | Professional quality |
666
+ | `fal-ai/elevenlabs/tts/turbo-v2.5` | Faster inference |
667
+ | `fal-ai/elevenlabs/tts/multilingual-v2` | Multi-language |
668
+ | `fal-ai/elevenlabs/text-to-dialogue/eleven-v3` | Dialogue generation |
669
+ | `fal-ai/elevenlabs/sound-effects/v2` | Sound effects |
670
+ | `fal-ai/elevenlabs/speech-to-text` | Transcription |
671
+ | `fal-ai/elevenlabs/audio-isolation` | Background removal |
672
+
673
+ **Other TTS:**
674
+ `fal-ai/f5-tts` (voice cloning), `fal-ai/dia-tts`, `fal-ai/minimax/speech-2.6-turbo`, `fal-ai/minimax/speech-2.6-hd`, `fal-ai/chatterbox/text-to-speech`, `fal-ai/index-tts-2/text-to-speech`
675
+
676
+ #### FAL Client Capabilities
677
+
678
+ The `@fal-ai/client` provides additional features beyond what Noosphere surfaces:
679
+
680
+ - **Queue API** — Submit jobs, poll status, get results, cancel. Supports webhooks and priority levels
681
+ - **Streaming API** — Real-time streaming responses via async iterators
682
+ - **Realtime API** — WebSocket connections for interactive use (e.g., real-time image generation)
683
+ - **Storage API** — File upload with configurable TTL (1h, 1d, 7d, 30d, 1y, never)
684
+ - **Retry logic** — Configurable retries with exponential backoff and jitter
685
+ - **Request middleware** — Custom request interceptors and proxy support
686
+
687
+ ---
688
+
689
+ ### Hugging Face — Open Source AI (30+ tasks)
690
+
691
+ **Provider ID:** `huggingface`
692
+ **Modalities:** LLM, Image, TTS
693
+ **Library:** `@huggingface/inference`
694
+
695
+ Access to the entire Hugging Face Hub ecosystem. Any model hosted on HuggingFace can be used by passing its ID directly.
696
+
697
+ #### Default Models
698
+
699
+ | Modality | Default Model | Description |
700
+ |---|---|---|
701
+ | LLM | `meta-llama/Llama-3.1-8B-Instruct` | Llama 3.1 8B |
702
+ | Image | `stabilityai/stable-diffusion-xl-base-1.0` | SDXL Base |
703
+ | TTS | `facebook/mms-tts-eng` | MMS TTS English |
704
+
705
+ Any HuggingFace model ID works — just pass it as the `model` parameter:
706
+
707
+ ```typescript
708
+ await ai.chat({
709
+ provider: 'huggingface',
710
+ model: 'mistralai/Mixtral-8x7B-v0.1',
711
+ messages: [{ role: 'user', content: 'Hello' }],
712
+ });
713
+ ```
714
+
715
+ #### Full Library Capabilities
716
+
717
+ The `@huggingface/inference` library (v3.15.0) provides 30+ AI tasks, including capabilities not yet surfaced by Noosphere:
718
+
719
+ **Natural Language Processing:**
720
+ | Task | Method | Description |
721
+ |---|---|---|
722
+ | Chat | `chatCompletion()` | OpenAI-compatible chat completions |
723
+ | Chat Streaming | `chatCompletionStream()` | Token-by-token streaming |
724
+ | Text Generation | `textGeneration()` | Raw text completion |
725
+ | Summarization | `summarization()` | Text summarization |
726
+ | Translation | `translation()` | Language translation |
727
+ | Question Answering | `questionAnswering()` | Extract answers from context |
728
+ | Text Classification | `textClassification()` | Sentiment, topic classification |
729
+ | Zero-Shot Classification | `zeroShotClassification()` | Classify without training |
730
+ | Token Classification | `tokenClassification()` | NER, POS tagging |
731
+ | Sentence Similarity | `sentenceSimilarity()` | Semantic similarity scores |
732
+ | Feature Extraction | `featureExtraction()` | Text embeddings |
733
+ | Fill Mask | `fillMask()` | Fill in masked tokens |
734
+ | Table QA | `tableQuestionAnswering()` | Answer questions about tables |
735
+
736
+ **Computer Vision:**
737
+ | Task | Method | Description |
738
+ |---|---|---|
739
+ | Text-to-Image | `textToImage()` | Generate images from text |
740
+ | Image-to-Image | `imageToImage()` | Transform/edit images |
741
+ | Image Captioning | `imageToText()` | Describe images |
742
+ | Classification | `imageClassification()` | Classify image content |
743
+ | Object Detection | `objectDetection()` | Detect and locate objects |
744
+ | Segmentation | `imageSegmentation()` | Pixel-level segmentation |
745
+ | Zero-Shot Image | `zeroShotImageClassification()` | Classify without training |
746
+ | Text-to-Video | `textToVideo()` | Generate videos |
747
+
748
+ **Audio:**
749
+ | Task | Method | Description |
750
+ |---|---|---|
751
+ | Text-to-Speech | `textToSpeech()` | Generate speech |
752
+ | Speech-to-Text | `automaticSpeechRecognition()` | Transcription |
753
+ | Audio Classification | `audioClassification()` | Classify sounds |
754
+ | Audio-to-Audio | `audioToAudio()` | Source separation, enhancement |
755
+
756
+ **Multimodal:**
757
+ | Task | Method | Description |
758
+ |---|---|---|
759
+ | Visual QA | `visualQuestionAnswering()` | Answer questions about images |
760
+ | Document QA | `documentQuestionAnswering()` | Answer questions about documents |
761
+
762
+ **Tabular:**
763
+ | Task | Method | Description |
764
+ |---|---|---|
765
+ | Classification | `tabularClassification()` | Classify tabular data |
766
+ | Regression | `tabularRegression()` | Predict continuous values |
767
+
768
+ #### HuggingFace Agentic Features
769
+
770
+ - **Tool/Function Calling:** Full support via `tools` parameter with `tool_choice` control (auto/none/required)
771
+ - **JSON Schema Responses:** `response_format: { type: 'json_schema', json_schema: {...} }`
772
+ - **Reasoning:** `reasoning_effort` parameter (none/minimal/low/medium/high/xhigh)
773
+ - **Multimodal Input:** Images via `image_url` content chunks in chat messages
774
+ - **17 Inference Providers:** Route through Groq, Together, Fireworks, Replicate, Cerebras, Cohere, and more
775
+
776
+ ---
777
+
778
+ ### ComfyUI — Local Image Generation
779
+
780
+ **Provider ID:** `comfyui`
781
+ **Modalities:** Image, Video (planned)
782
+ **Type:** Local
783
+ **Default Port:** 8188
784
+
785
+ Connects to a local ComfyUI instance for Stable Diffusion workflows.
786
+
787
+ #### How It Works
788
+
789
+ 1. Clones a built-in txt2img workflow template (KSampler + SDXL pipeline)
790
+ 2. Injects your parameters (prompt, dimensions, seed, steps, guidance)
791
+ 3. POSTs the workflow to ComfyUI's `/prompt` endpoint
792
+ 4. Polls `/history/{promptId}` every second until completion (max 5 minutes)
793
+ 5. Fetches the generated image from `/view`
794
+ 6. Returns a PNG buffer
795
+
796
+ #### Configuration
797
+
798
+ ```typescript
799
+ const ai = new Noosphere({
800
+ local: {
801
+ comfyui: {
802
+ enabled: true,
803
+ host: 'http://localhost',
804
+ port: 8188,
805
+ },
806
+ },
807
+ });
808
+ ```
809
+
810
+ #### Default Workflow
811
+
812
+ - **Checkpoint:** `sd_xl_base_1.0.safetensors`
813
+ - **Sampler:** euler with normal scheduler
814
+ - **Default Steps:** 20
815
+ - **Default CFG/Guidance:** 7
816
+ - **Default Size:** 1024x1024
817
+ - **Max Size:** 2048x2048
818
+ - **Output:** PNG
819
+
820
+ #### Models Exposed
821
+
822
+ | Model ID | Modality | Description |
823
+ |---|---|---|
824
+ | `comfyui-txt2img` | Image | Text-to-image via workflow |
825
+ | `comfyui-txt2vid` | Video | Planned (requires AnimateDiff workflow) |
826
+
827
+ ---
828
+
829
+ ### Local TTS — Piper & Kokoro
830
+
831
+ **Provider IDs:** `piper`, `kokoro`
832
+ **Modality:** TTS
833
+ **Type:** Local
834
+
835
+ Connects to local OpenAI-compatible TTS servers.
836
+
837
+ #### Supported Engines
838
+
839
+ | Engine | Default Port | Health Check | Voice Discovery |
840
+ |---|---|---|---|
841
+ | Piper | 5500 | `GET /health` | `GET /voices` |
842
+ | Kokoro | 5501 | `GET /health` | `GET /v1/models` (fallback) |
843
+
844
+ #### API
845
+
846
+ Uses the OpenAI-compatible TTS endpoint:
847
+
848
+ ```
849
+ POST /v1/audio/speech
850
+ {
851
+ "model": "tts-1",
852
+ "input": "Hello world",
853
+ "voice": "default",
854
+ "speed": 1.0,
855
+ "response_format": "mp3"
856
+ }
857
+ ```
858
+
859
+ Supports `mp3`, `wav`, and `ogg` formats. Returns audio as a Buffer.
860
+
861
+ ---
862
+
863
+ ## Architecture
864
+
865
+ ### Provider Resolution (Local-First)
866
+
867
+ When you call a generation method without specifying a provider, Noosphere resolves one automatically:
868
+
869
+ 1. If `model` is specified without `provider` → looks up model in registry cache
870
+ 2. If a `default` is configured for the modality → uses that
871
+ 3. Otherwise → **local providers first**, then cloud providers
872
+
873
+ ```
874
+ resolveProvider(modality):
875
+ 1. Check user-specified provider ID → return if found
876
+ 2. Check configured defaults → return if found
877
+ 3. Scan all providers:
878
+ → Return first LOCAL provider supporting this modality
879
+ → Fallback to first CLOUD provider
880
+ 4. Throw NO_PROVIDER error
881
+ ```
882
+
883
+ ### Retry & Failover Logic
884
+
885
+ ```
886
+ executeWithRetry(modality, provider, fn):
887
+ for attempt = 0..maxRetries:
888
+ try: return fn()
889
+ catch:
890
+ if error is retryable AND attempts remain:
891
+ wait backoffMs * 2^attempt (exponential backoff)
892
+ retry same provider
893
+ if error is NOT GENERATION_FAILED AND failover enabled:
894
+ try each alternative provider for this modality
895
+ throw last error
896
+ ```
897
+
898
+ **Retryable errors (same provider):** `PROVIDER_UNAVAILABLE`, `RATE_LIMITED`, `TIMEOUT`, `GENERATION_FAILED`
899
+
900
+ **Failover-eligible errors (cross-provider):** `PROVIDER_UNAVAILABLE`, `RATE_LIMITED`, `TIMEOUT` (NOT `GENERATION_FAILED`)
901
+
902
+ ### Model Registry & Caching
903
+
904
+ - Models are fetched from providers via `listModels()` and cached in memory
905
+ - Cache TTL is configurable (default: 60 minutes)
906
+ - `syncModels()` forces a refresh of all provider model lists
907
+ - Registry tracks model → provider mappings for fast resolution
908
+
909
+ ### Usage Tracking
910
+
911
+ Every API call (success or failure) records a `UsageEvent`:
912
+
913
+ ```typescript
914
+ interface UsageEvent {
915
+ modality: 'llm' | 'image' | 'video' | 'tts';
916
+ provider: string;
917
+ model: string;
918
+ cost: number; // USD
919
+ latencyMs: number;
920
+ input?: number; // tokens or characters
921
+ output?: number; // tokens
922
+ unit?: string;
923
+ timestamp: string; // ISO 8601
924
+ success: boolean;
925
+ error?: string; // error message if failed
926
+ metadata?: Record<string, unknown>;
927
+ }
928
+ ```
929
+
930
+ ---
931
+
932
+ ## Error Handling
933
+
934
+ All errors are instances of `NoosphereError`:
935
+
936
+ ```typescript
937
+ import { NoosphereError } from 'noosphere';
938
+
939
+ try {
940
+ await ai.chat({ messages: [{ role: 'user', content: 'Hello' }] });
941
+ } catch (err) {
942
+ if (err instanceof NoosphereError) {
943
+ console.log(err.code); // error code
944
+ console.log(err.provider); // which provider failed
945
+ console.log(err.modality); // which modality
946
+ console.log(err.model); // which model (if known)
947
+ console.log(err.cause); // underlying error
948
+ console.log(err.isRetryable()); // whether retry might help
949
+ }
950
+ }
951
+ ```
952
+
953
+ ### Error Codes
954
+
955
+ | Code | Description | Retryable | Failover |
956
+ |---|---|---|---|
957
+ | `PROVIDER_UNAVAILABLE` | Provider is down or unreachable | Yes | Yes |
958
+ | `RATE_LIMITED` | API rate limit exceeded | Yes | Yes |
959
+ | `TIMEOUT` | Request exceeded timeout | Yes | Yes |
960
+ | `GENERATION_FAILED` | Generation error (bad prompt, model issue) | Yes | No |
961
+ | `AUTH_FAILED` | Invalid or missing API key | No | No |
962
+ | `MODEL_NOT_FOUND` | Requested model doesn't exist | No | No |
963
+ | `INVALID_INPUT` | Bad parameters or unsupported operation | No | No |
964
+ | `NO_PROVIDER` | No provider available for the requested modality | No | No |
965
+
966
+ ---
967
+
968
+ ## Custom Providers
969
+
970
+ Extend Noosphere with your own providers:
971
+
972
+ ```typescript
973
+ import type { NoosphereProvider, ModelInfo, ChatOptions, NoosphereResult, Modality } from 'noosphere';
974
+
975
+ const myProvider: NoosphereProvider = {
976
+ // Required properties
977
+ id: 'my-provider',
978
+ name: 'My Custom Provider',
979
+ modalities: ['llm', 'image'] as Modality[],
980
+ isLocal: false,
981
+
982
+ // Required methods
983
+ async ping() { return true; },
984
+ async listModels(modality?: Modality): Promise<ModelInfo[]> {
985
+ return [{
986
+ id: 'my-model',
987
+ provider: 'my-provider',
988
+ name: 'My Model',
989
+ modality: 'llm',
990
+ local: false,
991
+ cost: { price: 1.0, unit: 'per_1m_tokens' },
992
+ capabilities: {
993
+ contextWindow: 128000,
994
+ maxTokens: 4096,
995
+ supportsVision: false,
996
+ supportsStreaming: true,
997
+ },
998
+ }];
999
+ },
1000
+
1001
+ // Optional methods — implement per modality
1002
+ async chat(options: ChatOptions): Promise<NoosphereResult> {
1003
+ const start = Date.now();
1004
+ // ... your implementation
1005
+ return {
1006
+ content: 'Response text',
1007
+ provider: 'my-provider',
1008
+ model: 'my-model',
1009
+ modality: 'llm',
1010
+ latencyMs: Date.now() - start,
1011
+ usage: { cost: 0.001, input: 100, output: 50, unit: 'tokens' },
1012
+ };
1013
+ },
1014
+
1015
+ // stream?(options): NoosphereStream
1016
+ // image?(options): Promise<NoosphereResult>
1017
+ // video?(options): Promise<NoosphereResult>
1018
+ // speak?(options): Promise<NoosphereResult>
1019
+ // dispose?(): Promise<void>
1020
+ };
1021
+
1022
+ ai.registerProvider(myProvider);
1023
+ ```
1024
+
1025
+ ---
1026
+
1027
+ ## Provider Summary
1028
+
1029
+ | Provider | ID | Modalities | Type | Models | Library |
1030
+ |---|---|---|---|---|---|
1031
+ | Pi-AI Gateway | `pi-ai` | LLM | Cloud | 246+ | `@mariozechner/pi-ai` |
1032
+ | FAL.ai | `fal` | Image, Video, TTS | Cloud | 867+ | `@fal-ai/client` |
1033
+ | Hugging Face | `huggingface` | LLM, Image, TTS | Cloud | Unlimited (any HF model) | `@huggingface/inference` |
1034
+ | ComfyUI | `comfyui` | Image | Local | SDXL workflows | Direct HTTP |
1035
+ | Piper TTS | `piper` | TTS | Local | Piper voices | Direct HTTP |
1036
+ | Kokoro TTS | `kokoro` | TTS | Local | Kokoro voices | Direct HTTP |
1037
+
1038
+ ## Requirements
1039
+
1040
+ - Node.js >= 18.0.0
1041
+
1042
+ ## License
1043
+
1044
+ MIT