smart_prompt 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -10
  3. data/README.cn.md +307 -64
  4. data/README.md +311 -64
  5. data/Rakefile +10 -1
  6. data/config/anthropic_config.yml +151 -0
  7. data/config/image_generation_config.yml +22 -0
  8. data/config/multimodal_config.yml +85 -0
  9. data/config/sensenova_config.yml +63 -0
  10. data/config/zhipu_config.yml +73 -0
  11. data/examples/anthropic_basic_chat.rb +143 -0
  12. data/examples/anthropic_example.rb +232 -0
  13. data/examples/anthropic_multimodal.rb +212 -0
  14. data/examples/anthropic_streaming.rb +312 -0
  15. data/examples/anthropic_tool_calling.rb +393 -0
  16. data/examples/automatic_cleanup_example.rb +109 -0
  17. data/examples/history_management_examples.rb +522 -0
  18. data/examples/image_generation_example.rb +130 -0
  19. data/examples/monitoring_example.rb +121 -0
  20. data/examples/multimodal_example.rb +63 -0
  21. data/examples/relevance_based_strategy_example.rb +87 -0
  22. data/examples/sensenova_example.rb +129 -0
  23. data/examples/stt_example.rb +287 -0
  24. data/examples/tts_example.rb +244 -0
  25. data/examples/video_generation_example.rb +189 -0
  26. data/examples/zhipu_example.rb +151 -0
  27. data/lib/smart_prompt/anthropic_adapter.rb +363 -281
  28. data/lib/smart_prompt/compression_engine.rb +201 -0
  29. data/lib/smart_prompt/context_strategy.rb +22 -0
  30. data/lib/smart_prompt/conversation.rb +81 -191
  31. data/lib/smart_prompt/engine.rb +36 -19
  32. data/lib/smart_prompt/history_manager.rb +596 -0
  33. data/lib/smart_prompt/hybrid_strategy.rb +222 -0
  34. data/lib/smart_prompt/image_generation_adapter.rb +297 -0
  35. data/lib/smart_prompt/lru_cache.rb +133 -0
  36. data/lib/smart_prompt/message.rb +57 -0
  37. data/lib/smart_prompt/multimodal_adapter.rb +277 -0
  38. data/lib/smart_prompt/openai_adapter.rb +1 -25
  39. data/lib/smart_prompt/persistence_layer.rb +197 -0
  40. data/lib/smart_prompt/relevance_based_strategy.rb +221 -0
  41. data/lib/smart_prompt/sensenova_adapter.rb +410 -0
  42. data/lib/smart_prompt/session.rb +140 -0
  43. data/lib/smart_prompt/sliding_window_strategy.rb +100 -0
  44. data/lib/smart_prompt/stt_adapter.rb +381 -0
  45. data/lib/smart_prompt/summary_based_strategy.rb +152 -0
  46. data/lib/smart_prompt/token_counter.rb +74 -0
  47. data/lib/smart_prompt/tts_adapter.rb +403 -0
  48. data/lib/smart_prompt/version.rb +1 -1
  49. data/lib/smart_prompt/video_generation_adapter.rb +330 -0
  50. data/lib/smart_prompt/worker.rb +25 -3
  51. data/lib/smart_prompt/zhipu_adapter.rb +616 -0
  52. data/lib/smart_prompt.rb +22 -2
  53. data/workers/history_management_examples.rb +407 -0
  54. data/workers/image_generation_workers.rb +119 -0
  55. data/workers/multimodal_workers.rb +110 -0
  56. data/workers/sensenova_workers.rb +62 -0
  57. data/workers/stt_workers.rb +195 -0
  58. data/workers/tts_workers.rb +388 -0
  59. data/workers/video_generation_workers.rb +264 -0
  60. data/workers/zhipu_workers.rb +113 -0
  61. metadata +84 -8
data/README.md CHANGED
@@ -11,13 +11,24 @@ SmartPrompt is a powerful Ruby gem that provides an elegant domain-specific lang
11
11
 
12
12
  ### Multi-LLM Support
13
13
  - **OpenAI API Compatible**: Full support for OpenAI GPT models and compatible APIs
14
- - **Llama.cpp Integration**: Direct integration with local Llama.cpp servers
14
+ - **Anthropic Claude**: Native support for Claude models with multimodal capabilities
15
+ - **SenseNova (商汤日日新)**: One adapter covers chat (商量), multimodal vision (图文多模态), Cupido embeddings (向量), and 秒画 text-to-image — see `examples/sensenova_example.rb`
16
+ - **智谱 AI (BigModel / GLM)**: One adapter covers all categories — chat (GLM-4), vision (GLM-4V), embeddings (embedding-3), text-to-image (CogView), text-to-video (CogVideoX), TTS (GLM-TTS), ASR (GLM-ASR) — see `examples/zhipu_example.rb`
17
+ - **Llama.cpp Integration**: Direct integration with local Llama.cpp servers
15
18
  - **Extensible Adapters**: Easy-to-extend adapter system for new LLM providers
16
19
  - **Unified Interface**: Same API regardless of the underlying LLM provider
17
20
 
21
+ ### Multimodal AI Capabilities
22
+ - **Vision Models**: Support for image understanding and analysis
23
+ - **Image Generation**: Create images from text prompts using diffusion models
24
+ - **Video Generation**: Generate videos from text or image prompts
25
+ - **Text-to-Speech**: Convert text to natural-sounding speech
26
+ - **Speech-to-Text**: Transcribe audio files to text with multi-language support
27
+
18
28
  ### Flexible Architecture
19
29
  - **Worker-based Tasks**: Define reusable workers for specific AI tasks
20
30
  - **Template System**: ERB-based prompt templates with parameter injection
31
+ - **Intelligent History Management**: Session isolation, automatic compression, and multiple context strategies
21
32
  - **Conversation Management**: Built-in conversation history and context management
22
33
  - **Streaming Support**: Real-time response streaming for better user experience
23
34
 
@@ -26,6 +37,8 @@ SmartPrompt is a powerful Ruby gem that provides an elegant domain-specific lang
26
37
  - **Retry Logic**: Robust error handling with configurable retry mechanisms
27
38
  - **Embeddings**: Text embedding generation for semantic search and RAG applications
28
39
  - **Configuration-driven**: YAML-based configuration for easy deployment management
40
+ - **Batch Processing**: Efficient processing of multiple files and tasks
41
+ - **Language Detection**: Automatic language identification from text and audio
29
42
 
30
43
  ### Production Ready
31
44
  - **Comprehensive Logging**: Detailed logging for debugging and monitoring
@@ -61,6 +74,7 @@ Create a YAML configuration file (`config/smart_prompt.yml`):
61
74
  # Adapter definitions
62
75
  adapters:
63
76
  openai: OpenAIAdapter
77
+ anthropic: AnthropicAdapter
64
78
  # LLM configurations
65
79
  llms:
66
80
  SiliconFlow:
@@ -68,40 +82,25 @@ llms:
68
82
  url: https://api.siliconflow.cn/v1/
69
83
  api_key: ENV["APIKey"]
70
84
  default_model: Qwen/Qwen2.5-7B-Instruct
71
- local:
85
+ claude:
86
+ adapter: anthropic
87
+ api_key: ENV["ANTHROPIC_API_KEY"]
88
+ model: claude-3-5-sonnet-20241022
89
+ temperature: 0.7
90
+ max_tokens: 4096
91
+ llamacpp:
72
92
  adapter: openai
73
93
  url: http://localhost:8080/
74
94
  ollama:
75
95
  adapter: openai
76
96
  url: http://localhost:11434/
77
97
  default_model: deepseek-r1
78
- gemma4_local:
79
- adapter: openai
80
- url: http://localhost:8000/v1
81
- api_key: dummy
82
- default_model: gemma-4-12B-it
83
- temperature: 1.0
84
- top_p: 0.95
85
- top_k: 64
86
98
  deepseek:
87
99
  adapter: openai
88
100
  url: https://api.deepseek.com
89
101
  api_key: ENV["DSKEY"]
90
102
  default_model: deepseek-reasoner
91
103
 
92
- # Model aliases
93
- models:
94
- local/qwen3.5:
95
- use: local
96
- model: qwen3.5
97
- deepseekv3.2:
98
- use: SiliconFlow
99
- model: Pro/deepseek-ai/DeepSeek-V3.2
100
- gemma4/12b:
101
- use: gemma4_local
102
- model: gemma-4-12B-it
103
- max_tokens: 1024
104
-
105
104
  # Default settings
106
105
  default_llm: SiliconFlow
107
106
  template_path: "./templates"
@@ -129,8 +128,9 @@ Create worker files in your `workers/` directory:
129
128
  **workers/chat_worker.rb**:
130
129
  ```ruby
131
130
  SmartPrompt.define_worker :chat_assistant do
132
- # Use a configured model alias
133
- use_model "deepseekv3.2"
131
+ # Use a specific LLM
132
+ use "SiliconFlow"
133
+ model "deepseek-ai/DeepSeek-V3"
134
134
  # Set system message
135
135
  sys_msg("You are a helpful AI assistant.", params)
136
136
  # Use template with parameters
@@ -182,26 +182,6 @@ engine.call_worker_by_stream(:streaming_chat, {
182
182
  end
183
183
  ```
184
184
 
185
- ### Gemma 4 12B Multimodal
186
-
187
- Gemma 4 12B can be connected through OpenAI-compatible local servers such as LiteRT-LM, LM Studio, Ollama, or llama.cpp. SmartPrompt places images before text and audio after text to match Gemma 4 multimodal best practices.
188
-
189
- ```ruby
190
- SmartPrompt.define_worker :gemma_multimodal_assistant do
191
- use_model "gemma4/12b"
192
- thinking params.fetch(:thinking, true)
193
- sys_msg("You are a precise local multimodal assistant.", params)
194
-
195
- image(params[:image], token_budget: params[:token_budget] || 280) if params[:image]
196
- video(params[:video], fps: 1, max_seconds: 60) if params[:video]
197
- audio(params[:audio]) if params[:audio]
198
- prompt(params[:message])
199
-
200
- request_options(response_format: { type: "json_object" }) if params[:json]
201
- send_msg
202
- end
203
- ```
204
-
205
185
  ### Tool Integration
206
186
 
207
187
  ```ruby
@@ -238,7 +218,10 @@ end
238
218
 
239
219
  ### Conversation History
240
220
 
221
+ SmartPrompt provides intelligent conversation history management with session isolation, automatic compression, and multiple context strategies.
222
+
241
223
  ```ruby
224
+ # Basic usage with automatic history management
242
225
  SmartPrompt.define_worker :conversational_chat do
243
226
  use "deepseek"
244
227
  model "deepseek-chat"
@@ -246,8 +229,38 @@ SmartPrompt.define_worker :conversational_chat do
246
229
  prompt(params[:message], with_history: true)
247
230
  send_msg
248
231
  end
232
+
233
+ # Advanced usage with explicit session management
234
+ SmartPrompt.define_worker :session_chat do
235
+ use "deepseek"
236
+ model "deepseek-chat"
237
+
238
+ # Use session_id for isolated conversations
239
+ session_id = params[:session_id] || "default"
240
+
241
+ # Configure session behavior
242
+ session_config = {
243
+ max_messages: 100,
244
+ max_tokens: 4000,
245
+ context_strategy: :sliding_window # or :relevance_based, :summary_based, :hybrid
246
+ }
247
+
248
+ sys_msg("You are a helpful assistant.", params)
249
+ prompt(params[:message], with_history: true)
250
+ params.merge(session_id: session_id, session_config: session_config)
251
+ send_msg
252
+ end
249
253
  ```
250
254
 
255
+ **History Management Features:**
256
+ - **Session Isolation**: Each conversation has independent history
257
+ - **Context Strategies**: Choose from sliding window, relevance-based, summary-based, or hybrid
258
+ - **Automatic Compression**: Reduce token usage while preserving context
259
+ - **Persistence**: Save and restore conversations across restarts
260
+ - **Performance**: LRU caching and async I/O for optimal performance
261
+
262
+ See [History Management Guide](HISTORY_MANAGEMENT_GUIDE.md) for detailed documentation.
263
+
251
264
  ### Embeddings Generation
252
265
 
253
266
  ```ruby
@@ -265,6 +278,78 @@ embeddings = engine.call_worker(:text_embedder, {
265
278
  })
266
279
  ```
267
280
 
281
+ ### Multimodal AI Examples
282
+
283
+ #### Image Generation
284
+ ```ruby
285
+ # Generate image from text prompt (SiliconFlow /v1/images/generations)
286
+ result = engine.call_worker(:image_generator, {
287
+ prompt: "A beautiful sunset over mountains",
288
+ image_size: "1024x1024", # "widthxheight"; aliases: size:
289
+ batch_size: 1, # only Kolors; aliases: n:
290
+ negative_prompt: "blurry, low quality",
291
+ save_to_file: true,
292
+ output_dir: "./generated_images"
293
+ })
294
+
295
+ puts "Generated #{result[:images].size} image(s)"
296
+ puts "First image URL: #{result[:images].first[:url]}"
297
+ puts "Saved files: #{result[:saved_files]}"
298
+ ```
299
+
300
+ #### Video Generation
301
+ ```ruby
302
+ # Generate video from text prompt
303
+ result = engine.call_worker(:video_generator, {
304
+ prompt: "A cat playing with a ball of yarn",
305
+ duration: 5,
306
+ resolution: "720p",
307
+ save_to_file: true,
308
+ output_dir: "./generated_videos"
309
+ })
310
+
311
+ puts "Video generation started: #{result[:video_id]}"
312
+ puts "Check status with: engine.call_worker(:video_status, {video_id: '#{result[:video_id]}'})"
313
+ ```
314
+
315
+ #### Text-to-Speech
316
+ ```ruby
317
+ # Convert text to speech
318
+ result = engine.call_worker(:tts_synthesizer, {
319
+ text: "Welcome to SmartPrompt, your AI assistant",
320
+ voice: "alloy",
321
+ speed: 1.0,
322
+ save_to_file: true,
323
+ output_dir: "./generated_audio"
324
+ })
325
+
326
+ puts "Audio file created: #{result[:audio_file][:file_path]}"
327
+ ```
328
+
329
+ #### Speech-to-Text
330
+ ```ruby
331
+ # Transcribe audio to text
332
+ result = engine.call_worker(:stt_transcriber, {
333
+ audio_file: "./audio.wav",
334
+ language: "en",
335
+ response_format: "json"
336
+ })
337
+
338
+ puts "Transcribed text: #{result[:transcription][:text]}"
339
+ puts "Language: #{result[:transcription][:language]}"
340
+ ```
341
+
342
+ #### Vision Analysis
343
+ ```ruby
344
+ # Analyze image with vision model
345
+ result = engine.call_worker(:vision_analyzer, {
346
+ image_file: "./image.jpg",
347
+ prompt: "Describe what you see in this image"
348
+ })
349
+
350
+ puts "Analysis: #{result[:response]}"
351
+ ```
352
+
268
353
  ## 🏗️ Architecture Overview
269
354
 
270
355
  SmartPrompt follows a modular architecture:
@@ -282,6 +367,13 @@ SmartPrompt follows a modular architecture:
282
367
  │Workers│ │Conv.│ │Template│
283
368
  │ │ │Mgmt │ │ System │
284
369
  └───────┘ └─────┘ └────────┘
370
+
371
+ ┌────────┴────────┐
372
+ │ │
373
+ ┌───▼────────┐ ┌─────▼──────┐
374
+ │ History │ │Persistence │
375
+ │ Manager │ │ Layer │
376
+ └────────────┘ └────────────┘
285
377
  ```
286
378
 
287
379
  ### Core Components
@@ -289,8 +381,10 @@ SmartPrompt follows a modular architecture:
289
381
  - **Engine**: Central orchestrator managing configuration, adapters, and workers
290
382
  - **Workers**: Reusable task definitions with embedded business logic
291
383
  - **Conversation**: Context and message history management
292
- - **Adapters**: LLM provider integrations (OpenAI, Llama.cpp, etc.)
384
+ - **History Manager**: Intelligent conversation history with session isolation and context strategies
385
+ - **Adapters**: LLM provider integrations (OpenAI, Anthropic, Llama.cpp, etc.)
293
386
  - **Templates**: ERB-based prompt template system
387
+ - **Persistence Layer**: Save and restore conversation history across restarts
294
388
 
295
389
  ## 🔧 Configuration Reference
296
390
 
@@ -298,32 +392,178 @@ SmartPrompt follows a modular architecture:
298
392
 
299
393
  ```yaml
300
394
  adapters:
301
- openai: "OpenAIAdapter" # For OpenAI API
395
+ openai: "OpenAIAdapter" # For OpenAI API
396
+ anthropic: "AnthropicAdapter" # For Anthropic Claude API
397
+ sensenova: "SenseNovaAdapter" # For 商汤 SenseNova (chat/vision/embeddings/image)
398
+ zhipu: "ZhipuAIAdapter" # For 智谱 BigModel/GLM (chat/vision/embed/image/video/tts/asr)
399
+ multimodal: "MultimodalAdapter" # For vision models
400
+ image_generation: "ImageGenerationAdapter" # For image generation
401
+ video_generation: "VideoGenerationAdapter" # For video generation
402
+ tts: "TTSAdapter" # For text-to-speech
403
+ stt: "STTAdapter" # For speech-to-text
302
404
  ```
303
405
 
304
406
  ### LLM Configuration
305
407
 
306
408
  ```yaml
307
409
  llms:
308
- model_name:
309
- adapter: "adapter_name"
310
- api_key: "your_api_key" # Can use ENV['KEY_NAME']
311
- url: "https://api.url"
312
- model: "model_identifier"
410
+ # Text models
411
+ gpt:
412
+ adapter: "openai"
413
+ api_key: ENV["OPENAI_API_KEY"]
414
+ model: "gpt-4"
313
415
  temperature: 0.7
314
- # Additional provider-specific options
315
- ```
316
416
 
317
- ### Model Alias Configuration
417
+ # Anthropic Claude models
418
+ claude:
419
+ adapter: "anthropic"
420
+ api_key: ENV["ANTHROPIC_API_KEY"]
421
+ model: "claude-3-5-sonnet-20241022"
422
+ temperature: 0.7
423
+ max_tokens: 4096
318
424
 
319
- ```yaml
320
- models:
321
- model_alias:
322
- use: "llm_name"
323
- model: "model_identifier"
324
- ```
425
+ claude_opus:
426
+ adapter: "anthropic"
427
+ api_key: ENV["ANTHROPIC_API_KEY"]
428
+ model: "claude-3-opus-20240229"
429
+ temperature: 0.7
430
+ max_tokens: 4096
325
431
 
326
- In a worker, `use_model "model_alias"` is equivalent to calling `use "llm_name"` and `model "model_identifier"`.
432
+ claude_haiku:
433
+ adapter: "anthropic"
434
+ api_key: ENV["ANTHROPIC_API_KEY"]
435
+ model: "claude-3-5-haiku-20241022"
436
+ temperature: 0.7
437
+ max_tokens: 4096
438
+
439
+ # Custom Anthropic endpoint (for proxy or custom deployment)
440
+ claude_custom:
441
+ adapter: "anthropic"
442
+ api_key: ENV["ANTHROPIC_API_KEY"]
443
+ url: "https://your-custom-endpoint.com"
444
+ model: "claude-3-5-sonnet-20241022"
445
+ temperature: 0.7
446
+ max_tokens: 4096
447
+
448
+ # 商汤 SenseNova — one adapter covers all four model categories; just change `model`.
449
+ # Free-tier models run on token.sensenova.cn/v1; paid models (SenseChat-5, SenseNova-V6-*
450
+ # , Cupido) run on api.sensenova.cn/compatible-mode/v2 (returns 403 if your key lacks them).
451
+ sensechat: # 商量 文本对话 (free-tier)
452
+ adapter: "sensenova"
453
+ url: "https://token.sensenova.cn/v1"
454
+ api_key: ENV["SENSENOVA_API_KEY"]
455
+ model: "sensenova-6.7-flash-lite"
456
+ temperature: 0.7
457
+ # Optional SenseNova sampling extras (forwarded to /chat/completions):
458
+ # reasoning_effort: "medium"
459
+ # max_completion_tokens: 4096
460
+ # Paid: url https://api.sensenova.cn/compatible-mode/v2, model SenseChat-5
461
+
462
+ sensevision: # 商量 图文多模态 (flash-lite is natively multimodal)
463
+ adapter: "sensenova"
464
+ url: "https://token.sensenova.cn/v1"
465
+ api_key: ENV["SENSENOVA_API_KEY"]
466
+ model: "sensenova-6.7-flash-lite"
467
+ # Paid: url https://api.sensenova.cn/compatible-mode/v2, model SenseNova-V6-Pro
468
+
469
+ senseembedding: # Cupido 向量模型 (paid; native endpoint)
470
+ adapter: "sensenova"
471
+ url: "https://api.sensenova.cn/compatible-mode/v2"
472
+ embeddings_url: "https://api.sensenova.cn/v1/llm/embeddings"
473
+ api_key: ENV["SENSENOVA_API_KEY"]
474
+ model: "Cupido"
475
+
476
+ senseimage: # 秒画 文生图 (sensenova-u1-fast; token.sensenova.cn base)
477
+ adapter: "sensenova"
478
+ url: "https://token.sensenova.cn/v1"
479
+ image_url: "https://token.sensenova.cn/v1/images/generations"
480
+ api_key: ENV["SENSENOVA_API_KEY"]
481
+ model: "sensenova-u1-fast"
482
+ # sensenova-u1-fast only accepts specific sizes (default 2048x2048); see
483
+ # VALID_IMAGE_SIZES in sensenova_adapter.rb.
484
+
485
+ # 智谱 AI (BigModel/GLM) — one adapter covers all categories; just change `model`.
486
+ # Base https://open.bigmodel.cn/api/paas/v4 ; Bearer auth. Defaults use free-tier models.
487
+ glm: # 文本对话 (free glm-4-flash; paid glm-4-plus/glm-5.2)
488
+ adapter: "zhipu"
489
+ url: "https://open.bigmodel.cn/api/paas/v4"
490
+ api_key: ENV["ZHIPUAI_API_KEY"]
491
+ model: "glm-4-flash"
492
+ temperature: 0.7
493
+ # CodeGeeX-4: set `coding: true` and model: codegeex-4 (uses the coding base)
494
+
495
+ glm_vision: # 图文多模态 (free glm-4v-flash; paid glm-4v-plus)
496
+ adapter: "zhipu"
497
+ url: "https://open.bigmodel.cn/api/paas/v4"
498
+ api_key: ENV["ZHIPUAI_API_KEY"]
499
+ model: "glm-4v-flash"
500
+
501
+ embedding: # 向量模型 (embedding-3; custom dimensions 256/512/1024/2048)
502
+ adapter: "zhipu"
503
+ url: "https://open.bigmodel.cn/api/paas/v4"
504
+ api_key: ENV["ZHIPUAI_API_KEY"]
505
+ model: "embedding-3"
506
+ dimensions: 1024
507
+
508
+ cogview: # 文生图 (free cogview-3-flash; paid cogview-4/glm-image)
509
+ adapter: "zhipu"
510
+ url: "https://open.bigmodel.cn/api/paas/v4"
511
+ api_key: ENV["ZHIPUAI_API_KEY"]
512
+ model: "cogview-3-flash"
513
+
514
+ cogvideo: # 文生视频 (async submit->poll->download; free cogvideox-flash)
515
+ adapter: "zhipu"
516
+ url: "https://open.bigmodel.cn/api/paas/v4"
517
+ api_key: ENV["ZHIPUAI_API_KEY"]
518
+ model: "cogvideox-flash"
519
+
520
+ glm_tts: # 语音合成 (GLM-TTS)
521
+ adapter: "zhipu"
522
+ url: "https://open.bigmodel.cn/api/paas/v4"
523
+ api_key: ENV["ZHIPUAI_API_KEY"]
524
+ model: "glm-tts"
525
+
526
+ glm_asr: # 语音识别 (GLM-ASR-2512)
527
+ adapter: "zhipu"
528
+ url: "https://open.bigmodel.cn/api/paas/v4"
529
+ api_key: ENV["ZHIPUAI_API_KEY"]
530
+ model: "glm-asr-2512"
531
+
532
+ # Vision models
533
+ vision:
534
+ adapter: "multimodal"
535
+ url: "https://api.siliconflow.cn/v1/"
536
+ api_key: ENV["SILICONFLOW_API_KEY"]
537
+ model: "Qwen/Qwen2.5-VL-7B-Instruct"
538
+
539
+ # Image generation (Kolors supports batch_size/guidance_scale; see Qwen-Image for cfg)
540
+ image_gen:
541
+ adapter: "image_generation"
542
+ url: "https://api.siliconflow.cn/v1/"
543
+ api_key: ENV["SILICONFLOW_API_KEY"]
544
+ model: "Kwai-Kolors/Kolors"
545
+
546
+ # Video generation
547
+ video_gen:
548
+ adapter: "video_generation"
549
+ url: "https://api.siliconflow.cn/v1/"
550
+ api_key: ENV["SILICONFLOW_API_KEY"]
551
+ model: "Wan-AI/Wan2.2-T2V-A14B"
552
+
553
+ # Text-to-speech
554
+ tts_service:
555
+ adapter: "tts"
556
+ url: "https://api.siliconflow.cn/v1/"
557
+ api_key: ENV["SILICONFLOW_API_KEY"]
558
+ model: "FunAudioLLM/CosyVoice2-0.5B"
559
+
560
+ # Speech-to-text
561
+ stt_service:
562
+ adapter: "stt"
563
+ url: "https://api.siliconflow.cn/v1/"
564
+ api_key: ENV["SILICONFLOW_API_KEY"]
565
+ model: "FunAudioLLM/CosyVoice2-0.5B"
566
+ ```
327
567
 
328
568
  ### Path Configuration
329
569
 
@@ -398,20 +638,27 @@ end
398
638
  ## 🚀 Real-world Use Cases
399
639
 
400
640
  - **Chatbots and Conversational AI**: Build sophisticated chatbots with context awareness
401
- - **Content Generation**: Automated content creation with template-driven prompts
641
+ - **Content Generation**: Automated content creation with template-driven prompts
402
642
  - **Code Analysis**: AI-powered code review and documentation generation
403
643
  - **Customer Support**: Intelligent ticket routing and response suggestions
404
644
  - **Data Processing**: LLM-powered data extraction and transformation
405
645
  - **Educational Tools**: AI tutors and learning assistance systems
646
+ - **Multimedia Content Creation**: Generate images, videos, and audio content
647
+ - **Voice Interfaces**: Build voice-enabled applications with TTS and STT
648
+ - **Visual Analysis**: Image understanding and object detection applications
649
+ - **Accessibility Tools**: Audio descriptions, text-to-speech for visually impaired
406
650
 
407
651
  ## 🛣️ Roadmap
408
652
 
653
+ - [x] **Multimodal AI Support** - Vision, Image Generation, Video Generation, TTS, STT
409
654
  - [ ] Additional LLM provider adapters (Anthropic Claude, Google PaLM)
410
655
  - [ ] Visual prompt builder and management interface
411
656
  - [ ] Enhanced caching and performance optimizations
412
657
  - [ ] Integration with vector databases for RAG applications
413
658
  - [ ] Built-in evaluation and testing framework for prompts
414
659
  - [ ] Distributed worker execution support
660
+ - [ ] Real-time audio/video streaming support
661
+ - [ ] Advanced multimodal prompt chaining
415
662
 
416
663
  ## 🤝 Contributing
417
664
 
@@ -442,4 +689,4 @@ This project is licensed under the MIT License - see the [LICENSE.txt](LICENSE.t
442
689
 
443
690
  ---
444
691
 
445
- **SmartPrompt** - Making LLM integration in Ruby applications simple, powerful, and elegant.
692
+ **SmartPrompt** - Making LLM integration in Ruby applications simple, powerful, and elegant.
data/Rakefile CHANGED
@@ -1,4 +1,13 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "bundler/gem_tasks"
4
- task default: %i[]
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << "lib"
8
+ t.libs << "test"
9
+ t.test_files = FileList["test/**/*_test.rb"]
10
+ t.verbose = true
11
+ end
12
+
13
+ task default: :test
@@ -0,0 +1,151 @@
1
+ # Anthropic Configuration for SmartPrompt
2
+ # This configuration enables Anthropic Claude models
3
+
4
+ # Adapter definitions
5
+ adapters:
6
+ openai: "OpenAIAdapter"
7
+ anthropic: "AnthropicAdapter"
8
+
9
+ # LLM configurations
10
+ llms:
11
+ deepseek_anthropic:
12
+ adapter: anthropic
13
+ api_key: ENV["ANTHROPIC_AUTH_TOKEN"]
14
+ url: "https://api.deepseek.com/anthropic"
15
+ temperature: 0.7
16
+ max_tokens: 4096
17
+ deepseek:
18
+ adapter: openai
19
+ api_key: ENV["DSKEY"]
20
+ url: "https://api.deepseek.com"
21
+
22
+ # Path configurations
23
+ template_path: "./templates"
24
+ worker_path: "./workers"
25
+ logger_file: "./logs/smart_prompt.log"
26
+
27
+ # Advanced settings
28
+ advanced:
29
+ # Timeout settings (in seconds)
30
+ request_timeout: 240
31
+ connection_timeout: 30
32
+
33
+ # Retry settings
34
+ max_retries: 3
35
+ retry_delay: 2
36
+
37
+ # Rate limiting
38
+ requests_per_minute: 60
39
+
40
+ # History Management Configuration
41
+ # SmartPrompt provides intelligent conversation history management with session isolation,
42
+ # automatic compression, and multiple context strategies.
43
+ history:
44
+ # Cache Configuration
45
+ # Maximum number of sessions to keep in memory (LRU eviction)
46
+ cache_size: 100
47
+
48
+ # Default Session Configuration
49
+ # These settings apply to all sessions unless overridden
50
+ session_defaults:
51
+ max_messages: 100 # Maximum messages per session (older messages removed)
52
+ max_tokens: 4000 # Maximum tokens per session (enforced during context retrieval)
53
+ context_strategy: sliding_window # Default strategy: sliding_window, relevance_based, summary_based, hybrid
54
+ preserve_system_messages: true # Always keep system messages regardless of limits
55
+
56
+ # Context Strategy Configurations
57
+ # Each strategy has specific parameters for fine-tuning behavior
58
+ strategies:
59
+ # Sliding Window: Keep the most recent N messages
60
+ sliding_window:
61
+ window_size: 10 # Number of recent messages to keep
62
+ preserve_system: true # Always include system messages
63
+
64
+ # Relevance-Based: Select messages based on semantic similarity
65
+ relevance_based:
66
+ top_k: 10 # Number of most relevant messages to select
67
+ recency_weight: 0.3 # Weight for recency (0.0-1.0)
68
+ relevance_weight: 0.7 # Weight for relevance (0.0-1.0)
69
+ embedding_service: null # Optional: embedding service for semantic similarity
70
+
71
+ # Summary-Based: Automatically compress old messages into summaries
72
+ summary_based:
73
+ summary_threshold: 20 # Trigger summarization after this many messages
74
+ keep_recent: 5 # Number of recent messages to keep uncompressed
75
+ compression_ratio: 0.5 # Target compression ratio (0.0-1.0)
76
+
77
+ # Hybrid: Adaptively combine multiple strategies
78
+ hybrid:
79
+ mode: adaptive # Mode: 'adaptive' (auto-select) or 'combined' (merge results)
80
+ sliding_window: {} # Override sliding window config
81
+ relevance_based: {} # Override relevance-based config
82
+ summary_based: {} # Override summary-based config
83
+
84
+ # Compression Configuration
85
+ # Automatic summarization to reduce token usage
86
+ compression:
87
+ enabled: true # Enable automatic compression
88
+ auto_compress_threshold: 50 # Auto-compress when session exceeds this many messages
89
+ compression_ratio: 0.5 # Target compression ratio
90
+ llm_adapter: null # LLM to use for summarization (uses default if null)
91
+
92
+ # Persistence Configuration
93
+ # Save and restore conversation history across restarts
94
+ persistence:
95
+ enabled: true # Enable persistence to disk
96
+ backend: filesystem # Backend type: 'filesystem' (more backends coming soon)
97
+ storage_path: "./history_data" # Directory for storing session data
98
+ async: true # Use async writes for better performance
99
+
100
+ # Cleanup Configuration
101
+ # Automatic cleanup of old or expired sessions
102
+ cleanup:
103
+ auto_cleanup: false # Enable automatic cleanup thread
104
+ cleanup_interval: 3600 # Cleanup interval in seconds (1 hour)
105
+ session_ttl: 86400 # Session time-to-live in seconds (24 hours)
106
+ cleanup_callback: null # Optional: custom cleanup logic (Ruby proc)
107
+
108
+ # Monitoring Configuration
109
+ # Logging and metrics for debugging and monitoring
110
+ monitoring:
111
+ enabled: true # Enable monitoring and logging
112
+ log_level: info # Log level: debug, info, warn, error
113
+ metrics_format: prometheus # Metrics format: prometheus, json, hash
114
+
115
+ # Example Configurations for Different Use Cases:
116
+ #
117
+ # 1. High-Volume Chat Application (optimize for performance):
118
+ # cache_size: 1000
119
+ # session_defaults:
120
+ # max_messages: 50
121
+ # max_tokens: 2000
122
+ # context_strategy: sliding_window
123
+ # cleanup:
124
+ # auto_cleanup: true
125
+ # session_ttl: 3600 # 1 hour
126
+ #
127
+ # 2. Long-Running Conversations (optimize for context retention):
128
+ # session_defaults:
129
+ # max_messages: 500
130
+ # max_tokens: 16000
131
+ # context_strategy: summary_based
132
+ # compression:
133
+ # enabled: true
134
+ # auto_compress_threshold: 100
135
+ #
136
+ # 3. Semantic Search Application (optimize for relevance):
137
+ # session_defaults:
138
+ # context_strategy: relevance_based
139
+ # strategies:
140
+ # relevance_based:
141
+ # top_k: 20
142
+ # recency_weight: 0.2
143
+ # relevance_weight: 0.8
144
+ #
145
+ # 4. Development/Testing (disable persistence and cleanup):
146
+ # persistence:
147
+ # enabled: false
148
+ # cleanup:
149
+ # auto_cleanup: false
150
+ # monitoring:
151
+ # log_level: debug