abstractcore 2.4.2__py3-none-any.whl → 2.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. abstractcore/apps/app_config_utils.py +19 -0
  2. abstractcore/apps/summarizer.py +85 -56
  3. abstractcore/architectures/detection.py +15 -4
  4. abstractcore/assets/architecture_formats.json +1 -1
  5. abstractcore/assets/model_capabilities.json +420 -11
  6. abstractcore/core/interface.py +2 -0
  7. abstractcore/core/session.py +4 -0
  8. abstractcore/embeddings/manager.py +54 -16
  9. abstractcore/media/__init__.py +116 -148
  10. abstractcore/media/auto_handler.py +363 -0
  11. abstractcore/media/base.py +456 -0
  12. abstractcore/media/capabilities.py +335 -0
  13. abstractcore/media/types.py +300 -0
  14. abstractcore/media/vision_fallback.py +260 -0
  15. abstractcore/providers/anthropic_provider.py +18 -1
  16. abstractcore/providers/base.py +187 -0
  17. abstractcore/providers/huggingface_provider.py +111 -12
  18. abstractcore/providers/lmstudio_provider.py +88 -5
  19. abstractcore/providers/mlx_provider.py +33 -1
  20. abstractcore/providers/ollama_provider.py +37 -3
  21. abstractcore/providers/openai_provider.py +18 -1
  22. abstractcore/server/app.py +1390 -104
  23. abstractcore/tools/common_tools.py +12 -8
  24. abstractcore/utils/__init__.py +9 -5
  25. abstractcore/utils/cli.py +199 -17
  26. abstractcore/utils/message_preprocessor.py +182 -0
  27. abstractcore/utils/structured_logging.py +117 -16
  28. abstractcore/utils/version.py +1 -1
  29. {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/METADATA +214 -20
  30. {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/RECORD +34 -27
  31. {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/entry_points.txt +1 -0
  32. {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/WHEEL +0 -0
  33. {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/licenses/LICENSE +0 -0
  34. {abstractcore-2.4.2.dist-info → abstractcore-2.4.4.dist-info}/top_level.txt +0 -0
@@ -473,21 +473,23 @@ class HuggingFaceProvider(BaseProvider):
473
473
  messages: Optional[List[Dict[str, str]]] = None,
474
474
  system_prompt: Optional[str] = None,
475
475
  tools: Optional[List[Dict[str, Any]]] = None,
476
+ media: Optional[List['MediaContent']] = None,
476
477
  stream: bool = False,
477
478
  response_model: Optional[Type[BaseModel]] = None,
478
479
  **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
479
480
  """Generate response using appropriate backend"""
480
481
 
481
482
  if self.model_type == "gguf":
482
- return self._generate_gguf(prompt, messages, system_prompt, tools, stream, **kwargs)
483
+ return self._generate_gguf(prompt, messages, system_prompt, tools, media, stream, **kwargs)
483
484
  else:
484
- return self._generate_transformers(prompt, messages, system_prompt, tools, stream, **kwargs)
485
+ return self._generate_transformers(prompt, messages, system_prompt, tools, media, stream, **kwargs)
485
486
 
486
487
  def _generate_transformers(self,
487
488
  prompt: str,
488
489
  messages: Optional[List[Dict[str, str]]] = None,
489
490
  system_prompt: Optional[str] = None,
490
491
  tools: Optional[List[Dict[str, Any]]] = None,
492
+ media: Optional[List['MediaContent']] = None,
491
493
  stream: bool = False,
492
494
  **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
493
495
  """Generate using transformers backend (original implementation)"""
@@ -499,7 +501,37 @@ class HuggingFaceProvider(BaseProvider):
499
501
  finish_reason="error"
500
502
  )
501
503
 
502
- # Build input text with tool support
504
+ # Build input text with tool and media support
505
+ # Handle media content first if present
506
+ if media:
507
+ try:
508
+ from ..media.handlers import LocalMediaHandler
509
+ media_handler = LocalMediaHandler("huggingface", self.model_capabilities, model_name=self.model)
510
+
511
+ # Create multimodal message combining text and media
512
+ multimodal_message = media_handler.create_multimodal_message(prompt, media)
513
+
514
+ # For local providers, we get text-embedded content
515
+ if isinstance(multimodal_message, str):
516
+ prompt = multimodal_message
517
+ else:
518
+ # If we get a structured message, extract the content
519
+ if isinstance(multimodal_message, dict) and "content" in multimodal_message:
520
+ if isinstance(multimodal_message["content"], list):
521
+ # Find text content in the structured message
522
+ text_content = ""
523
+ for item in multimodal_message["content"]:
524
+ if item.get("type") == "text":
525
+ text_content = item.get("text", "")
526
+ break
527
+ prompt = text_content or prompt
528
+ else:
529
+ prompt = str(multimodal_message["content"])
530
+ except ImportError:
531
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
532
+ except Exception as e:
533
+ self.logger.warning(f"Failed to process media content: {e}")
534
+
503
535
  input_text = self._build_input_text_transformers(prompt, messages, system_prompt, tools)
504
536
 
505
537
  # Generation parameters using unified system
@@ -532,6 +564,7 @@ class HuggingFaceProvider(BaseProvider):
532
564
  messages: Optional[List[Dict[str, str]]] = None,
533
565
  system_prompt: Optional[str] = None,
534
566
  tools: Optional[List[Dict[str, Any]]] = None,
567
+ media: Optional[List['MediaContent']] = None,
535
568
  stream: bool = False,
536
569
  **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
537
570
  """Generate using GGUF backend with llama-cpp-python"""
@@ -552,7 +585,64 @@ class HuggingFaceProvider(BaseProvider):
552
585
  if messages:
553
586
  chat_messages.extend(messages)
554
587
 
555
- chat_messages.append({"role": "user", "content": prompt})
588
+ # Handle media content for the user message - use proper vision format for GGUF models
589
+ if media:
590
+ try:
591
+ from ..architectures.detection import supports_vision
592
+
593
+ # Check if this model supports vision natively
594
+ if supports_vision(self.model):
595
+ # Use HuggingFace multimodal format for vision-capable GGUF models
596
+ user_message_content = []
597
+
598
+ # Add text content
599
+ user_message_content.append({"type": "text", "text": prompt})
600
+
601
+ # Add media content
602
+ for media_item in media:
603
+ if hasattr(media_item, 'file_path') and media_item.file_path:
604
+ # Use file:// URL format as specified in HuggingFace docs
605
+ file_path = str(media_item.file_path)
606
+ if not file_path.startswith('file://'):
607
+ file_path = f"file://{file_path}"
608
+ user_message_content.append({
609
+ "type": "image",
610
+ "image": file_path
611
+ })
612
+ elif hasattr(media_item, 'content') and media_item.content:
613
+ # For base64 or other content, we might need to save to temp file
614
+ import tempfile
615
+ import base64
616
+ with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
617
+ if isinstance(media_item.content, str) and media_item.content.startswith('data:'):
618
+ # Handle base64 data URLs
619
+ header, data = media_item.content.split(',', 1)
620
+ decoded_data = base64.b64decode(data)
621
+ tmp_file.write(decoded_data)
622
+ else:
623
+ tmp_file.write(media_item.content)
624
+ tmp_file.flush()
625
+ user_message_content.append({
626
+ "type": "image",
627
+ "image": f"file://{tmp_file.name}"
628
+ })
629
+ else:
630
+ # Fallback to text-based media handling for non-vision models
631
+ from ..media.handlers import LocalMediaHandler
632
+ media_handler = LocalMediaHandler("huggingface", self.model_capabilities, model_name=self.model)
633
+ multimodal_message = media_handler.create_multimodal_message(prompt, media)
634
+ user_message_content = multimodal_message if isinstance(multimodal_message, str) else prompt
635
+
636
+ except ImportError:
637
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
638
+ user_message_content = prompt
639
+ except Exception as e:
640
+ self.logger.warning(f"Failed to process media content: {e}")
641
+ user_message_content = prompt
642
+ else:
643
+ user_message_content = prompt
644
+
645
+ chat_messages.append({"role": "user", "content": user_message_content})
556
646
 
557
647
  # Prepare parameters using unified system
558
648
  unified_kwargs = self._prepare_generation_kwargs(**kwargs)
@@ -774,19 +864,14 @@ class HuggingFaceProvider(BaseProvider):
774
864
  if outputs and len(outputs) > 0:
775
865
  response_text = outputs[0]['generated_text'].strip()
776
866
 
777
- # Calculate token usage
778
- input_tokens = len(self.tokenizer.encode(input_text))
779
- output_tokens = len(self.tokenizer.encode(response_text))
867
+ # Calculate token usage using centralized utilities
868
+ usage = self._calculate_usage(input_text, response_text)
780
869
 
781
870
  return GenerateResponse(
782
871
  content=response_text,
783
872
  model=self.model,
784
873
  finish_reason="stop",
785
- usage={
786
- "prompt_tokens": input_tokens,
787
- "completion_tokens": output_tokens,
788
- "total_tokens": input_tokens + output_tokens
789
- }
874
+ usage=usage
790
875
  )
791
876
  else:
792
877
  return GenerateResponse(
@@ -802,6 +887,20 @@ class HuggingFaceProvider(BaseProvider):
802
887
  finish_reason="error"
803
888
  )
804
889
 
890
+ def _calculate_usage(self, prompt: str, response: str) -> Dict[str, int]:
891
+ """Calculate token usage using centralized token utilities."""
892
+ from ..utils.token_utils import TokenUtils
893
+
894
+ prompt_tokens = TokenUtils.estimate_tokens(prompt, self.model)
895
+ completion_tokens = TokenUtils.estimate_tokens(response, self.model)
896
+ total_tokens = prompt_tokens + completion_tokens
897
+
898
+ return {
899
+ "prompt_tokens": prompt_tokens,
900
+ "completion_tokens": completion_tokens,
901
+ "total_tokens": total_tokens
902
+ }
903
+
805
904
  def _stream_generate_transformers(self, input_text: str, max_new_tokens: int,
806
905
  temperature: float, top_p: float, tool_call_tags: Optional[str] = None) -> Iterator[GenerateResponse]:
807
906
  """Stream response using transformers (simulated, original implementation) with tool tag rewriting support"""
@@ -99,6 +99,7 @@ class LMStudioProvider(BaseProvider):
99
99
  messages: Optional[List[Dict[str, str]]] = None,
100
100
  system_prompt: Optional[str] = None,
101
101
  tools: Optional[List[Dict[str, Any]]] = None,
102
+ media: Optional[List['MediaContent']] = None,
102
103
  stream: bool = False,
103
104
  response_model: Optional[Type[BaseModel]] = None,
104
105
  execute_tools: Optional[bool] = None,
@@ -129,11 +130,63 @@ class LMStudioProvider(BaseProvider):
129
130
  if messages:
130
131
  chat_messages.extend(messages)
131
132
 
132
- # Add current prompt
133
- chat_messages.append({
134
- "role": "user",
135
- "content": prompt
136
- })
133
+ # Handle media content regardless of prompt (media can be used with messages too)
134
+ if media:
135
+ # Get the last user message content to combine with media
136
+ user_message_text = prompt.strip() if prompt else ""
137
+ if not user_message_text and chat_messages:
138
+ # If no prompt, try to get text from the last user message
139
+ for msg in reversed(chat_messages):
140
+ if msg.get("role") == "user" and msg.get("content"):
141
+ user_message_text = msg["content"]
142
+ break
143
+ try:
144
+ # CRITICAL FIX: Process media files into MediaContent objects first
145
+ processed_media = self._process_media_content(media)
146
+
147
+ # Use capability-based media handler selection
148
+ media_handler = self._get_media_handler_for_model(self.model)
149
+
150
+ # Create multimodal message combining text and processed media
151
+ multimodal_message = media_handler.create_multimodal_message(user_message_text, processed_media)
152
+
153
+ # For LMStudio (OpenAI-compatible), we might get a string (embedded text) or dict (structured)
154
+ if isinstance(multimodal_message, str):
155
+ # Replace the last user message with the multimodal message, or add new one
156
+ if chat_messages and chat_messages[-1].get("role") == "user":
157
+ chat_messages[-1]["content"] = multimodal_message
158
+ else:
159
+ chat_messages.append({
160
+ "role": "user",
161
+ "content": multimodal_message
162
+ })
163
+ else:
164
+ if chat_messages and chat_messages[-1].get("role") == "user":
165
+ # Replace last user message with structured multimodal message
166
+ chat_messages[-1] = multimodal_message
167
+ else:
168
+ chat_messages.append(multimodal_message)
169
+ except ImportError:
170
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
171
+ if user_message_text:
172
+ chat_messages.append({
173
+ "role": "user",
174
+ "content": user_message_text
175
+ })
176
+ except Exception as e:
177
+ self.logger.warning(f"Failed to process media content: {e}")
178
+ if user_message_text:
179
+ chat_messages.append({
180
+ "role": "user",
181
+ "content": user_message_text
182
+ })
183
+
184
+ # Add prompt as separate message if provided (for backward compatibility)
185
+ elif prompt and prompt.strip():
186
+ chat_messages.append({
187
+ "role": "user",
188
+ "content": prompt
189
+ })
137
190
 
138
191
  # Build request payload using unified system
139
192
  generation_kwargs = self._prepare_generation_kwargs(**kwargs)
@@ -313,7 +366,37 @@ class LMStudioProvider(BaseProvider):
313
366
  except Exception:
314
367
  pass # Best effort - don't fail the operation
315
368
 
369
+ def _normalize_model_name(self, model_name: str) -> str:
370
+ """Remove common provider prefixes from model name."""
371
+ for prefix in ["lmstudio/", "qwen/", "ollama/", "huggingface/"]:
372
+ if model_name.startswith(prefix):
373
+ model_name = model_name[len(prefix):]
374
+ return model_name
375
+
376
+ def _get_media_handler_for_model(self, model_name: str):
377
+ """Get appropriate media handler based on model vision capabilities."""
378
+ from ..media.handlers import OpenAIMediaHandler, LocalMediaHandler
379
+
380
+ # Normalize model name by removing provider prefixes
381
+ clean_model_name = self._normalize_model_name(model_name)
382
+
383
+ # Determine if model supports vision
384
+ try:
385
+ from ..architectures.detection import supports_vision
386
+ use_vision_handler = supports_vision(clean_model_name)
387
+ except Exception as e:
388
+ self.logger.debug(f"Vision detection failed: {e}, defaulting to LocalMediaHandler")
389
+ use_vision_handler = False
390
+
391
+ # Create appropriate handler
392
+ if use_vision_handler:
393
+ handler = OpenAIMediaHandler(self.model_capabilities, model_name=model_name)
394
+ self.logger.debug(f"Using OpenAIMediaHandler for vision model: {clean_model_name}")
395
+ else:
396
+ handler = LocalMediaHandler("lmstudio", self.model_capabilities, model_name=model_name)
397
+ self.logger.debug(f"Using LocalMediaHandler for model: {clean_model_name}")
316
398
 
399
+ return handler
317
400
 
318
401
  def list_available_models(self, **kwargs) -> List[str]:
319
402
  """List available models from LMStudio server."""
@@ -139,6 +139,7 @@ class MLXProvider(BaseProvider):
139
139
  messages: Optional[List[Dict[str, str]]] = None,
140
140
  system_prompt: Optional[str] = None,
141
141
  tools: Optional[List[Dict[str, Any]]] = None,
142
+ media: Optional[List['MediaContent']] = None,
142
143
  stream: bool = False,
143
144
  response_model: Optional[Type[BaseModel]] = None,
144
145
  **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
@@ -151,8 +152,39 @@ class MLXProvider(BaseProvider):
151
152
  finish_reason="error"
152
153
  )
153
154
 
155
+ # Handle media content first if present
156
+ processed_prompt = prompt
157
+ if media:
158
+ try:
159
+ from ..media.handlers import LocalMediaHandler
160
+ media_handler = LocalMediaHandler("mlx", self.model_capabilities, model_name=self.model)
161
+
162
+ # Create multimodal message combining text and media
163
+ multimodal_message = media_handler.create_multimodal_message(prompt, media)
164
+
165
+ # For MLX (local provider), we get text-embedded content
166
+ if isinstance(multimodal_message, str):
167
+ processed_prompt = multimodal_message
168
+ else:
169
+ # If we get a structured message, extract the content
170
+ if isinstance(multimodal_message, dict) and "content" in multimodal_message:
171
+ if isinstance(multimodal_message["content"], list):
172
+ # Find text content in the structured message
173
+ text_content = ""
174
+ for item in multimodal_message["content"]:
175
+ if item.get("type") == "text":
176
+ text_content = item.get("text", "")
177
+ break
178
+ processed_prompt = text_content or prompt
179
+ else:
180
+ processed_prompt = str(multimodal_message["content"])
181
+ except ImportError:
182
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
183
+ except Exception as e:
184
+ self.logger.warning(f"Failed to process media content: {e}")
185
+
154
186
  # Build full prompt with tool support
155
- full_prompt = self._build_prompt(prompt, messages, system_prompt, tools)
187
+ full_prompt = self._build_prompt(processed_prompt, messages, system_prompt, tools)
156
188
 
157
189
  # MLX generation parameters using unified system
158
190
  generation_kwargs = self._prepare_generation_kwargs(**kwargs)
@@ -109,6 +109,7 @@ class OllamaProvider(BaseProvider):
109
109
  messages: Optional[List[Dict[str, str]]] = None,
110
110
  system_prompt: Optional[str] = None,
111
111
  tools: Optional[List[Dict[str, Any]]] = None,
112
+ media: Optional[List['MediaContent']] = None,
112
113
  stream: bool = False,
113
114
  response_model: Optional[Type[BaseModel]] = None,
114
115
  **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
@@ -160,9 +161,42 @@ class OllamaProvider(BaseProvider):
160
161
  converted_messages = self._convert_messages_for_ollama(messages)
161
162
  payload["messages"].extend(converted_messages)
162
163
 
163
- # Add current prompt as user message (only if non-empty)
164
- # When using messages array, prompt should be empty or already in messages
165
- if prompt and prompt.strip():
164
+ # Handle media content regardless of prompt (media can be used with messages too)
165
+ if media:
166
+ # Get the text to combine with media
167
+ user_message_text = prompt.strip() if prompt else ""
168
+ try:
169
+ from ..media.handlers import LocalMediaHandler
170
+ media_handler = LocalMediaHandler("ollama", self.model_capabilities, model_name=self.model)
171
+
172
+ # Create multimodal message combining text and media
173
+ multimodal_message = media_handler.create_multimodal_message(user_message_text, media)
174
+
175
+ # For local providers, we might get a string (embedded text) or dict (structured)
176
+ if isinstance(multimodal_message, str):
177
+ payload["messages"].append({
178
+ "role": "user",
179
+ "content": multimodal_message
180
+ })
181
+ else:
182
+ payload["messages"].append(multimodal_message)
183
+ except ImportError:
184
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
185
+ if user_message_text:
186
+ payload["messages"].append({
187
+ "role": "user",
188
+ "content": user_message_text
189
+ })
190
+ except Exception as e:
191
+ self.logger.warning(f"Failed to process media content: {e}")
192
+ if user_message_text:
193
+ payload["messages"].append({
194
+ "role": "user",
195
+ "content": user_message_text
196
+ })
197
+
198
+ # Add prompt as separate message if provided (for backward compatibility)
199
+ elif prompt and prompt.strip():
166
200
  payload["messages"].append({
167
201
  "role": "user",
168
202
  "content": prompt
@@ -65,6 +65,7 @@ class OpenAIProvider(BaseProvider):
65
65
  messages: Optional[List[Dict[str, str]]] = None,
66
66
  system_prompt: Optional[str] = None,
67
67
  tools: Optional[List[Dict[str, Any]]] = None,
68
+ media: Optional[List['MediaContent']] = None,
68
69
  stream: bool = False,
69
70
  response_model: Optional[Type[BaseModel]] = None,
70
71
  **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
@@ -89,7 +90,23 @@ class OpenAIProvider(BaseProvider):
89
90
 
90
91
  # Add current prompt as user message
91
92
  if prompt and prompt not in [msg.get("content") for msg in (messages or [])]:
92
- api_messages.append({"role": "user", "content": prompt})
93
+ # Handle multimodal message with media content
94
+ if media:
95
+ try:
96
+ from ..media.handlers import OpenAIMediaHandler
97
+ media_handler = OpenAIMediaHandler(self.model_capabilities)
98
+
99
+ # Create multimodal message combining text and media
100
+ multimodal_message = media_handler.create_multimodal_message(prompt, media)
101
+ api_messages.append(multimodal_message)
102
+ except ImportError:
103
+ self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
104
+ api_messages.append({"role": "user", "content": prompt})
105
+ except Exception as e:
106
+ self.logger.warning(f"Failed to process media content: {e}")
107
+ api_messages.append({"role": "user", "content": prompt})
108
+ else:
109
+ api_messages.append({"role": "user", "content": prompt})
93
110
 
94
111
  # Prepare API call parameters using unified system
95
112
  generation_kwargs = self._prepare_generation_kwargs(**kwargs)