abstractcore 2.6.2__py3-none-any.whl → 2.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -178,7 +178,7 @@ def format_assessment_plain(assessment: dict) -> str:
178
178
  lines.append(f"Overall Score: {assessment.get('overall_score', 0)}/5")
179
179
  lines.append("")
180
180
 
181
- # Individual scores
181
+ # Predefined criterion scores
182
182
  score_fields = [
183
183
  ('clarity_score', 'Clarity'),
184
184
  ('simplicity_score', 'Simplicity'),
@@ -191,13 +191,28 @@ def format_assessment_plain(assessment: dict) -> str:
191
191
  ('coherence_score', 'Coherence')
192
192
  ]
193
193
 
194
- lines.append("Individual Scores:")
195
- lines.append("-" * 20)
196
- for field, label in score_fields:
197
- score = assessment.get(field)
198
- if score is not None:
199
- lines.append(f"{label:15}: {score}/5")
200
- lines.append("")
194
+ # Check if any predefined scores exist
195
+ has_predefined_scores = any(assessment.get(field) is not None for field, _ in score_fields)
196
+
197
+ if has_predefined_scores:
198
+ lines.append("📋 Predefined Criterion Scores:")
199
+ lines.append("-" * 32)
200
+ for field, label in score_fields:
201
+ score = assessment.get(field)
202
+ if score is not None:
203
+ lines.append(f"{label:15}: {score}/5")
204
+ lines.append("")
205
+
206
+ # Custom criterion scores
207
+ custom_scores = assessment.get('custom_scores', {})
208
+ if custom_scores:
209
+ lines.append("🎯 Custom Criterion Scores:")
210
+ lines.append("-" * 28)
211
+ for criterion, score in custom_scores.items():
212
+ # Format criterion name nicely
213
+ criterion_display = criterion.replace('_', ' ').title()
214
+ lines.append(f"{criterion_display:30}: {score}/5")
215
+ lines.append("")
201
216
 
202
217
  # Strengths
203
218
  strengths = assessment.get('strengths', [])
@@ -939,13 +939,18 @@ class BasicSession:
939
939
  summary_tokens = self._estimate_tokens_for_summary(summary_text)
940
940
  return original_tokens / summary_tokens if summary_tokens > 0 else 1.0
941
941
 
942
- def generate_assessment(self, criteria: Optional[Dict[str, bool]] = None) -> Dict[str, Any]:
942
+ def generate_assessment(
943
+ self,
944
+ criteria: Optional[Dict[str, bool]] = None,
945
+ custom_criteria: Optional[Dict[str, str]] = None
946
+ ) -> Dict[str, Any]:
943
947
  """
944
948
  Generate a quality assessment of the entire conversation and store it in session.assessment.
945
-
949
+
946
950
  Args:
947
- criteria: Optional criteria for assessment
948
-
951
+ criteria: Optional predefined criteria toggles (e.g., {"clarity": True, "coherence": False})
952
+ custom_criteria: Optional custom domain-specific criteria with descriptions (e.g., {"logical_coherence": "Are results logically consistent?"})
953
+
949
954
  Returns:
950
955
  Dict containing the generated assessment
951
956
  """
@@ -989,13 +994,27 @@ class BasicSession:
989
994
  assessment_result = judge.evaluate(
990
995
  content=conversation_text,
991
996
  context="conversation quality assessment",
992
- criteria=judge_criteria
997
+ criteria=judge_criteria,
998
+ custom_criteria=custom_criteria
993
999
  )
994
1000
 
995
1001
  # Store assessment in session
996
1002
  self.assessment = {
997
1003
  "created_at": start_time.isoformat(),
998
1004
  "criteria": criteria,
1005
+ "custom_criteria": custom_criteria,
1006
+ "scores": {
1007
+ "clarity": assessment_result.get('clarity_score'),
1008
+ "simplicity": assessment_result.get('simplicity_score'),
1009
+ "actionability": assessment_result.get('actionability_score'),
1010
+ "soundness": assessment_result.get('soundness_score'),
1011
+ "innovation": assessment_result.get('innovation_score'),
1012
+ "effectiveness": assessment_result.get('effectiveness_score'),
1013
+ "relevance": assessment_result.get('relevance_score'),
1014
+ "completeness": assessment_result.get('completeness_score'),
1015
+ "coherence": assessment_result.get('coherence_score'),
1016
+ },
1017
+ "custom_scores": assessment_result.get('custom_scores', {}),
999
1018
  "overall_score": assessment_result.get('overall_score', 0),
1000
1019
  "judge_summary": assessment_result.get('judge_summary', ''),
1001
1020
  "strengths": assessment_result.get('strengths', []),
@@ -9,11 +9,11 @@ Features:
9
9
  - Clear, simple and actionable feedback
10
10
  """
11
11
 
12
- from typing import Optional, List, Dict, Any, Union
12
+ from typing import Optional, List, Dict, Any, Union, Type
13
13
  import json
14
14
  import logging
15
15
  from pathlib import Path
16
- from pydantic import BaseModel, Field
16
+ from pydantic import BaseModel, Field, create_model
17
17
 
18
18
  from ..core.interface import AbstractCoreInterface
19
19
  from ..core.factory import create_llm
@@ -44,7 +44,7 @@ class Assessment(BaseModel):
44
44
  judge_summary: str = Field(..., description="Judge's experiential note summarizing the assessment task and key findings")
45
45
  source_reference: str = Field(..., description="Reference to what was assessed (file, content type, context)")
46
46
 
47
- # Individual criterion scores
47
+ # Individual criterion scores (predefined criteria)
48
48
  clarity_score: Optional[int] = Field(None, description="Clarity score (1-5)")
49
49
  simplicity_score: Optional[int] = Field(None, description="Simplicity score (1-5)")
50
50
  actionability_score: Optional[int] = Field(None, description="Actionability score (1-5)")
@@ -161,6 +161,30 @@ class BasicJudge:
161
161
 
162
162
  self.retry_strategy = FeedbackRetry(max_attempts=3)
163
163
 
164
+ def _create_dynamic_assessment_model(self, custom_criteria: Optional[Dict[str, str]]) -> Type[BaseModel]:
165
+ """Create a dynamic Assessment model with custom score fields"""
166
+ if not custom_criteria:
167
+ return Assessment
168
+
169
+ # Build fields dict for dynamic model creation
170
+ fields_dict = {}
171
+
172
+ # Add custom score fields dynamically as REQUIRED (not Optional)
173
+ # This forces the LLM to provide scores for all custom criteria
174
+ for criterion_name in custom_criteria.keys():
175
+ field_name = f"{criterion_name}_score"
176
+ # Make it required (int, not Optional[int]) with Field(...)
177
+ fields_dict[field_name] = (int, Field(..., description=f"{criterion_name} score (1-5)", ge=1, le=5))
178
+
179
+ # Create dynamic model that inherits from Assessment using create_model
180
+ DynamicAssessment = create_model(
181
+ 'DynamicAssessment',
182
+ __base__=Assessment,
183
+ **fields_dict
184
+ )
185
+
186
+ return DynamicAssessment
187
+
164
188
  def evaluate(
165
189
  self,
166
190
  content: str,
@@ -168,7 +192,8 @@ class BasicJudge:
168
192
  criteria: Optional[JudgmentCriteria] = None,
169
193
  focus: Optional[str] = None,
170
194
  reference: Optional[str] = None,
171
- include_criteria: bool = False
195
+ include_criteria: bool = False,
196
+ custom_criteria: Optional[Dict[str, str]] = None
172
197
  ) -> dict:
173
198
  """
174
199
  Evaluate content against specified criteria
@@ -180,6 +205,7 @@ class BasicJudge:
180
205
  focus: Specific areas to focus evaluation on (e.g., "technical accuracy, performance")
181
206
  reference: Optional reference/expected output for comparison
182
207
  include_criteria: Include detailed explanation of evaluation criteria in assessment
208
+ custom_criteria: Custom domain-specific criteria with descriptions (e.g., {"logical_coherence": "Are results logically consistent?"})
183
209
 
184
210
  Returns:
185
211
  dict: Structured assessment result
@@ -196,13 +222,16 @@ class BasicJudge:
196
222
  logger.info("Starting evaluation", context=context)
197
223
 
198
224
  # Build the evaluation prompt
199
- prompt = self._build_evaluation_prompt(content, context, criteria, focus, reference, include_criteria)
225
+ prompt = self._build_evaluation_prompt(content, context, criteria, focus, reference, include_criteria, custom_criteria)
226
+
227
+ # Create dynamic assessment model with custom score fields
228
+ AssessmentModel = self._create_dynamic_assessment_model(custom_criteria)
200
229
 
201
230
  # Generate structured assessment
202
231
  try:
203
232
  result = self.llm.generate(
204
233
  prompt,
205
- response_model=Assessment,
234
+ response_model=AssessmentModel,
206
235
  retry_strategy=self.retry_strategy
207
236
  )
208
237
 
@@ -216,6 +245,19 @@ class BasicJudge:
216
245
  # Convert to dict and add metadata
217
246
  assessment_dict = result.dict() if hasattr(result, 'dict') else result
218
247
 
248
+ # Extract custom scores from individual fields and add to custom_scores dict
249
+ if custom_criteria:
250
+ custom_scores = {}
251
+ for criterion_name in custom_criteria.keys():
252
+ field_name = f"{criterion_name}_score"
253
+ if field_name in assessment_dict:
254
+ score_value = assessment_dict.pop(field_name) # Remove individual field
255
+ if score_value is not None:
256
+ custom_scores[criterion_name] = score_value
257
+ assessment_dict['custom_scores'] = custom_scores
258
+ else:
259
+ assessment_dict['custom_scores'] = {}
260
+
219
261
  # Log results
220
262
  overall_score = assessment_dict.get('overall_score', 0)
221
263
  logger.info("Evaluation completed", overall_score=overall_score, max_score=5)
@@ -247,7 +289,8 @@ class BasicJudge:
247
289
  reference: Optional[str] = None,
248
290
  include_criteria: bool = False,
249
291
  max_file_size: int = 1000000, # 1MB default limit per file
250
- exclude_global: bool = False # Include global assessment by default
292
+ exclude_global: bool = False, # Include global assessment by default
293
+ custom_criteria: Optional[Dict[str, str]] = None
251
294
  ) -> Union[dict, List[dict]]:
252
295
  """
253
296
  Evaluate content from one or multiple files sequentially to avoid context overflow
@@ -261,6 +304,7 @@ class BasicJudge:
261
304
  include_criteria: Include detailed explanation of evaluation criteria in assessment
262
305
  max_file_size: Maximum file size in bytes (default 1MB to avoid context overflow)
263
306
  exclude_global: If True, skip global assessment for multiple files (default False)
307
+ custom_criteria: Custom domain-specific criteria with descriptions (e.g., {"logical_coherence": "Are results logically consistent?"})
264
308
 
265
309
  Returns:
266
310
  dict: Single assessment if one file provided
@@ -360,7 +404,8 @@ class BasicJudge:
360
404
  criteria=criteria,
361
405
  focus=focus,
362
406
  reference=reference,
363
- include_criteria=include_criteria
407
+ include_criteria=include_criteria,
408
+ custom_criteria=custom_criteria
364
409
  )
365
410
 
366
411
  # Update source reference to include file name
@@ -382,7 +427,7 @@ class BasicJudge:
382
427
  # Generate global assessment and return structured result
383
428
  logger.info("Generating global assessment from individual file evaluations", file_count=len(assessments))
384
429
  global_assessment = self._generate_global_assessment(
385
- assessments, context, criteria, focus, include_criteria
430
+ assessments, context, criteria, focus, include_criteria, custom_criteria
386
431
  )
387
432
 
388
433
  return {
@@ -396,7 +441,8 @@ class BasicJudge:
396
441
  context: str,
397
442
  criteria: Optional[JudgmentCriteria],
398
443
  focus: Optional[str],
399
- include_criteria: bool
444
+ include_criteria: bool,
445
+ custom_criteria: Optional[Dict[str, str]] = None
400
446
  ) -> dict:
401
447
  """
402
448
  Generate a global assessment from multiple individual file assessments
@@ -475,7 +521,8 @@ Provide a comprehensive global assessment of overall quality and recommendations
475
521
  context=f"global assessment summary for {total_files} files ({context})",
476
522
  criteria=criteria,
477
523
  focus=focus,
478
- include_criteria=include_criteria
524
+ include_criteria=include_criteria,
525
+ custom_criteria=custom_criteria
479
526
  )
480
527
 
481
528
  # Update the source reference to indicate this is a global assessment
@@ -506,6 +553,19 @@ Provide a comprehensive global assessment of overall quality and recommendations
506
553
  "evaluation_criteria_details": None
507
554
  }
508
555
 
556
+ def _build_custom_scores_format(self, custom_criteria: Optional[Dict[str, str]]) -> str:
557
+ """Build custom score fields for the prompt (individual fields, not dict)"""
558
+ if not custom_criteria:
559
+ return ""
560
+
561
+ # Build individual score fields for each custom criterion
562
+ score_fields = []
563
+ for criterion_name in custom_criteria.keys():
564
+ field_name = f"{criterion_name}_score"
565
+ score_fields.append(f' "{field_name}": <1-5 integer>,')
566
+
567
+ return "\n" + "\n".join(score_fields)
568
+
509
569
  def _build_evaluation_prompt(
510
570
  self,
511
571
  content: str,
@@ -513,7 +573,8 @@ Provide a comprehensive global assessment of overall quality and recommendations
513
573
  criteria: JudgmentCriteria,
514
574
  focus: Optional[str],
515
575
  reference: Optional[str],
516
- include_criteria: bool = False
576
+ include_criteria: bool = False,
577
+ custom_criteria: Optional[Dict[str, str]] = None
517
578
  ) -> str:
518
579
  """Build the evaluation prompt with chain-of-thought reasoning"""
519
580
 
@@ -565,6 +626,12 @@ Provide a comprehensive global assessment of overall quality and recommendations
565
626
  active_criteria.append(focus_item)
566
627
  criteria_descriptions.append(f"- **{focus_item.title()}**: PRIMARY FOCUS AREA - This is a key evaluation target")
567
628
 
629
+ # Add custom criteria with their specific descriptions
630
+ if custom_criteria:
631
+ for name, description in custom_criteria.items():
632
+ active_criteria.append(name)
633
+ criteria_descriptions.append(f"- **{name.replace('_', ' ').title()}**: {description}")
634
+
568
635
  criteria_text = "\n".join(criteria_descriptions)
569
636
 
570
637
  # Build reference section if provided
@@ -613,12 +680,26 @@ SCORING RUBRIC (1-5 scale):
613
680
  - **Score 2**: Poor - Falls short of expectations with significant issues
614
681
  - **Score 1**: Very Poor - Fails to meet basic standards in this dimension
615
682
 
683
+ SCORING PRINCIPLES - CRITICAL:
684
+ - **Be rigorous and avoid grade inflation**: Most adequate responses should be scored 2-3, not 3-4
685
+ - **Context matters**: For routine tasks (e.g., basic arithmetic), criteria like "innovation" should be scored 1-2 unless truly creative
686
+ - **If a criterion doesn't meaningfully apply to the task**, score it 1-2, not 3 (e.g., innovation for standard formula application = 1)
687
+ - **Reserve 4-5 for genuinely excellent work**: Don't give high scores by default
688
+ - **Apply task-appropriate expectations**:
689
+ * Routine calculations: innovation 1-2, soundness 4-5 (if correct)
690
+ * Creative explanations: innovation 3-4 if novel approach shown
691
+ * Complex problem-solving: innovation 4-5 if breakthrough thinking demonstrated
692
+ - **Be appropriately critical**: Question whether the response truly meets each criterion
693
+
616
694
  EVALUATION PROCESS:
617
695
  1. **STEP 1**: Carefully analyze the content for each active criterion
618
- 2. **STEP 2**: Identify specific strengths and weaknesses
619
- 3. **STEP 3**: Provide actionable recommendations for improvement
620
- 4. **STEP 4**: Assign scores based on the rubric (be fair but appropriately critical)
621
- 5. **STEP 5**: Calculate overall score - PRIMARY FOCUS AREAS should heavily influence the final score
696
+ 2. **STEP 2**: Assess if each criterion meaningfully applies to this task (if not, score 1-2)
697
+ 3. **STEP 3**: Identify specific strengths and weaknesses
698
+ 4. **STEP 4**: Provide actionable recommendations for improvement
699
+ 5. **STEP 5**: Assign scores based on the rubric (be rigorous and appropriately critical)
700
+ - For standard criteria: populate the corresponding _score fields (e.g., clarity_score, soundness_score)
701
+ - For custom criteria: populate the custom_scores object with scores for EACH custom criterion listed in EVALUATION CRITERIA
702
+ 6. **STEP 6**: Calculate overall score - PRIMARY FOCUS AREAS should heavily influence the final score
622
703
 
623
704
  CRITICAL ASSESSMENT PRINCIPLES:
624
705
  - Be objective and evidence-based in your evaluation
@@ -628,6 +709,11 @@ CRITICAL ASSESSMENT PRINCIPLES:
628
709
  - Ensure recommendations are specific and implementable
629
710
  - PRIMARY FOCUS AREAS are the most important evaluation targets - weaknesses in these areas should significantly impact the overall score
630
711
 
712
+ IMPORTANT - SCORING REQUIREMENTS:
713
+ - You MUST provide individual scores (1-5) for EVERY criterion in the custom_scores object if custom criteria are present
714
+ - Do NOT leave custom_scores as an empty object {{}} - populate it with scores for each custom criterion
715
+ - Each custom criterion listed in EVALUATION CRITERIA must have a corresponding score in custom_scores
716
+
631
717
  RESPONSE FORMAT:
632
718
  Provide your assessment as a structured JSON response with the following format:
633
719
 
@@ -643,7 +729,7 @@ Provide your assessment as a structured JSON response with the following format:
643
729
  "effectiveness_score": <1-5 integer or null if not evaluated>,
644
730
  "relevance_score": <1-5 integer or null if not evaluated>,
645
731
  "completeness_score": <1-5 integer or null if not evaluated>,
646
- "coherence_score": <1-5 integer or null if not evaluated>,
732
+ "coherence_score": <1-5 integer or null if not evaluated>,{self._build_custom_scores_format(custom_criteria)}
647
733
  "strengths": ["list of specific strengths identified"],
648
734
  "weaknesses": ["list of specific areas for improvement"],
649
735
  "actionable_feedback": ["list of specific actionable recommendations"],
@@ -7,6 +7,8 @@ from .ollama_provider import OllamaProvider
7
7
  from .lmstudio_provider import LMStudioProvider
8
8
  from .huggingface_provider import HuggingFaceProvider
9
9
  from .mlx_provider import MLXProvider
10
+ from .vllm_provider import VLLMProvider
11
+ from .openai_compatible_provider import OpenAICompatibleProvider
10
12
 
11
13
  # Provider registry for centralized provider discovery and management
12
14
  from .registry import (
@@ -41,6 +43,8 @@ __all__ = [
41
43
  'LMStudioProvider',
42
44
  'HuggingFaceProvider',
43
45
  'MLXProvider',
46
+ 'VLLMProvider',
47
+ 'OpenAICompatibleProvider',
44
48
 
45
49
  # Provider registry
46
50
  'ProviderRegistry',