llmasajudge 0.1.10__tar.gz → 0.1.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/PKG-INFO +1 -1
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/llmasajudge/__init__.py +153 -44
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/llmasajudge.egg-info/PKG-INFO +1 -1
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/pyproject.toml +1 -1
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/README.md +0 -0
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/llmasajudge.egg-info/SOURCES.txt +0 -0
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/llmasajudge.egg-info/dependency_links.txt +0 -0
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/llmasajudge.egg-info/requires.txt +0 -0
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/llmasajudge.egg-info/top_level.txt +0 -0
- {llmasajudge-0.1.10 → llmasajudge-0.1.11}/setup.cfg +0 -0
|
@@ -206,7 +206,10 @@ import time
|
|
|
206
206
|
import random
|
|
207
207
|
import re
|
|
208
208
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
209
|
+
import litellm
|
|
209
210
|
from litellm import completion
|
|
211
|
+
from litellm.caching.caching import Cache
|
|
212
|
+
|
|
210
213
|
|
|
211
214
|
__all__ = ["LLMAsAJudge", "OutputParsers"]
|
|
212
215
|
|
|
@@ -322,10 +325,122 @@ Output only the number. No explanation. No extra text.""",
|
|
|
322
325
|
|
|
323
326
|
|
|
324
327
|
|
|
328
|
+
# def __init__(
|
|
329
|
+
# self,
|
|
330
|
+
# models: Optional[List[str]] = None,
|
|
331
|
+
# config: Optional[Dict[str, Dict[str, Any]]] = None, # one dict for providers and models
|
|
332
|
+
# base_headers: Optional[Dict[str, str]] = None,
|
|
333
|
+
# wandb_project: Optional[str] = None,
|
|
334
|
+
# custom_template: Optional[str] = None,
|
|
335
|
+
# use_fully_custom_prompt: bool = False,
|
|
336
|
+
# notes: Optional[str] = None,
|
|
337
|
+
# output_parser: Optional[str] = 'right/wrong',
|
|
338
|
+
# fallback_comparison: bool = True,
|
|
339
|
+
# default_temperature: float = 0.0,
|
|
340
|
+
# verbose: bool = False,
|
|
341
|
+
# num_retries: int = 2, # per-call retries before giving up on that model
|
|
342
|
+
# backoff_base: float = 0.5, # seconds
|
|
343
|
+
# backoff_max: float = 4.0, # seconds
|
|
344
|
+
# custom_generation_fns: Optional[List[Callable[[str], str]]] = None,
|
|
345
|
+
# mode: str = "majority", # "single", "majority", "all"
|
|
346
|
+
# ):
|
|
347
|
+
# """
|
|
348
|
+
# config keys can be a provider name ("wandb", "openai", "anthropic")
|
|
349
|
+
# or a full model name ("openai/gpt-4o-mini", "wandb/deepseek-ai/DeepSeek-V3.1").
|
|
350
|
+
|
|
351
|
+
# Values can include:
|
|
352
|
+
# api_base: Optional[str]
|
|
353
|
+
# headers: Dict[str, str]
|
|
354
|
+
# temperature: float
|
|
355
|
+
|
|
356
|
+
# Precedence:
|
|
357
|
+
# base_headers < provider config < model config
|
|
358
|
+
|
|
359
|
+
# Args:
|
|
360
|
+
# models: List of litellm model strings (e.g., ["openai/gpt-4", "anthropic/claude-3"])
|
|
361
|
+
# custom_template: Template with placeholders for input/output/ground_truth
|
|
362
|
+
# use_fully_custom_prompt: If True, pass complete prompt to judge(prompt=...).
|
|
363
|
+
# When True, input/output/ground_truth must NOT be passed to judge()
|
|
364
|
+
# output_parser: Parser name ('right/wrong', 'yes/no', 'pass/fail', 'numeric')
|
|
365
|
+
# or custom function with signature (str) -> Any
|
|
366
|
+
# fallback_comparison: If True and parser returns None, falls back to string comparison
|
|
367
|
+
# custom_generation_fns: List of custom inference functions with signature fn(prompt: str) -> str
|
|
368
|
+
# These will be used in addition to litellm models for voting.
|
|
369
|
+
# mode: Voting mode - "majority" (default), "single" (first judge only), or "all" (unanimous)
|
|
370
|
+
# """
|
|
371
|
+
# self.models = models or []
|
|
372
|
+
# self.custom_generation_fns = custom_generation_fns or []
|
|
373
|
+
|
|
374
|
+
# # Validate that at least one judge is provided
|
|
375
|
+
# if not self.models and not self.custom_generation_fns:
|
|
376
|
+
# raise ValueError("Must provide at least one of: models (litellm) or custom_generation_fns")
|
|
377
|
+
|
|
378
|
+
# # Validate mode
|
|
379
|
+
# if mode not in ("majority", "single", "all"):
|
|
380
|
+
# raise ValueError("mode must be 'majority', 'single', or 'all'")
|
|
381
|
+
|
|
382
|
+
# self.config = config or {}
|
|
383
|
+
# self.base_headers = dict(base_headers or {})
|
|
384
|
+
# self.wandb_project = wandb_project or os.getenv("WANDB_PROJECT")
|
|
385
|
+
# self.notes = notes or ""
|
|
386
|
+
# self.use_fully_custom_prompt = use_fully_custom_prompt
|
|
387
|
+
# self.mode = mode
|
|
388
|
+
|
|
389
|
+
# # Resolve output parser
|
|
390
|
+
# parser_name = None
|
|
391
|
+
# if isinstance(output_parser, str):
|
|
392
|
+
# parser_map = {
|
|
393
|
+
# 'right/wrong': OutputParsers.right_wrong,
|
|
394
|
+
# 'pass/fail': OutputParsers.pass_fail,
|
|
395
|
+
# 'yes/no': OutputParsers.yes_no,
|
|
396
|
+
# 'numeric': OutputParsers.numeric_score,
|
|
397
|
+
# }
|
|
398
|
+
# if output_parser not in parser_map:
|
|
399
|
+
# raise ValueError(f"Unknown parser '{output_parser}'. Available: {list(parser_map.keys())}")
|
|
400
|
+
# self.output_parser = parser_map[output_parser]
|
|
401
|
+
# parser_name = output_parser
|
|
402
|
+
# else:
|
|
403
|
+
# self.output_parser = output_parser
|
|
404
|
+
|
|
405
|
+
# # Set template based on mode
|
|
406
|
+
# if use_fully_custom_prompt:
|
|
407
|
+
# self.template = None # No template in fully custom mode
|
|
408
|
+
# elif custom_template:
|
|
409
|
+
# self.template = custom_template
|
|
410
|
+
# elif parser_name and parser_name in self.PARSER_INSTRUCTIONS:
|
|
411
|
+
# self.template = self.BASE_TEMPLATE.format(
|
|
412
|
+
# instruction=self.PARSER_INSTRUCTIONS[parser_name],
|
|
413
|
+
# notes_section="{notes_section}",
|
|
414
|
+
# input_block="{input_block}",
|
|
415
|
+
# model_output="{model_output}",
|
|
416
|
+
# ground_truth="{ground_truth}",
|
|
417
|
+
# )
|
|
418
|
+
# else:
|
|
419
|
+
# # Default to right/wrong for custom parsers
|
|
420
|
+
# self.template = self.BASE_TEMPLATE.format(
|
|
421
|
+
# instruction=self.PARSER_INSTRUCTIONS['right/wrong'],
|
|
422
|
+
# notes_section="{notes_section}",
|
|
423
|
+
# input_block="{input_block}",
|
|
424
|
+
# model_output="{model_output}",
|
|
425
|
+
# ground_truth="{ground_truth}",
|
|
426
|
+
# )
|
|
427
|
+
|
|
428
|
+
# self.fallback_comparison = fallback_comparison
|
|
429
|
+
# self.default_temperature = float(default_temperature)
|
|
430
|
+
# self.verbose = verbose
|
|
431
|
+
# self.num_retries = int(num_retries)
|
|
432
|
+
# self.backoff_base = float(backoff_base)
|
|
433
|
+
# self.backoff_max = float(backoff_max)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
|
|
325
440
|
def __init__(
|
|
326
441
|
self,
|
|
327
442
|
models: Optional[List[str]] = None,
|
|
328
|
-
config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
443
|
+
config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
329
444
|
base_headers: Optional[Dict[str, str]] = None,
|
|
330
445
|
wandb_project: Optional[str] = None,
|
|
331
446
|
custom_template: Optional[str] = None,
|
|
@@ -335,44 +450,19 @@ Output only the number. No explanation. No extra text.""",
|
|
|
335
450
|
fallback_comparison: bool = True,
|
|
336
451
|
default_temperature: float = 0.0,
|
|
337
452
|
verbose: bool = False,
|
|
338
|
-
num_retries: int = 2,
|
|
339
|
-
backoff_base: float = 0.5,
|
|
340
|
-
backoff_max: float = 4.0,
|
|
453
|
+
num_retries: int = 2,
|
|
454
|
+
backoff_base: float = 0.5,
|
|
455
|
+
backoff_max: float = 4.0,
|
|
341
456
|
custom_generation_fns: Optional[List[Callable[[str], str]]] = None,
|
|
342
|
-
mode: str = "majority",
|
|
457
|
+
mode: str = "majority",
|
|
458
|
+
litellm_cache_dir: Optional[str] = None,
|
|
343
459
|
):
|
|
344
|
-
"""
|
|
345
|
-
config keys can be a provider name ("wandb", "openai", "anthropic")
|
|
346
|
-
or a full model name ("openai/gpt-4o-mini", "wandb/deepseek-ai/DeepSeek-V3.1").
|
|
347
|
-
|
|
348
|
-
Values can include:
|
|
349
|
-
api_base: Optional[str]
|
|
350
|
-
headers: Dict[str, str]
|
|
351
|
-
temperature: float
|
|
352
|
-
|
|
353
|
-
Precedence:
|
|
354
|
-
base_headers < provider config < model config
|
|
355
|
-
|
|
356
|
-
Args:
|
|
357
|
-
models: List of litellm model strings (e.g., ["openai/gpt-4", "anthropic/claude-3"])
|
|
358
|
-
custom_template: Template with placeholders for input/output/ground_truth
|
|
359
|
-
use_fully_custom_prompt: If True, pass complete prompt to judge(prompt=...).
|
|
360
|
-
When True, input/output/ground_truth must NOT be passed to judge()
|
|
361
|
-
output_parser: Parser name ('right/wrong', 'yes/no', 'pass/fail', 'numeric')
|
|
362
|
-
or custom function with signature (str) -> Any
|
|
363
|
-
fallback_comparison: If True and parser returns None, falls back to string comparison
|
|
364
|
-
custom_generation_fns: List of custom inference functions with signature fn(prompt: str) -> str
|
|
365
|
-
These will be used in addition to litellm models for voting.
|
|
366
|
-
mode: Voting mode - "majority" (default), "single" (first judge only), or "all" (unanimous)
|
|
367
|
-
"""
|
|
368
460
|
self.models = models or []
|
|
369
461
|
self.custom_generation_fns = custom_generation_fns or []
|
|
370
462
|
|
|
371
|
-
# Validate that at least one judge is provided
|
|
372
463
|
if not self.models and not self.custom_generation_fns:
|
|
373
464
|
raise ValueError("Must provide at least one of: models (litellm) or custom_generation_fns")
|
|
374
465
|
|
|
375
|
-
# Validate mode
|
|
376
466
|
if mode not in ("majority", "single", "all"):
|
|
377
467
|
raise ValueError("mode must be 'majority', 'single', or 'all'")
|
|
378
468
|
|
|
@@ -382,8 +472,13 @@ Output only the number. No explanation. No extra text.""",
|
|
|
382
472
|
self.notes = notes or ""
|
|
383
473
|
self.use_fully_custom_prompt = use_fully_custom_prompt
|
|
384
474
|
self.mode = mode
|
|
475
|
+
self.fallback_comparison = fallback_comparison
|
|
476
|
+
self.default_temperature = float(default_temperature)
|
|
477
|
+
self.verbose = verbose
|
|
478
|
+
self.num_retries = int(num_retries)
|
|
479
|
+
self.backoff_base = float(backoff_base)
|
|
480
|
+
self.backoff_max = float(backoff_max)
|
|
385
481
|
|
|
386
|
-
# Resolve output parser
|
|
387
482
|
parser_name = None
|
|
388
483
|
if isinstance(output_parser, str):
|
|
389
484
|
parser_map = {
|
|
@@ -393,15 +488,14 @@ Output only the number. No explanation. No extra text.""",
|
|
|
393
488
|
'numeric': OutputParsers.numeric_score,
|
|
394
489
|
}
|
|
395
490
|
if output_parser not in parser_map:
|
|
396
|
-
raise ValueError(f"Unknown parser '{output_parser}'
|
|
491
|
+
raise ValueError(f"Unknown parser '{output_parser}'")
|
|
397
492
|
self.output_parser = parser_map[output_parser]
|
|
398
493
|
parser_name = output_parser
|
|
399
494
|
else:
|
|
400
495
|
self.output_parser = output_parser
|
|
401
496
|
|
|
402
|
-
# Set template based on mode
|
|
403
497
|
if use_fully_custom_prompt:
|
|
404
|
-
self.template = None
|
|
498
|
+
self.template = None
|
|
405
499
|
elif custom_template:
|
|
406
500
|
self.template = custom_template
|
|
407
501
|
elif parser_name and parser_name in self.PARSER_INSTRUCTIONS:
|
|
@@ -413,7 +507,6 @@ Output only the number. No explanation. No extra text.""",
|
|
|
413
507
|
ground_truth="{ground_truth}",
|
|
414
508
|
)
|
|
415
509
|
else:
|
|
416
|
-
# Default to right/wrong for custom parsers
|
|
417
510
|
self.template = self.BASE_TEMPLATE.format(
|
|
418
511
|
instruction=self.PARSER_INSTRUCTIONS['right/wrong'],
|
|
419
512
|
notes_section="{notes_section}",
|
|
@@ -422,12 +515,18 @@ Output only the number. No explanation. No extra text.""",
|
|
|
422
515
|
ground_truth="{ground_truth}",
|
|
423
516
|
)
|
|
424
517
|
|
|
425
|
-
|
|
426
|
-
self.
|
|
427
|
-
self.
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
518
|
+
# optional local cache setup
|
|
519
|
+
self.cache_enabled = litellm_cache_dir is not None
|
|
520
|
+
if self.cache_enabled:
|
|
521
|
+
litellm.cache = Cache(type="disk", disk_cache_dir=litellm_cache_dir)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
|
|
431
530
|
|
|
432
531
|
def _build_prompt(self, input: Any, model_output: Any, ground_truth: Any) -> str:
|
|
433
532
|
notes_section = f"notes:\n{self.notes}\n" if self.notes else ""
|
|
@@ -495,14 +594,24 @@ Output only the number. No explanation. No extra text.""",
|
|
|
495
594
|
last_err = None
|
|
496
595
|
for i in range(attempts):
|
|
497
596
|
try:
|
|
597
|
+
# resp = completion(
|
|
598
|
+
# model=model,
|
|
599
|
+
# api_base=api_base, # None uses provider default
|
|
600
|
+
# messages=[{"role": "user", "content": prompt}],
|
|
601
|
+
# temperature=temperature,
|
|
602
|
+
# max_tokens=max_tokens,
|
|
603
|
+
# extra_headers=headers,
|
|
604
|
+
# )
|
|
605
|
+
|
|
498
606
|
resp = completion(
|
|
499
607
|
model=model,
|
|
500
|
-
api_base=api_base,
|
|
608
|
+
api_base=api_base,
|
|
501
609
|
messages=[{"role": "user", "content": prompt}],
|
|
502
610
|
temperature=temperature,
|
|
503
611
|
max_tokens=max_tokens,
|
|
504
612
|
extra_headers=headers,
|
|
505
|
-
|
|
613
|
+
caching=self.cache_enabled
|
|
614
|
+
)
|
|
506
615
|
return (resp.choices[0].message.content or "").strip()
|
|
507
616
|
except Exception as e:
|
|
508
617
|
last_err = e
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "llmasajudge"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.11"
|
|
8
8
|
description = "LLM Judge: simple right/wrong voting across models"
|
|
9
9
|
authors = [{name="Brett Young", email="byyoung3@gmail.com"}]
|
|
10
10
|
readme = "README.md"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|