llmasajudge 0.1.10__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmasajudge
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary: LLM Judge: simple right/wrong voting across models
5
5
  Author-email: Brett Young <byyoung3@gmail.com>
6
6
  Project-URL: Homepage, https://example.com
@@ -206,7 +206,10 @@ import time
206
206
  import random
207
207
  import re
208
208
  from typing import Any, Callable, Dict, List, Optional, Tuple
209
+ import litellm
209
210
  from litellm import completion
211
+ from litellm.caching.caching import Cache
212
+
210
213
 
211
214
  __all__ = ["LLMAsAJudge", "OutputParsers"]
212
215
 
@@ -322,10 +325,122 @@ Output only the number. No explanation. No extra text.""",
322
325
 
323
326
 
324
327
 
328
+ # def __init__(
329
+ # self,
330
+ # models: Optional[List[str]] = None,
331
+ # config: Optional[Dict[str, Dict[str, Any]]] = None, # one dict for providers and models
332
+ # base_headers: Optional[Dict[str, str]] = None,
333
+ # wandb_project: Optional[str] = None,
334
+ # custom_template: Optional[str] = None,
335
+ # use_fully_custom_prompt: bool = False,
336
+ # notes: Optional[str] = None,
337
+ # output_parser: Optional[str] = 'right/wrong',
338
+ # fallback_comparison: bool = True,
339
+ # default_temperature: float = 0.0,
340
+ # verbose: bool = False,
341
+ # num_retries: int = 2, # per-call retries before giving up on that model
342
+ # backoff_base: float = 0.5, # seconds
343
+ # backoff_max: float = 4.0, # seconds
344
+ # custom_generation_fns: Optional[List[Callable[[str], str]]] = None,
345
+ # mode: str = "majority", # "single", "majority", "all"
346
+ # ):
347
+ # """
348
+ # config keys can be a provider name ("wandb", "openai", "anthropic")
349
+ # or a full model name ("openai/gpt-4o-mini", "wandb/deepseek-ai/DeepSeek-V3.1").
350
+
351
+ # Values can include:
352
+ # api_base: Optional[str]
353
+ # headers: Dict[str, str]
354
+ # temperature: float
355
+
356
+ # Precedence:
357
+ # base_headers < provider config < model config
358
+
359
+ # Args:
360
+ # models: List of litellm model strings (e.g., ["openai/gpt-4", "anthropic/claude-3"])
361
+ # custom_template: Template with placeholders for input/output/ground_truth
362
+ # use_fully_custom_prompt: If True, pass complete prompt to judge(prompt=...).
363
+ # When True, input/output/ground_truth must NOT be passed to judge()
364
+ # output_parser: Parser name ('right/wrong', 'yes/no', 'pass/fail', 'numeric')
365
+ # or custom function with signature (str) -> Any
366
+ # fallback_comparison: If True and parser returns None, falls back to string comparison
367
+ # custom_generation_fns: List of custom inference functions with signature fn(prompt: str) -> str
368
+ # These will be used in addition to litellm models for voting.
369
+ # mode: Voting mode - "majority" (default), "single" (first judge only), or "all" (unanimous)
370
+ # """
371
+ # self.models = models or []
372
+ # self.custom_generation_fns = custom_generation_fns or []
373
+
374
+ # # Validate that at least one judge is provided
375
+ # if not self.models and not self.custom_generation_fns:
376
+ # raise ValueError("Must provide at least one of: models (litellm) or custom_generation_fns")
377
+
378
+ # # Validate mode
379
+ # if mode not in ("majority", "single", "all"):
380
+ # raise ValueError("mode must be 'majority', 'single', or 'all'")
381
+
382
+ # self.config = config or {}
383
+ # self.base_headers = dict(base_headers or {})
384
+ # self.wandb_project = wandb_project or os.getenv("WANDB_PROJECT")
385
+ # self.notes = notes or ""
386
+ # self.use_fully_custom_prompt = use_fully_custom_prompt
387
+ # self.mode = mode
388
+
389
+ # # Resolve output parser
390
+ # parser_name = None
391
+ # if isinstance(output_parser, str):
392
+ # parser_map = {
393
+ # 'right/wrong': OutputParsers.right_wrong,
394
+ # 'pass/fail': OutputParsers.pass_fail,
395
+ # 'yes/no': OutputParsers.yes_no,
396
+ # 'numeric': OutputParsers.numeric_score,
397
+ # }
398
+ # if output_parser not in parser_map:
399
+ # raise ValueError(f"Unknown parser '{output_parser}'. Available: {list(parser_map.keys())}")
400
+ # self.output_parser = parser_map[output_parser]
401
+ # parser_name = output_parser
402
+ # else:
403
+ # self.output_parser = output_parser
404
+
405
+ # # Set template based on mode
406
+ # if use_fully_custom_prompt:
407
+ # self.template = None # No template in fully custom mode
408
+ # elif custom_template:
409
+ # self.template = custom_template
410
+ # elif parser_name and parser_name in self.PARSER_INSTRUCTIONS:
411
+ # self.template = self.BASE_TEMPLATE.format(
412
+ # instruction=self.PARSER_INSTRUCTIONS[parser_name],
413
+ # notes_section="{notes_section}",
414
+ # input_block="{input_block}",
415
+ # model_output="{model_output}",
416
+ # ground_truth="{ground_truth}",
417
+ # )
418
+ # else:
419
+ # # Default to right/wrong for custom parsers
420
+ # self.template = self.BASE_TEMPLATE.format(
421
+ # instruction=self.PARSER_INSTRUCTIONS['right/wrong'],
422
+ # notes_section="{notes_section}",
423
+ # input_block="{input_block}",
424
+ # model_output="{model_output}",
425
+ # ground_truth="{ground_truth}",
426
+ # )
427
+
428
+ # self.fallback_comparison = fallback_comparison
429
+ # self.default_temperature = float(default_temperature)
430
+ # self.verbose = verbose
431
+ # self.num_retries = int(num_retries)
432
+ # self.backoff_base = float(backoff_base)
433
+ # self.backoff_max = float(backoff_max)
434
+
435
+
436
+
437
+
438
+
439
+
325
440
  def __init__(
326
441
  self,
327
442
  models: Optional[List[str]] = None,
328
- config: Optional[Dict[str, Dict[str, Any]]] = None, # one dict for providers and models
443
+ config: Optional[Dict[str, Dict[str, Any]]] = None,
329
444
  base_headers: Optional[Dict[str, str]] = None,
330
445
  wandb_project: Optional[str] = None,
331
446
  custom_template: Optional[str] = None,
@@ -335,44 +450,19 @@ Output only the number. No explanation. No extra text.""",
335
450
  fallback_comparison: bool = True,
336
451
  default_temperature: float = 0.0,
337
452
  verbose: bool = False,
338
- num_retries: int = 2, # per-call retries before giving up on that model
339
- backoff_base: float = 0.5, # seconds
340
- backoff_max: float = 4.0, # seconds
453
+ num_retries: int = 2,
454
+ backoff_base: float = 0.5,
455
+ backoff_max: float = 4.0,
341
456
  custom_generation_fns: Optional[List[Callable[[str], str]]] = None,
342
- mode: str = "majority", # "single", "majority", "all"
457
+ mode: str = "majority",
458
+ litellm_cache_dir: Optional[str] = None,
343
459
  ):
344
- """
345
- config keys can be a provider name ("wandb", "openai", "anthropic")
346
- or a full model name ("openai/gpt-4o-mini", "wandb/deepseek-ai/DeepSeek-V3.1").
347
-
348
- Values can include:
349
- api_base: Optional[str]
350
- headers: Dict[str, str]
351
- temperature: float
352
-
353
- Precedence:
354
- base_headers < provider config < model config
355
-
356
- Args:
357
- models: List of litellm model strings (e.g., ["openai/gpt-4", "anthropic/claude-3"])
358
- custom_template: Template with placeholders for input/output/ground_truth
359
- use_fully_custom_prompt: If True, pass complete prompt to judge(prompt=...).
360
- When True, input/output/ground_truth must NOT be passed to judge()
361
- output_parser: Parser name ('right/wrong', 'yes/no', 'pass/fail', 'numeric')
362
- or custom function with signature (str) -> Any
363
- fallback_comparison: If True and parser returns None, falls back to string comparison
364
- custom_generation_fns: List of custom inference functions with signature fn(prompt: str) -> str
365
- These will be used in addition to litellm models for voting.
366
- mode: Voting mode - "majority" (default), "single" (first judge only), or "all" (unanimous)
367
- """
368
460
  self.models = models or []
369
461
  self.custom_generation_fns = custom_generation_fns or []
370
462
 
371
- # Validate that at least one judge is provided
372
463
  if not self.models and not self.custom_generation_fns:
373
464
  raise ValueError("Must provide at least one of: models (litellm) or custom_generation_fns")
374
465
 
375
- # Validate mode
376
466
  if mode not in ("majority", "single", "all"):
377
467
  raise ValueError("mode must be 'majority', 'single', or 'all'")
378
468
 
@@ -382,8 +472,13 @@ Output only the number. No explanation. No extra text.""",
382
472
  self.notes = notes or ""
383
473
  self.use_fully_custom_prompt = use_fully_custom_prompt
384
474
  self.mode = mode
475
+ self.fallback_comparison = fallback_comparison
476
+ self.default_temperature = float(default_temperature)
477
+ self.verbose = verbose
478
+ self.num_retries = int(num_retries)
479
+ self.backoff_base = float(backoff_base)
480
+ self.backoff_max = float(backoff_max)
385
481
 
386
- # Resolve output parser
387
482
  parser_name = None
388
483
  if isinstance(output_parser, str):
389
484
  parser_map = {
@@ -393,15 +488,14 @@ Output only the number. No explanation. No extra text.""",
393
488
  'numeric': OutputParsers.numeric_score,
394
489
  }
395
490
  if output_parser not in parser_map:
396
- raise ValueError(f"Unknown parser '{output_parser}'. Available: {list(parser_map.keys())}")
491
+ raise ValueError(f"Unknown parser '{output_parser}'")
397
492
  self.output_parser = parser_map[output_parser]
398
493
  parser_name = output_parser
399
494
  else:
400
495
  self.output_parser = output_parser
401
496
 
402
- # Set template based on mode
403
497
  if use_fully_custom_prompt:
404
- self.template = None # No template in fully custom mode
498
+ self.template = None
405
499
  elif custom_template:
406
500
  self.template = custom_template
407
501
  elif parser_name and parser_name in self.PARSER_INSTRUCTIONS:
@@ -413,7 +507,6 @@ Output only the number. No explanation. No extra text.""",
413
507
  ground_truth="{ground_truth}",
414
508
  )
415
509
  else:
416
- # Default to right/wrong for custom parsers
417
510
  self.template = self.BASE_TEMPLATE.format(
418
511
  instruction=self.PARSER_INSTRUCTIONS['right/wrong'],
419
512
  notes_section="{notes_section}",
@@ -422,12 +515,18 @@ Output only the number. No explanation. No extra text.""",
422
515
  ground_truth="{ground_truth}",
423
516
  )
424
517
 
425
- self.fallback_comparison = fallback_comparison
426
- self.default_temperature = float(default_temperature)
427
- self.verbose = verbose
428
- self.num_retries = int(num_retries)
429
- self.backoff_base = float(backoff_base)
430
- self.backoff_max = float(backoff_max)
518
+ # optional local cache setup
519
+ self.cache_enabled = litellm_cache_dir is not None
520
+ if self.cache_enabled:
521
+ litellm.cache = Cache(type="disk", disk_cache_dir=litellm_cache_dir)
522
+
523
+
524
+
525
+
526
+
527
+
528
+
529
+
431
530
 
432
531
  def _build_prompt(self, input: Any, model_output: Any, ground_truth: Any) -> str:
433
532
  notes_section = f"notes:\n{self.notes}\n" if self.notes else ""
@@ -495,14 +594,24 @@ Output only the number. No explanation. No extra text.""",
495
594
  last_err = None
496
595
  for i in range(attempts):
497
596
  try:
597
+ # resp = completion(
598
+ # model=model,
599
+ # api_base=api_base, # None uses provider default
600
+ # messages=[{"role": "user", "content": prompt}],
601
+ # temperature=temperature,
602
+ # max_tokens=max_tokens,
603
+ # extra_headers=headers,
604
+ # )
605
+
498
606
  resp = completion(
499
607
  model=model,
500
- api_base=api_base, # None uses provider default
608
+ api_base=api_base,
501
609
  messages=[{"role": "user", "content": prompt}],
502
610
  temperature=temperature,
503
611
  max_tokens=max_tokens,
504
612
  extra_headers=headers,
505
- )
613
+ caching=self.cache_enabled
614
+ )
506
615
  return (resp.choices[0].message.content or "").strip()
507
616
  except Exception as e:
508
617
  last_err = e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmasajudge
3
- Version: 0.1.10
3
+ Version: 0.1.11
4
4
  Summary: LLM Judge: simple right/wrong voting across models
5
5
  Author-email: Brett Young <byyoung3@gmail.com>
6
6
  Project-URL: Homepage, https://example.com
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "llmasajudge"
7
- version = "0.1.10"
7
+ version = "0.1.11"
8
8
  description = "LLM Judge: simple right/wrong voting across models"
9
9
  authors = [{name="Brett Young", email="byyoung3@gmail.com"}]
10
10
  readme = "README.md"
File without changes
File without changes