llmasajudge 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llmasajudge/__init__.py CHANGED
@@ -206,11 +206,133 @@ import time
206
206
  import random
207
207
  import re
208
208
  from typing import Any, Callable, Dict, List, Optional, Tuple
209
+ import litellm
209
210
  from litellm import completion
211
+ from litellm.caching.caching import Cache
212
+
210
213
 
211
214
  __all__ = ["LLMAsAJudge", "OutputParsers"]
212
215
 
213
216
 
217
+ class UnlimitedDiskCache:
218
+ """
219
+ Drop-in replacement backend with 'unlimited' size for LiteLLM cache.
220
+
221
+ This wraps diskcache.Cache with a very large size limit (2^62 bytes ~ 4.6 exabytes)
222
+ to effectively disable automatic cache eviction, allowing the cache to grow
223
+ without size constraints.
224
+ """
225
+
226
+ def __init__(self, directory, size_limit=None):
227
+ """
228
+ Initialize unlimited disk cache.
229
+
230
+ Args:
231
+ directory: Path to cache directory
232
+ size_limit: Optional size limit in bytes. If None, uses 2^62 bytes (~4.6 exabytes)
233
+ """
234
+ import diskcache as dc
235
+
236
+ # Set to very large cap so culling never triggers (effectively unlimited)
237
+ cap = size_limit if size_limit is not None else (1 << 62)
238
+ self._dc = dc.Cache(directory, size_limit=cap)
239
+
240
+ # Sync API used by LiteLLM
241
+ def get_cache(self, key, **kwargs):
242
+ """Get value from cache by key."""
243
+ return self._dc.get(key)
244
+
245
+ def set_cache(self, key, value, ttl=None, **kwargs):
246
+ """Set value in cache with optional TTL."""
247
+ expire = None if ttl is None else float(ttl)
248
+ self._dc.set(key, value, expire=expire)
249
+
250
+ # Async API used by LiteLLM
251
+ async def async_get_cache(self, key, **kwargs):
252
+ """Async get value from cache by key."""
253
+ return self.get_cache(key, **kwargs)
254
+
255
+ async def async_set_cache(self, key, value, ttl=None, **kwargs):
256
+ """Async set value in cache with optional TTL."""
257
+ return self.set_cache(key, value, ttl=ttl, **kwargs)
258
+
259
+ async def async_set_cache_pipeline(self, cache_list, ttl=None, **kwargs):
260
+ """
261
+ Async batch set multiple cache entries.
262
+
263
+ Args:
264
+ cache_list: List of (key, value) tuples
265
+ ttl: Optional time-to-live in seconds
266
+ """
267
+ for k, v in cache_list:
268
+ self.set_cache(k, v, ttl=ttl)
269
+
270
+ async def batch_cache_write(self, key, value, ttl=None, **kwargs):
271
+ """Async batch write (single entry)."""
272
+ self.set_cache(key, value, ttl=ttl)
273
+
274
+ async def ping(self):
275
+ """Async ping check."""
276
+ return True
277
+
278
+ async def delete_cache_keys(self, keys):
279
+ """
280
+ Async delete multiple cache keys.
281
+
282
+ Args:
283
+ keys: List of keys to delete
284
+ """
285
+ for k in keys:
286
+ try:
287
+ del self._dc[k]
288
+ except KeyError:
289
+ pass
290
+ return True
291
+
292
+ async def disconnect(self):
293
+ """Async disconnect and close cache."""
294
+ self._dc.close()
295
+
296
+ def get_stats(self):
297
+ """
298
+ Get cache statistics.
299
+
300
+ Returns:
301
+ dict with size_limit, current_size, item_count, and percent_full
302
+ """
303
+ size_limit = self._dc.size_limit
304
+ volume = self._dc.volume() # Current size in bytes
305
+ count = len(self._dc) # Number of items
306
+
307
+ return {
308
+ "size_limit": size_limit,
309
+ "current_size": volume,
310
+ "item_count": count,
311
+ "percent_full": (volume / size_limit) * 100 if size_limit > 0 else 0.0,
312
+ }
313
+
314
+ def print_stats(self):
315
+ """Print human-readable cache statistics."""
316
+ stats = self.get_stats()
317
+
318
+ def human_size(bytes_val):
319
+ """Convert bytes to human readable format."""
320
+ for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB"]:
321
+ if bytes_val < 1024.0:
322
+ return f"{bytes_val:.2f} {unit}"
323
+ bytes_val /= 1024.0
324
+ return f"{bytes_val:.2f} EB"
325
+
326
+ print("=" * 60)
327
+ print("CACHE STATISTICS")
328
+ print("=" * 60)
329
+ print(f" Size limit: {human_size(stats['size_limit'])} ({stats['size_limit']:,} bytes)")
330
+ print(f" Current size: {human_size(stats['current_size'])} ({stats['current_size']:,} bytes)")
331
+ print(f" Items cached: {stats['item_count']}")
332
+ print(f" % full: {stats['percent_full']:.6f}%")
333
+ print("=" * 60)
334
+
335
+
214
336
  class OutputParsers:
215
337
  """Stock output parsers for common judge output formats."""
216
338
 
@@ -322,10 +444,122 @@ Output only the number. No explanation. No extra text.""",
322
444
 
323
445
 
324
446
 
447
+ # def __init__(
448
+ # self,
449
+ # models: Optional[List[str]] = None,
450
+ # config: Optional[Dict[str, Dict[str, Any]]] = None, # one dict for providers and models
451
+ # base_headers: Optional[Dict[str, str]] = None,
452
+ # wandb_project: Optional[str] = None,
453
+ # custom_template: Optional[str] = None,
454
+ # use_fully_custom_prompt: bool = False,
455
+ # notes: Optional[str] = None,
456
+ # output_parser: Optional[str] = 'right/wrong',
457
+ # fallback_comparison: bool = True,
458
+ # default_temperature: float = 0.0,
459
+ # verbose: bool = False,
460
+ # num_retries: int = 2, # per-call retries before giving up on that model
461
+ # backoff_base: float = 0.5, # seconds
462
+ # backoff_max: float = 4.0, # seconds
463
+ # custom_generation_fns: Optional[List[Callable[[str], str]]] = None,
464
+ # mode: str = "majority", # "single", "majority", "all"
465
+ # ):
466
+ # """
467
+ # config keys can be a provider name ("wandb", "openai", "anthropic")
468
+ # or a full model name ("openai/gpt-4o-mini", "wandb/deepseek-ai/DeepSeek-V3.1").
469
+
470
+ # Values can include:
471
+ # api_base: Optional[str]
472
+ # headers: Dict[str, str]
473
+ # temperature: float
474
+
475
+ # Precedence:
476
+ # base_headers < provider config < model config
477
+
478
+ # Args:
479
+ # models: List of litellm model strings (e.g., ["openai/gpt-4", "anthropic/claude-3"])
480
+ # custom_template: Template with placeholders for input/output/ground_truth
481
+ # use_fully_custom_prompt: If True, pass complete prompt to judge(prompt=...).
482
+ # When True, input/output/ground_truth must NOT be passed to judge()
483
+ # output_parser: Parser name ('right/wrong', 'yes/no', 'pass/fail', 'numeric')
484
+ # or custom function with signature (str) -> Any
485
+ # fallback_comparison: If True and parser returns None, falls back to string comparison
486
+ # custom_generation_fns: List of custom inference functions with signature fn(prompt: str) -> str
487
+ # These will be used in addition to litellm models for voting.
488
+ # mode: Voting mode - "majority" (default), "single" (first judge only), or "all" (unanimous)
489
+ # """
490
+ # self.models = models or []
491
+ # self.custom_generation_fns = custom_generation_fns or []
492
+
493
+ # # Validate that at least one judge is provided
494
+ # if not self.models and not self.custom_generation_fns:
495
+ # raise ValueError("Must provide at least one of: models (litellm) or custom_generation_fns")
496
+
497
+ # # Validate mode
498
+ # if mode not in ("majority", "single", "all"):
499
+ # raise ValueError("mode must be 'majority', 'single', or 'all'")
500
+
501
+ # self.config = config or {}
502
+ # self.base_headers = dict(base_headers or {})
503
+ # self.wandb_project = wandb_project or os.getenv("WANDB_PROJECT")
504
+ # self.notes = notes or ""
505
+ # self.use_fully_custom_prompt = use_fully_custom_prompt
506
+ # self.mode = mode
507
+
508
+ # # Resolve output parser
509
+ # parser_name = None
510
+ # if isinstance(output_parser, str):
511
+ # parser_map = {
512
+ # 'right/wrong': OutputParsers.right_wrong,
513
+ # 'pass/fail': OutputParsers.pass_fail,
514
+ # 'yes/no': OutputParsers.yes_no,
515
+ # 'numeric': OutputParsers.numeric_score,
516
+ # }
517
+ # if output_parser not in parser_map:
518
+ # raise ValueError(f"Unknown parser '{output_parser}'. Available: {list(parser_map.keys())}")
519
+ # self.output_parser = parser_map[output_parser]
520
+ # parser_name = output_parser
521
+ # else:
522
+ # self.output_parser = output_parser
523
+
524
+ # # Set template based on mode
525
+ # if use_fully_custom_prompt:
526
+ # self.template = None # No template in fully custom mode
527
+ # elif custom_template:
528
+ # self.template = custom_template
529
+ # elif parser_name and parser_name in self.PARSER_INSTRUCTIONS:
530
+ # self.template = self.BASE_TEMPLATE.format(
531
+ # instruction=self.PARSER_INSTRUCTIONS[parser_name],
532
+ # notes_section="{notes_section}",
533
+ # input_block="{input_block}",
534
+ # model_output="{model_output}",
535
+ # ground_truth="{ground_truth}",
536
+ # )
537
+ # else:
538
+ # # Default to right/wrong for custom parsers
539
+ # self.template = self.BASE_TEMPLATE.format(
540
+ # instruction=self.PARSER_INSTRUCTIONS['right/wrong'],
541
+ # notes_section="{notes_section}",
542
+ # input_block="{input_block}",
543
+ # model_output="{model_output}",
544
+ # ground_truth="{ground_truth}",
545
+ # )
546
+
547
+ # self.fallback_comparison = fallback_comparison
548
+ # self.default_temperature = float(default_temperature)
549
+ # self.verbose = verbose
550
+ # self.num_retries = int(num_retries)
551
+ # self.backoff_base = float(backoff_base)
552
+ # self.backoff_max = float(backoff_max)
553
+
554
+
555
+
556
+
557
+
558
+
325
559
  def __init__(
326
560
  self,
327
561
  models: Optional[List[str]] = None,
328
- config: Optional[Dict[str, Dict[str, Any]]] = None, # one dict for providers and models
562
+ config: Optional[Dict[str, Dict[str, Any]]] = None,
329
563
  base_headers: Optional[Dict[str, str]] = None,
330
564
  wandb_project: Optional[str] = None,
331
565
  custom_template: Optional[str] = None,
@@ -335,44 +569,20 @@ Output only the number. No explanation. No extra text.""",
335
569
  fallback_comparison: bool = True,
336
570
  default_temperature: float = 0.0,
337
571
  verbose: bool = False,
338
- num_retries: int = 2, # per-call retries before giving up on that model
339
- backoff_base: float = 0.5, # seconds
340
- backoff_max: float = 4.0, # seconds
572
+ num_retries: int = 2,
573
+ backoff_base: float = 0.5,
574
+ backoff_max: float = 4.0,
341
575
  custom_generation_fns: Optional[List[Callable[[str], str]]] = None,
342
- mode: str = "majority", # "single", "majority", "all"
576
+ mode: str = "majority",
577
+ litellm_cache_dir: Optional[str] = None,
578
+ cache_size_gb: Optional[float] = None,
343
579
  ):
344
- """
345
- config keys can be a provider name ("wandb", "openai", "anthropic")
346
- or a full model name ("openai/gpt-4o-mini", "wandb/deepseek-ai/DeepSeek-V3.1").
347
-
348
- Values can include:
349
- api_base: Optional[str]
350
- headers: Dict[str, str]
351
- temperature: float
352
-
353
- Precedence:
354
- base_headers < provider config < model config
355
-
356
- Args:
357
- models: List of litellm model strings (e.g., ["openai/gpt-4", "anthropic/claude-3"])
358
- custom_template: Template with placeholders for input/output/ground_truth
359
- use_fully_custom_prompt: If True, pass complete prompt to judge(prompt=...).
360
- When True, input/output/ground_truth must NOT be passed to judge()
361
- output_parser: Parser name ('right/wrong', 'yes/no', 'pass/fail', 'numeric')
362
- or custom function with signature (str) -> Any
363
- fallback_comparison: If True and parser returns None, falls back to string comparison
364
- custom_generation_fns: List of custom inference functions with signature fn(prompt: str) -> str
365
- These will be used in addition to litellm models for voting.
366
- mode: Voting mode - "majority" (default), "single" (first judge only), or "all" (unanimous)
367
- """
368
580
  self.models = models or []
369
581
  self.custom_generation_fns = custom_generation_fns or []
370
582
 
371
- # Validate that at least one judge is provided
372
583
  if not self.models and not self.custom_generation_fns:
373
584
  raise ValueError("Must provide at least one of: models (litellm) or custom_generation_fns")
374
585
 
375
- # Validate mode
376
586
  if mode not in ("majority", "single", "all"):
377
587
  raise ValueError("mode must be 'majority', 'single', or 'all'")
378
588
 
@@ -382,8 +592,13 @@ Output only the number. No explanation. No extra text.""",
382
592
  self.notes = notes or ""
383
593
  self.use_fully_custom_prompt = use_fully_custom_prompt
384
594
  self.mode = mode
595
+ self.fallback_comparison = fallback_comparison
596
+ self.default_temperature = float(default_temperature)
597
+ self.verbose = verbose
598
+ self.num_retries = int(num_retries)
599
+ self.backoff_base = float(backoff_base)
600
+ self.backoff_max = float(backoff_max)
385
601
 
386
- # Resolve output parser
387
602
  parser_name = None
388
603
  if isinstance(output_parser, str):
389
604
  parser_map = {
@@ -393,15 +608,14 @@ Output only the number. No explanation. No extra text.""",
393
608
  'numeric': OutputParsers.numeric_score,
394
609
  }
395
610
  if output_parser not in parser_map:
396
- raise ValueError(f"Unknown parser '{output_parser}'. Available: {list(parser_map.keys())}")
611
+ raise ValueError(f"Unknown parser '{output_parser}'")
397
612
  self.output_parser = parser_map[output_parser]
398
613
  parser_name = output_parser
399
614
  else:
400
615
  self.output_parser = output_parser
401
616
 
402
- # Set template based on mode
403
617
  if use_fully_custom_prompt:
404
- self.template = None # No template in fully custom mode
618
+ self.template = None
405
619
  elif custom_template:
406
620
  self.template = custom_template
407
621
  elif parser_name and parser_name in self.PARSER_INSTRUCTIONS:
@@ -413,7 +627,6 @@ Output only the number. No explanation. No extra text.""",
413
627
  ground_truth="{ground_truth}",
414
628
  )
415
629
  else:
416
- # Default to right/wrong for custom parsers
417
630
  self.template = self.BASE_TEMPLATE.format(
418
631
  instruction=self.PARSER_INSTRUCTIONS['right/wrong'],
419
632
  notes_section="{notes_section}",
@@ -422,12 +635,22 @@ Output only the number. No explanation. No extra text.""",
422
635
  ground_truth="{ground_truth}",
423
636
  )
424
637
 
425
- self.fallback_comparison = fallback_comparison
426
- self.default_temperature = float(default_temperature)
427
- self.verbose = verbose
428
- self.num_retries = int(num_retries)
429
- self.backoff_base = float(backoff_base)
430
- self.backoff_max = float(backoff_max)
638
+ # optional local cache setup
639
+ self.cache_enabled = litellm_cache_dir is not None
640
+ if self.cache_enabled:
641
+ # Convert GB to bytes if specified, otherwise unlimited
642
+ size_limit_bytes = None if cache_size_gb is None else int(cache_size_gb * 1024 * 1024 * 1024)
643
+ cache_backend = UnlimitedDiskCache(litellm_cache_dir, size_limit=size_limit_bytes)
644
+ litellm.cache = Cache(disk_cache_dir=litellm_cache_dir)
645
+ litellm.cache.cache = cache_backend
646
+
647
+
648
+
649
+
650
+
651
+
652
+
653
+
431
654
 
432
655
  def _build_prompt(self, input: Any, model_output: Any, ground_truth: Any) -> str:
433
656
  notes_section = f"notes:\n{self.notes}\n" if self.notes else ""
@@ -495,14 +718,24 @@ Output only the number. No explanation. No extra text.""",
495
718
  last_err = None
496
719
  for i in range(attempts):
497
720
  try:
721
+ # resp = completion(
722
+ # model=model,
723
+ # api_base=api_base, # None uses provider default
724
+ # messages=[{"role": "user", "content": prompt}],
725
+ # temperature=temperature,
726
+ # max_tokens=max_tokens,
727
+ # extra_headers=headers,
728
+ # )
729
+
498
730
  resp = completion(
499
731
  model=model,
500
- api_base=api_base, # None uses provider default
732
+ api_base=api_base,
501
733
  messages=[{"role": "user", "content": prompt}],
502
734
  temperature=temperature,
503
735
  max_tokens=max_tokens,
504
736
  extra_headers=headers,
505
- )
737
+ caching=self.cache_enabled
738
+ )
506
739
  return (resp.choices[0].message.content or "").strip()
507
740
  except Exception as e:
508
741
  last_err = e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmasajudge
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: LLM Judge: simple right/wrong voting across models
5
5
  Author-email: Brett Young <byyoung3@gmail.com>
6
6
  Project-URL: Homepage, https://example.com
@@ -0,0 +1,5 @@
1
+ llmasajudge/__init__.py,sha256=cXxvx3shu0h40u1jXb-MqB0-mzkX1FWZElXnzBOE070,31957
2
+ llmasajudge-0.1.12.dist-info/METADATA,sha256=efGYG1GCWizmcRoXS3zLEzdvQqqPB8JIRy_tlDOqpfM,515
3
+ llmasajudge-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
+ llmasajudge-0.1.12.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
5
+ llmasajudge-0.1.12.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- llmasajudge/__init__.py,sha256=IZk0rwFbH6cNa5GrzUeuBI6jdo92fhgu5ycHlJDfQjc,23496
2
- llmasajudge-0.1.10.dist-info/METADATA,sha256=1YVh8GW2_xT9EJvdVvlwuHwJzQ4PRaURZJZ1KRXUtSs,515
3
- llmasajudge-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
- llmasajudge-0.1.10.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
5
- llmasajudge-0.1.10.dist-info/RECORD,,