PyPI - llmasajudge - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

llmasajudge 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

llmasajudge/__init__.py CHANGED Viewed

@@ -206,11 +206,133 @@ import time
 import random
 import re
 from typing import Any, Callable, Dict, List, Optional, Tuple
+import litellm
 from litellm import completion
+from litellm.caching.caching import Cache
 __all__ = ["LLMAsAJudge", "OutputParsers"]
+class UnlimitedDiskCache:
+    """
+    Drop-in replacement backend with 'unlimited' size for LiteLLM cache.
+    This wraps diskcache.Cache with a very large size limit (2^62 bytes ~ 4.6 exabytes)
+    to effectively disable automatic cache eviction, allowing the cache to grow
+    without size constraints.
+    """
+    def __init__(self, directory, size_limit=None):
+        """
+        Initialize unlimited disk cache.
+        Args:
+            directory: Path to cache directory
+            size_limit: Optional size limit in bytes. If None, uses 2^62 bytes (~4.6 exabytes)
+        """
+        import diskcache as dc
+        # Set to very large cap so culling never triggers (effectively unlimited)
+        cap = size_limit if size_limit is not None else (1 << 62)
+        self._dc = dc.Cache(directory, size_limit=cap)
+    # Sync API used by LiteLLM
+    def get_cache(self, key, **kwargs):
+        """Get value from cache by key."""
+        return self._dc.get(key)
+    def set_cache(self, key, value, ttl=None, **kwargs):
+        """Set value in cache with optional TTL."""
+        expire = None if ttl is None else float(ttl)
+        self._dc.set(key, value, expire=expire)
+    # Async API used by LiteLLM
+    async def async_get_cache(self, key, **kwargs):
+        """Async get value from cache by key."""
+        return self.get_cache(key, **kwargs)
+    async def async_set_cache(self, key, value, ttl=None, **kwargs):
+        """Async set value in cache with optional TTL."""
+        return self.set_cache(key, value, ttl=ttl, **kwargs)
+    async def async_set_cache_pipeline(self, cache_list, ttl=None, **kwargs):
+        """
+        Async batch set multiple cache entries.
+        Args:
+            cache_list: List of (key, value) tuples
+            ttl: Optional time-to-live in seconds
+        """
+        for k, v in cache_list:
+            self.set_cache(k, v, ttl=ttl)
+    async def batch_cache_write(self, key, value, ttl=None, **kwargs):
+        """Async batch write (single entry)."""
+        self.set_cache(key, value, ttl=ttl)
+    async def ping(self):
+        """Async ping check."""
+        return True
+    async def delete_cache_keys(self, keys):
+        """
+        Async delete multiple cache keys.
+        Args:
+            keys: List of keys to delete
+        """
+        for k in keys:
+            try:
+                del self._dc[k]
+            except KeyError:
+                pass
+        return True
+    async def disconnect(self):
+        """Async disconnect and close cache."""
+        self._dc.close()
+    def get_stats(self):
+        """
+        Get cache statistics.
+        Returns:
+            dict with size_limit, current_size, item_count, and percent_full
+        """
+        size_limit = self._dc.size_limit
+        volume = self._dc.volume()  # Current size in bytes
+        count = len(self._dc)  # Number of items
+        return {
+            "size_limit": size_limit,
+            "current_size": volume,
+            "item_count": count,
+            "percent_full": (volume / size_limit) * 100 if size_limit > 0 else 0.0,
+        }
+    def print_stats(self):
+        """Print human-readable cache statistics."""
+        stats = self.get_stats()
+        def human_size(bytes_val):
+            """Convert bytes to human readable format."""
+            for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB"]:
+                if bytes_val < 1024.0:
+                    return f"{bytes_val:.2f} {unit}"
+                bytes_val /= 1024.0
+            return f"{bytes_val:.2f} EB"
+        print("=" * 60)
+        print("CACHE STATISTICS")
+        print("=" * 60)
+        print(f"  Size limit:     {human_size(stats['size_limit'])} ({stats['size_limit']:,} bytes)")
+        print(f"  Current size:   {human_size(stats['current_size'])} ({stats['current_size']:,} bytes)")
+        print(f"  Items cached:   {stats['item_count']}")
+        print(f"  % full:         {stats['percent_full']:.6f}%")
+        print("=" * 60)
 class OutputParsers:
     """Stock output parsers for common judge output formats."""
@@ -322,10 +444,122 @@ Output only the number. No explanation. No extra text.""",
+    # def __init__(
+    #     self,
+    #     models: Optional[List[str]] = None,
+    #     config: Optional[Dict[str, Dict[str, Any]]] = None,   # one dict for providers and models
+    #     base_headers: Optional[Dict[str, str]] = None,
+    #     wandb_project: Optional[str] = None,
+    #     custom_template: Optional[str] = None,
+    #     use_fully_custom_prompt: bool = False,
+    #     notes: Optional[str] = None,
+    #     output_parser: Optional[str] = 'right/wrong',
+    #     fallback_comparison: bool = True,
+    #     default_temperature: float = 0.0,
+    #     verbose: bool = False,
+    #     num_retries: int = 2,          # per-call retries before giving up on that model
+    #     backoff_base: float = 0.5,     # seconds
+    #     backoff_max: float = 4.0,      # seconds
+    #     custom_generation_fns: Optional[List[Callable[[str], str]]] = None,
+    #     mode: str = "majority",        # "single", "majority", "all"
+    # ):
+    #     """
+    #     config keys can be a provider name ("wandb", "openai", "anthropic")
+    #     or a full model name ("openai/gpt-4o-mini", "wandb/deepseek-ai/DeepSeek-V3.1").
+    #     Values can include:
+    #         api_base: Optional[str]
+    #         headers: Dict[str, str]
+    #         temperature: float
+    #     Precedence:
+    #         base_headers < provider config < model config
+    #     Args:
+    #         models: List of litellm model strings (e.g., ["openai/gpt-4", "anthropic/claude-3"])
+    #         custom_template: Template with placeholders for input/output/ground_truth
+    #         use_fully_custom_prompt: If True, pass complete prompt to judge(prompt=...).
+    #                                  When True, input/output/ground_truth must NOT be passed to judge()
+    #         output_parser: Parser name ('right/wrong', 'yes/no', 'pass/fail', 'numeric')
+    #                       or custom function with signature (str) -> Any
+    #         fallback_comparison: If True and parser returns None, falls back to string comparison
+    #         custom_generation_fns: List of custom inference functions with signature fn(prompt: str) -> str
+    #                                These will be used in addition to litellm models for voting.
+    #         mode: Voting mode - "majority" (default), "single" (first judge only), or "all" (unanimous)
+    #     """
+    #     self.models = models or []
+    #     self.custom_generation_fns = custom_generation_fns or []
+    #     # Validate that at least one judge is provided
+    #     if not self.models and not self.custom_generation_fns:
+    #         raise ValueError("Must provide at least one of: models (litellm) or custom_generation_fns")
+    #     # Validate mode
+    #     if mode not in ("majority", "single", "all"):
+    #         raise ValueError("mode must be 'majority', 'single', or 'all'")
+    #     self.config = config or {}
+    #     self.base_headers = dict(base_headers or {})
+    #     self.wandb_project = wandb_project or os.getenv("WANDB_PROJECT")
+    #     self.notes = notes or ""
+    #     self.use_fully_custom_prompt = use_fully_custom_prompt
+    #     self.mode = mode
+    #     # Resolve output parser
+    #     parser_name = None
+    #     if isinstance(output_parser, str):
+    #         parser_map = {
+    #             'right/wrong': OutputParsers.right_wrong,
+    #             'pass/fail': OutputParsers.pass_fail,
+    #             'yes/no': OutputParsers.yes_no,
+    #             'numeric': OutputParsers.numeric_score,
+    #         }
+    #         if output_parser not in parser_map:
+    #             raise ValueError(f"Unknown parser '{output_parser}'. Available: {list(parser_map.keys())}")
+    #         self.output_parser = parser_map[output_parser]
+    #         parser_name = output_parser
+    #     else:
+    #         self.output_parser = output_parser
+    #     # Set template based on mode
+    #     if use_fully_custom_prompt:
+    #         self.template = None  # No template in fully custom mode
+    #     elif custom_template:
+    #         self.template = custom_template
+    #     elif parser_name and parser_name in self.PARSER_INSTRUCTIONS:
+    #         self.template = self.BASE_TEMPLATE.format(
+    #             instruction=self.PARSER_INSTRUCTIONS[parser_name],
+    #             notes_section="{notes_section}",
+    #             input_block="{input_block}",
+    #             model_output="{model_output}",
+    #             ground_truth="{ground_truth}",
+    #         )
+    #     else:
+    #         # Default to right/wrong for custom parsers
+    #         self.template = self.BASE_TEMPLATE.format(
+    #             instruction=self.PARSER_INSTRUCTIONS['right/wrong'],
+    #             notes_section="{notes_section}",
+    #             input_block="{input_block}",
+    #             model_output="{model_output}",
+    #             ground_truth="{ground_truth}",
+    #         )
+    #     self.fallback_comparison = fallback_comparison
+    #     self.default_temperature = float(default_temperature)
+    #     self.verbose = verbose
+    #     self.num_retries = int(num_retries)
+    #     self.backoff_base = float(backoff_base)
+    #     self.backoff_max = float(backoff_max)
     def __init__(
         self,
         models: Optional[List[str]] = None,
-        config: Optional[Dict[str, Dict[str, Any]]] = None,   # one dict for providers and models
+        config: Optional[Dict[str, Dict[str, Any]]] = None,
         base_headers: Optional[Dict[str, str]] = None,
         wandb_project: Optional[str] = None,
         custom_template: Optional[str] = None,
@@ -335,44 +569,20 @@ Output only the number. No explanation. No extra text.""",
         fallback_comparison: bool = True,
         default_temperature: float = 0.0,
         verbose: bool = False,
-        num_retries: int = 2,          # per-call retries before giving up on that model
-        backoff_base: float = 0.5,     # seconds
-        backoff_max: float = 4.0,      # seconds
+        num_retries: int = 2,
+        backoff_base: float = 0.5,
+        backoff_max: float = 4.0,
         custom_generation_fns: Optional[List[Callable[[str], str]]] = None,
-        mode: str = "majority",        # "single", "majority", "all"
+        mode: str = "majority",
+        litellm_cache_dir: Optional[str] = None,
+        cache_size_gb: Optional[float] = None,
     ):
-        """
-        config keys can be a provider name ("wandb", "openai", "anthropic")
-        or a full model name ("openai/gpt-4o-mini", "wandb/deepseek-ai/DeepSeek-V3.1").
-        Values can include:
-            api_base: Optional[str]
-            headers: Dict[str, str]
-            temperature: float
-        Precedence:
-            base_headers < provider config < model config
-        Args:
-            models: List of litellm model strings (e.g., ["openai/gpt-4", "anthropic/claude-3"])
-            custom_template: Template with placeholders for input/output/ground_truth
-            use_fully_custom_prompt: If True, pass complete prompt to judge(prompt=...).
-                                     When True, input/output/ground_truth must NOT be passed to judge()
-            output_parser: Parser name ('right/wrong', 'yes/no', 'pass/fail', 'numeric')
-                          or custom function with signature (str) -> Any
-            fallback_comparison: If True and parser returns None, falls back to string comparison
-            custom_generation_fns: List of custom inference functions with signature fn(prompt: str) -> str
-                                   These will be used in addition to litellm models for voting.
-            mode: Voting mode - "majority" (default), "single" (first judge only), or "all" (unanimous)
-        """
         self.models = models or []
         self.custom_generation_fns = custom_generation_fns or []
-        # Validate that at least one judge is provided
         if not self.models and not self.custom_generation_fns:
             raise ValueError("Must provide at least one of: models (litellm) or custom_generation_fns")
-        # Validate mode
         if mode not in ("majority", "single", "all"):
             raise ValueError("mode must be 'majority', 'single', or 'all'")
@@ -382,8 +592,13 @@ Output only the number. No explanation. No extra text.""",
         self.notes = notes or ""
         self.use_fully_custom_prompt = use_fully_custom_prompt
         self.mode = mode
+        self.fallback_comparison = fallback_comparison
+        self.default_temperature = float(default_temperature)
+        self.verbose = verbose
+        self.num_retries = int(num_retries)
+        self.backoff_base = float(backoff_base)
+        self.backoff_max = float(backoff_max)
-        # Resolve output parser
         parser_name = None
         if isinstance(output_parser, str):
             parser_map = {
@@ -393,15 +608,14 @@ Output only the number. No explanation. No extra text.""",
                 'numeric': OutputParsers.numeric_score,
             }
             if output_parser not in parser_map:
-                raise ValueError(f"Unknown parser '{output_parser}'. Available: {list(parser_map.keys())}")
+                raise ValueError(f"Unknown parser '{output_parser}'")
             self.output_parser = parser_map[output_parser]
             parser_name = output_parser
         else:
             self.output_parser = output_parser
-        # Set template based on mode
         if use_fully_custom_prompt:
-            self.template = None  # No template in fully custom mode
+            self.template = None
         elif custom_template:
             self.template = custom_template
         elif parser_name and parser_name in self.PARSER_INSTRUCTIONS:
@@ -413,7 +627,6 @@ Output only the number. No explanation. No extra text.""",
                 ground_truth="{ground_truth}",
             )
         else:
-            # Default to right/wrong for custom parsers
             self.template = self.BASE_TEMPLATE.format(
                 instruction=self.PARSER_INSTRUCTIONS['right/wrong'],
                 notes_section="{notes_section}",
@@ -422,12 +635,22 @@ Output only the number. No explanation. No extra text.""",
                 ground_truth="{ground_truth}",
             )
-        self.fallback_comparison = fallback_comparison
-        self.default_temperature = float(default_temperature)
-        self.verbose = verbose
-        self.num_retries = int(num_retries)
-        self.backoff_base = float(backoff_base)
-        self.backoff_max = float(backoff_max)
+        # optional local cache setup
+        self.cache_enabled = litellm_cache_dir is not None
+        if self.cache_enabled:
+            # Convert GB to bytes if specified, otherwise unlimited
+            size_limit_bytes = None if cache_size_gb is None else int(cache_size_gb * 1024 * 1024 * 1024)
+            cache_backend = UnlimitedDiskCache(litellm_cache_dir, size_limit=size_limit_bytes)
+            litellm.cache = Cache(disk_cache_dir=litellm_cache_dir)
+            litellm.cache.cache = cache_backend
     def _build_prompt(self, input: Any, model_output: Any, ground_truth: Any) -> str:
         notes_section = f"notes:\n{self.notes}\n" if self.notes else ""
@@ -495,14 +718,24 @@ Output only the number. No explanation. No extra text.""",
         last_err = None
         for i in range(attempts):
             try:
+                # resp = completion(
+                #     model=model,
+                #     api_base=api_base,  # None uses provider default
+                #     messages=[{"role": "user", "content": prompt}],
+                #     temperature=temperature,
+                #     max_tokens=max_tokens,
+                #     extra_headers=headers,
+                # )
                 resp = completion(
                     model=model,
-                    api_base=api_base,  # None uses provider default
+                    api_base=api_base,
                     messages=[{"role": "user", "content": prompt}],
                     temperature=temperature,
                     max_tokens=max_tokens,
                     extra_headers=headers,
-                )
+                    caching=self.cache_enabled
+                )
                 return (resp.choices[0].message.content or "").strip()
             except Exception as e:
                 last_err = e

{llmasajudge-0.1.10.dist-info → llmasajudge-0.1.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmasajudge
-Version: 0.1.10
+Version: 0.1.12
 Summary: LLM Judge: simple right/wrong voting across models
 Author-email: Brett Young <byyoung3@gmail.com>
 Project-URL: Homepage, https://example.com

llmasajudge-0.1.12.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+llmasajudge/__init__.py,sha256=cXxvx3shu0h40u1jXb-MqB0-mzkX1FWZElXnzBOE070,31957
+llmasajudge-0.1.12.dist-info/METADATA,sha256=efGYG1GCWizmcRoXS3zLEzdvQqqPB8JIRy_tlDOqpfM,515
+llmasajudge-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+llmasajudge-0.1.12.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
+llmasajudge-0.1.12.dist-info/RECORD,,

llmasajudge-0.1.10.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-llmasajudge/__init__.py,sha256=IZk0rwFbH6cNa5GrzUeuBI6jdo92fhgu5ycHlJDfQjc,23496
-llmasajudge-0.1.10.dist-info/METADATA,sha256=1YVh8GW2_xT9EJvdVvlwuHwJzQ4PRaURZJZ1KRXUtSs,515
-llmasajudge-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-llmasajudge-0.1.10.dist-info/top_level.txt,sha256=rRaIpM1llpEqahR9flT3RjpZHi2o16iOgnGYJ8cO4_0,12
-llmasajudge-0.1.10.dist-info/RECORD,,

{llmasajudge-0.1.10.dist-info → llmasajudge-0.1.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{llmasajudge-0.1.10.dist-info → llmasajudge-0.1.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

llmasajudge 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

llmasajudge 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl