PyPI - crfm-helm - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

helm/common/cache.py CHANGED Viewed

@@ -1,19 +1,14 @@
-# mypy: check_untyped_defs = False
-from abc import abstractmethod
-import contextlib
+from collections import defaultdict
 from dataclasses import dataclass
+from typing import Dict, Callable, Generator, Mapping, Optional, Tuple
 import json
-from typing import Dict, Callable, Generator, Iterable, Optional, Tuple
-from collections import defaultdict
-import sqlite3
 import threading
-from sqlitedict import SqliteDict
+import sqlite3
 from helm.common.general import hlog, htrack
+from helm.common.key_value_store import BlackHoleKeyValueStore, KeyValueStore, SqliteKeyValueStore
 from helm.proxy.retry import get_retry_decorator
-from bson.son import SON
-from bson.errors import InvalidDocument
-from pymongo import MongoClient, ReplaceOne
 try:
     from cPickle import loads
@@ -21,31 +16,19 @@ except ImportError:
     from pickle import loads
-def request_to_key(request: Dict) -> str:
-    """Normalize a `request` into a `key` so that we can hash using it."""
-    return json.dumps(request, sort_keys=True)
-def key_to_request(key: str) -> Dict:
-    """Convert the normalized version to the request."""
-    return json.loads(key)
 def retry_if_write_failed(success: bool) -> bool:
     """Retries when the write fails."""
     return not success
 retry: Callable = get_retry_decorator(
-    "Write", max_attempts=10, wait_exponential_multiplier_seconds=2, retry_on_result=retry_if_write_failed
+    "Write", max_attempts=5, wait_exponential_multiplier_seconds=2, retry_on_result=retry_if_write_failed
 )
 class CacheConfig:
     """Configuration for a cache."""
-    pass
     @property
     def cache_stats_key(self) -> str:
         """The string key used by CacheStats to identify this cache."""
@@ -55,8 +38,6 @@ class CacheConfig:
 class KeyValueStoreCacheConfig(CacheConfig):
     """Configuration for a cache backed by a key-value store."""
-    pass
 @dataclass(frozen=True)
 class SqliteCacheConfig(KeyValueStoreCacheConfig):
@@ -70,6 +51,16 @@ class SqliteCacheConfig(KeyValueStoreCacheConfig):
         return self.path
+@dataclass(frozen=True)
+class BlackHoleCacheConfig(KeyValueStoreCacheConfig):
+    """Configuration for a cache that does not save any data."""
+    @property
+    def cache_stats_key(self) -> str:
+        """The string key used by CacheStats to identify this cache."""
+        return "disabled_cache"
 @dataclass(frozen=True)
 class MongoCacheConfig(KeyValueStoreCacheConfig):
     """Configuration for a cache backed by a MongoDB collection."""
@@ -105,156 +96,6 @@ class WithFollowerCacheConfig(CacheConfig):
         return self.main.cache_stats_key
-class KeyValueStore(contextlib.AbstractContextManager):
-    """Key value store that persists writes."""
-    @property
-    def path(self):
-        return self._path
-    @abstractmethod
-    def contains(self, key: Dict) -> bool:
-        pass
-    @abstractmethod
-    def get(self, key: Dict) -> Optional[Dict]:
-        pass
-    @abstractmethod
-    def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
-        pass
-    @abstractmethod
-    def put(self, key: Dict, value: Dict) -> None:
-        pass
-    @abstractmethod
-    def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
-        pass
-    @abstractmethod
-    def remove(self, key: Dict) -> None:
-        pass
-class _SqliteKeyValueStore(KeyValueStore):
-    """Key value store backed by a SQLite file."""
-    def __init__(self, path: str):
-        self._sqlite_dict = SqliteDict(path)
-        super().__init__()
-    def __enter__(self) -> "_SqliteKeyValueStore":
-        self._sqlite_dict.__enter__()
-        return self
-    def __exit__(self, exc_type, exc_value, traceback) -> None:
-        self._sqlite_dict.__exit__(exc_type, exc_value, traceback)
-    def contains(self, key: Dict) -> bool:
-        return request_to_key(key) in self._sqlite_dict
-    def get(self, key: Dict) -> Optional[Dict]:
-        key_string = request_to_key(key)
-        result = self._sqlite_dict.get(key_string)
-        if result is not None:
-            assert isinstance(result, dict)
-            return result
-        return None
-    def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
-        for key, value in self._sqlite_dict.items():
-            yield (key, value)
-    def put(self, key: Dict, value: Dict) -> None:
-        key_string = request_to_key(key)
-        self._sqlite_dict[key_string] = value
-        self._sqlite_dict.commit()
-    def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
-        for key, value in pairs:
-            self.put(key, value)
-    def remove(self, key: Dict) -> None:
-        del self._sqlite_dict[key]
-        self._sqlite_dict.commit()
-class _MongoKeyValueStore(KeyValueStore):
-    """Key value store backed by a MongoDB database."""
-    # The number of documents to return per batch.
-    _BATCH_SIZE: int = 8
-    _REQUEST_KEY = "request"
-    _RESPONSE_KEY = "response"
-    def __init__(self, uri: str, collection_name: str):
-        # TODO: Create client in __enter__ and clean up client in __exit__
-        self._mongodb_client: MongoClient = MongoClient(uri)
-        self._database = self._mongodb_client.get_default_database()
-        self._collection = self._database.get_collection(collection_name)
-        self._collection.create_index(self._REQUEST_KEY, unique=True)
-        super().__init__()
-    def __enter__(self) -> "_MongoKeyValueStore":
-        return self
-    def __exit__(self, exc_type, exc_value, traceback) -> None:
-        return
-    def _canonicalize_key(self, key: Dict) -> SON:
-        serialized = json.dumps(key, sort_keys=True)
-        return json.loads(serialized, object_pairs_hook=SON)
-    def contains(self, key: Dict) -> bool:
-        query = {self._REQUEST_KEY: self._canonicalize_key(key)}
-        return self._collection.find_one(query) is not None
-    def get(self, key: Dict) -> Optional[Dict]:
-        query = {self._REQUEST_KEY: self._canonicalize_key(key)}
-        document = self._collection.find_one(query)
-        if document is not None:
-            response = document[self._RESPONSE_KEY]
-            if isinstance(response, str):
-                return json.loads(response)
-            else:
-                return response
-        return None
-    def get_all(self) -> Generator[Tuple[Dict, Dict], None, None]:
-        for document in self._collection.find({}).batch_size(self._BATCH_SIZE):
-            request = document[self._REQUEST_KEY]
-            response = document[self._RESPONSE_KEY]
-            if isinstance(response, str):
-                yield (request, json.loads(response))
-            else:
-                yield (request, response)
-    def put(self, key: Dict, value: Dict) -> None:
-        request = self._canonicalize_key(key)
-        document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, value)])
-        # The MongoDB collection should have a unique indexed on "request"
-        try:
-            self._collection.replace_one(filter={"request": request}, replacement=document, upsert=True)
-        except InvalidDocument:
-            # If the document is malformed e.g. because of null bytes in keys, instead store the response as a string.
-            alternate_document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, json.dumps(value))])
-            self._collection.replace_one(filter={"request": request}, replacement=alternate_document, upsert=True)
-    def multi_put(self, pairs: Iterable[Tuple[Dict, Dict]]) -> None:
-        operations = []
-        for key, value in pairs:
-            request = self._canonicalize_key(key)
-            document = SON([(self._REQUEST_KEY, request), (self._RESPONSE_KEY, value)])
-            operations.append(ReplaceOne({self._REQUEST_KEY: request}, document, upsert=True))
-        # Note: unlike put, multi_put does not support documents with null bytes in keys.
-        self._collection.bulk_write(operations)
-    def remove(self, key: Dict) -> None:
-        self._collection.delete_one(key)
 def get_all_from_sqlite(path: str) -> Generator[Tuple[Dict, Dict], None, None]:
     """Yields all decoded key, value pairs from the SQLite cache.
@@ -277,15 +118,19 @@ def create_key_value_store(config: KeyValueStoreCacheConfig) -> KeyValueStore:
     """Create a key value store from the given configuration."""
     # TODO: Support creating _MongoKeyValueStore
     if isinstance(config, MongoCacheConfig):
-        return _MongoKeyValueStore(config.uri, config.collection_name)
+        from helm.common.mongo_key_value_store import MongoKeyValueStore
+        return MongoKeyValueStore(config.uri, config.collection_name)
     elif isinstance(config, SqliteCacheConfig):
-        return _SqliteKeyValueStore(config.path)
+        return SqliteKeyValueStore(config.path)
+    elif isinstance(config, BlackHoleCacheConfig):
+        return BlackHoleKeyValueStore()
     else:
         raise ValueError(f"KeyValueStoreCacheConfig with unknown type: {config}")
 @retry
-def write_to_key_value_store(key_value_store: KeyValueStore, key: Dict, response: Dict) -> bool:
+def write_to_key_value_store(key_value_store: KeyValueStore, key: Mapping, response: Dict) -> bool:
     """
     Write to the key value store with retry. Returns boolean indicating whether the write was successful or not.
     """
@@ -355,7 +200,7 @@ class Cache(object):
         else:
             raise ValueError(f"CacheConfig with unknown type: {config}")
-    def get(self, request: Dict, compute: Callable[[], Dict]) -> Tuple[Dict, bool]:
+    def get(self, request: Mapping, compute: Callable[[], Dict]) -> Tuple[Dict, bool]:
         """Get the result of `request` (by calling `compute` as needed)."""
         cache_stats.increment_query(self.config.cache_stats_key)

helm/common/cache_backend_config.py ADDED Viewed

@@ -0,0 +1,47 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+import os
+from helm.common.cache import CacheConfig, MongoCacheConfig, BlackHoleCacheConfig, SqliteCacheConfig
+class CacheBackendConfig(ABC):
+    """Config for a cache backend."""
+    @abstractmethod
+    def get_cache_config(self, shard_name: str) -> CacheConfig:
+        """Get a CacheConfig for the given shard."""
+        pass
+@dataclass(frozen=True)
+class MongoCacheBackendConfig(CacheBackendConfig):
+    """Config for a MongoDB cache backend."""
+    uri: str
+    """URL for the MongoDB database that contains the collection.
+    Example format: mongodb://[username:password@]host1[:port1]/[dbname]
+    For full format, see: https://www.mongodb.com/docs/manual/reference/connection-string/"""
+    def get_cache_config(self, shard_name: str) -> CacheConfig:
+        return MongoCacheConfig(uri=self.uri, collection_name=shard_name)
+@dataclass(frozen=True)
+class BlackHoleCacheBackendConfig(CacheBackendConfig):
+    """Config for a cache backend that does not save any data."""
+    def get_cache_config(self, shard_name: str) -> CacheConfig:
+        return BlackHoleCacheConfig()
+@dataclass(frozen=True)
+class SqliteCacheBackendConfig(CacheBackendConfig):
+    """Config for a Sqlite cache backend."""
+    path: str
+    """Path for the directory that will contain Sqlite files for caches."""
+    def get_cache_config(self, shard_name: str) -> CacheConfig:
+        return SqliteCacheConfig(path=os.path.join(self.path, f"{shard_name}.sqlite"))

helm/common/clip_score_request.py ADDED Viewed

@@ -0,0 +1,41 @@
+from dataclasses import dataclass
+from typing import Optional
+DEFAULT_CLIP_SCORE_MODEL = "openai/clip-vit-large-patch14"
+@dataclass(frozen=True)
+class CLIPScoreRequest:
+    """
+    Computes a CLIPScore for a given caption and image.
+    """
+    # Caption to compute CLIPScore for
+    caption: str
+    # Location of the image
+    image_location: str
+    # Which CLIP model to use
+    model: str = DEFAULT_CLIP_SCORE_MODEL
+    # Compute multilingual CLIPScore
+    multilingual: bool = False
+@dataclass(frozen=True)
+class CLIPScoreResult:
+    """Result after sending a `CLIPScoreRequest`."""
+    # Whether the request was successful
+    success: bool
+    # Whether the request was cached
+    cached: bool
+    # The CLIPScore
+    score: float = 0.0
+    # If `success` is false, what was the error?
+    error: Optional[str] = None

helm/common/concurrency.py ADDED Viewed

@@ -0,0 +1,32 @@
+from contextlib import AbstractContextManager
+from threading import Lock
+from typing import TypeVar, Generic
+T = TypeVar("T")
+class ThreadSafeWrapper(AbstractContextManager, Generic[T]):
+    """A wrapper that makes thread-hostile objects thread-safe.
+    This provides a context manager that holds a lock for accessing the inner object.
+    Example usage:
+        wrapped_obj = wrapper(thread_hostile_obj)
+        with wrapped_obj as obj:
+            # Lock is automatically held in here
+            obj.do_stuff()
+    """
+    def __init__(self, wrapped: T):
+        self._wrapped = wrapped
+        self._lock = Lock()
+    def __enter__(self) -> T:
+        self._lock.__enter__()
+        return self._wrapped
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self._lock.__exit__(exc_type, exc_value, traceback)
+        pass

helm/common/credentials_utils.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Functions used for credentials."""
+from typing import Any, Mapping, Optional
+from helm.common.hierarchical_logger import hlog
+def provide_api_key(
+    credentials: Mapping[str, Any], host_organization: str, model: Optional[str] = None
+) -> Optional[str]:
+    api_key_name = host_organization + "ApiKey"
+    if api_key_name in credentials:
+        hlog(f"Using host_organization api key defined in credentials.conf: {api_key_name}")
+        return credentials[api_key_name]
+    if "deployments" not in credentials:
+        hlog(
+            "WARNING: Could not find key 'deployments' in credentials.conf, "
+            f"therefore the API key {api_key_name} should be specified."
+        )
+        return None
+    deployment_api_keys = credentials["deployments"]
+    if model is None:
+        hlog(f"WARNING: Could not find key '{host_organization}' in credentials.conf and no model provided")
+        return None
+    if model not in deployment_api_keys:
+        hlog(f"WARNING: Could not find key '{model}' under key 'deployments' in credentials.conf")
+        return None
+    return deployment_api_keys[model]

helm/common/file_caches/__init__.py ADDED Viewed

File without changes

helm/common/file_caches/file_cache.py ADDED Viewed

@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+from typing import Callable
+class FileCache(ABC):
+    """
+    Cache to store files.
+    """
+    @abstractmethod
+    def store(self, compute: Callable[[], bytes]) -> str:
+        """
+        Stores the output of `compute` as a file at a unique location.
+        Returns the location of the file.
+        """
+        pass

helm/common/file_caches/local_file_cache.py ADDED Viewed

@@ -0,0 +1,61 @@
+import os
+from typing import Callable
+from helm.common.general import ensure_directory_exists, generate_unique_id
+from .file_cache import FileCache
+from helm.common.optional_dependencies import handle_module_not_found_error
+try:
+    from PIL import Image
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["images"])
+class LocalFileCache(FileCache):
+    def __init__(self, base_path: str, file_extension: str):
+        ensure_directory_exists(base_path)
+        self._location: str = base_path
+        self._file_extension: str = file_extension
+    def store(self, compute: Callable[[], bytes]) -> str:
+        """
+        Stores the output of `compute` as a file at a unique path.
+        Returns the file path.
+        """
+        file_path: str = self.generate_unique_new_file_path()
+        with open(file_path, "wb") as f:
+            f.write(compute())
+        return file_path
+    def generate_unique_new_file_path(self) -> str:
+        """Generate an unique file name at `base_path`"""
+        def generate_one() -> str:
+            file_name: str = f"{generate_unique_id()}.{self._file_extension}"
+            return os.path.join(self._location, file_name)
+        file_path: str
+        while True:
+            file_path = generate_one()
+            if not os.path.exists(file_path):
+                break
+        return file_path
+class LocalPILFileCache(LocalFileCache):
+    def __init__(self, base_path: str):
+        super().__init__(base_path, "png")
+    def store_image(self, compute: Callable[[], Image.Image]) -> str:
+        """
+        Stores the output of `compute` as a file at a unique path.
+        Returns the file path.
+        """
+        file_path: str = self.generate_unique_new_file_path()
+        compute().save(file_path)
+        return file_path
+    def load_image(self, file_path: str) -> Image.Image:
+        return Image.open(file_path).convert("RGB")

helm/common/file_caches/test_local_file_cache.py ADDED Viewed

@@ -0,0 +1,25 @@
+import os
+import shutil
+import tempfile
+import unittest
+from .local_file_cache import LocalFileCache
+class TestLocalFileCache(unittest.TestCase):
+    def setup_method(self, _):
+        self.path: str = tempfile.mkdtemp()
+    def teardown_method(self, _):
+        shutil.rmtree(self.path)
+    def test_get(self):
+        cache = LocalFileCache(self.path, file_extension="txt")
+        file_path1: str = cache.store(lambda: "hello.".encode())
+        # Verify the contents of the file
+        with open(file_path1, "r") as f:
+            assert f.read() == "hello."
+        cache.store(lambda: "bye.".encode())
+        assert len(os.listdir(self.path)) == 2

helm/common/file_upload_request.py ADDED Viewed

@@ -0,0 +1,27 @@
+from dataclasses import dataclass
+from typing import Optional
+@dataclass(frozen=True)
+class FileUploadRequest:
+    """Uploads a file at `path`."""
+    # Path of the file to upload
+    path: str
+@dataclass(frozen=True)
+class FileUploadResult:
+    """Result after sending a `FileUploadRequest`."""
+    # Whether the request was successful
+    success: bool
+    # Whether the request was cached
+    cached: bool
+    # URL of the uploaded file
+    url: str
+    # If `success` is false, what was the error?
+    error: Optional[str] = None

helm/common/general.py CHANGED Viewed

@@ -7,7 +7,8 @@ import urllib
 import uuid
 import zstandard
 from typing import Any, Callable, Dict, List, Optional, TypeVar
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+from datetime import datetime, date
+from concurrent.futures import ThreadPoolExecutor
 from tqdm import tqdm
 import pyhocon
@@ -62,7 +63,7 @@ def shell(args: List[str]):
     hlog(f"Executing: {cmd}")
     exit_code = subprocess.call(args)
     if exit_code != 0:
-        hlog(f"Failed with exit code {exit_code}: {cmd}")
+        raise Exception(f"Failed with exit code {exit_code}: {cmd}")
 @htrack(None)
@@ -160,6 +161,13 @@ def asdict_without_nones(obj: Any) -> Dict[str, Any]:
     return asdict(obj, dict_factory=lambda x: {k: v for (k, v) in x if v is not None})
+def serialize_dates(obj):
+    """Serialize dates (pass deault=serialize_dates into json.dumps)."""
+    if isinstance(obj, (datetime, date)):
+        return obj.isoformat()
+    raise TypeError(f"Type {type(obj)} is not serializable")
 def binarize_dict(d: Dict[str, int]) -> Dict[str, int]:
     """Binarize the dict by setting the values that are 1 to 0.
@@ -214,20 +222,14 @@ InT = TypeVar("InT")
 OutT = TypeVar("OutT")
-def parallel_map(
-    process: Callable[[InT], OutT], items: List[InT], parallelism: int, multiprocessing: bool = False
-) -> List[OutT]:
+def parallel_map(process: Callable[[InT], OutT], items: List[InT], parallelism: int) -> List[OutT]:
     """
     A wrapper for applying `process` to all `items`.
     """
-    units = "processes" if multiprocessing else "threads"
-    with htrack_block(f"Parallelizing computation on {len(items)} items over {parallelism} {units}"):
+    with htrack_block(f"Parallelizing computation on {len(items)} items over {parallelism} threads"):
         results: List
         if parallelism == 1:
             results = list(tqdm(map(process, items), total=len(items), disable=None))
-        elif multiprocessing:
-            with ProcessPoolExecutor(max_workers=parallelism) as executor:
-                results = list(tqdm(executor.map(process, items), total=len(items), disable=None))
         else:
             with ThreadPoolExecutor(max_workers=parallelism) as executor:
                 results = list(tqdm(executor.map(process, items), total=len(items), disable=None))
@@ -320,3 +322,20 @@ def safe_symlink(src: str, dest: str) -> None:
 def is_url(location: str) -> bool:
     """Return True if `location` is a url. False otherwise."""
     return urllib.parse.urlparse(location).scheme in ["http", "https"]
+def assert_is_str(val: Any) -> str:
+    assert isinstance(val, str)
+    return val
+def assert_is_str_list(val: Any) -> List[str]:
+    assert isinstance(val, list)
+    for v in val:
+        assert isinstance(v, str)
+    return val
+def assert_present(val: Optional[InT]) -> InT:
+    assert val is not None
+    return val

helm/common/image_generation_parameters.py ADDED Viewed

@@ -0,0 +1,25 @@
+from dataclasses import dataclass
+from typing import Optional
+@dataclass(frozen=True)
+class ImageGenerationParameters:
+    """
+    Parameters for image generation.
+    """
+    output_image_width: Optional[int] = None
+    """Width of the generated image. The model will generate images with the model's
+    default dimensions when unspecified."""
+    output_image_height: Optional[int] = None
+    """Height of the generated image. The model will generate images with the model's
+    default dimensions when unspecified."""
+    guidance_scale: Optional[float] = None
+    """A non-negative number determining how much importance is given to the prompt
+    when generating images. Higher values will generate images that follow more
+    closely to the prompt. Currently only for diffusion models."""
+    diffusion_denoising_steps: Optional[int] = None
+    """The number of denoising steps for diffusion models."""

crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl