PyPI - eval-framework - Versions diffs - 0.2.7__py3-none-any.whl - Mend

eval-framework 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

eval_framework/__init__.py +7 -0
eval_framework/base_config.py +36 -0
eval_framework/context/__init__.py +0 -0
eval_framework/context/determined.py +177 -0
eval_framework/context/eval.py +121 -0
eval_framework/context/local.py +78 -0
eval_framework/evaluation_generator.py +234 -0
eval_framework/exceptions.py +2 -0
eval_framework/external/ifeval_impl/README.md +5 -0
eval_framework/external/ifeval_impl/instructions.py +1523 -0
eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
eval_framework/external/ifeval_impl/utils.py +135 -0
eval_framework/llm/__init__.py +0 -0
eval_framework/llm/aleph_alpha.py +432 -0
eval_framework/llm/base.py +180 -0
eval_framework/llm/huggingface.py +418 -0
eval_framework/llm/mistral.py +88 -0
eval_framework/llm/models.py +28 -0
eval_framework/llm/openai.py +400 -0
eval_framework/llm/vllm.py +554 -0
eval_framework/logger.py +3 -0
eval_framework/main.py +166 -0
eval_framework/metrics/__init__.py +0 -0
eval_framework/metrics/base.py +40 -0
eval_framework/metrics/completion/__init__.py +1 -0
eval_framework/metrics/completion/accuracy_completion.py +16 -0
eval_framework/metrics/completion/aidanbench.py +28 -0
eval_framework/metrics/completion/bleu.py +76 -0
eval_framework/metrics/completion/chrf.py +62 -0
eval_framework/metrics/completion/code_assertion.py +44 -0
eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
eval_framework/metrics/completion/comet.py +56 -0
eval_framework/metrics/completion/concordance_index.py +38 -0
eval_framework/metrics/completion/csv_format.py +102 -0
eval_framework/metrics/completion/cwe_accuracy.py +49 -0
eval_framework/metrics/completion/exponential_similarity.py +65 -0
eval_framework/metrics/completion/f1.py +42 -0
eval_framework/metrics/completion/format_checker.py +56 -0
eval_framework/metrics/completion/grid_difference.py +77 -0
eval_framework/metrics/completion/ifeval.py +73 -0
eval_framework/metrics/completion/json_format.py +179 -0
eval_framework/metrics/completion/language_checker.py +74 -0
eval_framework/metrics/completion/length_control.py +83 -0
eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
eval_framework/metrics/completion/niah_accuracy.py +163 -0
eval_framework/metrics/completion/placeholder_checker.py +27 -0
eval_framework/metrics/completion/repetition.py +88 -0
eval_framework/metrics/completion/rouge_1.py +35 -0
eval_framework/metrics/completion/rouge_2.py +45 -0
eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
eval_framework/metrics/completion/rouge_l.py +52 -0
eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
eval_framework/metrics/completion/ter.py +67 -0
eval_framework/metrics/completion/text_counter.py +182 -0
eval_framework/metrics/efficiency/__init__.py +0 -0
eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
eval_framework/metrics/llm/__init__.py +0 -0
eval_framework/metrics/llm/base.py +34 -0
eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
eval_framework/metrics/llm/graders/language.py +56 -0
eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
eval_framework/metrics/llm/graders/models.py +74 -0
eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
eval_framework/metrics/llm/llm_judge_sql.py +394 -0
eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
eval_framework/metrics/llm/utils.py +20 -0
eval_framework/metrics/loglikelihood/__init__.py +0 -0
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
eval_framework/metrics/loglikelihood/base.py +50 -0
eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
eval_framework/metrics/loglikelihood/dcs.py +43 -0
eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
eval_framework/metrics/loglikelihood/ternary.py +42 -0
eval_framework/py.typed +0 -0
eval_framework/response_generator.py +351 -0
eval_framework/result_processors/__init__.py +0 -0
eval_framework/result_processors/base.py +88 -0
eval_framework/result_processors/hf_uploader.py +75 -0
eval_framework/result_processors/result_processor.py +129 -0
eval_framework/result_processors/wandb_uploader.py +137 -0
eval_framework/run.py +369 -0
eval_framework/run_direct.py +42 -0
eval_framework/shared/types.py +227 -0
eval_framework/tasks/__init__.py +6 -0
eval_framework/tasks/base.py +392 -0
eval_framework/tasks/benchmarks/__init__.py +0 -0
eval_framework/tasks/benchmarks/aidanbench.py +211 -0
eval_framework/tasks/benchmarks/arc.py +70 -0
eval_framework/tasks/benchmarks/arc_de.py +46 -0
eval_framework/tasks/benchmarks/arc_fi.py +46 -0
eval_framework/tasks/benchmarks/belebele.py +60 -0
eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
eval_framework/tasks/benchmarks/casehold.py +47 -0
eval_framework/tasks/benchmarks/chembench.py +85 -0
eval_framework/tasks/benchmarks/copa.py +64 -0
eval_framework/tasks/benchmarks/duc.py +91 -0
eval_framework/tasks/benchmarks/flores200.py +133 -0
eval_framework/tasks/benchmarks/flores_plus.py +84 -0
eval_framework/tasks/benchmarks/gpqa.py +201 -0
eval_framework/tasks/benchmarks/gsm8k.py +150 -0
eval_framework/tasks/benchmarks/hellaswag.py +69 -0
eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
eval_framework/tasks/benchmarks/humaneval.py +97 -0
eval_framework/tasks/benchmarks/ifeval.py +78 -0
eval_framework/tasks/benchmarks/include.py +119 -0
eval_framework/tasks/benchmarks/infinitebench.py +302 -0
eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
eval_framework/tasks/benchmarks/mbpp.py +192 -0
eval_framework/tasks/benchmarks/mmlu.py +215 -0
eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
eval_framework/tasks/benchmarks/mmmlu.py +529 -0
eval_framework/tasks/benchmarks/openbookqa.py +85 -0
eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
eval_framework/tasks/benchmarks/pawsx.py +65 -0
eval_framework/tasks/benchmarks/piqa.py +64 -0
eval_framework/tasks/benchmarks/quality.py +56 -0
eval_framework/tasks/benchmarks/sciq.py +110 -0
eval_framework/tasks/benchmarks/sphyr.py +79 -0
eval_framework/tasks/benchmarks/squad.py +211 -0
eval_framework/tasks/benchmarks/struct_eval.py +116 -0
eval_framework/tasks/benchmarks/tablebench.py +117 -0
eval_framework/tasks/benchmarks/triviaqa.py +42 -0
eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
eval_framework/tasks/benchmarks/winogender.py +64 -0
eval_framework/tasks/benchmarks/winogrande.py +69 -0
eval_framework/tasks/benchmarks/winox.py +57 -0
eval_framework/tasks/benchmarks/wmt.py +160 -0
eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
eval_framework/tasks/eval_config.py +136 -0
eval_framework/tasks/perturbation.py +83 -0
eval_framework/tasks/registry.py +186 -0
eval_framework/tasks/task_loader.py +81 -0
eval_framework/tasks/task_names.py +324 -0
eval_framework/tasks/utils.py +584 -0
eval_framework/utils/constants.py +9 -0
eval_framework/utils/file_ops.py +245 -0
eval_framework/utils/generate_task_docs.py +244 -0
eval_framework/utils/helpers.py +32 -0
eval_framework/utils/logging.py +62 -0
eval_framework/utils/packaging.py +52 -0
eval_framework/utils/tqdm_handler.py +14 -0
eval_framework-0.2.7.dist-info/METADATA +548 -0
eval_framework-0.2.7.dist-info/RECORD +170 -0
eval_framework-0.2.7.dist-info/WHEEL +4 -0
eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
template_formatting/README.md +83 -0
template_formatting/__init__.py +0 -0
template_formatting/formatter.py +537 -0
template_formatting/mistral_formatter.py +159 -0
template_formatting/py.typed +0 -0

eval_framework/tasks/utils.py ADDED Viewed

@@ -0,0 +1,584 @@
+import base64
+import logging
+import os
+import random
+import re
+import string
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any, Literal, NamedTuple
+import dill
+import numpy as np
+from llm_sandbox import SandboxSession
+logger = logging.getLogger(__name__)
+RANDOM_SEED = 42  # hacky way to get around circular import
+redis_warning_printed = False
+def raise_errors() -> bool:
+    debug = os.environ.get("DEBUG", "FALSE").lower()
+    if debug in {"1", "true"}:
+        return True
+    elif debug in {"0", "false"}:
+        return False
+    else:
+        raise ValueError(f"Invalid value for DEBUG environment variable: {debug}. Use one of 1, 0, true, false.")
+def get_n_letters(n: int) -> list[str]:
+    return list(string.ascii_uppercase)[: max(0, n)]
+def run_python_code(
+    code: str,
+    image: str | None = None,
+    input_files: list[tuple[str, str]] | None = None,
+    timeout: int = 60,
+    packages: list[str] | None = None,
+) -> str:
+    """
+    Run code in a sandboxed environment.
+    :param code: The code to run.
+    :param image: Docker image to use.
+    :param input_files: pairs of host and docker paths, host files will be copied to the docker.
+    :param timeout: Timeout in seconds, 0 if no timeout.
+    :param packages: List of python packages to install with pip.
+    :return: The output of the code.
+    """
+    with SandboxSession(lang="python", image=image, keep_template=True, commit_container=False) as session:
+        for host_file, docker_file in input_files or []:
+            session.copy_to_runtime(host_file, docker_file)
+        if timeout > 0:  # hack-add timeout from coreutils to the command executed
+            session.orig_execute_command = session.execute_command
+            session.execute_command = lambda command: session.orig_execute_command(f"timeout {timeout} {command}")
+        return session.run(code, libraries=packages).text.strip()
+def unittest_merge_snippets(code: str, test_code: str) -> str:
+    # Add unittest.main() if not present (note that without "if" sometimes it just reports
+    # "Ran 0 tests" errorneously).
+    if "unittest.main(" not in test_code:
+        test_code += "\n\nif __name__ == '__main__':\n  unittest.main()"
+    # Combine the implementation code and test code
+    combined_code = code + "\n\n" + test_code
+    return combined_code
+class ExecutionResult(NamedTuple):
+    """
+    A named tuple to store the result of code execution.
+    Attributes:
+        success (bool): Indicates if the execution was successful.
+        output (str): Contains the output or error messages from the execution.
+    """
+    success: bool
+    output: str
+def execute_python_code_with_tests(
+    code: str,
+    test_code: str,
+    package_mapping: dict[str, str | None],
+    merge_code_fn: Callable[[str, str], str],
+    image: str | None,
+    timeout: int,
+    parse_output_fn: Callable[[str], ExecutionResult],
+) -> ExecutionResult:
+    """
+    Executes the given code with test cases in a sandboxed environment.
+    :param code: The code to be tested.
+    :param test_code: The test cases to run against the code.
+    :param package_mapping: Mapping of package names to install commands.
+    :param merge_code_fn: function to merge LLM and test code
+    :param image: Docker image to use.
+    :param timeout: Timeout for the execution in seconds.
+    :param parse_otuput_fn: function to parse docker execution output
+    :return: An ExecutionResult named tuple with success status and output or errors.
+    """
+    combined_code = merge_code_fn(code, test_code)
+    packages = get_external_dependencies(combined_code, package_mapping)
+    # Run the combined code in the sandbox
+    output = run_python_code(combined_code, image=image, timeout=timeout, packages=packages)
+    # Parse the output to determine success
+    return parse_output_fn(output)
+class SerializationError(Exception):
+    """Base exception for callable serialization errors."""
+    pass
+class EncodingError(SerializationError):
+    """Raised when encoding a callable fails."""
+    pass
+class DecodingError(SerializationError):
+    """Raised when decoding a callable fails."""
+    pass
+class CallableSerializer:
+    @staticmethod
+    def encode(fn: Callable[..., Any]) -> str:
+        try:
+            serialized = dill.dumps(fn)
+            return base64.b64encode(serialized).decode("utf-8")
+        except Exception as e:
+            raise EncodingError(f"Failed to encode callable {fn}: {e}") from e
+    @staticmethod
+    def decode(fn_str: str) -> Callable[..., Any]:
+        try:
+            decoded = base64.b64decode(fn_str.encode("utf-8"))
+            return dill.loads(decoded)
+        except Exception as e:
+            raise DecodingError(f"Failed to decode callable from string: {e}") from e
+def _parse_unittest_output(output: str) -> ExecutionResult:
+    """Parse the unittest output to determine success and format the result."""
+    # Check for unittest success pattern
+    if "OK" in output and "FAILED" not in output:
+        # Extract the test summary if possible
+        match = re.search(r"Ran (\d+) tests? in [\d.]+s", output)
+        if match:
+            test_count = match.group(1)
+            test_output = f"All {test_count} tests completed successfully."
+        else:
+            test_output = "All tests completed successfully."
+        return ExecutionResult(True, test_output)
+    # Check for unittest failure pattern
+    elif "FAILED" in output:
+        # Try to extract failure details
+        match = re.search(r"FAILED \((.+)\)", output)
+        if match:
+            failure_details = match.group(1)
+            return ExecutionResult(False, f"Tests failed: {failure_details}\n{output}")
+        else:
+            return ExecutionResult(False, f"Tests failed: {output}")
+    # Check for common error patterns
+    elif "AssertionError" in output:
+        return ExecutionResult(False, f"Test failed with assertion error: {output}")
+    elif "Error:" in output or "Exception:" in output:
+        return ExecutionResult(False, f"Error during execution: {output}")
+    # If we can't determine success/failure, return the raw output
+    return ExecutionResult(False, f"Could not determine test results, potentially due to timeout. Output: {output}")
+def get_external_dependencies(code: str, package_mapping: dict[str, str | None]) -> list[str]:
+    """Identify external dependencies in the code."""
+    _, packages = extract_imports(code)
+    external_packages = []
+    for pkg in packages:
+        if pkg in package_mapping and package_mapping[pkg] is not None:
+            external_packages.append(package_mapping[pkg])
+    return external_packages  # type: ignore[return-value]
+def extract_imports(code: str) -> tuple[list[str], set[str]]:
+    """Extract all import statements and the imported packages from code."""
+    # Pattern for 'import x' or 'import x, y, z'
+    import_pattern = r"^import\s+([\w\s,.]+)"
+    # Pattern for 'from x import y'
+    from_pattern = r"^from\s+([\w.]+)\s+import\s+"
+    imports = []
+    packages = set()
+    for line in code.split("\n"):
+        line = line.strip()
+        # Skip empty lines
+        if not line:
+            continue
+        # Handle 'import x' or 'import x, y, z'
+        import_match = re.match(import_pattern, line)
+        if import_match:
+            imports.append(line)
+            # Extract all packages from the import statement
+            imported_items = import_match.group(1).split(",")
+            for item in imported_items:
+                # Clean up and get the base package name
+                pkg = item.strip().split(".")[0].split(" as ")[0]
+                if pkg:
+                    packages.add(pkg)
+            continue
+        # Handle 'from x import y'
+        from_match = re.match(from_pattern, line)
+        if from_match:
+            imports.append(line)
+            # Get the base package name
+            pkg = from_match.group(1).split(".")[0]
+            if pkg:
+                packages.add(pkg)
+    return imports, packages
+def get_docker_address() -> str:
+    # If it's docker-in-docker: the new docker actually started in host, so we need to use the host's IP
+    # See https://stackoverflow.com/questions/48546124/what-is-the-linux-equivalent-of-host-docker-internal
+    return "172.17.0.1" if Path("/.dockerenv").exists() else "localhost"
+class Editor:
+    def __init__(self, language: Literal["en", "de"] = "en", seed: int = RANDOM_SEED) -> None:
+        self.np_rng = np.random.RandomState(seed)
+        self.rng = random.Random(seed)
+        if language == "en":
+            self.letters = string.ascii_lowercase
+        elif language == "de":
+            self.letters = string.ascii_lowercase + "ßöäü"
+        else:
+            raise NotImplementedError
+    @staticmethod
+    def _split_sentence(sentence: str) -> tuple[list[str], list[str], bool]:
+        words = re.findall(r"\w+", sentence)
+        spaces = re.findall(r"[^\w]+", sentence)
+        has_leading_space = not words or sentence[: len(words[0])] != words[0]
+        return words, spaces, has_leading_space
+    @staticmethod
+    def _recombine(words: list[str], spaces: list[str], has_leading_space: bool) -> str:
+        if has_leading_space:
+            combined_lists = sum([[w, s] for w, s in zip(words, spaces[1:])], [spaces[0]])
+        else:
+            combined_lists = sum([[w, s] for w, s in zip(words, spaces)], [])
+        if len(words) > len(spaces) - (1 if has_leading_space else 0):
+            combined_lists.append(words[-1])
+        return "".join(combined_lists)
+    @staticmethod
+    def _get_word_probs(words: list[str]) -> np.ndarray:
+        # We sample words proportional to their length - 1,
+        # This means we ignore one-character words such as "I" and "a",
+        # because these can't be transposed or split
+        lengths = np.array([len(word) - 1 for word in words])
+        probs = lengths / np.sum(lengths)
+        return probs
+    @staticmethod
+    def _transpose(word: str, idx1: int, idx2: int) -> str:
+        assert abs(idx2 - idx1) == 1, "idx1 and idx2 are not next to each other"
+        if idx1 > idx2:
+            idx1, idx2 = idx2, idx1
+        return word[:idx1] + word[idx2] + word[idx1] + word[idx2 + 1 :]
+    @staticmethod
+    def _delete(word: str, idx: int) -> str:
+        return word[:idx] + word[idx + 1 :]
+    @staticmethod
+    def _insert(word: str, idx: int, letter: str) -> str:
+        assert len(letter) == 1, "`letter` is not a single character"
+        return word[:idx] + letter + word[idx:]
+    @staticmethod
+    def _change_casing(word: str, idx: int) -> str:
+        character = word[idx]
+        if character.islower():
+            character = character.upper()
+        else:
+            character = character.lower()
+        return word[:idx] + character + word[idx + 1 :]
+    @staticmethod
+    def _split_word(word: str, idx: int) -> str:
+        return word[:idx] + " " + word[idx:]
+    def _edit_word(self, word: str, num_edits: int) -> str:
+        # NB: It could be that two edits cancel each other out
+        # but the chance of this is sufficiently small that it doesn't
+        # make sense to complicate the code to fix this
+        if num_edits == 0:
+            return word
+        for _ in range(num_edits):
+            # upweighted change casing
+            choices = ["insert", "change_casing", "change_casing"]
+            if len(word) > 1:
+                choices.extend(["transpose", "split_word"])
+            if len(word) > 4:
+                # use delete more sparingly since it has a big impact
+                choices.extend(["delete"])
+            edit_function = self.rng.choice(choices)
+            if edit_function == "transpose":
+                idx = self.rng.randint(0, len(word) - 2)
+                word = self._transpose(word, idx, idx + 1)
+            elif edit_function == "delete":
+                idx = self.rng.randint(1, len(word) - 2)
+                word = self._delete(word, idx)
+            elif edit_function == "insert":
+                idx = self.rng.randint(0, len(word) - 1)
+                letter = self.rng.choice(self.letters)
+                word = self._insert(word, idx, letter)
+            elif edit_function == "change_casing":
+                idx = self.rng.randint(0, len(word) - 1)
+                word = self._change_casing(word, idx)
+            elif edit_function == "split_word":
+                idx = self.rng.randint(1, len(word) - 1)
+                word = self._split_word(word, idx)
+        return word
+    def __call__(self, sentence: str, character_edit_change: float, unmodifiable_words: list[str] | None = None) -> str:
+        words, spaces, has_leading_space = self._split_sentence(sentence)
+        num_characters = sum(map(len, words))
+        num_edits = int(num_characters * character_edit_change)
+        if num_edits == 0:
+            return sentence
+        probs = self._get_word_probs(words)
+        edits_per_word = self.np_rng.multinomial(num_edits, probs)
+        unmodifiable_words_set = set([w.lower() for w in unmodifiable_words or []])
+        edited_words = []
+        for edits, word in zip(edits_per_word, words):
+            if word.lower() not in unmodifiable_words_set:
+                edited_words.append(self._edit_word(word, int(edits)))
+            else:
+                edited_words.append(word)
+        return self._recombine(edited_words, spaces, has_leading_space)
+class HatPaperEditor:
+    # Used for Section 4.4 in the HAT paper (https://openreview.net/pdf?id=tU074jg2vS).
+    def __init__(self, seed: int = RANDOM_SEED) -> None:
+        self.rng = random.Random(seed)
+    def _get_indices(self, input_text: str, pct: float, unmodifiable_words: list[str] | None = None) -> list[int]:
+        indices = [
+            i + 1
+            for i, c in enumerate(input_text[1:-1])
+            if c.isalnum() and input_text[i].isalnum() and input_text[i + 2].isalnum()
+        ]
+        for word in unmodifiable_words or []:
+            for match in re.finditer(r"\b" + word + r"\b", input_text, re.IGNORECASE):
+                indices = [i for i in indices if i < match.start(0) or i >= match.end(0)]
+        return self.rng.sample(indices, int(len(indices) * pct))
+    def permute_chars_in_string(
+        self, input_text: str, permute_pct: float, unmodifiable_words: list[str] | None = None
+    ) -> str:
+        """
+        Randomly permute permute_pct characters in the input string.
+        Only permutes within words (whitespaces and first word chars are preserved).
+        """
+        chars_to_permute = self._get_indices(input_text, permute_pct, unmodifiable_words)
+        permuted_text = list(input_text)
+        for char_index in chars_to_permute:
+            permuted_text[char_index], permuted_text[char_index + 1] = (
+                permuted_text[char_index + 1],
+                permuted_text[char_index],
+            )
+        return "".join(permuted_text)
+    def replace_chars_in_string(
+        self, input_text: str, replace_pct: float, unmodifiable_words: list[str] | None = None
+    ) -> str:
+        """
+        Randomly replace replace_pct characters in the input string with replace_char.
+        Only replaces within words (whitespaces and first and last word chars are preserved).
+        """
+        chars_to_replace = self._get_indices(input_text, replace_pct, unmodifiable_words)
+        replaced_text = list(input_text)
+        for char_index in chars_to_replace:
+            replace_char = chr(self.rng.randint(33, 126))  # ASCII printable characters
+            replaced_text[char_index] = replace_char
+        return "".join(replaced_text)
+    def delete_chars_in_string(
+        self, input_text: str, delete_pct: float, unmodifiable_words: list[str] | None = None
+    ) -> str:
+        """
+        Randomly delete delete_pct characters in the input string.
+        Only deletes within words (whitespaces and first and last word chars are preserved).
+        """
+        chars_to_delete = self._get_indices(input_text, delete_pct, unmodifiable_words)
+        deleted_text = list(input_text)
+        for char_index in chars_to_delete:
+            deleted_text[char_index] = ""  # do not delete list entry since then the length of the list changes
+        return "".join(deleted_text)
+    def upper_case_string(self, input_text: str) -> str:
+        """
+        Upper case all characters in the input string.
+        """
+        return input_text.upper()
+# these are all the packages that occur in the BigCodeBench dataset
+BIG_CODE_BENCH_PACKAGE_MAPPING = {
+    # Standard library packages (built-in)
+    "array": None,
+    "ast": None,
+    "base64": None,
+    "binascii": None,
+    "bisect": None,
+    "calendar": None,
+    "cgi": None,
+    "cmath": None,
+    "codecs": None,
+    "collections": None,
+    "configparser": None,
+    "csv": None,
+    "ctypes": None,
+    "datetime": None,
+    "decimal": None,
+    "difflib": None,
+    "email": None,
+    "enum": None,
+    "errno": None,
+    "fnmatch": None,
+    "ftplib": None,
+    "functools": None,
+    "getpass": None,
+    "glob": None,
+    "gzip": None,
+    "hashlib": None,
+    "heapq": None,
+    "hmac": None,
+    "html": None,
+    "http": None,
+    "importlib": None,
+    "inspect": None,
+    "io": None,
+    "ipaddress": None,
+    "itertools": None,
+    "json": None,
+    "logging": None,
+    "math": None,
+    "mimetypes": None,
+    "multiprocessing": None,
+    "operator": None,
+    "os": None,
+    "pathlib": None,
+    "pickle": None,
+    "pkgutil": None,
+    "platform": None,
+    "queue": None,
+    "random": None,
+    "re": None,
+    "select": None,
+    "secrets": None,
+    "shlex": None,
+    "shutil": None,
+    "signal": None,
+    "smtplib": None,
+    "socket": None,
+    "sqlite3": None,
+    "ssl": None,
+    "statistics": None,
+    "string": None,
+    "struct": None,
+    "subprocess": None,
+    "sys": None,
+    "tarfile": None,
+    "textwrap": None,
+    "threading": None,
+    "time": None,
+    "turtle": None,
+    "types": None,
+    "typing": None,
+    "unicodedata": None,
+    "urllib": None,
+    "uuid": None,
+    "warnings": None,
+    "xml": None,
+    "zipfile": None,
+    "zlib": None,
+    "zoneinfo": None,
+    # External packages (need pip install)
+    "PIL": "pillow",
+    "Crypto": "pycryptodome",
+    "Levenshtein": "python-Levenshtein",
+    "blake3": "blake3",
+    "bs4": "beautifulsoup4",
+    "chardet": "chardet",
+    "cryptography": "cryptography",
+    "cv2": "opencv-python",
+    "dateutil": "python-dateutil",
+    "django": "django",
+    "docx": "python-docx",
+    "faker": "Faker",
+    "flask": "flask",
+    "flask_login": "flask-login",
+    "flask_mail": "flask-mail",
+    "flask_restful": "flask-restful",
+    "flask_wtf": "flask-wtf",
+    "folium": "folium",
+    "gensim": "gensim",
+    "geopandas": "geopandas",
+    "geopy": "geopy",
+    "holidays": "holidays",
+    "keras": "keras",
+    "librosa": "librosa",
+    "lxml": "lxml",
+    "matplotlib": "matplotlib",
+    "mechanize": "mechanize",
+    "mpl_toolkits": "matplotlib",
+    "natsort": "natsort",
+    "nltk": "nltk",
+    "numpy": "numpy",
+    "openpyxl": "openpyxl",
+    "pandas": "pandas",
+    "prettytable": "prettytable",
+    "psutil": "psutil",
+    "pyquery": "pyquery",
+    "pytesseract": "pytesseract",
+    "python_http_client": "python-http-client",
+    "pytz": "pytz",
+    "regex": "regex",
+    "requests": "requests",
+    "rsa": "rsa",
+    "scipy": "scipy",
+    "seaborn": "seaborn",
+    "sendgrid": "sendgrid",
+    "shapely": "shapely",
+    "skimage": "scikit-image",
+    "sklearn": "scikit-learn",
+    "soundfile": "soundfile",
+    "statsmodels": "statsmodels",
+    "sympy": "sympy",
+    "tensorflow": "tensorflow",
+    "textblob": "textblob",
+    "texttable": "texttable",
+    "werkzeug": "werkzeug",
+    "wikipedia": "wikipedia",
+    "wordcloud": "wordcloud",
+    "wordninja": "wordninja",
+    "wtforms": "wtforms",
+    "xlwt": "xlwt",
+    "xmltodict": "xmltodict",
+    "yaml": "pyyaml",
+}

eval_framework/utils/constants.py ADDED Viewed

@@ -0,0 +1,9 @@
+from pathlib import Path
+RED = "\033[91m"
+YELLOW = "\033[93m"
+MAGENTA = "\033[1;35;40m"
+RESET = "\033[0m"
+GREEN = "\033[92m"
+ROOT_DIR = Path(__file__).parents[2]