PyPI - wisent - Versions diffs - 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

wisent 0.1.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show

wisent/__init__.py +1 -8
wisent/benchmarks/__init__.py +0 -0
wisent/benchmarks/coding/__init__.py +0 -0
wisent/benchmarks/coding/metrics/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
wisent/benchmarks/coding/metrics/evaluator.py +275 -0
wisent/benchmarks/coding/metrics/passk.py +66 -0
wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
wisent/benchmarks/coding/providers/__init__.py +18 -0
wisent/benchmarks/coding/providers/core/__init__.py +0 -0
wisent/benchmarks/coding/providers/core/atoms.py +31 -0
wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
wisent/classifiers/__init__.py +0 -0
wisent/classifiers/core/__init__.py +0 -0
wisent/classifiers/core/atoms.py +747 -0
wisent/classifiers/models/__init__.py +0 -0
wisent/classifiers/models/logistic.py +29 -0
wisent/classifiers/models/mlp.py +47 -0
wisent/cli/__init__.py +0 -0
wisent/cli/classifiers/__init__.py +0 -0
wisent/cli/classifiers/classifier_rotator.py +137 -0
wisent/cli/cli_logger.py +142 -0
wisent/cli/data_loaders/__init__.py +0 -0
wisent/cli/data_loaders/data_loader_rotator.py +96 -0
wisent/cli/evaluators/__init__.py +0 -0
wisent/cli/evaluators/evaluator_rotator.py +148 -0
wisent/cli/steering_methods/__init__.py +0 -0
wisent/cli/steering_methods/steering_rotator.py +110 -0
wisent/cli/wisent_cli/__init__.py +0 -0
wisent/cli/wisent_cli/commands/__init__.py +0 -0
wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
wisent/cli/wisent_cli/commands/listing.py +154 -0
wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
wisent/cli/wisent_cli/main.py +93 -0
wisent/cli/wisent_cli/shell.py +80 -0
wisent/cli/wisent_cli/ui.py +69 -0
wisent/cli/wisent_cli/util/__init__.py +0 -0
wisent/cli/wisent_cli/util/aggregations.py +43 -0
wisent/cli/wisent_cli/util/parsing.py +126 -0
wisent/cli/wisent_cli/version.py +4 -0
wisent/core/__init__.py +27 -0
wisent/core/activations/__init__.py +0 -0
wisent/core/activations/activations_collector.py +338 -0
wisent/core/activations/core/__init__.py +0 -0
wisent/core/activations/core/atoms.py +216 -0
wisent/core/agent/__init__.py +18 -0
wisent/core/agent/budget.py +638 -0
wisent/core/agent/device_benchmarks.py +685 -0
wisent/core/agent/diagnose/__init__.py +55 -0
wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
wisent/core/agent/diagnose/create_classifier.py +1154 -0
wisent/core/agent/diagnose/response_diagnostics.py +268 -0
wisent/core/agent/diagnose/select_classifiers.py +506 -0
wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
wisent/core/agent/diagnose/tasks/__init__.py +33 -0
wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
wisent/core/agent/diagnose.py +242 -0
wisent/core/agent/steer.py +212 -0
wisent/core/agent/timeout.py +134 -0
wisent/core/autonomous_agent.py +1234 -0
wisent/core/bigcode_integration.py +583 -0
wisent/core/contrastive_pairs/__init__.py +15 -0
wisent/core/contrastive_pairs/core/__init__.py +0 -0
wisent/core/contrastive_pairs/core/atoms.py +45 -0
wisent/core/contrastive_pairs/core/buliders.py +59 -0
wisent/core/contrastive_pairs/core/pair.py +178 -0
wisent/core/contrastive_pairs/core/response.py +152 -0
wisent/core/contrastive_pairs/core/serialization.py +300 -0
wisent/core/contrastive_pairs/core/set.py +133 -0
wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
wisent/core/data_loaders/__init__.py +0 -0
wisent/core/data_loaders/core/__init__.py +0 -0
wisent/core/data_loaders/core/atoms.py +98 -0
wisent/core/data_loaders/loaders/__init__.py +0 -0
wisent/core/data_loaders/loaders/custom.py +120 -0
wisent/core/data_loaders/loaders/lm_loader.py +218 -0
wisent/core/detection_handling.py +257 -0
wisent/core/download_full_benchmarks.py +1386 -0
wisent/core/evaluators/__init__.py +0 -0
wisent/core/evaluators/oracles/__init__.py +0 -0
wisent/core/evaluators/oracles/interactive.py +73 -0
wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
wisent/core/evaluators/oracles/user_specified.py +67 -0
wisent/core/hyperparameter_optimizer.py +429 -0
wisent/core/lm_eval_harness_ground_truth.py +1396 -0
wisent/core/log_likelihoods_evaluator.py +321 -0
wisent/core/managed_cached_benchmarks.py +595 -0
wisent/core/mixed_benchmark_sampler.py +364 -0
wisent/core/model_config_manager.py +330 -0
wisent/core/model_persistence.py +317 -0
wisent/core/models/__init__.py +0 -0
wisent/core/models/core/__init__.py +0 -0
wisent/core/models/core/atoms.py +460 -0
wisent/core/models/wisent_model.py +727 -0
wisent/core/multi_steering.py +316 -0
wisent/core/optuna/__init__.py +57 -0
wisent/core/optuna/classifier/__init__.py +25 -0
wisent/core/optuna/classifier/activation_generator.py +349 -0
wisent/core/optuna/classifier/classifier_cache.py +509 -0
wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
wisent/core/optuna/steering/__init__.py +0 -0
wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
wisent/core/optuna/steering/data_utils.py +342 -0
wisent/core/optuna/steering/metrics.py +474 -0
wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
wisent/core/optuna/steering/steering_optimization.py +1111 -0
wisent/core/parser.py +1668 -0
wisent/core/prompts/__init__.py +0 -0
wisent/core/prompts/core/__init__.py +0 -0
wisent/core/prompts/core/atom.py +57 -0
wisent/core/prompts/core/prompt_formater.py +157 -0
wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
wisent/core/representation.py +5 -0
wisent/core/sample_size_optimizer.py +648 -0
wisent/core/sample_size_optimizer_v2.py +355 -0
wisent/core/save_results.py +277 -0
wisent/core/steering.py +652 -0
wisent/core/steering_method.py +26 -0
wisent/core/steering_methods/__init__.py +0 -0
wisent/core/steering_methods/core/__init__.py +0 -0
wisent/core/steering_methods/core/atoms.py +153 -0
wisent/core/steering_methods/methods/__init__.py +0 -0
wisent/core/steering_methods/methods/caa.py +44 -0
wisent/core/steering_optimizer.py +1297 -0
wisent/core/task_interface.py +132 -0
wisent/core/task_selector.py +189 -0
wisent/core/tasks/__init__.py +175 -0
wisent/core/tasks/aime_task.py +141 -0
wisent/core/tasks/file_task.py +211 -0
wisent/core/tasks/hle_task.py +180 -0
wisent/core/tasks/hmmt_task.py +119 -0
wisent/core/tasks/livecodebench_task.py +201 -0
wisent/core/tasks/livemathbench_task.py +158 -0
wisent/core/tasks/lm_eval_task.py +455 -0
wisent/core/tasks/math500_task.py +84 -0
wisent/core/tasks/polymath_task.py +146 -0
wisent/core/tasks/supergpqa_task.py +220 -0
wisent/core/time_estimator.py +149 -0
wisent/core/timing_calibration.py +174 -0
wisent/core/tracking/__init__.py +54 -0
wisent/core/tracking/latency.py +618 -0
wisent/core/tracking/memory.py +359 -0
wisent/core/trainers/__init__.py +0 -0
wisent/core/trainers/core/__init__.py +11 -0
wisent/core/trainers/core/atoms.py +45 -0
wisent/core/trainers/steering_trainer.py +271 -0
wisent/core/user_model_config.py +158 -0
wisent/opti/__init__.py +0 -0
wisent/opti/core/__init__.py +0 -0
wisent/opti/core/atoms.py +175 -0
wisent/opti/methods/__init__.py +0 -0
wisent/opti/methods/opti_classificator.py +172 -0
wisent/opti/methods/opti_steering.py +138 -0
wisent/synthetic/__init__.py +0 -0
wisent/synthetic/cleaners/__init__.py +0 -0
wisent/synthetic/cleaners/core/__init__.py +0 -0
wisent/synthetic/cleaners/core/atoms.py +58 -0
wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
wisent/synthetic/cleaners/methods/__init__.py +0 -0
wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
wisent/synthetic/db_instructions/__init__.py +0 -0
wisent/synthetic/db_instructions/core/__init__.py +0 -0
wisent/synthetic/db_instructions/core/atoms.py +25 -0
wisent/synthetic/db_instructions/mini_dp.py +37 -0
wisent/synthetic/generators/__init__.py +0 -0
wisent/synthetic/generators/core/__init__.py +0 -0
wisent/synthetic/generators/core/atoms.py +73 -0
wisent/synthetic/generators/diversities/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/core.py +68 -0
wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
wisent/synthetic/generators/pairs_generator.py +179 -0
wisent-0.5.1.dist-info/METADATA +67 -0
wisent-0.5.1.dist-info/RECORD +218 -0
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
wisent/activations/__init__.py +0 -9
wisent/activations/client.py +0 -97
wisent/activations/extractor.py +0 -251
wisent/activations/models.py +0 -95
wisent/client.py +0 -45
wisent/control_vector/__init__.py +0 -9
wisent/control_vector/client.py +0 -85
wisent/control_vector/manager.py +0 -168
wisent/control_vector/models.py +0 -70
wisent/inference/__init__.py +0 -9
wisent/inference/client.py +0 -103
wisent/inference/inferencer.py +0 -250
wisent/inference/models.py +0 -66
wisent/utils/__init__.py +0 -3
wisent/utils/auth.py +0 -30
wisent/utils/http.py +0 -228
wisent/version.py +0 -3
wisent-0.1.1.dist-info/METADATA +0 -142
wisent-0.1.1.dist-info/RECORD +0 -23
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0

wisent/core/tasks/lm_eval_task.py ADDED Viewed

@@ -0,0 +1,455 @@
+"""
+LM-Evaluation-Harness task wrapper for task-agnostic architecture.
+"""
+from typing import Any, Dict, List, Optional
+from ..benchmark_extractors import BenchmarkExtractor, get_extractor
+from ..task_interface import TaskInterface
+class LMEvalTask(TaskInterface):
+    """Wrapper for lm-evaluation-harness tasks."""
+    def __init__(self, task_name: str, description: str, categories: List[str]):
+        self.task_name = task_name
+        self._description = description
+        self._categories = categories
+        self._extractor = get_extractor(task_name)
+    def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+        """Load data directly from lm-eval without Model dependency."""
+        try:
+            # Load data directly from lm-eval without creating a Model instance
+            from lm_eval.tasks import get_task_dict
+            # Get task directly from lm-eval
+            task_dict = get_task_dict([self.task_name])
+            if self.task_name not in task_dict:
+                print(f"Warning: Task '{self.task_name}' not found in lm-eval")
+                return []
+            task = task_dict[self.task_name]
+            # Get the task's test documents
+            docs = []
+            if hasattr(task, "test_docs"):
+                # For lm-eval versions with test_docs method
+                docs = list(task.test_docs())
+            elif hasattr(task, "dataset"):
+                # For newer lm-eval versions
+                dataset = task.dataset
+                if hasattr(dataset, "test"):
+                    docs = list(dataset.test)
+                elif hasattr(dataset, "validation"):
+                    docs = list(dataset.validation)
+                else:
+                    # Fallback to the main dataset
+                    docs = list(dataset)
+            # Ensure docs are in dictionary format
+            processed_docs = []
+            for doc in docs:
+                if isinstance(doc, dict):
+                    processed_docs.append(doc)
+                elif isinstance(doc, str):
+                    # Handle string documents by wrapping them
+                    processed_docs.append({"text": doc})
+                else:
+                    # Try to convert to dict if possible
+                    try:
+                        processed_docs.append(dict(doc))
+                    except:
+                        processed_docs.append({"data": str(doc)})
+            docs = processed_docs
+            # Apply limit if specified
+            if limit and len(docs) > limit:
+                docs = docs[:limit]
+            return docs
+        except Exception as e:
+            print(f"Warning: Could not load lm-eval task '{self.task_name}': {e}")
+            return []
+    def get_extractor(self) -> BenchmarkExtractor:
+        """Get the benchmark extractor for this task."""
+        return self._extractor
+    def get_name(self) -> str:
+        """Get the task name."""
+        return self.task_name
+    def get_description(self) -> str:
+        """Get the task description."""
+        return self._description
+    def get_categories(self) -> List[str]:
+        """Get the task categories."""
+        return self._categories
+class MBPPTask(LMEvalTask):
+    """MBPP task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="mbpp",
+            description="MBPP: Mostly Basic Python Problems coding benchmark",
+            categories=["coding", "reasoning", "python"],
+        )
+class HumanEvalTask(LMEvalTask):
+    """HumanEval task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="humaneval",
+            description="HumanEval: Human Evaluation of Python coding problems",
+            categories=["coding", "reasoning", "python"],
+        )
+class MBPPPlusTask(LMEvalTask):
+    """MBPP Plus task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="mbpp_plus",
+            description="MBPP Plus: Extended version of MBPP with additional test cases",
+            categories=["coding", "reasoning", "python"],
+        )
+class GSM8KTask(LMEvalTask):
+    """GSM8K task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="gsm8k",
+            description="GSM8K: Grade School Math 8K problems",
+            categories=["mathematics", "reasoning", "arithmetic"],
+        )
+class TruthfulQATask(LMEvalTask):
+    """TruthfulQA task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="truthfulqa_mc1",
+            description="TruthfulQA: Truthfulness evaluation benchmark",
+            categories=["hallucination", "general-knowledge", "reasoning"],
+        )
+    def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+        """Load TruthfulQA data, which only has validation split."""
+        try:
+            from lm_eval.tasks import get_task_dict
+            # Get task directly from lm-eval
+            task_dict = get_task_dict([self.task_name])
+            if self.task_name not in task_dict:
+                print(f"Warning: Task '{self.task_name}' not found in lm-eval")
+                return []
+            task = task_dict[self.task_name]
+            # TruthfulQA only has validation split, access it directly
+            docs = []
+            if hasattr(task, "dataset") and "validation" in task.dataset:
+                validation_data = task.dataset["validation"]
+                docs = list(validation_data)
+            # Apply limit if specified
+            if limit and len(docs) > limit:
+                docs = docs[:limit]
+            return docs
+        except Exception as e:
+            print(f"Warning: Could not load TruthfulQA task '{self.task_name}': {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+class MMLUTask(LMEvalTask):
+    """MMLU task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="mmlu",
+            description="MMLU: Massive Multitask Language Understanding",
+            categories=["general-knowledge", "science", "reasoning"],
+        )
+# === CODING TASKS ===
+class InstructHumanEvalTask(LMEvalTask):
+    """InstructHumanEval task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="instructhumaneval",
+            description="InstructHumanEval: Instruction-following HumanEval benchmark",
+            categories=["coding", "reasoning", "python", "instruction-following"],
+        )
+class HumanEvalPlusTask(LMEvalTask):
+    """HumanEval Plus task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="humaneval_plus",
+            description="HumanEval Plus: Extended HumanEval with more tests",
+            categories=["coding", "reasoning", "python"],
+        )
+class ConalaTask(LMEvalTask):
+    """Conala task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="conala",
+            description="Conala: Code generation from natural language",
+            categories=["coding", "reasoning", "python", "nl2code"],
+        )
+class ConcodeTask(LMEvalTask):
+    """Concode task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="concode",
+            description="Concode: Code completion benchmark",
+            categories=["coding", "reasoning", "completion"],
+        )
+class MercuryTask(LMEvalTask):
+    """Mercury task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="mercury",
+            description="Mercury: Code generation benchmark",
+            categories=["coding", "reasoning"],
+        )
+class AppsTask(LMEvalTask):
+    """APPS task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="apps",
+            description="APPS: Automated Programming Problems Synthesis",
+            categories=["coding", "reasoning", "python", "competitive"],
+        )
+class DS1000Task(LMEvalTask):
+    """DS1000 task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="ds1000",
+            description="DS1000: Data Science coding tasks",
+            categories=["coding", "reasoning", "python", "data-science"],
+        )
+class MultiplePyTask(LMEvalTask):
+    """Multiple-Py task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="multiple_py",
+            description="Multiple-Py: Multi-language Python tasks",
+            categories=["coding", "reasoning", "python", "multi-language"],
+        )
+class MultipleJsTask(LMEvalTask):
+    """Multiple-JS task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="multiple_js",
+            description="Multiple-JS: Multi-language JavaScript tasks",
+            categories=["coding", "reasoning", "javascript", "multi-language"],
+        )
+class MultipleJavaTask(LMEvalTask):
+    """Multiple-Java task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="multiple_java",
+            description="Multiple-Java: Multi-language Java tasks",
+            categories=["coding", "reasoning", "java", "multi-language"],
+        )
+class MultipleCppTask(LMEvalTask):
+    """Multiple-Cpp task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="multiple_cpp",
+            description="Multiple-Cpp: Multi-language C++ tasks",
+            categories=["coding", "reasoning", "cpp", "multi-language"],
+        )
+class MultipleRsTask(LMEvalTask):
+    """Multiple-Rs task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="multiple_rs",
+            description="Multiple-Rs: Multi-language Rust tasks",
+            categories=["coding", "reasoning", "rust", "multi-language"],
+        )
+class MultipleGoTask(LMEvalTask):
+    """Multiple-Go task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="multiple_go",
+            description="Multiple-Go: Multi-language Go tasks",
+            categories=["coding", "reasoning", "go", "multi-language"],
+        )
+class CodexglueCodeToTextPythonTask(LMEvalTask):
+    """CodexGlue Code-to-Text Python task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="codexglue_code_to_text_python",
+            description="CodexGlue Code-to-Text Python: Python code summarization",
+            categories=["coding", "reasoning", "python", "code-to-text"],
+        )
+class CodexglueCodeToTextGoTask(LMEvalTask):
+    """CodexGlue Code-to-Text Go task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="codexglue_code_to_text_go",
+            description="CodexGlue Code-to-Text Go: Go code summarization",
+            categories=["coding", "reasoning", "go", "code-to-text"],
+        )
+class CodexglueCodeToTextRubyTask(LMEvalTask):
+    """CodexGlue Code-to-Text Ruby task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="codexglue_code_to_text_ruby",
+            description="CodexGlue Code-to-Text Ruby: Ruby code summarization",
+            categories=["coding", "reasoning", "ruby", "code-to-text"],
+        )
+class CodexglueCodeToTextJavaTask(LMEvalTask):
+    """CodexGlue Code-to-Text Java task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="codexglue_code_to_text_java",
+            description="CodexGlue Code-to-Text Java: Java code summarization",
+            categories=["coding", "reasoning", "java", "code-to-text"],
+        )
+class CodexglueCodeToTextJavascriptTask(LMEvalTask):
+    """CodexGlue Code-to-Text JavaScript task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="codexglue_code_to_text_javascript",
+            description="CodexGlue Code-to-Text JavaScript: JavaScript code summarization",
+            categories=["coding", "reasoning", "javascript", "code-to-text"],
+        )
+class CodexglueCodeToTextPhpTask(LMEvalTask):
+    """CodexGlue Code-to-Text PHP task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="codexglue_code_to_text_php",
+            description="CodexGlue Code-to-Text PHP: PHP code summarization",
+            categories=["coding", "reasoning", "php", "code-to-text"],
+        )
+class RecodeTask(LMEvalTask):
+    """Recode task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="recode",
+            description="Recode: Perturbed HumanEval natural generation",
+            categories=["coding", "reasoning", "python", "perturbation"],
+        )
+class Squad2Task(LMEvalTask):
+    """SQuAD2 task implementation."""
+    def __init__(self):
+        super().__init__(
+            task_name="squadv2",
+            description="SQuAD2: Stanford Question Answering Dataset 2.0",
+            categories=["reading-comprehension", "qa", "natural-language"],
+        )
+    def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+        """Load SQuAD2 data, which only has validation split."""
+        try:
+            from lm_eval.tasks import get_task_dict
+            # Get task directly from lm-eval
+            task_dict = get_task_dict([self.task_name])
+            if self.task_name not in task_dict:
+                print(f"Warning: Task '{self.task_name}' not found in lm-eval")
+                return []
+            task = task_dict[self.task_name]
+            # SQuAD2 only has validation split, access it directly
+            docs = []
+            if hasattr(task, "dataset") and "validation" in task.dataset:
+                validation_data = task.dataset["validation"]
+                docs = list(validation_data)
+            # Apply limit if specified
+            if limit and len(docs) > limit:
+                docs = docs[:limit]
+            return docs
+        except Exception as e:
+            print(f"Warning: Could not load SQuAD2 task '{self.task_name}': {e}")
+            import traceback
+            traceback.print_exc()
+            return []

wisent/core/tasks/math500_task.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""
+MATH-500 task implementation for task-agnostic architecture.
+"""
+from typing import Dict, Any, List, Optional
+from ..task_interface import TaskInterface
+from ..benchmark_extractors import GSM8KExtractor
+import datasets
+class Math500Task(TaskInterface):
+    """MATH-500 mathematical reasoning task implementation."""
+    def __init__(self, limit: Optional[int] = None):
+        self._limit = limit
+        self._data = None  # Cache for loaded data
+        self._extractor = GSM8KExtractor()  # Reuse GSM8K extractor
+    def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+        """Load MATH-500 data from HuggingFace."""
+        dataset = datasets.load_dataset("HuggingFaceH4/MATH-500", split="test")
+        # Apply limit
+        effective_limit = limit or self._limit
+        if effective_limit:
+            dataset = dataset.select(range(min(effective_limit, len(dataset))))
+        # Convert to list of dictionaries
+        return [dict(item) for item in dataset]
+    def get_task_info(self) -> Dict[str, Any]:
+        """Get information about the MATH-500 task."""
+        return {
+            "task_name": "math500",
+            "description": "500 mathematical reasoning problems from OpenAI's MATH dataset",
+            "source": "HuggingFaceH4/MATH-500",
+            "task_type": "text_generation",
+            "evaluation_method": "mathematical_equivalence"
+        }
+    def validate_sample(self, sample: Dict[str, Any]) -> bool:
+        """Validate that a sample has required MATH-500 fields."""
+        required_fields = ["problem", "answer"]
+        return all(field in sample for field in required_fields)
+    def get_extractor(self) -> GSM8KExtractor:
+        """Get the benchmark extractor for this task."""
+        return self._extractor
+    def get_name(self) -> str:
+        """Get the task name."""
+        return "math500"
+    def get_description(self) -> str:
+        """Get the task description."""
+        return "500 mathematical reasoning problems from OpenAI's MATH dataset requiring multi-step solutions"
+    def get_categories(self) -> List[str]:
+        """Get the task categories."""
+        return ["mathematics", "reasoning", "text_generation"]
+    # Methods to match lm-eval interface
+    def has_validation_docs(self) -> bool:
+        """Check if task has validation documents."""
+        return False  # MATH-500 doesn't have separate validation sets
+    def has_test_docs(self) -> bool:
+        """Check if task has test documents."""
+        return True  # All samples are considered test docs
+    def test_docs(self) -> List[Dict[str, Any]]:
+        """Get test documents."""
+        if self._data is None:
+            self._data = self.load_data()
+        return self._data
+    def validation_docs(self) -> List[Dict[str, Any]]:
+        """Get validation documents."""
+        return []  # No separate validation set
+    def doc_to_text(self, doc: Dict[str, Any]) -> str:
+        """Convert document to text prompt."""
+        return doc.get('problem', '')

wisent/core/tasks/polymath_task.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""
+PolyMath multilingual mathematical reasoning task implementation for task-agnostic architecture.
+"""
+from typing import Dict, Any, List, Optional
+from ..task_interface import TaskInterface
+from ..benchmark_extractors import GSM8KExtractor
+import datasets
+class PolyMathTask(TaskInterface):
+    """PolyMath multilingual mathematical reasoning task implementation."""
+    # Dataset configurations for different language-difficulty combinations
+    DATASET_CONFIGS = {
+        "zh_medium": {
+            "source": "Qwen/PolyMath",
+            "language": "zh",
+            "split": "medium",
+            "fields": {"problem": "question", "answer": "answer"},
+            "description": "125 medium-difficulty mathematical problems in Chinese"
+        },
+        "en_medium": {
+            "source": "Qwen/PolyMath",
+            "language": "en",
+            "split": "medium",
+            "fields": {"problem": "question", "answer": "answer"},
+            "description": "125 medium-difficulty mathematical problems in English"
+        },
+        "zh_high": {
+            "source": "Qwen/PolyMath",
+            "language": "zh",
+            "split": "high",
+            "fields": {"problem": "question", "answer": "answer"},
+            "description": "125 high-difficulty mathematical problems in Chinese"
+        },
+        "en_high": {
+            "source": "Qwen/PolyMath",
+            "language": "en",
+            "split": "high",
+            "fields": {"problem": "question", "answer": "answer"},
+            "description": "125 high-difficulty mathematical problems in English"
+        }
+    }
+    def __init__(self, language: str = "en", difficulty: str = "medium", limit: Optional[int] = None):
+        """
+        Initialize PolyMath task for specified language and difficulty.
+        Args:
+            language: Language code ("en" for English, "zh" for Chinese). Default: "en"
+            difficulty: Difficulty level ("medium", "high"). Default: "medium"
+            limit: Maximum number of samples to load
+        """
+        config_key = f"{language}_{difficulty}"
+        if config_key not in self.DATASET_CONFIGS:
+            available = list(self.DATASET_CONFIGS.keys())
+            raise ValueError(f"PolyMath config '{config_key}' not supported. Available: {available}")
+        self.language = language
+        self.difficulty = difficulty
+        self.config_key = config_key
+        self.config = self.DATASET_CONFIGS[config_key]
+        self._limit = limit
+        self._data = None  # Cache for loaded data
+        self._extractor = GSM8KExtractor()  # Reuse enhanced GSM8K extractor
+    def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+        """Load PolyMath data from HuggingFace for specified language and difficulty."""
+        # Load dataset based on language and difficulty configuration
+        dataset = datasets.load_dataset(
+            self.config["source"],
+            self.config["language"],
+            split=self.config["split"]
+        )
+        # Apply limit
+        effective_limit = limit or self._limit
+        if effective_limit:
+            dataset = dataset.select(range(min(effective_limit, len(dataset))))
+        # Convert to list and normalize field names
+        data = [dict(item) for item in dataset]
+        # Normalize field names for consistent processing
+        normalized_data = []
+        problem_field = self.config["fields"]["problem"]
+        answer_field = self.config["fields"]["answer"]
+        for item in data:
+            normalized_item = dict(item)  # Keep all original fields
+            # Ensure consistent field names for extractor
+            if problem_field in item:
+                normalized_item["Problem"] = item[problem_field]
+                normalized_item["question"] = item[problem_field]  # For question/answer format
+            if answer_field in item:
+                normalized_item["Answer"] = item[answer_field]
+                normalized_item["answer"] = item[answer_field]  # For question/answer format
+            normalized_data.append(normalized_item)
+        return normalized_data
+    def get_task_info(self) -> Dict[str, Any]:
+        """Get information about the PolyMath task."""
+        return {
+            "task_name": f"polymath_{self.config_key}",
+            "language": self.language,
+            "difficulty": self.difficulty,
+            "description": self.config["description"],
+            "source": self.config["source"],
+            "task_type": "text_generation",
+            "evaluation_method": "mathematical_equivalence"
+        }
+    def validate_sample(self, sample: Dict[str, Any]) -> bool:
+        """Validate that a sample has required PolyMath fields."""
+        problem_field = self.config["fields"]["problem"]
+        answer_field = self.config["fields"]["answer"]
+        return all(field in sample for field in [problem_field, answer_field])
+    def get_extractor(self) -> GSM8KExtractor:
+        """Get the benchmark extractor for this task."""
+        return self._extractor
+    def get_name(self) -> str:
+        """Get the task name."""
+        return f"polymath_{self.config_key}"
+    def get_description(self) -> str:
+        """Get the task description."""
+        lang_name = "Chinese" if self.language == "zh" else "English"
+        return f"PolyMath {self.difficulty}-difficulty mathematical problems in {lang_name}"
+    def get_categories(self) -> List[str]:
+        """Get the task categories."""
+        return ["mathematics", "reasoning", "multilingual", "text_generation"]
+    @classmethod
+    def get_supported_configs(cls) -> List[str]:
+        """Get list of supported PolyMath language-difficulty configurations."""
+        return list(cls.DATASET_CONFIGS.keys())

wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

wisent 0.1.1py3-none-any.whl → 0.5.1py3-none-any.whl