PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import json
 from typing import Any
 from wisent.core.cli_logger import setup_logger
@@ -13,43 +14,45 @@ log = setup_logger(__name__)
 class MercuryExtractor(HuggingFaceBenchmarkExtractor):
     """
-    Extractor for mercury dataset (code-to-code translation).
+    Extractor for Mercury - code efficiency benchmark.
-    Schema (code_x_glue_cc_code_to_code_trans):
-        - java: str (java code/prompt)
-        - cs: str (c# code/answer)
+    Dataset: Elfsong/Mercury
+    Paper: "Mercury: A Code Efficiency Benchmark for LLM Code Synthesis"
-    Note: This is a translation task, not code execution. Uses generation evaluator.
+    Mercury evaluates code efficiency by comparing different solutions
+    to the same problem based on runtime performance.
+    Schema:
+        - prompt: str (problem description)
+        - solutions: list[dict] with runtime and solution code
+        - test_cases: str (JSON with test inputs/outputs)
+        - difficulty: str
+    For code efficiency evaluation:
+    - Positive (correct) = Fastest solution
+    - Negative (incorrect) = Slowest solution
     """
-    evaluator_name = "generation"
+    evaluator_name = "code_efficiency"
     def extract_contrastive_pairs(
         self,
         limit: int | None = None,
     ) -> list[ContrastivePair]:
         """
-        Build contrastive pairs from mercury examples.
-        Args:
-            limit: Optional maximum number of pairs to produce.
-        Returns:
-            A list of ContrastivePair objects.
+        Build contrastive pairs from Mercury examples.
         """
         max_items = self._normalize_limit(limit)
-        # Load dataset - using code_x_glue as alternative since tau/code_translation doesn't exist
         docs = self.load_dataset(
-            dataset_name="code_x_glue_cc_code_to_code_trans",
-            dataset_config="default",
-            split="train",
+            dataset_name="Elfsong/Mercury",
+            split="eval",
             limit=max_items,
         )
         pairs: list[ContrastivePair] = []
-        log.info(f"Extracting contrastive pairs from {len(docs)} mercury examples")
+        log.info(f"Extracting contrastive pairs from {len(docs)} Mercury examples")
         for doc in docs:
             pair = self._extract_pair_from_doc(doc)
@@ -59,53 +62,73 @@ class MercuryExtractor(HuggingFaceBenchmarkExtractor):
                     break
         if not pairs:
-            log.warning("No valid mercury pairs extracted")
+            log.warning("No valid Mercury pairs extracted")
         return pairs
     def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
         """
         Convert a single doc into a ContrastivePair.
-        Returns None when required fields are missing or malformed.
+        Uses fastest vs slowest solution as correct vs incorrect.
         """
         try:
-            question = doc.get("java", "").strip()
-            answer = doc.get("cs", "")
+            prompt = doc.get("prompt", "").strip()
+            solutions = doc.get("solutions", [])
+            difficulty = doc.get("difficulty", "")
+            slug_name = doc.get("slug_name", "")
+            pretty_content = doc.get("pretty_content", [])
-            if not question or not answer:
-                log.debug("Skipping: missing question or answer")
+            if not prompt or not solutions or len(solutions) < 2:
                 return None
-            # Convert answer to string
-            correct_answer = str(answer).strip()
+            # Sort solutions by runtime (fastest first)
+            # Runtime format is like "44ms", "36ms", etc.
+            def parse_runtime(sol):
+                runtime_str = sol.get("runtime", "999ms")
+                try:
+                    return int(runtime_str.replace("ms", ""))
+                except:
+                    return 999
+            sorted_solutions = sorted(solutions, key=parse_runtime)
+            fastest = sorted_solutions[0]
+            slowest = sorted_solutions[-1]
+            fastest_code = fastest.get("solution", "")
+            slowest_code = slowest.get("solution", "")
+            if not fastest_code or not slowest_code:
+                return None
+            # Use pretty_content if available for problem description
+            problem_desc = pretty_content[0] if pretty_content else prompt
-            # Create incorrect answer (modify or corrupt)
-            incorrect_answer = self._create_incorrect_answer(correct_answer)
+            formatted_question = f"""Code Efficiency Task:
-            # Format the question
-            formatted_question = f"Translate this Java code to C#:\n{question}"
+{problem_desc}
+Write an efficient Python solution."""
             metadata = {
                 "label": "mercury",
-                "source": "code_x_glue_cc_code_to_code_trans",
+                "source": "Elfsong/Mercury",
+                "slug_name": slug_name,
+                "difficulty": difficulty,
+                "fastest_runtime": fastest.get("runtime", ""),
+                "slowest_runtime": slowest.get("runtime", ""),
+                "is_code_efficiency_benchmark": True,
             }
             return self._build_pair(
                 question=formatted_question,
-                correct=correct_answer,
-                incorrect=incorrect_answer,
+                correct=f"```python\n{fastest_code}\n```",
+                incorrect=f"```python\n{slowest_code}\n```",
                 metadata=metadata,
             )
         except Exception as exc:
-            log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
+            log.error(f"Error extracting Mercury pair: {exc}", exc_info=True)
             return None
-    def _create_incorrect_answer(self, correct: str) -> str:
-        """Create an incorrect answer by modifying the correct one."""
-        # For code, corrupt it slightly
-        if len(correct) > 10:
-            return correct[:len(correct)//2] + "# CORRUPTED" + correct[len(correct)//2:]
-        return f"{correct} # INCORRECT"

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py CHANGED Viewed

@@ -111,18 +111,8 @@ class OlympiadBenchExtractor(HuggingFaceBenchmarkExtractor):
             )
             log.info(f"Loaded {len(docs)} examples from OlympiadBench ({self.config})")
         except Exception as e:
-            log.warning(f"Failed to load OlympiadBench with config {self.config}: {e}")
-            # Try alternative config
-            try:
-                docs = self.load_dataset(
-                    dataset_name="lmms-lab/OlympiadBench",
-                    split="test",
-                    limit=max_items,
-                )
-                log.info(f"Loaded {len(docs)} examples from lmms-lab/OlympiadBench")
-            except Exception as e2:
-                log.error(f"Failed to load any OlympiadBench: {e2}")
-                return []
+            log.error(f"Failed to load Hothan/OlympiadBench: {e}")
+            return []
         pairs: list[ContrastivePair] = []

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import Any
+from datasets import load_dataset
 from wisent.core.cli_logger import setup_logger
 from wisent.core.contrastive_pairs.core.pair import ContrastivePair
@@ -10,73 +11,61 @@ __all__ = ["PlanBenchExtractor"]
 log = setup_logger(__name__)
-# PlanBench domains
-PLANBENCH_DOMAINS = [
-    "blocksworld",   # Classic blocks world planning
-    "logistics",     # Package delivery logistics
-]
-# PlanBench task types
-PLANBENCH_TASKS = [
-    "plan_generation",           # Generate a valid plan
-    "cost_optimal_planning",     # Generate cost-optimal plan
-    "plan_verification",         # Verify if a plan is valid
-    "goal_recognition",          # Recognize the goal from actions
-    "plan_execution_reasoning",  # Predict outcome of action execution
-    "action_reordering",         # Reorder actions for valid plan
+PLANBENCH_CONFIGS = [
+    "task_1_plan_generation",
+    "task_2_plan_optimality",
+    "task_3_plan_verification",
+    "task_5_plan_generalization",
+    "task_7_plan_execution",
+    "task_8_1_goal_shuffling",
+    "task_8_2_full_to_partial",
 ]
 class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
     """
-    Extractor for PlanBench - Planning and Reasoning Benchmark.
+    Extractor for PlanBench - Planning and Reasoning Benchmark (NeurIPS 2023).
     PlanBench evaluates LLMs on planning and reasoning about actions
     and change, using domains from the International Planning Competition.
-    Domains:
-    - Blocksworld: Classic blocks stacking problems
-    - Logistics: Package delivery with trucks and planes
-    Task Types:
-    - Plan generation and cost-optimal planning
-    - Plan verification
-    - Goal recognition
-    - Plan execution reasoning
-    - Action reordering
+    Dataset: tasksource/planbench (HuggingFace)
-    Dataset: GitHub karthikv792/LLMs-Planning
+    Available configs:
+    - task_1_plan_generation: Generate a valid plan
+    - task_2_plan_optimality: Generate cost-optimal plan
+    - task_3_plan_verification: Verify if a plan is valid
+    - task_5_plan_generalization: Generalize plan to new instances
+    - task_7_plan_execution: Predict execution outcome
+    - task_8_1_goal_shuffling: Handle shuffled goals
+    - task_8_2_full_to_partial: Full to partial observability
     For planning evaluation:
-    - Positive (correct) = Valid plan or correct reasoning
-    - Negative (incorrect) = Invalid plan or incorrect reasoning
+    - Positive (correct) = Valid plan matching ground truth
+    - Negative (incorrect) = Invalid or wrong plan
     """
-    # Evaluator that should be used for this benchmark
     evaluator_name = "planning_reasoning"
-    def __init__(self, domain: str = "blocksworld", task: str = "plan_generation"):
+    def __init__(self, config: str = "task_1_plan_generation"):
         """
         Initialize PlanBench extractor.
         Args:
-            domain: Planning domain ("blocksworld", "logistics")
-            task: Task type (e.g., "plan_generation", "plan_verification")
+            config: PlanBench task config (default: task_1_plan_generation)
         """
         super().__init__()
-        self.domain = domain
-        self.task = task
+        if config not in PLANBENCH_CONFIGS:
+            log.warning(f"Unknown config '{config}', using task_1_plan_generation")
+            config = "task_1_plan_generation"
+        self.config = config
     def extract_contrastive_pairs(
         self,
         limit: int | None = None,
     ) -> list[ContrastivePair]:
         """
-        Build contrastive pairs from PlanBench examples.
-        Creates pairs for planning evaluation:
-        - Positive (correct) = Valid planning solution
-        - Negative (incorrect) = Invalid planning solution
+        Build contrastive pairs from PlanBench.
         Args:
             limit: Optional maximum number of pairs to produce.
@@ -86,9 +75,8 @@ class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
         """
         max_items = self._normalize_limit(limit)
-        # PlanBench is on GitHub, create examples based on documented structure
-        docs = self._create_planbench_examples(max_items or 50)
-        log.info(f"Created {len(docs)} PlanBench examples ({self.domain}, {self.task})")
+        docs = self._load_planbench_data()
+        log.info(f"Loaded {len(docs)} PlanBench examples (config: {self.config})")
         pairs: list[ContrastivePair] = []
@@ -104,161 +92,60 @@ class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
         return pairs
-    def _create_planbench_examples(self, count: int) -> list[dict[str, Any]]:
-        """Create examples based on PlanBench structure."""
-        examples = []
-        if self.domain == "blocksworld":
-            examples = self._create_blocksworld_examples(count)
-        elif self.domain == "logistics":
-            examples = self._create_logistics_examples(count)
-        else:
-            examples = self._create_blocksworld_examples(count)
-        return examples
-    def _create_blocksworld_examples(self, count: int) -> list[dict[str, Any]]:
-        """Create blocksworld planning examples."""
-        blocksworld_cases = [
-            {
-                "initial_state": "Block A is on the table. Block B is on Block A. Block C is on the table.",
-                "goal_state": "Block A is on Block B. Block B is on Block C.",
-                "valid_plan": [
-                    "1. Unstack B from A",
-                    "2. Put B on C",
-                    "3. Pick up A",
-                    "4. Stack A on B",
-                ],
-                "invalid_plan": [
-                    "1. Pick up A",  # Invalid - B is on A
-                    "2. Put A on B",
-                ],
-            },
-            {
-                "initial_state": "Block A is on Block B. Block B is on the table. Block C is on the table. The robot arm is empty.",
-                "goal_state": "Block B is on Block A. Block C is on Block B.",
-                "valid_plan": [
-                    "1. Unstack A from B",
-                    "2. Put A on the table",
-                    "3. Pick up B",
-                    "4. Stack B on A",
-                    "5. Pick up C",
-                    "6. Stack C on B",
-                ],
-                "invalid_plan": [
-                    "1. Stack B on A",  # Invalid - A is on B
-                ],
-            },
-            {
-                "initial_state": "Block A, B, and C are on the table. Block D is on Block A.",
-                "goal_state": "Block A is on Block B. Block B is on Block C. Block D is on Block A.",
-                "valid_plan": [
-                    "1. Unstack D from A",
-                    "2. Put D on table",
-                    "3. Pick up B",
-                    "4. Stack B on C",
-                    "5. Pick up A",
-                    "6. Stack A on B",
-                    "7. Pick up D",
-                    "8. Stack D on A",
-                ],
-                "invalid_plan": [
-                    "1. Pick up A",  # Invalid - D is on A
-                ],
-            },
-        ]
-        examples = []
-        for i in range(count):
-            case = blocksworld_cases[i % len(blocksworld_cases)].copy()
-            case["case_id"] = f"blocks_{i:03d}"
-            case["domain"] = "blocksworld"
-            examples.append(case)
-        return examples
-    def _create_logistics_examples(self, count: int) -> list[dict[str, Any]]:
-        """Create logistics planning examples."""
-        logistics_cases = [
-            {
-                "initial_state": "Package P1 is in City A. Truck T1 is in City A. Package needs to go to City B.",
-                "goal_state": "Package P1 is in City B.",
-                "valid_plan": [
-                    "1. Load P1 onto T1 in City A",
-                    "2. Drive T1 from City A to City B",
-                    "3. Unload P1 from T1 in City B",
-                ],
-                "invalid_plan": [
-                    "1. Drive T1 from City A to City B",
-                    "2. Unload P1 from T1",  # Invalid - P1 was never loaded
-                ],
-            },
-            {
-                "initial_state": "Package P1 is in City A. Package P2 is in City B. Plane A1 is in City A. Goal: P1 in City C, P2 in City A.",
-                "goal_state": "Package P1 is in City C. Package P2 is in City A.",
-                "valid_plan": [
-                    "1. Load P1 onto Plane A1 in City A",
-                    "2. Fly A1 from City A to City B",
-                    "3. Load P2 onto A1 in City B",
-                    "4. Fly A1 from City B to City A",
-                    "5. Unload P2 in City A",
-                    "6. Fly A1 from City A to City C",
-                    "7. Unload P1 in City C",
-                ],
-                "invalid_plan": [
-                    "1. Fly A1 to City B",
-                    "2. Unload P1",  # P1 was never loaded
-                ],
-            },
-        ]
-        examples = []
-        for i in range(count):
-            case = logistics_cases[i % len(logistics_cases)].copy()
-            case["case_id"] = f"logistics_{i:03d}"
-            case["domain"] = "logistics"
-            examples.append(case)
-        return examples
+    def _load_planbench_data(self) -> list[dict[str, Any]]:
+        """Load PlanBench data from HuggingFace."""
+        try:
+            ds = load_dataset("tasksource/planbench", self.config, split="train")
+            examples = []
+            for i, item in enumerate(ds):
+                examples.append({
+                    "case_id": f"planbench_{self.config}_{i:04d}",
+                    "task": item.get("task", ""),
+                    "prompt_type": item.get("prompt_type", ""),
+                    "domain": item.get("domain", ""),
+                    "instance_id": item.get("instance_id", ""),
+                    "query": item.get("query", ""),
+                    "ground_truth_plan": item.get("ground_truth_plan", ""),
+                })
+            return examples
+        except Exception as e:
+            log.error(f"Failed to load PlanBench from HuggingFace: {e}")
+            raise RuntimeError(f"Cannot load PlanBench data: {e}")
     def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
         """
         Convert a single doc into a ContrastivePair.
+        PlanBench HuggingFace format:
+        {"task": "task_1_plan_generation", "prompt_type": "oneshot", "domain": "...",
+         "instance_id": 2, "query": "...", "ground_truth_plan": "..."}
         """
         try:
             case_id = doc.get("case_id", "")
-            initial_state = doc.get("initial_state", "").strip()
-            goal_state = doc.get("goal_state", "").strip()
-            valid_plan = doc.get("valid_plan", [])
-            invalid_plan = doc.get("invalid_plan", [])
-            domain = doc.get("domain", self.domain)
-            if not initial_state or not goal_state:
-                log.debug("Skipping: missing states")
-                return None
+            query = doc.get("query", "").strip()
+            ground_truth_plan = doc.get("ground_truth_plan", "").strip()
+            domain = doc.get("domain", "")
+            task = doc.get("task", "")
-            # Build the planning task prompt
-            task_prompt = self._build_planning_prompt(
-                initial_state, goal_state, domain
-            )
+            if not query or not ground_truth_plan:
+                log.debug("Skipping: missing query or ground_truth_plan")
+                return None
-            # Positive = valid plan
-            correct_response = self._create_valid_plan_response(valid_plan)
-            # Negative = invalid plan
-            incorrect_response = self._create_invalid_plan_response(invalid_plan)
+            correct_response = self._create_correct_response(ground_truth_plan)
+            incorrect_response = self._create_incorrect_response(ground_truth_plan)
             metadata = {
                 "label": "planbench",
-                "source": "karthikv792/LLMs-Planning",
+                "source": "tasksource/planbench",
                 "case_id": case_id,
                 "domain": domain,
-                "task": self.task,
-                "plan_length": len(valid_plan),
+                "task": task,
+                "config": self.config,
                 "is_planning_benchmark": True,
             }
             return self._build_pair(
-                question=task_prompt,
+                question=query,
                 correct=correct_response,
                 incorrect=incorrect_response,
                 metadata=metadata,
@@ -268,50 +155,22 @@ class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
             log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
             return None
-    def _build_planning_prompt(
-        self, initial_state: str, goal_state: str, domain: str
-    ) -> str:
-        """Build the planning task prompt."""
-        domain_desc = ""
-        if domain == "blocksworld":
-            domain_desc = (
-                "In this blocks world domain, you can:\n"
-                "- Pick up a block (only if nothing is on it and arm is empty)\n"
-                "- Put down a block on the table\n"
-                "- Stack a block on another (only if target block is clear)\n"
-                "- Unstack a block from another\n\n"
-            )
-        elif domain == "logistics":
-            domain_desc = (
-                "In this logistics domain, you can:\n"
-                "- Load packages onto trucks/planes (at same location)\n"
-                "- Unload packages from trucks/planes\n"
-                "- Drive trucks between locations in same city\n"
-                "- Fly planes between cities\n\n"
-            )
+    def _create_correct_response(self, ground_truth_plan: str) -> str:
+        """Create a response with the correct plan."""
         return (
-            f"{domain_desc}"
-            f"Initial State:\n{initial_state}\n\n"
-            f"Goal State:\n{goal_state}\n\n"
-            "Generate a valid sequence of actions to achieve the goal state from "
-            "the initial state. Ensure each action's preconditions are satisfied."
+            f"Here is the plan to achieve the goal:\n\n{ground_truth_plan}\n\n"
+            "Each action in this sequence has its preconditions satisfied."
         )
-    def _create_valid_plan_response(self, plan: list[str]) -> str:
-        """Create a response with a valid plan."""
-        plan_str = "\n".join(plan)
-        return (
-            f"Here is a valid plan to achieve the goal:\n\n{plan_str}\n\n"
-            "Each action in this sequence has its preconditions satisfied by the "
-            "previous actions, and executing them in order will achieve the goal state."
-        )
-    def _create_invalid_plan_response(self, plan: list[str]) -> str:
-        """Create a response with an invalid plan."""
-        plan_str = "\n".join(plan) if plan else "1. [Incomplete plan]"
+    def _create_incorrect_response(self, ground_truth_plan: str) -> str:
+        """Create an incorrect response (wrong/incomplete plan)."""
+        lines = ground_truth_plan.strip().split("\n")
+        if len(lines) > 1:
+            wrong_plan = "\n".join(reversed(lines[:2]))
+        else:
+            wrong_plan = "(noop)"
         return (
-            f"Here's my plan:\n\n{plan_str}\n\n"
+            f"Here's my plan:\n\n{wrong_plan}\n\n"
             "This should work to reach the goal."
         )

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py CHANGED Viewed

@@ -130,13 +130,46 @@ class PolyMathExtractor(HuggingFaceBenchmarkExtractor):
             return None
     def _create_incorrect_answer(self, correct: str) -> str:
-        """Create an incorrect answer by modifying the correct one."""
+        """Create a meaningful incorrect answer using plausible wrong values."""
+        import random
+        import re
+        random.seed(hash(correct) % (2**32))
+        # Try symbolic parsing first
         try:
             parsed_correct = parse_latex(correct)
-            incorrect = str(latex(parsed_correct + 1))
-            return incorrect
+            transforms = [
+                parsed_correct * 2,
+                parsed_correct / 2,
+                parsed_correct - 1,
+                -parsed_correct,
+            ]
+            wrong = random.choice(transforms)
+            return str(latex(wrong))
         except Exception:
-            return f"{correct} + 1"
+            pass
+        # Try simple integer
+        try:
+            clean = correct.replace('$', '').replace(',', '').strip()
+            num = int(clean)
+            wrong_vals = [num * 2, num // 2 if num > 1 else num * 3, num - 1, -num]
+            return str(random.choice(wrong_vals))
+        except ValueError:
+            pass
+        # For fractions
+        frac_match = re.match(r'\\frac\{(\d+)\}\{(\d+)\}', correct)
+        if frac_match:
+            n, d = int(frac_match.group(1)), int(frac_match.group(2))
+            return random.choice([f"\\frac{{{d}}}{{{n}}}", f"\\frac{{{n*2}}}{{{d}}}"])
+        # For pi expressions
+        if '\\pi' in correct:
+            return correct.replace('\\pi', '2\\pi') if '2\\pi' not in correct else correct.replace('2\\pi', '\\pi')
+        # Fallback
+        return random.choice(['0', '1', '-1', '2'])
     @staticmethod

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl