PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (725) hide show

wisent/examples/scripts/results/benchmark_test_summary_nov4.json DELETED Viewed

@@ -1,71 +0,0 @@
-{
-  "passed": [
-    "aime2024",
-    "aime2025",
-    "aime",
-    "arc_challenge",
-    "arc_easy",
-    "arithmetic",
-    "asdiv",
-    "cb",
-    "copa",
-    "coqa",
-    "drop",
-    "gsm8k",
-    "hendrycks_math",
-    "hmmt",
-    "hmmt_feb_2025",
-    "humaneval",
-    "humaneval_plus",
-    "lambada_standard",
-    "livemathbench_cnmo_en",
-    "livemathbench_cnmo_zh",
-    "mmlu",
-    "nq_open",
-    "penn_treebank",
-    "piqa",
-    "polymath_en_high",
-    "polymath_zh_high",
-    "ptb",
-    "record",
-    "squad2",
-    "squadv2",
-    "triviaqa",
-    "webqs",
-    "wikitext103",
-    "wikitext"
-  ],
-  "failed": [
-    "apps",
-    "boolq",
-    "codexglue_code_to_text_go",
-    "codexglue_code_to_text_java",
-    "codexglue_code_to_text_javascript",
-    "codexglue_code_to_text_php",
-    "codexglue_code_to_text_python",
-    "codexglue_code_to_text_ruby",
-    "conala",
-    "concode",
-    "ds1000",
-    "gpqa",
-    "hellaswag",
-    "instruct_humaneval",
-    "lambada_openai",
-    "livecodebench",
-    "math500",
-    "math",
-    "mbpp",
-    "mbpp_plus",
-    "mercury",
-    "openbookqa",
-    "polymath_en_medium",
-    "polymath_zh_medium",
-    "race",
-    "recode",
-    "swag",
-    "truthfulqa_mc1",
-    "truthfulqa_mc2",
-    "winogrande"
-  ],
-  "error": []
-}

wisent/examples/scripts/results/coding_benchmarks_test_code_status.json DELETED Viewed

@@ -1,150 +0,0 @@
-{
-  "summary": {
-    "total_coding_benchmarks": 18,
-    "benchmarks_with_test_code": 2,
-    "benchmarks_without_test_code": 16,
-    "model_tested": "meta-llama/Llama-3.2-1B-Instruct",
-    "test_date": "2025-11-03",
-    "test_script": "test_coding_benchmarks.py"
-  },
-  "evaluation_method_verification": {
-    "expected_method": "docker_code (code execution)",
-    "verification_status": "CONFIRMED",
-    "note": "After removing log-likelihood fallback, docker_code_evaluator now ONLY performs code execution. Benchmarks without test_code return UNKNOWN as expected."
-  },
-  "benchmarks_with_test_code": {
-    "humaneval": {
-      "status": "PASSED",
-      "has_test_code": true,
-      "evaluation_method": "docker_code",
-      "test_result": "Code executed successfully",
-      "note": "Python function completion with unit tests"
-    },
-    "humaneval_plus": {
-      "status": "PASSED",
-      "has_test_code": true,
-      "evaluation_method": "docker_code",
-      "test_result": "Code executed successfully",
-      "note": "Extended version of HumanEval with more tests"
-    }
-  },
-  "benchmarks_without_test_code": {
-    "mbpp": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Python programming problems - dataset may not include test_code in metadata"
-    },
-    "mbpp_plus": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Extended version of MBPP"
-    },
-    "instruct_humaneval": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Instruction-following version of HumanEval"
-    },
-    "apps": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Automated Programming Progress Standard"
-    },
-    "ds1000": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Data science code generation benchmark"
-    },
-    "livecodebench": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Live coding benchmark"
-    },
-    "conala": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Python code snippet generation"
-    },
-    "concode": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Java code generation"
-    },
-    "mercury": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Multi-language code translation"
-    },
-    "recode": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Code translation benchmark"
-    },
-    "codexglue_code_to_text_python": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Code-to-text for Python (summarization task, not code generation)"
-    },
-    "codexglue_code_to_text_go": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Code-to-text for Go"
-    },
-    "codexglue_code_to_text_ruby": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Code-to-text for Ruby"
-    },
-    "codexglue_code_to_text_java": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Code-to-text for Java"
-    },
-    "codexglue_code_to_text_javascript": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Code-to-text for JavaScript"
-    },
-    "codexglue_code_to_text_php": {
-      "status": "UNKNOWN",
-      "has_test_code": false,
-      "evaluation_method": "docker_code",
-      "test_result": "No test code provided",
-      "note": "Code-to-text for PHP"
-    }
-  },
-  "implications": {
-    "current_limitation": "Only 2 out of 18 coding benchmarks (11.1%) have test code available for actual code execution evaluation. The remaining 16 benchmarks return UNKNOWN because they lack unit tests in their dataset metadata.",
-    "recommendation": "For benchmarks without test code, either: (1) add test code to the dataset extractors, (2) use a different evaluation method (e.g., log-likelihood comparison for code-to-text tasks), or (3) accept UNKNOWN as the expected result for these benchmarks.",
-    "paper_alignment": "The paper specifies 'CE' (Code Execution) as the evaluation method for these benchmarks. However, the dataset metadata for most benchmarks does not include test_code, making code execution impossible without additional data engineering."
-  }
-}

wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl