PyPI - wisent - Versions diffs - 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl - Mend

wisent 0.7.701py3-none-any.whl → 0.7.1045py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (391) hide show

wisent/parameters/lm_eval/broken_in_lm_eval.json CHANGED Viewed

@@ -1,10 +1,187 @@
 [
+  "20_newsgroups",
+  "acp_bench_hard",
+  "acpbench",
+  "african_flores",
+  "afrimgsm",
+  "afrobench_adr",
+  "ag_news",
+  "agentharm",
+  "agieval",
+  "aime2024",
+  "aime2025",
+  "alpaca_eval",
+  "anagrams1",
+  "anagrams2",
+  "aradice",
+  "argument_topic",
+  "assin_entailment",
+  "banking77",
+  "basque_bench",
+  "benchmarks",
+  "bfcl",
+  "bhs",
+  "bhtc",
+  "bigbench",
+  "bigbench_generate_until",
+  "cabbq",
+  "cabreu_abstractive",
+  "chain",
+  "claim_stance_topic",
+  "click",
+  "cluewsc",
+  "cnn_dailymail",
+  "cocoteros_va",
+  "code2text",
   "code_x_glue",
+  "codeforces",
+  "codexglue_code_to_text_go",
+  "codexglue_code_to_text_java",
+  "codexglue_code_to_text_javascript",
+  "codexglue_code_to_text_php",
+  "codexglue_code_to_text_python",
+  "codexglue_code_to_text_ruby",
+  "coedit",
+  "copa_ca",
+  "cycle_letters",
+  "dbpedia_14",
+  "discrim_eval",
+  "doc",
+  "egyhellaswag",
+  "egymmlu",
+  "epec",
   "epec_koref_bin",
+  "esbbq",
+  "escola",
+  "ethos_binary",
+  "evalita-mp",
+  "evalita-sp",
+  "financial_tweets",
+  "flan",
   "flan_held_in",
+  "flores_ca-eu",
+  "gpt3_translation_benchmarks",
+  "harmbench",
+  "hle",
+  "icelandic_winogrande",
+  "ifeval",
+  "instruct_humaneval",
+  "instructhumaneval",
+  "jailbreakbench",
+  "japanese_leaderboard",
+  "jsonschema_bench",
+  "kmmlu",
+  "kmmlu_accounting",
+  "law_stack_exchange",
+  "leaderboard",
+  "ledgar",
+  "libra",
+  "librusec_history",
+  "livemathbench_cnmo_zh",
+  "llama3",
+  "lm_syneval",
+  "long_context_multiq",
+  "longbench",
+  "longbenchv2",
+  "matreshka_names",
+  "mbpp",
+  "mbpp_plus",
+  "mc-taco",
+  "medical_abstracts",
+  "mediqa_qa2019",
+  "medtext",
+  "meqsum",
+  "mgsm_direct_eu",
+  "mimic_repsum",
+  "minerva_math",
+  "mmlu_redux",
+  "mmmlu",
+  "mts_dialog",
+  "multi_swe_bench",
+  "multiblimp",
+  "multilingual",
+  "ncb",
+  "niah_single_1",
+  "norbelebele_p0",
+  "norec_document_p0",
+  "noreval",
+  "noropenbookqa_nno_p0",
+  "norrewrite_instruct",
+  "norsumm_nno_p0",
+  "norsummarize_instruct",
+  "okapi",
+  "okapi/arc_multilingual",
+  "olaph",
+  "olympiadbench",
+  "openbookqa_ca",
+  "or_bench",
+  "parafraseja",
+  "parafrases_gl",
+  "passkey",
+  "penn_treebank",
+  "pile",
+  "pile_10k",
+  "piqa_eu",
+  "portuguese_bench",
+  "ptb",
+  "qnlieu",
+  "quac",
+  "random",
+  "reversed",
+  "ru_2wikimultihopqa",
   "ruler",
+  "scrolls",
+  "sglue",
+  "social_iqa",
+  "sorry_bench",
+  "squad2",
+  "summarization_gl",
+  "super-glue-t5-prompt",
+  "super_glue",
+  "super_glue-wsc-t5-prompt",
+  "superglue",
+  "supergpqa",
+  "swe_bench_multilingual",
+  "sycophancy_eval",
+  "t0",
   "t0_eval",
+  "tatoeba_eng_nno_p0",
+  "teca",
+  "tinyArc",
+  "tinyBenchmarks",
+  "tinyGSM8k",
+  "tinyHellaswag",
+  "tinyMMLU",
+  "tinyTruthfulQA",
+  "tinyWinogrande",
+  "tinyarc",
+  "tinybenchmarks",
+  "tinygsm8k",
+  "tinyhellaswag",
+  "tinymmlu",
+  "tinytruthfulqa",
+  "tinywinogrande",
   "tmlu",
+  "trasnlation_all_flores",
+  "truthfulqa-multi",
+  "truthfulqa_gl_mc1",
+  "turblimp_core",
+  "turkishmmlu",
+  "turkishmmlu_biology",
+  "twenty_newsgroups",
+  "unfair_tos",
+  "unitxt",
+  "unscramble",
+  "vaxx",
   "vaxx_stance",
-  "wiceu"
-]
+  "wiceu",
+  "wikitext103",
+  "wildguard",
+  "wmt16-en-ro",
+  "wmt16-ro-en",
+  "wmt16_en_ro",
+  "wmt16_ro_en",
+  "wmt2014",
+  "xlsum_es",
+  "xnli_gl"
+]

wisent/parameters/lm_eval/category_directions.json ADDED Viewed

@@ -0,0 +1,137 @@
+{
+  "coding": {
+    "description": "Code generation, understanding, and debugging capabilities",
+    "hypothesized_directions": [
+      "code_correctness",
+      "code_completeness",
+      "algorithmic_thinking",
+      "code_style"
+    ]
+  },
+  "math": {
+    "description": "Mathematical reasoning and computation",
+    "hypothesized_directions": [
+      "numerical_accuracy",
+      "algebraic_reasoning",
+      "problem_decomposition",
+      "mathematical_rigor"
+    ]
+  },
+  "reasoning_logic": {
+    "description": "Logical deduction and multi-step reasoning",
+    "hypothesized_directions": [
+      "deductive_reasoning",
+      "causal_reasoning",
+      "planning",
+      "constraint_satisfaction"
+    ]
+  },
+  "hallucination_factuality": {
+    "description": "Truthfulness and factual accuracy",
+    "hypothesized_directions": [
+      "factual_recall",
+      "uncertainty_awareness",
+      "source_grounding",
+      "confabulation_resistance"
+    ]
+  },
+  "safety_bias": {
+    "description": "Safety, fairness, and bias mitigation",
+    "hypothesized_directions": [
+      "harm_avoidance",
+      "stereotype_resistance",
+      "fairness",
+      "toxicity_avoidance"
+    ]
+  },
+  "multilingual": {
+    "description": "Cross-lingual and language-specific capabilities",
+    "hypothesized_directions": [
+      "language_transfer",
+      "cultural_awareness",
+      "script_handling",
+      "cross_lingual_consistency"
+    ]
+  },
+  "knowledge_qa": {
+    "description": "World knowledge and question answering",
+    "hypothesized_directions": [
+      "factual_knowledge",
+      "knowledge_retrieval",
+      "answer_precision",
+      "domain_expertise"
+    ]
+  },
+  "reading_comprehension": {
+    "description": "Understanding and extracting information from text",
+    "hypothesized_directions": [
+      "information_extraction",
+      "inference_making",
+      "context_tracking",
+      "summarization"
+    ]
+  },
+  "commonsense": {
+    "description": "Everyday reasoning and world understanding",
+    "hypothesized_directions": [
+      "physical_intuition",
+      "social_reasoning",
+      "temporal_reasoning",
+      "spatial_reasoning"
+    ]
+  },
+  "science_medical": {
+    "description": "Scientific and medical domain knowledge",
+    "hypothesized_directions": [
+      "scientific_accuracy",
+      "medical_knowledge",
+      "technical_precision",
+      "evidence_based_reasoning"
+    ]
+  },
+  "instruction_following": {
+    "description": "Following complex instructions and user intent",
+    "hypothesized_directions": [
+      "instruction_adherence",
+      "format_compliance",
+      "constraint_following",
+      "intent_understanding"
+    ]
+  },
+  "tool_use_agents": {
+    "description": "Using tools and acting as an agent",
+    "hypothesized_directions": [
+      "tool_selection",
+      "api_usage",
+      "action_planning",
+      "error_recovery"
+    ]
+  },
+  "language_understanding": {
+    "description": "Core linguistic competence",
+    "hypothesized_directions": [
+      "syntactic_knowledge",
+      "semantic_understanding",
+      "pragmatic_competence",
+      "lexical_knowledge"
+    ]
+  },
+  "translation": {
+    "description": "Cross-lingual translation",
+    "hypothesized_directions": [
+      "translation_accuracy",
+      "fluency",
+      "terminology_handling",
+      "style_preservation"
+    ]
+  },
+  "ethics_values": {
+    "description": "Ethical reasoning and value alignment",
+    "hypothesized_directions": [
+      "moral_reasoning",
+      "value_consistency",
+      "norm_awareness",
+      "ethical_sensitivity"
+    ]
+  }
+}

wisent/parameters/lm_eval/repair_plan.json ADDED Viewed

@@ -0,0 +1,282 @@
+{
+  "summary": {
+    "total_tested": 321,
+    "ok": 179,
+    "failed": 142,
+    "success_rate": "56%"
+  },
+  "case_sensitive_fix": {
+    "description": "Task exists in lm-eval but our code uses wrong case",
+    "repair": "Fix case in manifest or loader",
+    "tasks": [
+      {"our_name": "AraDiCE_ArabicMMLU_lev", "correct_name": "AraDiCE_ArabicMMLU_lev", "note": "timeout issue, not case"},
+      {"our_name": "aexams_IslamicStudies", "correct_name": "aexams_IslamicStudies", "note": "our code lowercases to aexams_islamicstudies"}
+    ]
+  },
+  "task_name_mappings": {
+    "description": "Task exists in lm-eval but under different name - verified with lm_eval --tasks",
+    "repair": "Update manifest to use correct lm-eval task name",
+    "tasks": [
+      {
+        "our_name": "mc-taco",
+        "correct_name": "mc_taco",
+        "reason": "lm_eval --tasks mc-taco fails, lm_eval --tasks mc_taco works (hyphen vs underscore)"
+      }
+    ]
+  },
+  "lm_eval_bugs": {
+    "description": "Folder exists in lm-eval/tasks but group/task name not registered - bug in lm-eval harness",
+    "repair": "Use one of the existing subtasks, or wait for lm-eval fix, or move to HF_EXTRACTORS",
+    "tasks": [
+      {
+        "our_name": "acpbench",
+        "folder": "lm_eval/tasks/acpbench",
+        "note": "Folder exists but task 'acpbench' not registered. README says task is 'acp_bench'",
+        "available_groups": ["acp_bench", "acp_bench_hard", "acp_bench_hard_with_pddl"],
+        "available_subtasks": ["acp_bool_cot_2shot", "acp_gen_2shot", "acp_mcq_cot_2shot", "acp_app_bool", "etc."]
+      },
+      {
+        "our_name": "afrimgsm",
+        "folder": "lm_eval/tasks/afrimgsm",
+        "note": "Folder README says 'afrimgsm: All afrimgsm tasks' but this group is NOT defined in any YAML. YAMLs define 'afrimgsm-irokobench' etc.",
+        "available_groups": ["afrimgsm-irokobench", "afrimgsm_cot-irokobench", "afrimgsm_tt-irokobench", "afrimgsm_tt_cot-irokobench"],
+        "available_subtasks": ["afrimgsm_tasks_prompt_1", "afrimgsm_amh_prompt_1", "etc."]
+      },
+      {
+        "our_name": "afrimmlu",
+        "folder": "lm_eval/tasks/afrimmlu",
+        "note": "Same issue as afrimgsm - folder exists, group 'afrimmlu' not registered",
+        "available_groups": ["afrimmlu-irokobench"]
+      },
+      {
+        "our_name": "llama3",
+        "folder": "lm_eval/tasks/llama3",
+        "note": "Folder exists but no 'llama3' group. Contains MMLU variants for Llama evaluation",
+        "available_groups": ["mmlu_llama", "mmlu_cot_llama", "mmlu_de_llama", "mmlu_es_llama", "mmlu_fr_llama", "etc."]
+      },
+      {
+        "our_name": "mmmlu",
+        "folder": "lm_eval/tasks/okapi/mmlu_multilingual",
+        "note": "No 'mmmlu' task. Multilingual MMLU is 'm_mmlu' group in okapi folder",
+        "available_groups": ["m_mmlu"],
+        "available_subtasks": ["m_mmlu_ar", "m_mmlu_de", "m_mmlu_es", "m_mmlu_fr", "etc."]
+      },
+      {
+        "our_name": "okapi",
+        "folder": "lm_eval/tasks/okapi",
+        "note": "Folder exists but no 'okapi' group. Contains multilingual variants of benchmarks",
+        "available_groups": ["m_arc", "m_hellaswag", "m_mmlu", "m_truthfulqa"],
+        "subfolders": ["arc_multilingual", "hellaswag_multilingual", "mmlu_multilingual", "truthfulqa_multilingual"]
+      },
+      {
+        "our_name": "sglue",
+        "folder": "lm_eval/tasks/super_glue",
+        "note": "No 'sglue' group. Super GLUE tasks are registered individually or as 'super-glue-lm-eval-v1'",
+        "available_groups": ["super-glue-lm-eval-v1", "super-glue-lm-eval-v1-seq2seq", "super-glue-t5-prompt"],
+        "available_subtasks": ["boolq", "cb", "copa", "multirc", "record", "sglue_rte", "wic", "wsc"]
+      },
+      {
+        "our_name": "superglue",
+        "folder": "lm_eval/tasks/super_glue",
+        "note": "Same as sglue - no 'superglue' group",
+        "available_groups": ["super-glue-lm-eval-v1", "super-glue-lm-eval-v1-seq2seq", "super-glue-t5-prompt"]
+      }
+    ]
+  },
+  "truly_not_in_lmeval": {
+    "description": "Task truly does not exist in lm-eval harness - need to move to HF_EXTRACTORS or remove",
+    "repair": "Move to HF_EXTRACTORS with custom data loader, or remove from manifest",
+    "tasks": [
+      "bhs",
+      "bhtc",
+      "cabbq",
+      "chain",
+      "click",
+      "code2text",
+      "coedit",
+      "discrim_eval",
+      "doc",
+      "egyhellaswag",
+      "egymmlu",
+      "epec",
+      "esbbq",
+      "evalita-sp",
+      "flan",
+      "icelandic_winogrande",
+      "libra",
+      "librusec_history",
+      "lm_syneval",
+      "long_context_multiq",
+      "longbenchv2",
+      "matreshka_names",
+      "multiblimp",
+      "multilingual",
+      "passkey",
+      "quac",
+      "random",
+      "reversed",
+      "ru_2wikimultihopqa",
+      "t0",
+      "tinybenchmarks",
+      "truthfulqa-multi",
+      "turblimp_core",
+      "twenty_newsgroups",
+      "vaxx",
+      "wmt2014"
+    ]
+  },
+  "needs_unitxt": {
+    "description": "Requires unitxt package installation",
+    "repair": "pip install unitxt",
+    "tasks": [
+      "20_newsgroups",
+      "ag_news",
+      "argument_topic",
+      "banking77",
+      "claim_stance_topic",
+      "cnn_dailymail",
+      "dbpedia_14",
+      "ethos_binary",
+      "financial_tweets",
+      "law_stack_exchange",
+      "ledgar",
+      "medical_abstracts",
+      "unfair_tos",
+      "unitxt"
+    ]
+  },
+  "import_error": {
+    "description": "Extractor module cannot be imported - likely missing dependencies or syntax error",
+    "repair": "Fix the extractor module import issues",
+    "tasks": [
+      {"name": "african_flores", "module": "flores"},
+      {"name": "afrobench_adr", "module": "afrobench"},
+      {"name": "agieval", "module": "agieval"},
+      {"name": "evalita-mp", "module": "evalita_mp"},
+      {"name": "flores_ca-eu", "module": "flores"},
+      {"name": "super-glue-t5-prompt", "module": "super_glue_t5_prompt"},
+      {"name": "super_glue-wsc-t5-prompt", "module": "super_glue_t5_prompt"},
+      {"name": "trasnlation_all_flores", "module": "flores"}
+    ]
+  },
+  "no_pairs_returned": {
+    "description": "Extractor ran but returned no contrastive pairs - likely data loading or extraction logic issue",
+    "repair": "Debug extractor to ensure pairs are generated from the dataset",
+    "tasks": [
+      "anagrams1",
+      "anagrams2",
+      "assin_entailment",
+      "cabreu_abstractive",
+      "cocoteros_va",
+      "copa_ca",
+      "cycle_letters",
+      "escola",
+      "gpt3_translation_benchmarks",
+      "kmmlu_accounting",
+      "mgsm_direct_eu",
+      "ncb",
+      "norbelebele_p0",
+      "norec_document_p0",
+      "noropenbookqa_nno_p0",
+      "norrewrite_instruct",
+      "norsummarize_instruct",
+      "openbookqa_ca",
+      "parafraseja",
+      "parafrases_gl",
+      "piqa_eu",
+      "qnlieu",
+      "summarization_gl",
+      "tatoeba_eng_nno_p0",
+      "teca",
+      "truthfulqa_gl_mc1",
+      "turkishmmlu_biology",
+      "unscramble",
+      "wikitext103",
+      "wmt16-en-ro",
+      "wmt16-ro-en",
+      "xlsum_es",
+      "xnli_gl"
+    ]
+  },
+  "timeout": {
+    "description": "Task loading exceeded 30 second timeout",
+    "repair": "Increase timeout or optimize data loading",
+    "tasks": [
+      "bigbench_generate_until",
+      "portuguese_bench",
+      "scrolls"
+    ]
+  },
+  "missing_dependencies": {
+    "description": "Missing Python packages required by lm-eval tasks",
+    "repair_commands": {
+      "tinyBenchmarks": "pip install git+https://github.com/felipemaiapolo/tinyBenchmarks",
+      "langdetect": "pip install langdetect",
+      "emoji": "pip install lm_eval[japanese_leaderboard]",
+      "jsonschema": "pip install jsonschema[format]",
+      "longbench": "pip install lm_eval[longbench]",
+      "bert_score": "pip install evaluate bert-score",
+      "minerva_math": "pip install sympy math_verify antlr4-python3-runtime==4.11",
+      "ruler": "pip install lm_eval[ruler]",
+      "noreval": "pip install sacrebleu bert_score rouge_score"
+    },
+    "tasks": [
+      {"name": "benchmarks", "dep": "tinyBenchmarks"},
+      {"name": "ifeval", "dep": "langdetect"},
+      {"name": "japanese_leaderboard", "dep": "emoji"},
+      {"name": "jsonschema_bench", "dep": "jsonschema"},
+      {"name": "longbench", "dep": "longbench"},
+      {"name": "mediqa_qa2019", "dep": "bert_score"},
+      {"name": "medtext", "dep": "bert_score"},
+      {"name": "meqsum", "dep": "bert_score"},
+      {"name": "mimic_repsum", "dep": "bert_score"},
+      {"name": "minerva_math", "dep": "minerva_math"},
+      {"name": "mts_dialog", "dep": "bert_score"},
+      {"name": "niah_single_1", "dep": "ruler"},
+      {"name": "olaph", "dep": "bert_score"},
+      {"name": "ruler", "dep": "ruler"},
+      {"name": "tinyArc", "dep": "tinyBenchmarks"},
+      {"name": "tinyBenchmarks", "dep": "tinyBenchmarks"},
+      {"name": "tinyGSM8k", "dep": "tinyBenchmarks"},
+      {"name": "tinyHellaswag", "dep": "tinyBenchmarks"},
+      {"name": "tinyMMLU", "dep": "tinyBenchmarks"},
+      {"name": "tinyTruthfulQA", "dep": "tinyBenchmarks"},
+      {"name": "tinyWinogrande", "dep": "tinyBenchmarks"},
+      {"name": "tinyarc", "dep": "tinyBenchmarks"},
+      {"name": "tinygsm8k", "dep": "tinyBenchmarks"},
+      {"name": "tinyhellaswag", "dep": "tinyBenchmarks"},
+      {"name": "tinymmlu", "dep": "tinyBenchmarks"},
+      {"name": "tinytruthfulqa", "dep": "tinyBenchmarks"},
+      {"name": "tinywinogrande", "dep": "tinyBenchmarks"}
+    ]
+  },
+  "other_errors": {
+    "description": "Various other errors requiring individual investigation",
+    "tasks": [
+      {"name": "aradice", "error": "HF dataset file not found"},
+      {"name": "basque_bench", "error": "HF dataset file not found"},
+      {"name": "hle", "error": "Fallback dataset loading not permitted"},
+      {"name": "mbpp", "error": "Multiprocessing bootstrap error"},
+      {"name": "noreval", "error": "Missing sacrebleu, bert_score, rouge_score"},
+      {"name": "norsumm_nno_p0", "error": "Missing sacrebleu, bert_score, rouge_score"},
+      {"name": "pile", "error": "Dataset disabled - the-eye.eu unavailable"},
+      {"name": "social_iqa", "error": "Dataset scripts no longer supported"},
+      {"name": "supergpqa", "error": "Fallback dataset loading not permitted"},
+      {"name": "tmlu", "error": "Extractor interface mismatch - needs lm_eval_task_data argument"}
+    ]
+  },
+  "working_benchmarks": {
+    "count": 179,
+    "examples": [
+      "ArabCulture", "aclue", "acp_bench", "advanced_ai_risk", "afrixnli",
+      "ai2_arc", "anli", "apps", "arabic_leaderboard_acva", "arabicmmlu",
+      "arithmetic", "asdiv", "babi", "bbh", "bbq", "belebele", "blimp",
+      "boolq", "cb", "ceval", "cmmlu_agronomy", "cola", "commonsense_qa",
+      "coqa", "drop", "glue", "gpqa", "gsm8k", "hellaswag", "hendrycks_ethics",
+      "humaneval", "lambada_openai", "logiqa", "math", "mathqa", "medmcqa",
+      "mmlu", "mrpc", "multirc", "mutual", "openbookqa", "piqa", "pubmedqa",
+      "qasper", "race", "sciq", "siqa", "squad_completion", "storycloze",
+      "swag", "triviaqa", "truthfulqa", "winogrande", "wsc273", "xcopa", "xnli"
+    ]
+  }
+}

wisent/parameters/lm_eval/weak_contrastive_pairs.json ADDED Viewed

@@ -0,0 +1,38 @@
+{
+  "description": "Benchmarks with weak or problematic contrastive pair generation",
+  "categories": {
+    "identical_pairs": {
+      "description": "Positive and negative responses are identical - extractor bug",
+      "benchmarks": [
+        "paloma"
+      ]
+    },
+    "lazy_math_negative": {
+      "description": "Negative is just 'correct_answer + 1' instead of meaningful wrong answer",
+      "benchmarks": [
+        "hendrycks_math",
+        "math500",
+        "livemathbench_cnmo_en",
+        "polymath_en_medium",
+        "polymath_zh_medium",
+        "polymath_en_high",
+        "polymath_zh_high"
+      ]
+    },
+    "hedging_negative": {
+      "description": "Negative is 'I believe the answer is not X' instead of actual wrong answer",
+      "benchmarks": [
+        "simpleqa",
+        "frames"
+      ]
+    },
+    "negation_pattern": {
+      "description": "Negative is 'not X' pattern - acceptable for some benchmarks but weak",
+      "benchmarks": [
+        "babi"
+      ]
+    }
+  },
+  "total_weak_benchmarks": 12,
+  "notes": "These benchmarks technically work but have suboptimal contrastive pair quality. Consider improving extractors to generate more meaningful negative examples."
+}

wisent 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl

wisent 0.7.701py3-none-any.whl → 0.7.1045py3-none-any.whl