PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json CHANGED Viewed

@@ -3,7 +3,9 @@
   "aime2024", "aime2025", "arabic_exams", "argument_topic", "banking77", "babilong", "bangla_mmlu", "boolq", "boolq-seq2seq", "cb",
   "claim_stance_topic", "squad2", "unitxt", "copa", "glianorex", "global_mmlu_ar", "gsm_plus",  "logieval", "m_mmlu",   "mela",   "noticia",
   "penn_treebank", "phrases_ca-va", "record", "stsb", "wikitext103", "wmt14_en_fr", "wmt14_fr_en", "wmt16_de_en", "wmt16_en_de", "wmt16_en_ro",
-  "wmt16_ro_en", "sglue_rte"
+  "wmt16_ro_en", "sglue_rte", "humaneval", "codexglue_code_to_text_go", "codexglue_code_to_text_java", "codexglue_code_to_text_javascript",
+  "codexglue_code_to_text_php", "codexglue_code_to_text_python", "codexglue_code_to_text_ruby", "prompt_robustness_agieval_aqua_rat",
+  "option_order_robustness_agieval_aqua_rat",  "non_greedy_robustness_agieval_aqua_rat"
   ],
@@ -32,7 +34,8 @@
     {"noticia": ["generation"]},
     {"phrases_ca-va": ["generation"]},
     {"record": ["log_likelihoods"]},
-    {"sglue_rte": ["log_likelihoods"]}
+    {"sglue_rte": ["log_likelihoods"]},
+    {"humaneval": []}
   ],
   "completed_hf_with_version_on_lm_git": [
@@ -41,88 +44,34 @@
   "completed_hf_without_version_on_lm_git": [
       "hmmt", "hmmt_feb_2025", "penn_treebank", "stsb", "wikitext103",  "math", "math500", "polymath_en_high", "polymath_en_medium", "polymath_zh_high",
-      "polymath_zh_medium", "livemathbench"
+      "polymath_zh_medium", "livemathbench", "conala"
   ],
   "too complex": ["evalita: many different prompts structure, could mess up our prompt strategies",
-                  "babilong: many splits and subsets, long contexts"],
+                  "babilong: many splits and subsets, long contexts",
+                  "flores: over 400 tasks"],
   "broken": ["vaxx_stance", "wiceu", "tmlu", "t0_eval", "flan_held_in"],
-  "all_tasks": [
-  "Tag",
-  "apps",
-  "humaneval_plus",
-  "multipl_e",
-  "chain_of_thought",
-  "codexglue_code_to_text_go",
-  "codexglue_code_to_text_java",
-  "codexglue_code_to_text_javascript",
-  "codexglue_code_to_text_php",
-  "codexglue_code_to_text_python",
-  "codexglue_code_to_text_ruby",
-  "conala",
-  "concode",
-  "ds1000",
+  "long to evaluate and we have individual subtasks": ["pythia"],
-  "evalita-mp",
-  "evalita-sp_sum_task_fp-small_p1",
+  "run custom code": ["multimedqa", "iwslt2017-ar-en", "iwslt2017-en-ar"],
-  "flores",
+  "idk what is it": ["multiple_choice", "Tag"],
-  "freebase",
+  "all_tasks": [
-  "gpt3_translation_benchmarks",
+  "multipl_e",
-  "humaneval_64_instruct",
-  "humaneval_instruct",
-  "humanevalpack",
-  "instruct_humaneval",
-  "instructhumaneval",
-  "iwslt2017-ar-en",
-  "iwslt2017-en-ar",
+  "concode",
+  "ds1000",
   "livecodebench",
-  "livemathbench_cnmo_en",
-  "livemathbench_cnmo_zh",
-  "llama",
-  "mbpp_plus",
-  "mercury",
-  "multimedqa",
-  "multiple_choice",
-  "non_greedy_robustness_agieval_aqua_rat",
-  "openllm",
-  "option_order_robustness_agieval_aqua_rat",
-  "prompt_robustness_agieval_aqua_rat",
-  "ptb",
-  "pythia",
-  "recode",
-  "self_consistency",
-  "super-glue-lm-eval-v1",
-  "super-glue-lm-eval-v1-seq2seq",
-  "super-glue-t5-prompt"
+  "llama", "llama3",
+  "humanevalpack",
+  "mercury",
+  "recode"
   ]
 }

wisent/parameters/lm_eval/weak_contrastive_pairs.json ADDED Viewed

@@ -0,0 +1,38 @@
+{
+  "description": "Benchmarks with weak or problematic contrastive pair generation",
+  "categories": {
+    "identical_pairs": {
+      "description": "Positive and negative responses are identical - extractor bug",
+      "benchmarks": [
+        "paloma"
+      ]
+    },
+    "lazy_math_negative": {
+      "description": "Negative is just 'correct_answer + 1' instead of meaningful wrong answer",
+      "benchmarks": [
+        "hendrycks_math",
+        "math500",
+        "livemathbench_cnmo_en",
+        "polymath_en_medium",
+        "polymath_zh_medium",
+        "polymath_en_high",
+        "polymath_zh_high"
+      ]
+    },
+    "hedging_negative": {
+      "description": "Negative is 'I believe the answer is not X' instead of actual wrong answer",
+      "benchmarks": [
+        "simpleqa",
+        "frames"
+      ]
+    },
+    "negation_pattern": {
+      "description": "Negative is 'not X' pattern - acceptable for some benchmarks but weak",
+      "benchmarks": [
+        "babi"
+      ]
+    }
+  },
+  "total_weak_benchmarks": 12,
+  "notes": "These benchmarks technically work but have suboptimal contrastive pair quality. Consider improving extractors to generate more meaningful negative examples."
+}

wisent/parameters/lm_eval/working_benchmarks.json ADDED Viewed

@@ -0,0 +1,206 @@
+[
+  "AraDiCE",
+  "ArabCulture",
+  "Tag",
+  "aclue",
+  "acp_bench",
+  "aexams",
+  "afrimgsm_direct_amh",
+  "afrimmlu_direct_amh",
+  "agentbench",
+  "aider_polyglot",
+  "aime",
+  "anli",
+  "apps",
+  "arabic_leaderboard_complete",
+  "arabic_leaderboard_light",
+  "arabicmmlu",
+  "arena_hard",
+  "arithmetic",
+  "asdiv",
+  "babi",
+  "babilong",
+  "bangla_mmlu",
+  "basqueglue",
+  "bbh",
+  "bbq",
+  "belebele",
+  "bertaqa",
+  "blimp",
+  "browsecomp",
+  "c4",
+  "careqa",
+  "catalan_bench",
+  "ceval",
+  "chartqa",
+  "chinese_simpleqa",
+  "cmmlu",
+  "cnmo",
+  "cnmo_2024",
+  "commonsense_qa",
+  "conala",
+  "concode",
+  "copal_id",
+  "coqa",
+  "crows_pairs",
+  "csatqa",
+  "curate",
+  "darija_bench",
+  "darijahellaswag",
+  "darijammlu",
+  "donotanswer",
+  "drop",
+  "ds1000",
+  "eq_bench",
+  "eus_exams",
+  "eus_proficiency",
+  "eus_reading",
+  "eus_trivia",
+  "evalita_LLM",
+  "facts_grounding",
+  "faithbench",
+  "fda",
+  "finsearchcomp",
+  "flames",
+  "fld",
+  "frames",
+  "french_bench",
+  "galician_bench",
+  "global_mmlu",
+  "gpqa",
+  "groundcocoa",
+  "gsm8k",
+  "haerae",
+  "hallucinations_leaderboard",
+  "halueval",
+  "halulens",
+  "headqa",
+  "healthbench",
+  "hellaswag",
+  "hendrycks_ethics",
+  "hendrycks_math",
+  "histoires_morales",
+  "hmmt",
+  "hmmt_feb_2025",
+  "hrm8k",
+  "humaneval",
+  "humaneval_plus",
+  "humanevalpack",
+  "inverse_scaling",
+  "kbl",
+  "kobest",
+  "kormedmcqa",
+  "lambada",
+  "lambada_cloze",
+  "lambada_multilingual",
+  "lambada_multilingual_stablelm",
+  "lingoly",
+  "livecodebench",
+  "livecodebench_lite",
+  "livecodebench_v5",
+  "livecodebench_v6",
+  "livemathbench_cnmo_en",
+  "logiqa",
+  "logiqa2",
+  "longform",
+  "longform_writing",
+  "mastermind",
+  "math",
+  "math500",
+  "mathqa",
+  "mc_taco",
+  "med_concepts_qa",
+  "meddialog",
+  "medmcqa",
+  "medqa",
+  "mercury",
+  "metabench",
+  "mgsm",
+  "mlqa",
+  "mmlu",
+  "mmlu-pro-plus",
+  "mmlu_pro",
+  "mmlu_prox",
+  "mmlusr",
+  "mmmu",
+  "model_written_evals",
+  "moral_stories",
+  "multipl_e",
+  "multiple_cpp",
+  "multiple_go",
+  "multiple_java",
+  "multiple_js",
+  "multiple_py",
+  "multiple_rs",
+  "mutual",
+  "nq_open",
+  "oj_bench",
+  "okapi/hellaswag_multilingual",
+  "okapi/mmlu_multilingual",
+  "okapi/truthfulqa_multilingual",
+  "openbookqa",
+  "paloma",
+  "paws-x",
+  "piqa",
+  "planbench",
+  "polemo2",
+  "politicalbias_qa",
+  "polyglottoxicityprompts",
+  "polymath_en_high",
+  "polymath_en_medium",
+  "polymath_zh_high",
+  "polymath_zh_medium",
+  "prost",
+  "pubmedqa",
+  "qa4mre",
+  "qasper",
+  "race",
+  "realtoxicityprompts",
+  "recode",
+  "refusalbench",
+  "scicode",
+  "sciq",
+  "score",
+  "seal",
+  "seal_0",
+  "simple_cooccurrence_bias",
+  "simpleqa",
+  "siqa",
+  "spanish_bench",
+  "squad_completion",
+  "squadv2",
+  "storycloze",
+  "swag",
+  "swde",
+  "swe_bench_verified",
+  "swe_verified",
+  "tau_bench",
+  "terminal_bench",
+  "tmmluplus",
+  "toolbench",
+  "toolemu",
+  "toolllm",
+  "toxigen",
+  "translation",
+  "travelplanner",
+  "triviaqa",
+  "truthfulqa",
+  "truthfulqa_generation",
+  "webqs",
+  "wikitext",
+  "winogender",
+  "winogrande",
+  "wmdp",
+  "wmt14_en_fr",
+  "wmt14_fr_en",
+  "wmt16_de_en",
+  "wmt16_en_de",
+  "wmt2016",
+  "wsc273",
+  "xcopa",
+  "xnli",
+  "xnli_eu",
+  "xquad",
+  "xstorycloze",
+  "xwinograd"
+]

wisent/parameters/lm_eval/working_benchmarks_categorized.json ADDED Viewed

@@ -0,0 +1,236 @@
+{
+  "coding": [
+    "aider_polyglot",
+    "apps",
+    "conala",
+    "concode",
+    "donotanswer",
+    "ds1000",
+    "humaneval",
+    "humaneval_plus",
+    "humanevalpack",
+    "livecodebench",
+    "livecodebench_lite",
+    "livecodebench_v5",
+    "livecodebench_v6",
+    "mercury",
+    "multipl_e",
+    "multiple_cpp",
+    "multiple_go",
+    "multiple_java",
+    "multiple_js",
+    "multiple_py",
+    "multiple_rs",
+    "oj_bench",
+    "recode",
+    "scicode",
+    "swe_bench_verified",
+    "swe_verified",
+    "terminal_bench"
+  ],
+  "commonsense": [
+    "anli",
+    "hellaswag",
+    "piqa",
+    "prost",
+    "siqa",
+    "storycloze",
+    "swag",
+    "winogender",
+    "winogrande",
+    "wsc273"
+  ],
+  "ethics_values": [
+    "hendrycks_ethics",
+    "histoires_morales",
+    "model_written_evals",
+    "moral_stories"
+  ],
+  "hallucination_factuality": [
+    "browsecomp",
+    "chinese_simpleqa",
+    "facts_grounding",
+    "faithbench",
+    "hallucinations_leaderboard",
+    "halueval",
+    "halulens",
+    "okapi/truthfulqa_multilingual",
+    "simpleqa",
+    "truthfulqa",
+    "truthfulqa_generation"
+  ],
+  "instruction_following": [
+    "arena_hard",
+    "chartqa",
+    "eq_bench",
+    "groundcocoa",
+    "longform",
+    "longform_writing",
+    "mmmu",
+    "travelplanner"
+  ],
+  "knowledge_qa": [
+    "aclue",
+    "commonsense_qa",
+    "metabench",
+    "mmlu",
+    "mmlu-pro-plus",
+    "mmlu_pro",
+    "mmlu_prox",
+    "mmlusr",
+    "nq_open",
+    "openbookqa",
+    "triviaqa",
+    "webqs"
+  ],
+  "language_understanding": [
+    "blimp",
+    "lambada",
+    "lambada_cloze",
+    "mc_taco",
+    "mutual",
+    "paloma",
+    "paws-x",
+    "wikitext"
+  ],
+  "math": [
+    "afrimgsm_direct_amh",
+    "aime",
+    "arithmetic",
+    "asdiv",
+    "cnmo",
+    "cnmo_2024",
+    "gsm8k",
+    "hendrycks_math",
+    "hmmt",
+    "hmmt_feb_2025",
+    "hrm8k",
+    "livemathbench_cnmo_en",
+    "math",
+    "math500",
+    "mathqa",
+    "mgsm",
+    "polymath_en_high",
+    "polymath_en_medium",
+    "polymath_zh_high",
+    "polymath_zh_medium"
+  ],
+  "multilingual": [
+    "AraDiCE",
+    "ArabCulture",
+    "Tag",
+    "aexams",
+    "afrimmlu_direct_amh",
+    "arabic_leaderboard_complete",
+    "arabic_leaderboard_light",
+    "arabicmmlu",
+    "bangla_mmlu",
+    "basqueglue",
+    "belebele",
+    "bertaqa",
+    "catalan_bench",
+    "ceval",
+    "cmmlu",
+    "copal_id",
+    "csatqa",
+    "darija_bench",
+    "darijahellaswag",
+    "darijammlu",
+    "eus_exams",
+    "eus_proficiency",
+    "eus_reading",
+    "eus_trivia",
+    "evalita_LLM",
+    "french_bench",
+    "galician_bench",
+    "global_mmlu",
+    "haerae",
+    "kbl",
+    "kobest",
+    "lambada_multilingual",
+    "lambada_multilingual_stablelm",
+    "mlqa",
+    "okapi/hellaswag_multilingual",
+    "okapi/mmlu_multilingual",
+    "polemo2",
+    "spanish_bench",
+    "tmmluplus",
+    "xcopa",
+    "xnli",
+    "xnli_eu",
+    "xquad",
+    "xstorycloze",
+    "xwinograd"
+  ],
+  "reading_comprehension": [
+    "c4",
+    "coqa",
+    "drop",
+    "qa4mre",
+    "qasper",
+    "race",
+    "squad_completion",
+    "squadv2",
+    "swde"
+  ],
+  "reasoning_logic": [
+    "acp_bench",
+    "babi",
+    "babilong",
+    "bbh",
+    "fld",
+    "frames",
+    "inverse_scaling",
+    "lingoly",
+    "logiqa",
+    "logiqa2",
+    "mastermind",
+    "planbench",
+    "score"
+  ],
+  "safety_bias": [
+    "bbq",
+    "crows_pairs",
+    "curate",
+    "flames",
+    "politicalbias_qa",
+    "polyglottoxicityprompts",
+    "realtoxicityprompts",
+    "refusalbench",
+    "simple_cooccurrence_bias",
+    "toxigen",
+    "wmdp"
+  ],
+  "science_medical": [
+    "careqa",
+    "fda",
+    "gpqa",
+    "headqa",
+    "healthbench",
+    "kormedmcqa",
+    "med_concepts_qa",
+    "meddialog",
+    "medmcqa",
+    "medqa",
+    "pubmedqa",
+    "sciq"
+  ],
+  "tool_use_agents": [
+    "agentbench",
+    "finsearchcomp",
+    "seal",
+    "seal_0",
+    "tau_bench",
+    "toolbench",
+    "toolemu",
+    "toolllm"
+  ],
+  "translation": [
+    "translation",
+    "wmt14_en_fr",
+    "wmt14_fr_en",
+    "wmt16_de_en",
+    "wmt16_en_de",
+    "wmt2016"
+  ]
+}

wisent/scripts/run_quality_metrics_sweep.sh CHANGED Viewed

@@ -149,32 +149,29 @@ for BENCHMARK in "${BENCHMARKS[@]}"; do
     BENCHMARK_START=$(date +%s)
-    # Run the optimization pipeline (don't exit on failure)
-    if python3 -m wisent.core.optuna.steering.optuna_pipeline \
+    # Run the optimization using wisent CLI with baseline comparison
+    if wisent optimize-steering comprehensive "$MODEL" \
+        --tasks "$BENCHMARK" \
+        --limit "$TRAIN_LIMIT" \
+        --compute-baseline \
+        --device cuda \
         --output-dir "$OUTPUT_DIR/$BENCHMARK" \
-        --model "$MODEL" \
-        --task "$BENCHMARK" \
-        --n-trials "$N_TRIALS" \
-        --train-limit "$TRAIN_LIMIT" \
-        --val-limit "$VAL_LIMIT" \
-        --test-limit "$TEST_LIMIT" \
-        --layer-range "$LAYER_RANGE" \
         2>&1 | tee "$OUTPUT_DIR/${BENCHMARK}_log.txt"; then
         BENCHMARK_END=$(date +%s)
         DURATION=$((BENCHMARK_END - BENCHMARK_START))
         echo "Completed $BENCHMARK in ${DURATION}s"
-        # Find and copy the metrics file
-        METRICS_FILE=$(find "$OUTPUT_DIR/$BENCHMARK" -name "all_trials_metrics_*.json" -type f 2>/dev/null | head -1)
+        # Find and copy the results file
+        RESULTS_FILE=$(find "$OUTPUT_DIR/$BENCHMARK" -name "steering_comprehensive_*.json" -type f 2>/dev/null | head -1)
-        if [ -n "$METRICS_FILE" ]; then
-            echo "Metrics saved to: $METRICS_FILE"
-            cp "$METRICS_FILE" "$OUTPUT_DIR/${BENCHMARK}_metrics.json"
+        if [ -n "$RESULTS_FILE" ]; then
+            echo "Results saved to: $RESULTS_FILE"
+            cp "$RESULTS_FILE" "$OUTPUT_DIR/${BENCHMARK}_metrics.json"
             mark_benchmark_completed "$BENCHMARK"
             COMPLETED_BENCHMARKS+=("$BENCHMARK")
         else
-            echo "WARNING: No metrics file found for $BENCHMARK"
+            echo "WARNING: No results file found for $BENCHMARK"
             FAILED_BENCHMARKS+=("$BENCHMARK")
         fi
     else
@@ -211,30 +208,28 @@ for SYNTHETIC_TYPE in "${SYNTHETIC_TYPES[@]}"; do
     SYNTHETIC_START=$(date +%s)
     SYNTHETIC_DIR="$OUTPUT_DIR/synthetic_${SYNTHETIC_TYPE}"
-    # Run the optimization pipeline with personalization task
-    if python3 -m wisent.core.optuna.steering.optuna_pipeline \
-        --output-dir "$SYNTHETIC_DIR" \
+    # Run the optimization with personalization task
+    if wisent optimize-steering personalization \
         --model "$MODEL" \
-        --task personalization \
         --trait "$SYNTHETIC_TYPE" \
-        --n-trials "$N_TRIALS" \
         --num-pairs 50 \
-        --layer-range "$LAYER_RANGE" \
+        --output-dir "$SYNTHETIC_DIR" \
+        --device cuda \
         2>&1 | tee "$OUTPUT_DIR/synthetic_${SYNTHETIC_TYPE}_log.txt"; then
         SYNTHETIC_END=$(date +%s)
         DURATION=$((SYNTHETIC_END - SYNTHETIC_START))
         echo "Completed synthetic $SYNTHETIC_TYPE in ${DURATION}s"
-        # Find the metrics file
-        METRICS_FILE=$(find "$SYNTHETIC_DIR" -name "all_trials_metrics_*.json" -type f 2>/dev/null | head -1)
+        # Find the results file
+        RESULTS_FILE=$(find "$SYNTHETIC_DIR" -name "*.json" -type f 2>/dev/null | head -1)
-        if [ -n "$METRICS_FILE" ]; then
-            echo "Metrics saved to: $METRICS_FILE"
-            cp "$METRICS_FILE" "$OUTPUT_DIR/synthetic_${SYNTHETIC_TYPE}_metrics.json"
+        if [ -n "$RESULTS_FILE" ]; then
+            echo "Results saved to: $RESULTS_FILE"
+            cp "$RESULTS_FILE" "$OUTPUT_DIR/synthetic_${SYNTHETIC_TYPE}_metrics.json"
             COMPLETED_BENCHMARKS+=("synthetic_$SYNTHETIC_TYPE")
         else
-            echo "WARNING: No metrics file found for synthetic_$SYNTHETIC_TYPE"
+            echo "WARNING: No results file found for synthetic_$SYNTHETIC_TYPE"
             FAILED_BENCHMARKS+=("synthetic_$SYNTHETIC_TYPE")
         fi
     else

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl