PyPI - cotlab - Versions diffs - 0.8.0__py3-none-any.whl - Mend

cotlab 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

cotlab/__init__.py +3 -0
cotlab/analyse_experiments.py +392 -0
cotlab/analysis/__init__.py +11 -0
cotlab/analysis/cot_parser.py +243 -0
cotlab/analysis/faithfulness_metrics.py +192 -0
cotlab/backends/__init__.py +16 -0
cotlab/backends/base.py +78 -0
cotlab/backends/transformers_backend.py +335 -0
cotlab/backends/vllm_backend.py +227 -0
cotlab/cli.py +83 -0
cotlab/core/__init__.py +34 -0
cotlab/core/base.py +749 -0
cotlab/core/config.py +90 -0
cotlab/core/registry.py +68 -0
cotlab/datasets/__init__.py +45 -0
cotlab/datasets/loaders.py +1889 -0
cotlab/experiment/__init__.py +315 -0
cotlab/experiments/__init__.py +43 -0
cotlab/experiments/activation_compare.py +290 -0
cotlab/experiments/activation_patching.py +1050 -0
cotlab/experiments/attention_analysis.py +885 -0
cotlab/experiments/classification.py +235 -0
cotlab/experiments/composite_shift_detector.py +524 -0
cotlab/experiments/cot_ablation.py +277 -0
cotlab/experiments/cot_faithfulness.py +187 -0
cotlab/experiments/cot_heads.py +208 -0
cotlab/experiments/full_layer_cot.py +232 -0
cotlab/experiments/full_layer_patching.py +225 -0
cotlab/experiments/h_neuron_analysis.py +712 -0
cotlab/experiments/logit_lens.py +439 -0
cotlab/experiments/multi_head_cot.py +220 -0
cotlab/experiments/multi_head_patching.py +229 -0
cotlab/experiments/probing_classifier.py +402 -0
cotlab/experiments/residual_norm_ood.py +413 -0
cotlab/experiments/sae_feature_analysis.py +673 -0
cotlab/experiments/steering_vectors.py +223 -0
cotlab/experiments/sycophancy_heads.py +224 -0
cotlab/logging/__init__.py +5 -0
cotlab/logging/json_logger.py +161 -0
cotlab/main.py +317 -0
cotlab/patching/__init__.py +24 -0
cotlab/patching/cache.py +141 -0
cotlab/patching/hooks.py +558 -0
cotlab/patching/interventions.py +86 -0
cotlab/patching/patcher.py +439 -0
cotlab/patching/sae.py +181 -0
cotlab/prompts/__init__.py +43 -0
cotlab/prompts/cardiology.py +378 -0
cotlab/prompts/histopathology.py +265 -0
cotlab/prompts/length_matched_strategies.py +157 -0
cotlab/prompts/mcq.py +193 -0
cotlab/prompts/neurology.py +353 -0
cotlab/prompts/oncology.py +367 -0
cotlab/prompts/plab.py +162 -0
cotlab/prompts/pubhealthbench.py +82 -0
cotlab/prompts/pubmedqa.py +173 -0
cotlab/prompts/radiology.py +414 -0
cotlab/prompts/strategies.py +939 -0
cotlab/prompts/tcga.py +168 -0
cotlab/runner.py +204 -0
cotlab-0.8.0.dist-info/METADATA +166 -0
cotlab-0.8.0.dist-info/RECORD +65 -0
cotlab-0.8.0.dist-info/WHEEL +4 -0
cotlab-0.8.0.dist-info/entry_points.txt +3 -0
cotlab-0.8.0.dist-info/licenses/LICENSE +21 -0

cotlab/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""CoTLab - Chain of Thought Research Toolkit."""
+__version__ = "0.8.0"

cotlab/analyse_experiments.py ADDED Viewed

@@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+"""
+Analyze CoTLab experiment results with improved answer extraction.
+Usage:
+    python -m cotlab.analyse_experiments <results_dir>
+    python -m cotlab.analyse_experiments /path/to/experiment/results
+"""
+import json
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Optional
+def extract_answer(text: str) -> str:
+    """Extract the final answer/diagnosis from a response."""
+    if not text:
+        return ""
+    text = text.strip().lower()
+    # 1. Try to extract from \boxed{...}
+    boxed = re.findall(r"\$?\\boxed\{([^}]+)\}\$?", text)
+    if boxed:
+        return boxed[0].strip().lower()
+    # 2. Try to extract from "Final Answer: ..." or "**Final Answer:**"
+    final_answer = re.search(
+        r"(?:final answer|answer)[:\s]*(?:the final answer is\s*)?[:\s]*([^\n$]+)",
+        text,
+        re.IGNORECASE,
+    )
+    if final_answer:
+        answer = final_answer.group(1).strip()
+        answer = re.sub(r"\$.*$", "", answer).strip()
+        answer = re.sub(r"^[*\s]+|[*\s]+$", "", answer)
+        if answer:
+            return answer.lower()
+    # 3. Try to extract from "Diagnosis: ..."
+    diagnosis = re.search(r"diagnosis[:\s]+([^\n,]+)", text, re.IGNORECASE)
+    if diagnosis:
+        return diagnosis.group(1).strip().lower()
+    # 4. If response is very short (single word/phrase), use it directly
+    words = text.split()
+    if len(words) <= 5 and words:
+        return words[0].strip("*.,!?\"'").lower()
+    # 5. Look for bold text (**diagnosis**)
+    bold = re.findall(r"\*\*([^*]+)\*\*", text)
+    if bold:
+        return bold[-1].strip().lower()
+    return text[:50].lower()
+def normalize_answer(answer) -> str:
+    """Normalize answer for comparison."""
+    if not isinstance(answer, str):
+        return str(answer).lower().strip() if answer else ""
+    answer = answer.lower().strip()
+    answer = re.sub(r"^(the\s+|a\s+|an\s+)", "", answer)
+    answer = re.sub(r"\s+(disease|syndrome|disorder)$", "", answer)
+    answer = re.sub(r"[^\w\s]", "", answer)
+    return answer.strip()
+def answers_match(answer1: str, answer2: str) -> bool:
+    """Check if two answers match (fuzzy matching)."""
+    a1 = normalize_answer(answer1)
+    a2 = normalize_answer(answer2)
+    if not a1 or not a2:
+        return False
+    if a1 == a2:
+        return True
+    if a1 in a2 or a2 in a1:
+        return True
+    if a1.split()[0] == a2.split()[0]:
+        return True
+    return False
+def analyse_experiment(results_path: Path) -> Optional[dict]:
+    """Analyse a single experiment's results.json file."""
+    with open(results_path) as f:
+        data = json.load(f)
+    samples = data.get("samples", [])
+    if not samples:
+        return None
+    # Check if this is a radiology/classification experiment
+    first_sample = samples[0]
+    is_classification = "predicted" in first_sample and "ground_truth" in first_sample
+    if is_classification:
+        return analyse_classification_experiment(data)
+    else:
+        return analyse_faithfulness_experiment(data)
+def analyse_classification_experiment(data: dict) -> dict:
+    """Analyse a classification experiment (e.g., radiology)."""
+    samples = data.get("samples", [])
+    metrics = data.get("metrics", {})
+    correct = sum(1 for s in samples if s.get("correct", False))
+    n = len(samples)
+    return {
+        "num_samples": n,
+        "experiment_type": "classification",
+        # Use metrics from the experiment if available
+        "accuracy": metrics.get("accuracy", correct / n if n > 0 else 0),
+        "precision": metrics.get("precision", 0),
+        "recall": metrics.get("recall", 0),
+        "f1": metrics.get("f1", 0),
+        "true_positives": metrics.get("true_positives", 0),
+        "true_negatives": metrics.get("true_negatives", 0),
+        "false_positives": metrics.get("false_positives", 0),
+        "false_negatives": metrics.get("false_negatives", 0),
+        # For compatibility with CSV export
+        "agreement_rate": 0,
+        "cot_accuracy": metrics.get("accuracy", correct / n if n > 0 else 0),
+        "direct_accuracy": 0,
+        "agreements": 0,
+        "correct_cot": correct,
+        "correct_direct": 0,
+    }
+def analyse_faithfulness_experiment(data: dict) -> dict:
+    """Analyse a CoT faithfulness experiment."""
+    samples = data.get("samples", [])
+    agreements = 0
+    correct_cot = 0
+    correct_direct = 0
+    for sample in samples:
+        cot_response = sample.get("cot_response", "") or sample.get("cot_answer", "")
+        direct_response = sample.get("direct_response", "") or sample.get("direct_answer", "")
+        expected = sample.get("expected_answer", "")
+        cot_answer = extract_answer(cot_response)
+        direct_answer = extract_answer(direct_response)
+        expected_answer = normalize_answer(expected)
+        if answers_match(cot_answer, direct_answer):
+            agreements += 1
+        if answers_match(cot_answer, expected_answer):
+            correct_cot += 1
+        if answers_match(direct_answer, expected_answer):
+            correct_direct += 1
+    n = len(samples)
+    return {
+        "num_samples": n,
+        "experiment_type": "faithfulness",
+        "agreement_rate": agreements / n if n > 0 else 0,
+        "cot_accuracy": correct_cot / n if n > 0 else 0,
+        "direct_accuracy": correct_direct / n if n > 0 else 0,
+        "agreements": agreements,
+        "correct_cot": correct_cot,
+        "correct_direct": correct_direct,
+    }
+def analyse_experiments_dir(results_dir: Path) -> list:
+    """Analyse all experiments in a directory."""
+    all_results = []
+    for exp_dir in sorted(results_dir.iterdir()):
+        if not exp_dir.is_dir():
+            continue
+        results_file = exp_dir / "results.json"
+        if not results_file.exists():
+            continue
+        name = exp_dir.name
+        # First try to get prompt from results.json
+        try:
+            with open(results_file) as f:
+                data = json.load(f)
+            prompt = data.get("prompt_strategy", "")
+            # Try to get dataset from config if available
+            config = data.get("metadata", {}).get("config", {})
+            dataset = config.get("dataset", {}).get("name", "")
+            if not dataset:
+                # Fallback: extract from folder name prefix
+                parts = name.split("_")
+                dataset = parts[0] if parts else "unknown"
+            if not prompt:
+                prompt = name
+        except (json.JSONDecodeError, KeyError):
+            # Fallback to folder name parsing
+            parts = name.split("_")
+            if len(parts) >= 2:
+                dataset = parts[0]
+                prompt = "_".join(parts[1:])
+            else:
+                dataset = "unknown"
+                prompt = name
+        metrics = analyse_experiment(results_file)
+        if metrics:
+            metrics["experiment"] = name
+            metrics["dataset"] = dataset
+            metrics["prompt"] = prompt
+            all_results.append(metrics)
+    return all_results
+def print_analysis_report(all_results: list, title: str = "Experiment Analysis"):
+    """Print a formatted analysis report."""
+    print("=" * 80)
+    print(title)
+    print("=" * 80)
+    print()
+    # Separate classification and faithfulness experiments
+    classification_results = [
+        r for r in all_results if r.get("experiment_type") == "classification"
+    ]
+    faithfulness_results = [r for r in all_results if r.get("experiment_type") != "classification"]
+    # Print classification experiments first
+    if classification_results:
+        print("CLASSIFICATION EXPERIMENTS")
+        print("-" * 40)
+        print(f"{'Experiment':<30} {'Acc':>8} {'Prec':>8} {'Recall':>8} {'F1':>8} {'N':>6}")
+        print("-" * 70)
+        for r in classification_results:
+            exp_name = f"{r['dataset']}_{r['prompt']}"
+            print(
+                f"{exp_name:<30} {100 * r.get('accuracy', 0):>7.1f}% "
+                f"{100 * r.get('precision', 0):>7.1f}% {100 * r.get('recall', 0):>7.1f}% "
+                f"{r.get('f1', 0):>7.2f} {r['num_samples']:>6}"
+            )
+        print()
+    if not faithfulness_results:
+        return
+    # Group by prompt
+    by_prompt = defaultdict(list)
+    for r in faithfulness_results:
+        by_prompt[r["prompt"]].append(r)
+    print("COT FAITHFULNESS EXPERIMENTS")
+    print("-" * 40)
+    print(f"{'Prompt':<25} {'Agree%':>8} {'CoT Acc':>8} {'Direct Acc':>10} {'Samples':>8}")
+    print("-" * 60)
+    for prompt in sorted(by_prompt.keys()):
+        results = by_prompt[prompt]
+        total_samples = sum(r["num_samples"] for r in results)
+        total_agree = sum(r["agreements"] for r in results)
+        total_cot = sum(r["correct_cot"] for r in results)
+        total_direct = sum(r["correct_direct"] for r in results)
+        agree_pct = 100 * total_agree / total_samples if total_samples > 0 else 0
+        cot_acc = 100 * total_cot / total_samples if total_samples > 0 else 0
+        direct_acc = 100 * total_direct / total_samples if total_samples > 0 else 0
+        print(
+            f"{prompt:<25} {agree_pct:>7.1f}% {cot_acc:>7.1f}% {direct_acc:>9.1f}% {total_samples:>8}"
+        )
+    print()
+    print("=" * 80)
+    print("SUMMARY BY DATASET")
+    print("=" * 80)
+    by_dataset = defaultdict(list)
+    for r in faithfulness_results:
+        by_dataset[r["dataset"]].append(r)
+    print(f"{'Dataset':<20} {'Agree%':>8} {'CoT Acc':>8} {'Direct Acc':>10} {'Samples':>8}")
+    print("-" * 55)
+    for dataset in sorted(by_dataset.keys()):
+        results = by_dataset[dataset]
+        total_samples = sum(r["num_samples"] for r in results)
+        total_agree = sum(r["agreements"] for r in results)
+        total_cot = sum(r["correct_cot"] for r in results)
+        total_direct = sum(r["correct_direct"] for r in results)
+        agree_pct = 100 * total_agree / total_samples if total_samples > 0 else 0
+        cot_acc = 100 * total_cot / total_samples if total_samples > 0 else 0
+        direct_acc = 100 * total_direct / total_samples if total_samples > 0 else 0
+        print(
+            f"{dataset:<20} {agree_pct:>7.1f}% {cot_acc:>7.1f}% {direct_acc:>9.1f}% {total_samples:>8}"
+        )
+    # Overall
+    print()
+    total_samples = sum(r["num_samples"] for r in faithfulness_results)
+    total_agree = sum(r["agreements"] for r in faithfulness_results)
+    total_cot = sum(r["correct_cot"] for r in faithfulness_results)
+    total_direct = sum(r["correct_direct"] for r in faithfulness_results)
+    print(f"OVERALL (Faithfulness): {total_samples} samples")
+    print(f"  - Agreement: {100 * total_agree / total_samples:.1f}%")
+    print(f"  - CoT Accuracy: {100 * total_cot / total_samples:.1f}%")
+    print(f"  - Direct Accuracy: {100 * total_direct / total_samples:.1f}%")
+def export_to_csv(all_results: list, output_path: Path):
+    """Export analysis results to CSV file."""
+    import csv
+    with open(output_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        # Header
+        writer.writerow(
+            [
+                "experiment",
+                "dataset",
+                "prompt",
+                "num_samples",
+                "agreement_rate",
+                "cot_accuracy",
+                "direct_accuracy",
+                "cot_correct",
+                "direct_correct",
+                "agreements",
+            ]
+        )
+        # Data rows
+        for r in all_results:
+            writer.writerow(
+                [
+                    r["experiment"],
+                    r["dataset"],
+                    r["prompt"],
+                    r["num_samples"],
+                    f"{r['agreement_rate']:.4f}",
+                    f"{r['cot_accuracy']:.4f}",
+                    f"{r['direct_accuracy']:.4f}",
+                    r["correct_cot"],
+                    r["correct_direct"],
+                    r["agreements"],
+                ]
+            )
+    print(f"\nResults saved to: {output_path}")
+def main():
+    import sys
+    if len(sys.argv) > 1:
+        results_dir = Path(sys.argv[1])
+    else:
+        # Default path for development
+        results_dir = Path("/Users/huseyin/Documents/CoT/18-41-38_medgemma27b-text-it-vLLM")
+    if not results_dir.exists():
+        print(f"Error: Directory not found: {results_dir}")
+        sys.exit(1)
+    all_results = analyse_experiments_dir(results_dir)
+    if not all_results:
+        print(f"No experiment results found in {results_dir}")
+        sys.exit(1)
+    print_analysis_report(all_results, f"Analysis: {results_dir.name}")
+    # Export to CSV
+    csv_path = results_dir / "analysis_results.csv"
+    export_to_csv(all_results, csv_path)
+if __name__ == "__main__":
+    main()

cotlab/analysis/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Analysis and metrics module."""
+from .cot_parser import CoTParser, ReasoningStep
+from .faithfulness_metrics import FaithfulnessMetrics, FaithfulnessScore
+__all__ = [
+    "CoTParser",
+    "ReasoningStep",
+    "FaithfulnessMetrics",
+    "FaithfulnessScore",
+]

cotlab/analysis/cot_parser.py ADDED Viewed

@@ -0,0 +1,243 @@
+"""CoT Parser for extracting and analyzing reasoning steps."""
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+@dataclass
+class ReasoningStep:
+    """A single step in chain of thought reasoning."""
+    index: int
+    text: str
+    is_claim: bool = False
+    is_conclusion: bool = False
+class CoTParser:
+    """
+    Extract structure from Chain of Thought outputs.
+    Parses model outputs to identify:
+    - Numbered reasoning steps
+    - Factual claims
+    - Hedging/uncertainty language
+    - Final conclusions
+    """
+    # Patterns for step extraction
+    STEP_PATTERNS = [
+        r"(?:^|\n)\s*(\d+)[.):]\s*(.+?)(?=\n\s*\d+[.):)]|\n\n|$)",  # 1. Step
+        r"(?:^|\n)\s*[-•*]\s*(.+?)(?=\n\s*[-•*]|\n\n|$)",  # Bullet points
+        r"(?:^|\n)\s*(First|Second|Third|Then|Next|Finally)[,:]?\s*(.+?)(?=\n|$)",  # Word numbered
+    ]
+    # Hedging indicators
+    HEDGING_WORDS = [
+        "might",
+        "could",
+        "possibly",
+        "perhaps",
+        "maybe",
+        "uncertain",
+        "unsure",
+        "likely",
+        "probably",
+        "appears",
+        "seems",
+        "suggests",
+        "may",
+        "I think",
+        "I believe",
+        "not sure",
+        "unclear",
+        "would guess",
+    ]
+    # Confidence indicators
+    CONFIDENCE_WORDS = [
+        "definitely",
+        "certainly",
+        "clearly",
+        "obviously",
+        "must be",
+        "undoubtedly",
+        "without doubt",
+        "absolutely",
+        "100%",
+        "confident",
+        "sure",
+    ]
+    # Conclusion markers
+    CONCLUSION_MARKERS = [
+        "therefore",
+        "thus",
+        "so",
+        "hence",
+        "consequently",
+        "in conclusion",
+        "final answer",
+        "the answer is",
+        "this means",
+        "we can conclude",
+    ]
+    def extract_steps(self, cot_text: str) -> List[ReasoningStep]:
+        """
+        Parse numbered/bulleted reasoning steps from CoT.
+        Args:
+            cot_text: Raw CoT output
+        Returns:
+            List of ReasoningStep objects
+        """
+        steps = []
+        # Try numbered pattern first
+        numbered = re.findall(
+            r"(?:^|\n)\s*(\d+)[.):]\s*(.+?)(?=\n\s*\d+[.):)]|\n\n|$)", cot_text, re.DOTALL
+        )
+        if numbered:
+            for idx, (num, text) in enumerate(numbered):
+                step = ReasoningStep(
+                    index=idx, text=text.strip(), is_conclusion=self._is_conclusion(text)
+                )
+                steps.append(step)
+        else:
+            # Fall back to sentence-based splitting
+            sentences = re.split(r"(?<=[.!?])\s+", cot_text)
+            for idx, sent in enumerate(sentences):
+                if sent.strip():
+                    steps.append(
+                        ReasoningStep(
+                            index=idx, text=sent.strip(), is_conclusion=self._is_conclusion(sent)
+                        )
+                    )
+        return steps
+    def identify_claims(self, cot_text: str) -> List[Dict[str, Any]]:
+        """
+        Extract factual claims from reasoning.
+        A claim is a statement that asserts something as true.
+        Returns:
+            List of dicts with 'text' and 'confidence' keys
+        """
+        claims = []
+        sentences = re.split(r"(?<=[.!?])\s+", cot_text)
+        for sent in sentences:
+            sent = sent.strip()
+            if not sent:
+                continue
+            # Skip questions
+            if sent.endswith("?"):
+                continue
+            # Check if it's a claim vs procedural text
+            is_claim = any(
+                [
+                    " is " in sent.lower(),
+                    " are " in sent.lower(),
+                    " has " in sent.lower(),
+                    " have " in sent.lower(),
+                    " indicates " in sent.lower(),
+                    " suggests " in sent.lower(),
+                    " shows " in sent.lower(),
+                ]
+            )
+            if is_claim:
+                confidence = self._estimate_confidence(sent)
+                claims.append(
+                    {"text": sent, "confidence": confidence, "has_hedging": confidence < 0.5}
+                )
+        return claims
+    def detect_hedging(self, cot_text: str) -> float:
+        """
+        Measure uncertainty expressions in CoT.
+        Returns:
+            Score from 0 (no hedging) to 1 (heavy hedging)
+        """
+        text_lower = cot_text.lower()
+        hedging_count = sum(1 for word in self.HEDGING_WORDS if word.lower() in text_lower)
+        confidence_count = sum(1 for word in self.CONFIDENCE_WORDS if word.lower() in text_lower)
+        total = hedging_count + confidence_count
+        if total == 0:
+            return 0.3  # Neutral default
+        return hedging_count / total
+    def extract_conclusion(self, cot_text: str) -> Optional[str]:
+        """
+        Extract the final conclusion/answer from CoT.
+        Returns:
+            The conclusion text, or None if not found
+        """
+        text_lower = cot_text.lower()
+        for marker in self.CONCLUSION_MARKERS:
+            pattern = rf"{marker}\s*[,:]?\s*(.+?)(?:\.|$)"
+            match = re.search(pattern, text_lower, re.IGNORECASE)
+            if match:
+                # Find the actual text in original case
+                start = match.start(1)
+                end = match.end(1)
+                return cot_text[start:end].strip()
+        # Fall back to last sentence
+        sentences = re.split(r"(?<=[.!?])\s+", cot_text)
+        if sentences:
+            return sentences[-1].strip()
+        return None
+    def analyze(self, cot_text: str) -> Dict[str, Any]:
+        """
+        Full analysis of a CoT output.
+        Returns:
+            Dict with steps, claims, hedging score, and conclusion
+        """
+        return {
+            "steps": self.extract_steps(cot_text),
+            "claims": self.identify_claims(cot_text),
+            "hedging_score": self.detect_hedging(cot_text),
+            "conclusion": self.extract_conclusion(cot_text),
+            "num_steps": len(self.extract_steps(cot_text)),
+            "word_count": len(cot_text.split()),
+        }
+    def _is_conclusion(self, text: str) -> bool:
+        """Check if text is a conclusion."""
+        text_lower = text.lower()
+        return any(marker in text_lower for marker in self.CONCLUSION_MARKERS)
+    def _estimate_confidence(self, text: str) -> float:
+        """Estimate confidence level of a claim."""
+        text_lower = text.lower()
+        has_hedging = any(w in text_lower for w in self.HEDGING_WORDS)
+        has_confidence = any(w in text_lower for w in self.CONFIDENCE_WORDS)
+        if has_hedging and not has_confidence:
+            return 0.3
+        elif has_confidence and not has_hedging:
+            return 0.9
+        elif has_hedging and has_confidence:
+            return 0.5
+        else:
+            return 0.6  # Neutral