npm - claude-turing - Versions diffs - 1.0.0 - Mend

claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

package/.claude-plugin/plugin.json +34 -0
package/LICENSE +21 -0
package/README.md +457 -0
package/agents/ml-evaluator.md +43 -0
package/agents/ml-researcher.md +74 -0
package/bin/cli.js +46 -0
package/bin/turing-init.sh +57 -0
package/commands/brief.md +83 -0
package/commands/compare.md +24 -0
package/commands/design.md +97 -0
package/commands/init.md +123 -0
package/commands/logbook.md +51 -0
package/commands/mode.md +43 -0
package/commands/poster.md +89 -0
package/commands/preflight.md +75 -0
package/commands/report.md +97 -0
package/commands/rules/loop-protocol.md +91 -0
package/commands/status.md +24 -0
package/commands/suggest.md +95 -0
package/commands/sweep.md +45 -0
package/commands/train.md +66 -0
package/commands/try.md +63 -0
package/commands/turing.md +54 -0
package/commands/validate.md +34 -0
package/config/defaults.yaml +45 -0
package/config/experiment_archetypes.yaml +127 -0
package/config/lifecycle.toml +31 -0
package/config/novelty_aliases.yaml +107 -0
package/config/relationships.toml +125 -0
package/config/state.toml +24 -0
package/config/task_taxonomy.yaml +110 -0
package/config/taxonomy.toml +37 -0
package/package.json +54 -0
package/src/claude-md.js +55 -0
package/src/install.js +107 -0
package/src/paths.js +20 -0
package/src/postinstall.js +22 -0
package/src/verify.js +109 -0
package/templates/MEMORY.md +36 -0
package/templates/README.md +93 -0
package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
package/templates/config.yaml +48 -0
package/templates/evaluate.py +237 -0
package/templates/features/__init__.py +0 -0
package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
package/templates/features/featurizers.py +138 -0
package/templates/prepare.py +171 -0
package/templates/program.md +216 -0
package/templates/pyproject.toml +8 -0
package/templates/requirements.txt +8 -0
package/templates/scripts/__init__.py +0 -0
package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
package/templates/scripts/check_convergence.py +230 -0
package/templates/scripts/compare_runs.py +124 -0
package/templates/scripts/critique_hypothesis.py +350 -0
package/templates/scripts/experiment_index.py +288 -0
package/templates/scripts/generate_brief.py +389 -0
package/templates/scripts/generate_logbook.py +423 -0
package/templates/scripts/log_experiment.py +243 -0
package/templates/scripts/manage_hypotheses.py +543 -0
package/templates/scripts/novelty_guard.py +343 -0
package/templates/scripts/parse_metrics.py +139 -0
package/templates/scripts/post-train-hook.sh +74 -0
package/templates/scripts/preflight.py +549 -0
package/templates/scripts/scaffold.py +409 -0
package/templates/scripts/show_environment.py +92 -0
package/templates/scripts/show_experiment_tree.py +144 -0
package/templates/scripts/show_families.py +133 -0
package/templates/scripts/show_metrics.py +157 -0
package/templates/scripts/statistical_compare.py +259 -0
package/templates/scripts/stop-hook.sh +34 -0
package/templates/scripts/suggest_next.py +301 -0
package/templates/scripts/sweep.py +276 -0
package/templates/scripts/synthesize_decision.py +300 -0
package/templates/scripts/turing_io.py +76 -0
package/templates/scripts/update_state.py +296 -0
package/templates/scripts/validate_stability.py +167 -0
package/templates/scripts/verify_placeholders.py +119 -0
package/templates/sweep_config.yaml +14 -0
package/templates/tests/__init__.py +0 -0
package/templates/tests/conftest.py +91 -0
package/templates/train.py +240 -0

package/templates/evaluate.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""Evaluation harness for the {{PROJECT_NAME}} ML pipeline.
+HIDDEN — MEASUREMENT APPARATUS.
+This file is hidden from the autoresearch agent. The agent cannot
+read, modify, or reference this file. This prevents metric gaming,
+seed exploitation, and evaluation function reverse-engineering.
+The platform runs this file automatically. The agent knows only:
+- The primary metric name (from config.yaml)
+- Whether higher or lower is better (from config.yaml)
+- The metric value (from parsed run.log output)
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+import numpy as np
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    mean_absolute_error,
+    mean_squared_error,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+)
+def evaluate_model(
+    predictions: np.ndarray,
+    ground_truth: np.ndarray,
+    config: dict | None = None,
+) -> dict:
+    """Compute evaluation metrics from predictions vs ground truth.
+    Customize this function for your specific ML task. Add or remove
+    metrics as needed. The autoresearch agent reads these via format_metrics().
+    Args:
+        predictions: Model predictions (numpy array).
+        ground_truth: Ground truth values (numpy array).
+        config: Optional config dict for metric selection.
+    Returns:
+        Dict with metric names and values.
+    """
+    if len(predictions) != len(ground_truth):
+        raise ValueError(
+            f"Length mismatch: {len(predictions)} predictions vs "
+            f"{len(ground_truth)} ground truth"
+        )
+    # Determine which metrics to compute from config
+    eval_cfg = config.get("evaluation", {}) if config else {}
+    metric_names = eval_cfg.get("metrics", ["accuracy", "f1_weighted"])
+    results = {}
+    for metric_name in metric_names:
+        if metric_name == "accuracy":
+            results["accuracy"] = round(float(accuracy_score(ground_truth, predictions)), 4)
+        elif metric_name == "f1_weighted":
+            results["f1_weighted"] = round(float(f1_score(ground_truth, predictions, average="weighted")), 4)
+        elif metric_name == "f1_macro":
+            results["f1_macro"] = round(float(f1_score(ground_truth, predictions, average="macro")), 4)
+        elif metric_name == "f1_micro":
+            results["f1_micro"] = round(float(f1_score(ground_truth, predictions, average="micro")), 4)
+        elif metric_name == "precision":
+            results["precision"] = round(float(precision_score(ground_truth, predictions, average="weighted")), 4)
+        elif metric_name == "recall":
+            results["recall"] = round(float(recall_score(ground_truth, predictions, average="weighted")), 4)
+        elif metric_name == "mae":
+            results["mae"] = round(float(mean_absolute_error(ground_truth, predictions)), 4)
+        elif metric_name == "mse":
+            results["mse"] = round(float(mean_squared_error(ground_truth, predictions)), 4)
+        elif metric_name == "rmse":
+            results["rmse"] = round(float(np.sqrt(mean_squared_error(ground_truth, predictions))), 4)
+    return results
+def evaluate_detailed(
+    predictions: np.ndarray,
+    ground_truth: np.ndarray,
+    config: dict | None = None,
+) -> dict:
+    """Compute detailed evaluation metrics including per-class breakdown.
+    Extends evaluate_model with per-class precision/recall/F1 and a
+    confusion matrix. The agent uses this to understand WHERE the model
+    fails, not just that it fails.
+    Args:
+        predictions: Model predictions (numpy array).
+        ground_truth: Ground truth values (numpy array).
+        config: Optional config dict for metric selection.
+    Returns:
+        Dict with 'aggregate' (same as evaluate_model), 'per_class'
+        (dict of class -> {precision, recall, f1, support}), and
+        'confusion_matrix' (dict representation).
+    """
+    aggregate = evaluate_model(predictions, ground_truth, config)
+    # Per-class breakdown
+    classes = sorted(set(ground_truth.tolist()))
+    per_class = {}
+    for cls in classes:
+        cls_mask = ground_truth == cls
+        n_support = int(cls_mask.sum())
+        cls_preds = predictions[cls_mask]
+        tp = int((cls_preds == cls).sum())
+        precision_denom = int((predictions == cls).sum())
+        cls_precision = round(tp / precision_denom, 4) if precision_denom > 0 else 0.0
+        cls_recall = round(tp / n_support, 4) if n_support > 0 else 0.0
+        if cls_precision + cls_recall > 0:
+            cls_f1 = round(2 * cls_precision * cls_recall / (cls_precision + cls_recall), 4)
+        else:
+            cls_f1 = 0.0
+        per_class[str(cls)] = {
+            "precision": cls_precision,
+            "recall": cls_recall,
+            "f1": cls_f1,
+            "support": n_support,
+        }
+    # Confusion matrix as dict
+    confusion = {}
+    for true_cls in classes:
+        row = {}
+        for pred_cls in classes:
+            row[str(pred_cls)] = int(((ground_truth == true_cls) & (predictions == pred_cls)).sum())
+        confusion[str(true_cls)] = row
+    return {
+        "aggregate": aggregate,
+        "per_class": per_class,
+        "confusion_matrix": confusion,
+    }
+def format_metrics(metrics: dict) -> str:
+    """Format metrics in a parseable delimited format.
+    Output format (for the autoresearch agent to parse):
+        ---
+        metric_name:     value
+        ...
+        ---
+    The agent reads metrics by grepping between --- delimiters.
+    Args:
+        metrics: Dict with metric names and values.
+    Returns:
+        Formatted string.
+    """
+    # Separate known metadata keys from actual metrics
+    metadata_keys = {"model_type", "train_seconds"}
+    metric_keys = [k for k in metrics if k not in metadata_keys]
+    all_keys = metric_keys + [k for k in metadata_keys if k in metrics]
+    lines = ["---"]
+    for key in all_keys:
+        padding = " " * max(1, 15 - len(key))
+        lines.append(f"{key}:{padding}{metrics[key]}")
+    lines.append("---")
+    return "\n".join(lines)
+def validate_training_behavior(config: dict | None = None) -> tuple[bool, str]:
+    """Validate that real training work was performed.
+    HIDDEN — this function runs automatically. The agent cannot see it.
+    Checks:
+      1. Training took a minimum amount of time (prevents task avoidance)
+      2. Model artifact has non-trivial size (prevents empty model saves)
+      3. Predictions have diversity (prevents constant-prediction shortcuts)
+    Returns:
+        (passed, message) tuple.
+    """
+    constraints = (config or {}).get("constraints", {})
+    min_train_time = constraints.get("min_train_time", 5)
+    min_model_size = constraints.get("min_model_size_bytes", 100)
+    meta_path = Path("train_metadata.json")
+    if not meta_path.exists():
+        return True, "no metadata — skipping behavioral checks"
+    with open(meta_path) as f:
+        meta = json.load(f)
+    train_time = meta.get("train_time_sec", 0)
+    if train_time < min_train_time:
+        return False, f"PROBE FAIL: train_time={train_time:.1f}s < minimum {min_train_time}s — training may have been skipped"
+    model_size = meta.get("model_size_bytes", 0)
+    if model_size < min_model_size:
+        return False, f"PROBE FAIL: model_size={model_size} bytes < minimum {min_model_size} — model may be empty"
+    pred_unique = meta.get("predictions_unique", 0)
+    if pred_unique <= 1:
+        return False, f"PROBE FAIL: predictions_unique={pred_unique} — model may predict a constant value"
+    return True, f"behavioral_ok: train_time={train_time:.1f}s, model_size={model_size}, pred_diversity={pred_unique}"
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Evaluate {{PROJECT_NAME}} model predictions"
+    )
+    parser.add_argument("predictions", help="Path to predictions JSONL file")
+    parser.add_argument("ground_truth", help="Path to ground truth JSONL file")
+    args = parser.parse_args()
+    with open(args.predictions) as f:
+        preds = [json.loads(line) for line in f if line.strip()]
+    with open(args.ground_truth) as f:
+        truth = [json.loads(line) for line in f if line.strip()]
+    pred_values = np.array([p.get("prediction") for p in preds])
+    truth_values = np.array([t.get("label") for t in truth])
+    result = evaluate_model(pred_values, truth_values)
+    print(format_metrics(result))

package/templates/features/__init__.py ADDED Viewed

File without changes

package/templates/features/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file

package/templates/features/__pycache__/featurizers.cpython-314.pyc ADDED Viewed

Binary file

package/templates/features/featurizers.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""Feature engineering strategies for the {{PROJECT_NAME}} ML pipeline.
+READ-ONLY — INFRASTRUCTURE.
+The autoresearch agent does not modify this file directly. Instead, it
+modifies how train.py *uses* the featurizers — composing them differently,
+selecting different column subsets, or adding preprocessing in train.py.
+Provides pluggable featurizers following a scikit-learn-like fit/transform
+interface. The CompositeFeaturizer chains multiple featurizers, concatenating
+their output columns.
+Exports:
+  - BaseFeaturizer: Abstract base class.
+  - NumericFeaturizer: Passes through numeric columns.
+  - CategoricalFeaturizer: One-hot encodes categorical columns.
+  - CompositeFeaturizer: Chains multiple featurizers.
+  - get_default_featurizer: Returns the standard composite featurizer.
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+import pandas as pd
+class BaseFeaturizer(ABC):
+    """Abstract base class for feature extraction."""
+    @abstractmethod
+    def fit(self, df: pd.DataFrame) -> "BaseFeaturizer":
+        """Fit the featurizer to training data.
+        Args:
+            df: Training DataFrame.
+        Returns:
+            self
+        """
+    @abstractmethod
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Transform data into feature DataFrame.
+        Args:
+            df: Input DataFrame.
+        Returns:
+            DataFrame with extracted features (numeric columns only).
+        """
+    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Fit and transform in one step."""
+        return self.fit(df).transform(df)
+class NumericFeaturizer(BaseFeaturizer):
+    """Passes through numeric columns as features.
+    Customize the column list for your dataset.
+    """
+    def __init__(self, columns: list[str] | None = None) -> None:
+        self.columns = columns
+        self._fitted_columns: list[str] = []
+    def fit(self, df: pd.DataFrame) -> "NumericFeaturizer":
+        if self.columns:
+            self._fitted_columns = [c for c in self.columns if c in df.columns]
+        else:
+            self._fitted_columns = df.select_dtypes(include=["number"]).columns.tolist()
+        return self
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df[self._fitted_columns].copy()
+class CategoricalFeaturizer(BaseFeaturizer):
+    """One-hot encodes categorical columns.
+    Customize the column list for your dataset.
+    """
+    def __init__(self, columns: list[str] | None = None) -> None:
+        self.columns = columns
+        self._categories: dict[str, list[str]] = {}
+    def fit(self, df: pd.DataFrame) -> "CategoricalFeaturizer":
+        cols = self.columns or df.select_dtypes(include=["object", "category"]).columns.tolist()
+        for col in cols:
+            if col in df.columns:
+                self._categories[col] = sorted(df[col].unique().tolist())
+        return self
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        features = pd.DataFrame(index=df.index)
+        for col, categories in self._categories.items():
+            if col in df.columns:
+                for cat in categories:
+                    features[f"{col}_{cat}"] = (df[col] == cat).astype(int)
+        return features
+class CompositeFeaturizer(BaseFeaturizer):
+    """Chains multiple featurizers, concatenating their output columns."""
+    def __init__(self, featurizers: list[BaseFeaturizer]) -> None:
+        self.featurizers = featurizers
+    def fit(self, df: pd.DataFrame) -> "CompositeFeaturizer":
+        for f in self.featurizers:
+            f.fit(df)
+        return self
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        parts = [f.transform(df) for f in self.featurizers]
+        return pd.concat(parts, axis=1)
+    def __repr__(self) -> str:
+        names = [type(f).__name__ for f in self.featurizers]
+        return f"CompositeFeaturizer({names})"
+def get_default_featurizer() -> CompositeFeaturizer:
+    """Return the standard composite featurizer.
+    Customize this function for your dataset. Add or remove featurizers
+    to match your feature engineering needs.
+    The autoresearch agent calls this from train.py. To experiment with
+    different feature sets, the agent modifies how train.py calls this
+    function or composes featurizers differently.
+    """
+    return CompositeFeaturizer([
+        NumericFeaturizer(),
+        CategoricalFeaturizer(),
+    ])

package/templates/prepare.py ADDED Viewed

@@ -0,0 +1,171 @@
+"""Data preparation module for the {{PROJECT_NAME}} ML pipeline.
+READ-ONLY — MEASUREMENT APPARATUS.
+This file is part of the immutable evaluation infrastructure. The autoresearch
+agent MUST NOT modify this file under any circumstances. Consistent data
+preparation across experiments ensures that observed metric changes reflect
+genuine model improvements, not data handling artifacts.
+Provides:
+  - load_config: Load YAML experiment configuration.
+  - load_data: Load training data into a DataFrame.
+  - create_splits: Stratified train/val/test split.
+  - load_splits: Load pre-created split files.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+import pandas as pd
+import yaml
+def load_config(path: str = "config.yaml") -> dict:
+    """Load YAML experiment configuration.
+    Args:
+        path: Path to the YAML config file.
+    Returns:
+        Configuration dictionary.
+    """
+    with open(path) as f:
+        return yaml.safe_load(f)
+def load_data(path: str) -> pd.DataFrame:
+    """Load training data into a DataFrame.
+    Supports JSONL (.jsonl) and CSV (.csv) formats.
+    Args:
+        path: Path to the data file.
+    Returns:
+        DataFrame with training data.
+    Raises:
+        FileNotFoundError: If path does not exist.
+        ValueError: If file format is unsupported.
+    """
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(f"Data file not found: {path}")
+    if p.suffix == ".jsonl":
+        records = []
+        with open(path) as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    records.append(json.loads(line))
+        if not records:
+            return pd.DataFrame()
+        return pd.DataFrame(records)
+    elif p.suffix == ".csv":
+        return pd.read_csv(path)
+    else:
+        raise ValueError(
+            f"Unsupported file format: {p.suffix}. Use .jsonl or .csv"
+        )
+def create_splits(
+    data_path: str,
+    output_dir: str,
+    target_column: str = "label",
+    test_size: float = 0.15,
+    val_size: float = 0.15,
+    random_state: int = 42,
+) -> dict[str, Path]:
+    """Create stratified train/val/test splits from training data.
+    Stratifies by target_column to preserve label distribution.
+    Args:
+        data_path: Path to the source data file.
+        output_dir: Directory to write train.jsonl, val.jsonl, test.jsonl.
+        target_column: Column to stratify on.
+        test_size: Fraction of data for test set.
+        val_size: Fraction of data for validation set.
+        random_state: Random seed for reproducibility.
+    Returns:
+        Dict mapping split name to output file path.
+    """
+    from sklearn.model_selection import train_test_split
+    df = load_data(data_path)
+    if df.empty:
+        raise ValueError(f"No data found in {data_path}")
+    out = Path(output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    # First split: separate test set
+    stratify_col = df[target_column] if target_column in df.columns else None
+    train_val, test = train_test_split(
+        df,
+        test_size=test_size,
+        random_state=random_state,
+        stratify=stratify_col,
+    )
+    # Second split: separate val from train
+    val_relative = val_size / (1.0 - test_size)
+    stratify_col_tv = train_val[target_column] if target_column in train_val.columns else None
+    train, val = train_test_split(
+        train_val,
+        test_size=val_relative,
+        random_state=random_state,
+        stratify=stratify_col_tv,
+    )
+    paths = {}
+    for name, split_df in [("train", train), ("val", val), ("test", test)]:
+        path = out / f"{name}.jsonl"
+        with open(path, "w") as f:
+            for _, row in split_df.iterrows():
+                f.write(json.dumps(row.to_dict()) + "\n")
+        paths[name] = path
+    return paths
+def load_splits(splits_dir: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """Load pre-created train/val/test splits.
+    Args:
+        splits_dir: Directory containing train.jsonl, val.jsonl, test.jsonl.
+    Returns:
+        Tuple of (train_df, val_df, test_df).
+    Raises:
+        FileNotFoundError: If any split file is missing.
+    """
+    splits_path = Path(splits_dir)
+    train = load_data(str(splits_path / "train.jsonl"))
+    val = load_data(str(splits_path / "val.jsonl"))
+    test = load_data(str(splits_path / "test.jsonl"))
+    return train, val, test
+if __name__ == "__main__":
+    config = load_config()
+    data_cfg = config["data"]
+    print(f"Creating splits from {data_cfg['source']}...")
+    paths = create_splits(
+        data_path=data_cfg["source"],
+        output_dir=data_cfg["splits_dir"],
+        target_column=data_cfg.get("target_column", "label"),
+        test_size=data_cfg["split_ratios"]["test"],
+        val_size=data_cfg["split_ratios"]["val"],
+        random_state=data_cfg["random_state"],
+    )
+    for name, path in paths.items():
+        print(f"  {name}: {path}")
+    print("Done.")