PyPI - graflag-evaluator - Versions diffs - 1.0.0__tar.gz - Mend

graflag-evaluator 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

graflag_evaluator-1.0.0/PKG-INFO +14 -0
graflag_evaluator-1.0.0/README.md +132 -0
graflag_evaluator-1.0.0/__init__.py +8 -0
graflag_evaluator-1.0.0/evaluator.py +215 -0
graflag_evaluator-1.0.0/graflag_evaluator.egg-info/PKG-INFO +14 -0
graflag_evaluator-1.0.0/graflag_evaluator.egg-info/SOURCES.txt +17 -0
graflag_evaluator-1.0.0/graflag_evaluator.egg-info/dependency_links.txt +1 -0
graflag_evaluator-1.0.0/graflag_evaluator.egg-info/requires.txt +4 -0
graflag_evaluator-1.0.0/graflag_evaluator.egg-info/top_level.txt +1 -0
graflag_evaluator-1.0.0/metrics.py +241 -0
graflag_evaluator-1.0.0/plots.py +198 -0
graflag_evaluator-1.0.0/run_evaluation.py +36 -0
graflag_evaluator-1.0.0/setup.cfg +4 -0
graflag_evaluator-1.0.0/setup.py +19 -0

graflag_evaluator-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,14 @@
+Metadata-Version: 2.4
+Name: graflag_evaluator
+Version: 1.0.0
+Summary: Evaluation framework for graph anomaly detection methods in GraFlag
+Author: GraFlag Team
+Requires-Python: >=3.7
+Requires-Dist: numpy>=1.21.0
+Requires-Dist: scikit-learn>=1.0.0
+Requires-Dist: matplotlib>=3.5.0
+Requires-Dist: pandas>=1.3.0
+Dynamic: author
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

graflag_evaluator-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,132 @@
+# GraFlag Evaluator
+Docker-based evaluation system for graph anomaly detection experiments.
+## Features
+- Automatic metric computation based on result type
+- Plot generation: ROC curves, PR curves, score distributions, spot curves
+- Spot file integration: detects and plots training/validation metrics
+- Standardized output: evaluation.json with all metrics and metadata
+- Docker-based: isolated environment with all dependencies
+## Usage
+### From CLI
+```bash
+# Evaluate an experiment (builds Docker image on first run)
+graflag evaluate -e exp__generaldyg__btc_alpha__20251211_120000
+# Copy results locally
+graflag copy --from-remote -s experiments/<exp_name>/eval -d ./eval_results
+```
+### Manual Docker Usage
+```bash
+# Build image (done automatically by CLI)
+cd graflag-shared/libs/graflag_evaluator
+docker build -t graflag-evaluator:latest .
+# Run evaluation
+docker run --rm -v /shared:/shared graflag-evaluator:latest /shared/experiments/<exp_name>
+```
+### From Python
+```python
+from graflag_evaluator import Evaluator
+from pathlib import Path
+evaluator = Evaluator(Path("experiments/exp__generaldyg__btc_alpha__20251211_120000"))
+eval_path = evaluator.evaluate()
+```
+## Supported Metrics
+All result types get:
+- **AUC-ROC**: Area under ROC curve
+- **AUC-PR**: Area under Precision-Recall curve
+- **Precision@K**: Precision at top K predictions
+- **Recall@K**: Recall at top K predictions
+- **F1@K**: F1 score at top K
+- **Best F1**: Best F1 across all thresholds
+Additional metrics are computed based on result type (edge counts, temporal span, etc.).
+## Output Structure
+```
+experiments/exp_name/
++-- results.json        (input)
++-- training.csv        (optional spot file)
++-- validation.csv      (optional spot file)
++-- eval/
+    +-- evaluation.json (computed metrics)
+    +-- roc_curve.png
+    +-- pr_curve.png
+    +-- score_distribution.png
+    +-- spot_curves.png (if spot files exist)
+```
+### evaluation.json Format
+```json
+{
+  "experiment_name": "exp__generaldyg__btc_alpha__20251211_120000",
+  "result_type": "EDGE_STREAM_ANOMALY_SCORES",
+  "metrics": {
+    "auc_roc": 0.9234,
+    "auc_pr": 0.8765,
+    "precision_at_k": 0.8500,
+    "recall_at_k": 0.8500,
+    "f1_at_k": 0.8500,
+    "best_f1": 0.8723,
+    "best_f1_threshold": 0.5432,
+    "num_anomalies": 345,
+    "num_samples": 3783,
+    "anomaly_ratio": 0.0912
+  },
+  "plots": {
+    "roc_curve": "roc_curve.png",
+    "pr_curve": "pr_curve.png",
+    "score_distribution": "score_distribution.png",
+    "spot_curves": "spot_curves.png"
+  },
+  "spot_files": ["training", "validation"]
+}
+```
+## Adding Custom Metrics
+```python
+from graflag_evaluator.metrics import MetricCalculator
+def compute_custom_metric(scores, ground_truth, **kwargs):
+    return {"custom_metric": 0.123}
+MetricCalculator.register_metric(
+    "EDGE_STREAM_ANOMALY_SCORES",
+    compute_custom_metric
+)
+```
+## Architecture
+```
+graflag_evaluator/
++-- __init__.py          Package exports
++-- evaluator.py         Main orchestrator
++-- metrics.py           Metric calculators with registry
++-- plots.py             Plot generation utilities
++-- run_evaluation.py    Docker container entry point
+```
+## Troubleshooting
+**"results.json not found"** -- Experiment hasn't completed or failed before writing results.
+**"No ground_truth found"** -- The results.json must include ground_truth for evaluation.
+**"Only one class present"** -- Dataset has no anomalies or only anomalies. Check data preparation.

graflag_evaluator-1.0.0/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""GraFlag Evaluator - Modular evaluation system for graph anomaly detection."""
+from .metrics import MetricCalculator, get_metrics_for_type
+from .evaluator import Evaluator
+__all__ = ["MetricCalculator", "get_metrics_for_type", "Evaluator"]
+__version__ = "1.0.0"

graflag_evaluator-1.0.0/evaluator.py ADDED Viewed

@@ -0,0 +1,215 @@
+"""Main evaluator orchestrator."""
+import json
+import numpy as np
+from pathlib import Path
+from typing import Dict, Any, Optional
+import logging
+from .metrics import MetricCalculator
+from .plots import PlotGenerator
+logger = logging.getLogger(__name__)
+class Evaluator:
+    """
+    Main evaluation orchestrator for GraFlag experiments.
+    Automatically:
+    1. Loads results.json from experiment directory
+    2. Detects result type and loads appropriate data
+    3. Computes all relevant metrics
+    4. Generates evaluation plots (ROC, PR, spot curves)
+    5. Saves evaluation.json with all metrics and metadata
+    """
+    def __init__(self, experiment_path: Path):
+        """
+        Initialize evaluator for an experiment.
+        Args:
+            experiment_path: Path to experiment directory
+        """
+        self.experiment_path = Path(experiment_path)
+        self.results_path = self.experiment_path / "results.json"
+        self.eval_dir = self.experiment_path / "eval"
+        if not self.results_path.exists():
+            raise FileNotFoundError(f"results.json not found in {self.experiment_path}")
+        # Create eval directory
+        self.eval_dir.mkdir(exist_ok=True)
+        # Load results
+        with open(self.results_path, 'r') as f:
+            self.results = json.load(f)
+        self.result_type = self.results.get("result_type")
+        if not self.result_type:
+            raise ValueError("result_type not found in results.json")
+        logger.info(f"[INFO] Evaluating experiment: {self.experiment_path.name}")
+        logger.info(f"   Result type: {self.result_type}")
+    def _load_scores_and_ground_truth(self) -> tuple:
+        """Load scores and ground truth from results."""
+        scores_raw = self.results.get("scores", [])
+        ground_truth_raw = self.results.get("ground_truth", [])
+        if len(scores_raw) == 0:
+            raise ValueError("No scores found in results.json")
+        if len(ground_truth_raw) == 0:
+            raise ValueError("No ground_truth found in results.json")
+        # Handle ragged arrays (e.g., TEMPORAL_EDGE_ANOMALY_SCORES where each
+        # snapshot has different number of edges). Use dtype=object for ragged.
+        try:
+            scores = np.array(scores_raw)
+        except ValueError:
+            # Ragged array - use object dtype
+            scores = np.array(scores_raw, dtype=object)
+        try:
+            ground_truth = np.array(ground_truth_raw)
+        except ValueError:
+            # Ragged array - use object dtype
+            ground_truth = np.array(ground_truth_raw, dtype=object)
+        logger.info(f"   Scores shape: {scores.shape}, dtype: {scores.dtype}")
+        logger.info(f"   Ground truth shape: {ground_truth.shape}, dtype: {ground_truth.dtype}")
+        return scores, ground_truth
+    def _find_spot_files(self) -> Dict[str, Path]:
+        """Find all spot CSV files in experiment directory."""
+        spot_files = {}
+        for csv_file in self.experiment_path.glob("*.csv"):
+            metric_key = csv_file.stem  # filename without extension
+            spot_files[metric_key] = csv_file
+        if spot_files:
+            logger.info(f"   Found {len(spot_files)} spot files: {list(spot_files.keys())}")
+        return spot_files
+    def compute_metrics(self) -> Dict[str, Any]:
+        """
+        Compute all metrics for the experiment.
+        Returns:
+            Dictionary of computed metrics
+        """
+        scores, ground_truth = self._load_scores_and_ground_truth()
+        # Get additional data (timestamps, edges, etc.)
+        kwargs = {
+            "timestamps": self.results.get("timestamps"),
+            "edges": self.results.get("edges"),
+            "node_ids": self.results.get("node_ids"),
+            "graph_ids": self.results.get("graph_ids"),
+        }
+        # Compute metrics
+        logger.info("[INFO] Computing metrics...")
+        metrics = MetricCalculator.calculate_metrics(
+            self.result_type, scores, ground_truth, **kwargs
+        )
+        logger.info(f"[OK] Computed {len(metrics)} metrics")
+        return metrics
+    def generate_plots(self) -> list:
+        """Generate all evaluation plots.
+        Returns:
+            List of generated spot curve plot filenames
+        """
+        logger.info("[INFO] Generating plots...")
+        scores, ground_truth = self._load_scores_and_ground_truth()
+        # ROC curve
+        roc_path = self.eval_dir / "roc_curve.png"
+        PlotGenerator.plot_roc_curve(scores, ground_truth, roc_path,
+                                     title=f"ROC Curve - {self.experiment_path.name}")
+        # PR curve
+        pr_path = self.eval_dir / "pr_curve.png"
+        PlotGenerator.plot_pr_curve(scores, ground_truth, pr_path,
+                                   title=f"PR Curve - {self.experiment_path.name}")
+        # Score distribution
+        dist_path = self.eval_dir / "score_distribution.png"
+        PlotGenerator.plot_score_distribution(scores, ground_truth, dist_path,
+                                             title=f"Score Distribution - {self.experiment_path.name}")
+        # Spot curves from spot files (generates separate files)
+        spot_files = self._find_spot_files()
+        spot_plot_files = []
+        if spot_files:
+            spot_plot_files = PlotGenerator.plot_spot_curves(
+                spot_files, self.eval_dir,
+                title=f"Spot Curves - {self.experiment_path.name}"
+            )
+        logger.info(f"[OK] Plots saved to {self.eval_dir}")
+        return spot_plot_files
+    def evaluate(self) -> Path:
+        """
+        Run full evaluation: compute metrics and generate plots.
+        Returns:
+            Path to evaluation.json
+        """
+        # Compute metrics
+        computed_metrics = self.compute_metrics()
+        # Generate plots (returns list of spot curve plot filenames)
+        spot_plot_files = self.generate_plots()
+        # Build evaluation results
+        evaluation = {
+            "experiment_name": self.experiment_path.name,
+            "result_type": self.result_type,
+            "metrics": computed_metrics,
+            "metadata": self.results.get("metadata", {}),
+            "plots": {
+                "roc_curve": "roc_curve.png",
+                "pr_curve": "pr_curve.png",
+                "score_distribution": "score_distribution.png",
+            },
+        }
+        # Add spot curve plots if available
+        spot_files = self._find_spot_files()
+        if spot_files:
+            evaluation["spot_files"] = list(spot_files.keys())
+            # Add each spot curve plot
+            for plot_file in spot_plot_files:
+                plot_key = plot_file.replace('.png', '')
+                evaluation["plots"][plot_key] = plot_file
+        # Save evaluation.json
+        eval_json_path = self.eval_dir / "evaluation.json"
+        with open(eval_json_path, 'w') as f:
+            json.dump(evaluation, f, indent=2)
+        logger.info(f"[OK] Evaluation complete!")
+        logger.info(f"   Results: {eval_json_path}")
+        logger.info(f"   Plots: {self.eval_dir}")
+        # Print summary
+        print("\n" + "="*60)
+        print(f"[INFO] Evaluation Summary: {self.experiment_path.name}")
+        print("="*60)
+        print(f"Result Type: {self.result_type}")
+        print(f"\nKey Metrics:")
+        for key, value in computed_metrics.items():
+            if isinstance(value, (int, float)) and value is not None:
+                print(f"  {key}: {value}")
+        print(f"\nPlots saved to: {self.eval_dir}")
+        print("="*60 + "\n")
+        return eval_json_path

graflag_evaluator-1.0.0/graflag_evaluator.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,14 @@
+Metadata-Version: 2.4
+Name: graflag_evaluator
+Version: 1.0.0
+Summary: Evaluation framework for graph anomaly detection methods in GraFlag
+Author: GraFlag Team
+Requires-Python: >=3.7
+Requires-Dist: numpy>=1.21.0
+Requires-Dist: scikit-learn>=1.0.0
+Requires-Dist: matplotlib>=3.5.0
+Requires-Dist: pandas>=1.3.0
+Dynamic: author
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

graflag_evaluator-1.0.0/graflag_evaluator.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,17 @@
+README.md
+__init__.py
+evaluator.py
+metrics.py
+plots.py
+run_evaluation.py
+setup.py
+./__init__.py
+./evaluator.py
+./metrics.py
+./plots.py
+./run_evaluation.py
+graflag_evaluator.egg-info/PKG-INFO
+graflag_evaluator.egg-info/SOURCES.txt
+graflag_evaluator.egg-info/dependency_links.txt
+graflag_evaluator.egg-info/requires.txt
+graflag_evaluator.egg-info/top_level.txt

graflag_evaluator-1.0.0/graflag_evaluator.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

graflag_evaluator-1.0.0/graflag_evaluator.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+numpy>=1.21.0
+scikit-learn>=1.0.0
+matplotlib>=3.5.0
+pandas>=1.3.0

graflag_evaluator-1.0.0/graflag_evaluator.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ graflag_evaluator

graflag_evaluator-1.0.0/metrics.py ADDED Viewed

@@ -0,0 +1,241 @@
+"""Metric calculators for different result types."""
+import numpy as np
+from sklearn import metrics
+from typing import Dict, List, Any, Callable
+import logging
+logger = logging.getLogger(__name__)
+class MetricCalculator:
+    """
+    Base class for metric calculation.
+    Supports plugin-based architecture for adding new metrics.
+    """
+    # Registry of metric functions by result type
+    _METRIC_REGISTRY: Dict[str, List[Callable]] = {}
+    @classmethod
+    def register_metric(cls, result_type: str, metric_func: Callable):
+        """
+        Register a new metric function for a result type.
+        Args:
+            result_type: Result type (e.g., "EDGE_STREAM_ANOMALY_SCORES")
+            metric_func: Function that takes (scores, ground_truth, **kwargs)
+                        and returns Dict[str, float]
+        """
+        if result_type not in cls._METRIC_REGISTRY:
+            cls._METRIC_REGISTRY[result_type] = []
+        cls._METRIC_REGISTRY[result_type].append(metric_func)
+        logger.debug(f"Registered metric {metric_func.__name__} for {result_type}")
+    @classmethod
+    def calculate_metrics(cls, result_type: str, scores: np.ndarray,
+                         ground_truth: np.ndarray, **kwargs) -> Dict[str, Any]:
+        """
+        Calculate all registered metrics for a result type.
+        Args:
+            result_type: Type of anomaly detection result
+            scores: Anomaly scores
+            ground_truth: Ground truth labels
+            **kwargs: Additional parameters (timestamps, edges, etc.)
+        Returns:
+            Dictionary of computed metrics
+        """
+        if result_type not in cls._METRIC_REGISTRY:
+            logger.warning(f"No metrics registered for {result_type}")
+            return {}
+        all_metrics = {}
+        for metric_func in cls._METRIC_REGISTRY[result_type]:
+            try:
+                result = metric_func(scores, ground_truth, **kwargs)
+                all_metrics.update(result)
+            except Exception as e:
+                logger.error(f"Error in {metric_func.__name__}: {e}")
+        return all_metrics
+# ============================================================================
+# Standard Metrics for Binary Anomaly Detection
+# ============================================================================
+def compute_classification_metrics(scores: np.ndarray, ground_truth: np.ndarray,
+                                   **kwargs) -> Dict[str, float]:
+    """
+    Compute standard classification metrics (works for all types).
+    Metrics:
+    - AUC-ROC: Area under ROC curve
+    - AUC-PR: Area under Precision-Recall curve
+    - Precision@K: Precision in top K predictions
+    - Recall@K: Recall in top K predictions
+    - F1@K: F1 score in top K predictions
+    - Best F1: Best F1 score across all thresholds
+    """
+    # Handle nested lists (e.g., TEMPORAL_EDGE_ANOMALY_SCORES where each snapshot
+    # has different number of edges). np.array() creates object array for ragged lists.
+    if scores.dtype == object or (scores.ndim == 1 and isinstance(scores[0], (list, np.ndarray))):
+        # Flatten nested structure
+        scores_flat = np.concatenate([np.asarray(s).flatten() for s in scores])
+        gt_flat = np.concatenate([np.asarray(g).flatten() for g in ground_truth])
+    else:
+        scores_flat = scores.flatten()
+        gt_flat = ground_truth.flatten()
+    # Remove invalid scores (-2, -1) if present
+    valid_mask = (scores_flat >= 0) & (scores_flat <= 1) if np.max(scores_flat) <= 1 else scores_flat > -2
+    scores_valid = scores_flat[valid_mask]
+    gt_valid = gt_flat[valid_mask]
+    if len(np.unique(gt_valid)) < 2:
+        logger.warning("Ground truth has only one class, skipping some metrics")
+        return {"auc_roc": None, "auc_pr": None}
+    # AUC-ROC
+    auc_roc = metrics.roc_auc_score(gt_valid, scores_valid)
+    # AUC-PR
+    precision, recall, _ = metrics.precision_recall_curve(gt_valid, scores_valid)
+    auc_pr = metrics.auc(recall, precision)
+    # Precision/Recall/F1 at K (K = number of anomalies)
+    k = int(np.sum(gt_valid))
+    top_k_indices = np.argsort(scores_valid)[-k:]
+    predictions_at_k = np.zeros_like(gt_valid)
+    predictions_at_k[top_k_indices] = 1
+    precision_at_k = metrics.precision_score(gt_valid, predictions_at_k, zero_division=0)
+    recall_at_k = metrics.recall_score(gt_valid, predictions_at_k, zero_division=0)
+    f1_at_k = metrics.f1_score(gt_valid, predictions_at_k, zero_division=0)
+    # Best F1 across all thresholds
+    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
+    best_f1 = np.max(f1_scores)
+    best_f1_threshold = _[np.argmax(f1_scores)] if len(_) > 0 else None
+    return {
+        "auc_roc": round(float(auc_roc), 4),
+        "auc_pr": round(float(auc_pr), 4),
+        "precision_at_k": round(float(precision_at_k), 4),
+        "recall_at_k": round(float(recall_at_k), 4),
+        "f1_at_k": round(float(f1_at_k), 4),
+        "best_f1": round(float(best_f1), 4),
+        "best_f1_threshold": round(float(best_f1_threshold), 4) if best_f1_threshold else None,
+        "num_anomalies": int(k),
+        "num_samples": int(len(gt_valid)),
+        "anomaly_ratio": round(float(k / len(gt_valid)), 4),
+    }
+def compute_temporal_metrics(scores: np.ndarray, ground_truth: np.ndarray,
+                            timestamps: List[int] = None, **kwargs) -> Dict[str, float]:
+    """
+    Compute temporal-specific metrics.
+    Metrics:
+    - Early detection rate: How early anomalies are detected
+    - Temporal consistency: How consistent scores are over time
+    """
+    if timestamps is None:
+        return {}
+    # Early detection: average time between first high score and actual anomaly
+    # (This is a placeholder - implement based on your specific needs)
+    return {
+        "temporal_span": int(max(timestamps) - min(timestamps)) if timestamps else 0,
+        "num_timestamps": len(set(timestamps)) if timestamps else 0,
+    }
+def compute_edge_metrics(scores: np.ndarray, ground_truth: np.ndarray,
+                        edges: List[List[int]] = None, **kwargs) -> Dict[str, float]:
+    """
+    Compute edge-specific metrics.
+    Metrics:
+    - Number of unique edges
+    - Edge degree distribution stats
+    """
+    if edges is None:
+        return {}
+    # Count unique edges
+    unique_edges = len(set(tuple(e) for e in edges))
+    # Node degree stats (how many times each node appears)
+    nodes = [n for edge in edges for n in edge]
+    unique_nodes = len(set(nodes))
+    return {
+        "num_unique_edges": int(unique_edges),
+        "num_unique_nodes": int(unique_nodes),
+        "total_edge_occurrences": int(len(edges)),
+    }
+# ============================================================================
+# Register Default Metrics
+# ============================================================================
+# Register for all result types
+for result_type in [
+    "NODE_ANOMALY_SCORES",
+    "EDGE_ANOMALY_SCORES",
+    "GRAPH_ANOMALY_SCORES",
+    "TEMPORAL_NODE_ANOMALY_SCORES",
+    "TEMPORAL_EDGE_ANOMALY_SCORES",
+    "TEMPORAL_GRAPH_ANOMALY_SCORES",
+    "NODE_STREAM_ANOMALY_SCORES",
+    "EDGE_STREAM_ANOMALY_SCORES",
+    "GRAPH_STREAM_ANOMALY_SCORES",
+]:
+    MetricCalculator.register_metric(result_type, compute_classification_metrics)
+# Register temporal metrics for temporal and stream types
+for result_type in [
+    "TEMPORAL_NODE_ANOMALY_SCORES",
+    "TEMPORAL_EDGE_ANOMALY_SCORES",
+    "TEMPORAL_GRAPH_ANOMALY_SCORES",
+    "NODE_STREAM_ANOMALY_SCORES",
+    "EDGE_STREAM_ANOMALY_SCORES",
+    "GRAPH_STREAM_ANOMALY_SCORES",
+]:
+    MetricCalculator.register_metric(result_type, compute_temporal_metrics)
+# Register edge metrics for edge types
+for result_type in [
+    "EDGE_ANOMALY_SCORES",
+    "TEMPORAL_EDGE_ANOMALY_SCORES",
+    "EDGE_STREAM_ANOMALY_SCORES",
+]:
+    MetricCalculator.register_metric(result_type, compute_edge_metrics)
+def get_metrics_for_type(result_type: str) -> List[str]:
+    """
+    Get list of available metrics for a result type.
+    Args:
+        result_type: Result type string
+    Returns:
+        List of metric names
+    """
+    if result_type not in MetricCalculator._METRIC_REGISTRY:
+        return []
+    # Extract metric names from registered functions
+    metric_names = []
+    for func in MetricCalculator._METRIC_REGISTRY[result_type]:
+        metric_names.append(func.__name__)
+    return metric_names

graflag_evaluator-1.0.0/plots.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""Plot generation utilities for evaluation."""
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')  # Non-interactive backend
+import matplotlib.pyplot as plt
+from sklearn import metrics
+from pathlib import Path
+from typing import Dict, List, Optional
+import pandas as pd
+import logging
+logger = logging.getLogger(__name__)
+def _flatten_ragged(arr: np.ndarray) -> np.ndarray:
+    """Flatten array, handling ragged/object arrays properly."""
+    if arr.dtype == object or (arr.ndim == 1 and len(arr) > 0 and isinstance(arr[0], (list, np.ndarray))):
+        # Ragged array - concatenate all elements
+        return np.concatenate([np.asarray(x).flatten() for x in arr])
+    return arr.flatten()
+class PlotGenerator:
+    """Generate evaluation plots."""
+    @staticmethod
+    def plot_roc_curve(scores: np.ndarray, ground_truth: np.ndarray,
+                       output_path: Path, title: str = "ROC Curve"):
+        """
+        Generate ROC curve plot.
+        Args:
+            scores: Anomaly scores
+            ground_truth: Ground truth labels
+            output_path: Path to save plot
+            title: Plot title
+        """
+        scores_flat = _flatten_ragged(scores)
+        gt_flat = _flatten_ragged(ground_truth)
+        # Remove invalid scores
+        valid_mask = scores_flat > -2
+        scores_valid = scores_flat[valid_mask]
+        gt_valid = gt_flat[valid_mask]
+        if len(np.unique(gt_valid)) < 2:
+            logger.warning("Cannot plot ROC: only one class present")
+            return
+        fpr, tpr, thresholds = metrics.roc_curve(gt_valid, scores_valid)
+        auc_score = metrics.auc(fpr, tpr)
+        plt.figure(figsize=(8, 6))
+        plt.plot(fpr, tpr, label=f'AUC = {auc_score:.4f}', linewidth=2)
+        plt.plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
+        plt.xlabel('False Positive Rate', fontsize=12)
+        plt.ylabel('True Positive Rate', fontsize=12)
+        plt.title(title, fontsize=14)
+        plt.legend(fontsize=10)
+        plt.grid(alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(output_path, dpi=150)
+        plt.close()
+        logger.info(f"[OK] ROC curve saved to {output_path}")
+    @staticmethod
+    def plot_pr_curve(scores: np.ndarray, ground_truth: np.ndarray,
+                     output_path: Path, title: str = "Precision-Recall Curve"):
+        """
+        Generate Precision-Recall curve plot.
+        Args:
+            scores: Anomaly scores
+            ground_truth: Ground truth labels
+            output_path: Path to save plot
+            title: Plot title
+        """
+        scores_flat = _flatten_ragged(scores)
+        gt_flat = _flatten_ragged(ground_truth)
+        valid_mask = scores_flat > -2
+        scores_valid = scores_flat[valid_mask]
+        gt_valid = gt_flat[valid_mask]
+        if len(np.unique(gt_valid)) < 2:
+            logger.warning("Cannot plot PR: only one class present")
+            return
+        precision, recall, thresholds = metrics.precision_recall_curve(gt_valid, scores_valid)
+        auc_score = metrics.auc(recall, precision)
+        plt.figure(figsize=(8, 6))
+        plt.plot(recall, precision, label=f'AUC-PR = {auc_score:.4f}', linewidth=2)
+        plt.xlabel('Recall', fontsize=12)
+        plt.ylabel('Precision', fontsize=12)
+        plt.title(title, fontsize=14)
+        plt.legend(fontsize=10)
+        plt.grid(alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(output_path, dpi=150)
+        plt.close()
+        logger.info(f"[OK] PR curve saved to {output_path}")
+    @staticmethod
+    def plot_score_distribution(scores: np.ndarray, ground_truth: np.ndarray,
+                                output_path: Path, title: str = "Score Distribution"):
+        """
+        Generate score distribution plot (histogram for anomalies vs normal).
+        Args:
+            scores: Anomaly scores
+            ground_truth: Ground truth labels
+            output_path: Path to save plot
+            title: Plot title
+        """
+        scores_flat = _flatten_ragged(scores)
+        gt_flat = _flatten_ragged(ground_truth)
+        valid_mask = scores_flat > -2
+        scores_valid = scores_flat[valid_mask]
+        gt_valid = gt_flat[valid_mask]
+        normal_scores = scores_valid[gt_valid == 0]
+        anomaly_scores = scores_valid[gt_valid == 1]
+        plt.figure(figsize=(8, 6))
+        plt.hist(normal_scores, bins=50, alpha=0.5, label='Normal', color='blue')
+        plt.hist(anomaly_scores, bins=50, alpha=0.5, label='Anomaly', color='red')
+        plt.xlabel('Anomaly Score', fontsize=12)
+        plt.ylabel('Frequency', fontsize=12)
+        plt.title(title, fontsize=14)
+        plt.legend(fontsize=10)
+        plt.grid(alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(output_path, dpi=150)
+        plt.close()
+        logger.info(f"[OK] Score distribution saved to {output_path}")
+    @staticmethod
+    def plot_spot_curves(spot_files: Dict[str, Path], output_dir: Path,
+                        title: str = "Spot Curves") -> List[str]:
+        """
+        Generate separate spot curve plots from spot CSV files.
+        Args:
+            spot_files: Dictionary mapping metric_key to CSV path
+            output_dir: Directory to save plots
+            title: Plot title prefix
+        Returns:
+            List of generated plot filenames
+        """
+        if not spot_files:
+            logger.warning("No spot files to plot")
+            return []
+        generated_plots = []
+        for metric_key, csv_path in spot_files.items():
+            try:
+                df = pd.read_csv(csv_path)
+                # Get columns to plot (exclude timestamp and epoch)
+                plot_cols = [col for col in df.columns if col not in ('timestamp', 'epoch')]
+                if not plot_cols:
+                    continue
+                # Create a separate plot for this spot file
+                plt.figure(figsize=(10, 6))
+                for col in plot_cols:
+                    plt.plot(df.index, df[col], label=col, marker='o', markersize=3, linewidth=1.5)
+                plt.xlabel('Step/Epoch', fontsize=12)
+                plt.ylabel('Value', fontsize=12)
+                plt.title(f"{metric_key.replace('_', ' ').title()} Curves", fontsize=14)
+                plt.legend(fontsize=10, loc='best')
+                plt.grid(alpha=0.3)
+                plt.tight_layout()
+                # Save with metric_key name
+                output_filename = f"{metric_key}_curves.png"
+                output_path = output_dir / output_filename
+                plt.savefig(output_path, dpi=150)
+                plt.close()
+                generated_plots.append(output_filename)
+                logger.info(f"[OK] {metric_key} curves saved to {output_path}")
+            except Exception as e:
+                logger.error(f"Error plotting {metric_key}: {e}")
+        return generated_plots

graflag_evaluator-1.0.0/run_evaluation.py ADDED Viewed

@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+"""Standalone script to run evaluation on an experiment."""
+import sys
+from pathlib import Path
+# Add graflag_evaluator to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from graflag_evaluator import Evaluator
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: run_evaluation.py <experiment_directory>")
+        print("Example: run_evaluation.py /shared/experiments/exp_name")
+        sys.exit(1)
+    exp_dir = Path(sys.argv[1])
+    if not exp_dir.exists():
+        print(f"Error: Experiment directory not found: {exp_dir}")
+        sys.exit(1)
+    if not (exp_dir / "results.json").exists():
+        print(f"Error: results.json not found in {exp_dir}")
+        sys.exit(1)
+    # Run evaluation
+    print(f"[INFO] Loading experiment from: {exp_dir}")
+    evaluator = Evaluator(exp_dir)
+    eval_path = evaluator.evaluate()
+    print(f"\n[OK] Evaluation complete! Results saved to: {eval_path}")
+if __name__ == "__main__":
+    main()

graflag_evaluator-1.0.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

graflag_evaluator-1.0.0/setup.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Setup script for graflag_evaluator package."""
+from setuptools import setup
+setup(
+    name="graflag_evaluator",
+    version="1.0.0",
+    description="Evaluation framework for graph anomaly detection methods in GraFlag",
+    author="GraFlag Team",
+    packages=["graflag_evaluator"],
+    package_dir={"graflag_evaluator": "."},
+    install_requires=[
+        "numpy>=1.21.0",
+        "scikit-learn>=1.0.0",
+        "matplotlib>=3.5.0",
+        "pandas>=1.3.0",
+    ],
+    python_requires=">=3.7",
+)