PyPI - cordon - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cordon 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

cordon/__init__.py +14 -0
cordon/analysis/__init__.py +4 -0
cordon/analysis/scorer.py +256 -0
cordon/analysis/thresholder.py +51 -0
cordon/cli.py +230 -0
cordon/core/__init__.py +19 -0
cordon/core/config.py +64 -0
cordon/core/types.py +141 -0
cordon/embedding/__init__.py +29 -0
cordon/embedding/llama_cpp.py +95 -0
cordon/embedding/transformer.py +135 -0
cordon/ingestion/__init__.py +3 -0
cordon/ingestion/reader.py +45 -0
cordon/pipeline.py +126 -0
cordon/postprocess/__init__.py +4 -0
cordon/postprocess/formatter.py +68 -0
cordon/postprocess/merger.py +77 -0
cordon/py.typed +2 -0
cordon/segmentation/__init__.py +3 -0
cordon/segmentation/windower.py +80 -0
cordon-0.1.0.dist-info/METADATA +287 -0
cordon-0.1.0.dist-info/RECORD +25 -0
cordon-0.1.0.dist-info/WHEEL +4 -0
cordon-0.1.0.dist-info/entry_points.txt +2 -0
cordon-0.1.0.dist-info/licenses/LICENSE +201 -0

cordon/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from cordon.core.config import AnalysisConfig
+from cordon.core.types import AnalysisResult, MergedBlock, ScoredWindow, TextWindow
+from cordon.pipeline import SemanticLogAnalyzer
+__version__ = "0.1.0"
+__all__ = [
+    "SemanticLogAnalyzer",
+    "AnalysisConfig",
+    "AnalysisResult",
+    "TextWindow",
+    "ScoredWindow",
+    "MergedBlock",
+]

cordon/analysis/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from cordon.analysis.scorer import DensityAnomalyScorer
+from cordon.analysis.thresholder import Thresholder
+__all__ = ["DensityAnomalyScorer", "Thresholder"]

cordon/analysis/scorer.py ADDED Viewed

@@ -0,0 +1,256 @@
+import tempfile
+import warnings
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any
+import numpy as np
+import numpy.typing as npt
+from sklearn.neighbors import NearestNeighbors
+from cordon.core.config import AnalysisConfig
+from cordon.core.types import ScoredWindow, TextWindow
+# optional FAISS support
+try:
+    import faiss
+    HAS_FAISS = True
+except ImportError:
+    HAS_FAISS = False
+class DensityAnomalyScorer:
+    """Calculate significance scores using k-NN cosine distance.
+    This scorer uses the average distance to k nearest neighbors as a measure
+    of how anomalous each window is. Higher distances
+    indicate more anomalous content.
+    For large datasets, automatically switches to memory-mapped storage to
+    reduce RAM usage.
+    """
+    def _calculate_n_neighbors(self, config: AnalysisConfig, n_samples: int) -> int:
+        """Calculate the number of neighbors to use for k-NN.
+        Args:
+            config: Analysis configuration with k_neighbors setting
+            n_samples: Total number of samples in the dataset
+        Returns:
+            Number of neighbors to use (k+1 for self, capped at n_samples)
+        """
+        num_neighbors = config.k_neighbors
+        return min(num_neighbors + 1, n_samples)
+    def score_windows(
+        self,
+        embedded_windows: Sequence[tuple[TextWindow, npt.NDArray[np.floating[Any]]]],
+        config: AnalysisConfig,
+    ) -> list[ScoredWindow]:
+        """Score windows based on k-NN density.
+        Args:
+            embedded_windows: Sequence of (window, embedding) pairs
+            config: Analysis configuration with k_neighbors setting
+        Returns:
+            List of scored windows with anomaly scores
+        """
+        if not embedded_windows:
+            return []
+        # single window
+        if len(embedded_windows) == 1:
+            window, embedding = embedded_windows[0]
+            return [ScoredWindow(window=window, score=0.0, embedding=embedding)]
+        n_windows = len(embedded_windows)
+        # choose strategy based on dataset size
+        use_faiss = (
+            HAS_FAISS
+            and config.use_faiss_threshold is not None
+            and n_windows >= config.use_faiss_threshold
+        )
+        use_mmap = (
+            config.use_mmap_threshold is not None
+            and n_windows >= config.use_mmap_threshold
+            and not use_faiss  # FAISS takes precedence
+        )
+        if use_faiss:
+            return self._score_windows_faiss(embedded_windows, config)
+        elif use_mmap:
+            return self._score_windows_mmap(embedded_windows, config)
+        else:
+            return self._score_windows_inmemory(embedded_windows, config)
+    def _score_windows_inmemory(
+        self,
+        embedded_windows: Sequence[tuple[TextWindow, npt.NDArray[np.floating[Any]]]],
+        config: AnalysisConfig,
+    ) -> list[ScoredWindow]:
+        """Score windows using in-memory arrays (fast, but uses more RAM).
+        Args:
+            embedded_windows: Sequence of (window, embedding) pairs
+            config: Analysis configuration
+        Returns:
+            List of scored windows
+        """
+        # extract embeddings into matrix
+        windows = [window for window, _ in embedded_windows]
+        embeddings = np.array([embedding for _, embedding in embedded_windows])
+        # build k-NN index
+        n_samples = len(embeddings)
+        n_neighbors = self._calculate_n_neighbors(config, n_samples)
+        knn = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
+        knn.fit(embeddings)
+        # query all points
+        distances, _ = knn.kneighbors(embeddings)
+        # calculate scores (average distance to k nearest neighbors, excluding self)
+        scored_windows = []
+        for window_idx, (window, embedding) in enumerate(zip(windows, embeddings, strict=False)):
+            # skip first distance (self = 0) and take mean of remaining
+            neighbor_distances = distances[window_idx][1:]
+            score = float(np.mean(neighbor_distances))
+            scored_windows.append(ScoredWindow(window=window, score=score, embedding=embedding))
+        return scored_windows
+    def _score_windows_mmap(
+        self,
+        embedded_windows: Sequence[tuple[TextWindow, npt.NDArray[np.floating[Any]]]],
+        config: AnalysisConfig,
+    ) -> list[ScoredWindow]:
+        """Score windows using memory-mapped storage (lower RAM, slightly slower).
+        Args:
+            embedded_windows: Sequence of (window, embedding) pairs
+            config: Analysis configuration
+        Returns:
+            List of scored windows
+        """
+        windows = [window for window, _ in embedded_windows]
+        n_windows = len(windows)
+        # embedding dimension from first embedding
+        first_embedding = embedded_windows[0][1]
+        embedding_dim = len(first_embedding)
+        # create temporary memory-mapped file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".dat")
+        temp_path = Path(temp_file.name)
+        temp_file.close()
+        try:
+            # memory-mapped array for embeddings
+            embeddings_mmap = np.memmap(
+                temp_path,
+                dtype="float32",
+                mode="w+",
+                shape=(n_windows, embedding_dim),
+            )
+            # copy embeddings to mmap and flush to disk
+            for window_idx, (_, embedding) in enumerate(embedded_windows):
+                embeddings_mmap[window_idx] = embedding
+            embeddings_mmap.flush()
+            # build k-NN index
+            n_neighbors = self._calculate_n_neighbors(config, n_windows)
+            knn = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
+            knn.fit(embeddings_mmap)
+            # query all points and calculate scores
+            distances, _ = knn.kneighbors(embeddings_mmap)
+            scored_windows = []
+            for window_idx, window in enumerate(windows):
+                neighbor_distances = distances[window_idx][1:]
+                score = float(np.mean(neighbor_distances))
+                scored_windows.append(
+                    ScoredWindow(
+                        window=window,
+                        score=score,
+                        embedding=embeddings_mmap[window_idx].copy(),
+                    )
+                )
+            return scored_windows
+        finally:
+            # clean up temporary file
+            if temp_path.exists():
+                temp_path.unlink()
+    def _score_windows_faiss(
+        self,
+        embedded_windows: Sequence[tuple[TextWindow, npt.NDArray[np.floating[Any]]]],
+        config: AnalysisConfig,
+    ) -> list[ScoredWindow]:
+        """Score windows using FAISS for fast approximate k-NN (lowest RAM, fastest).
+        Args:
+            embedded_windows: Sequence of (window, embedding) pairs
+            config: Analysis configuration
+        Returns:
+            List of scored windows
+        """
+        if not HAS_FAISS:
+            warnings.warn(
+                "FAISS not available, falling back to memory-mapped approach. "
+                "Install faiss-cpu or faiss-gpu for better performance on large logs.",
+                UserWarning,
+                stacklevel=2,
+            )
+            return self._score_windows_mmap(embedded_windows, config)
+        windows = [window for window, _ in embedded_windows]
+        embeddings = np.array([embedding for _, embedding in embedded_windows], dtype=np.float32)
+        n_windows = len(embeddings)
+        embedding_dim = embeddings.shape[1]
+        n_neighbors = self._calculate_n_neighbors(config, n_windows)
+        # normalize embeddings so inner product = cosine similarity
+        faiss.normalize_L2(embeddings)
+        # create FAISS index
+        index = faiss.IndexFlatIP(embedding_dim)
+        index.add(embeddings)
+        # query k-nearest neighbors
+        distances, _ = index.search(embeddings, n_neighbors)
+        # convert inner product (cosine similarity) to cosine distance
+        # after normalization, inner product equals cosine similarity
+        distances = 1.0 - distances
+        # calculate scores
+        scored_windows = []
+        for window_idx, window in enumerate(windows):
+            # skip first distance (self) and take mean of remaining
+            neighbor_distances = distances[window_idx][1:]
+            score = float(np.mean(neighbor_distances))
+            # ensure non-negative scores (handle numerical precision issues)
+            score = max(0.0, score)
+            scored_windows.append(
+                ScoredWindow(window=window, score=score, embedding=embeddings[window_idx])
+            )
+        return scored_windows

cordon/analysis/thresholder.py ADDED Viewed

@@ -0,0 +1,51 @@
+from collections.abc import Sequence
+import numpy as np
+from cordon.core.config import AnalysisConfig
+from cordon.core.types import ScoredWindow
+class Thresholder:
+    """Select top windows based on anomaly percentile.
+    Determines which windows are significant based on the distribution
+    of scores in the current dataset.
+    """
+    def select_significant(
+        self, scored_windows: Sequence[ScoredWindow], config: AnalysisConfig
+    ) -> list[ScoredWindow]:
+        """Select significant windows based on threshold.
+        Args:
+            scored_windows: Sequence of scored windows
+            config: Analysis configuration with anomaly_percentile
+        Returns:
+            List of significant windows, sorted by score (descending)
+        """
+        # no scored windows
+        if not scored_windows:
+            return []
+        # all windows, sorted by score descending
+        if config.anomaly_percentile == 1.0:
+            return sorted(scored_windows, key=lambda window: window.score, reverse=True)
+        # no windows requested
+        if config.anomaly_percentile == 0.0:
+            return []
+        # calculate percentile threshold
+        scores = np.array([sw.score for sw in scored_windows])
+        percentile = (1 - config.anomaly_percentile) * 100
+        threshold = np.percentile(scores, percentile)
+        # filter windows at or above threshold
+        selected = [sw for sw in scored_windows if sw.score >= threshold]
+        # sort by score descending (highest anomalies first)
+        selected.sort(key=lambda window: window.score, reverse=True)
+        return selected

cordon/cli.py ADDED Viewed

@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+import argparse
+import sys
+from pathlib import Path
+from cordon import AnalysisConfig, SemanticLogAnalyzer
+def parse_args() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        prog="cordon",
+        description="Analyze log files for anomalous patterns using semantic similarity",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    # positional arguments
+    parser.add_argument(
+        "logfiles",
+        type=Path,
+        nargs="+",
+        help="Path(s) to log file(s) to analyze",
+    )
+    # embedding backend selection
+    backend_group = parser.add_argument_group("embedding backend")
+    backend_group.add_argument(
+        "--backend",
+        type=str,
+        choices=["sentence-transformers", "llama-cpp"],
+        default="sentence-transformers",
+        help="Embedding backend to use (default: sentence-transformers)",
+    )
+    backend_group.add_argument(
+        "--model-path",
+        type=Path,
+        default=None,
+        help="GGUF model path (auto-downloads default if omitted)",
+    )
+    backend_group.add_argument(
+        "--n-gpu-layers",
+        type=int,
+        default=0,
+        help="Number of layers to offload to GPU (llama-cpp only, default: 0)",
+    )
+    backend_group.add_argument(
+        "--n-threads",
+        type=int,
+        default=None,
+        help="Thread count for llama.cpp (default: auto-detect)",
+    )
+    backend_group.add_argument(
+        "--n-ctx",
+        type=int,
+        default=2048,
+        help="Context size for llama.cpp (default: 2048)",
+    )
+    # configuration options
+    config_group = parser.add_argument_group("analysis configuration")
+    config_group.add_argument(
+        "--window-size",
+        type=int,
+        default=5,
+        help="Number of lines per window (default: 5)",
+    )
+    config_group.add_argument(
+        "--stride",
+        type=int,
+        default=2,
+        help="Step size for sliding window in lines (default: 2)",
+    )
+    config_group.add_argument(
+        "--k-neighbors",
+        type=int,
+        default=5,
+        help="Number of neighbors for k-NN density calculation (default: 5)",
+    )
+    config_group.add_argument(
+        "--anomaly-percentile",
+        type=float,
+        default=0.1,
+        help="Percentile of windows to retain, e.g., 0.1 = top 10%% (default: 0.1)",
+    )
+    config_group.add_argument(
+        "--model-name",
+        type=str,
+        default="all-MiniLM-L6-v2",
+        help="HuggingFace model name for sentence-transformers (default: all-MiniLM-L6-v2)",
+    )
+    config_group.add_argument(
+        "--batch-size",
+        type=int,
+        default=32,
+        help="Batch size for embeddings (default: 32)",
+    )
+    config_group.add_argument(
+        "--device",
+        type=str,
+        choices=["cuda", "mps", "cpu"],
+        default=None,
+        help="Device for model inference (default: auto-detect)",
+    )
+    config_group.add_argument(
+        "--use-faiss",
+        action="store_true",
+        help="Use FAISS for faster k-NN search on large logs",
+    )
+    # output options
+    output_group = parser.add_argument_group("output options")
+    output_group.add_argument(
+        "--detailed",
+        action="store_true",
+        help="Show detailed statistics in addition to anomalous blocks",
+    )
+    return parser.parse_args()
+def analyze_file(log_path: Path, analyzer: SemanticLogAnalyzer, detailed: bool) -> None:
+    """Analyze a single log file and print results.
+    Args:
+        log_path: Path to the log file
+        analyzer: Configured SemanticLogAnalyzer instance
+        detailed: Whether to show detailed statistics
+    """
+    # verify file exists and is readable
+    if not log_path.exists():
+        print(f"Error: File not found: {log_path}", file=sys.stderr)
+        return
+    if not log_path.is_file():
+        print(f"Error: Not a file: {log_path}", file=sys.stderr)
+        return
+    # count lines in file
+    with open(log_path) as log_file:
+        line_count = sum(1 for _ in log_file)
+    print("=" * 80)
+    print(f"Analyzing: {log_path}")
+    print(f"Total lines: {line_count:,}")
+    print("=" * 80)
+    if detailed:
+        # run detailed analysis
+        result = analyzer.analyze_file_detailed(log_path)
+        print("\nAnalysis Statistics:")
+        print(f"  Total windows created: {result.total_windows:,}")
+        print(f"  Significant windows: {result.significant_windows:,}")
+        print(f"  Merged blocks: {result.merged_blocks}")
+        print(f"  Processing time: {result.processing_time:.2f}s")
+        print("\nScore Distribution:")
+        print(f"  Min:    {result.score_distribution['min']:.4f}")
+        print(f"  Mean:   {result.score_distribution['mean']:.4f}")
+        print(f"  Median: {result.score_distribution['median']:.4f}")
+        print(f"  P90:    {result.score_distribution['p90']:.4f}")
+        print(f"  Max:    {result.score_distribution['max']:.4f}")
+        print(f"\n{'Significant Blocks':^80}")
+        print("=" * 80)
+        print(result.output)
+    else:
+        # run simple analysis
+        output = analyzer.analyze_file(log_path)
+        print(output)
+    print()
+def main() -> None:
+    """Main entry point for the CLI."""
+    args = parse_args()
+    # create configuration from arguments
+    try:
+        config = AnalysisConfig(
+            window_size=args.window_size,
+            stride=args.stride,
+            k_neighbors=args.k_neighbors,
+            anomaly_percentile=args.anomaly_percentile,
+            model_name=args.model_name,
+            batch_size=args.batch_size,
+            device=args.device,
+            use_faiss_threshold=0 if args.use_faiss else None,
+            backend=args.backend,
+            model_path=str(args.model_path) if args.model_path else None,
+            n_gpu_layers=args.n_gpu_layers,
+            n_threads=args.n_threads,
+            n_ctx=args.n_ctx,
+        )
+    except ValueError as error:
+        print(f"Configuration error: {error}", file=sys.stderr)
+        sys.exit(1)
+    # create analyzer
+    print("Initializing analyzer...")
+    print(f"Backend: {config.backend}")
+    if config.backend == "sentence-transformers":
+        print(f"Model: {config.model_name}")
+        print(f"Device: {config.device or 'auto'}")
+    elif config.backend == "llama-cpp":
+        print(f"Model path: {config.model_path}")
+        print(f"GPU layers: {config.n_gpu_layers}")
+        if config.n_threads:
+            print(f"Threads: {config.n_threads}")
+    print()
+    try:
+        analyzer = SemanticLogAnalyzer(config)
+    except ImportError as error:
+        print(f"Import error: {error}", file=sys.stderr)
+        print("\nTo install llama.cpp support:", file=sys.stderr)
+        print("  uv pip install 'cordon[llama-cpp]'", file=sys.stderr)
+        print("  or: pip install llama-cpp-python", file=sys.stderr)
+        sys.exit(1)
+    except Exception as error:
+        print(f"Initialization error: {error}", file=sys.stderr)
+        sys.exit(1)
+    print()
+    # analyze each log file
+    for log_path in args.logfiles:
+        analyze_file(log_path, analyzer, args.detailed)
+if __name__ == "__main__":
+    main()

cordon/core/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from cordon.core.config import AnalysisConfig
+from cordon.core.types import (
+    AnalysisResult,
+    Embedder,
+    MergedBlock,
+    ScoredWindow,
+    Scorer,
+    TextWindow,
+)
+__all__ = [
+    "AnalysisConfig",
+    "TextWindow",
+    "ScoredWindow",
+    "MergedBlock",
+    "AnalysisResult",
+    "Embedder",
+    "Scorer",
+]

cordon/core/config.py ADDED Viewed

@@ -0,0 +1,64 @@
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class AnalysisConfig:
+    """Global configuration for the analysis pipeline."""
+    window_size: int = 5
+    stride: int = 2
+    k_neighbors: int = 5
+    anomaly_percentile: float = 0.1
+    model_name: str = "all-MiniLM-L6-v2"
+    batch_size: int = 32
+    device: str | None = None
+    use_mmap_threshold: int | None = 50000  # switch to mmap at 50k windows
+    use_faiss_threshold: int | None = None  # FAISS disabled by default
+    backend: str = "sentence-transformers"  # or "llama-cpp"
+    model_path: str | None = None  # GGUF model file path
+    n_ctx: int = 2048  # llama.cpp context size
+    n_threads: int | None = None  # llama.cpp threads (None=auto)
+    n_gpu_layers: int = 0  # llama.cpp GPU layer offloading
+    def __post_init__(self) -> None:
+        """Validate configuration parameters."""
+        if self.window_size < 1:
+            raise ValueError("window_size must be >= 1")
+        if self.stride < 1:
+            raise ValueError("stride must be >= 1")
+        if self.k_neighbors < 1:
+            raise ValueError("k_neighbors must be >= 1")
+        if not 0.0 <= self.anomaly_percentile <= 1.0:
+            raise ValueError("anomaly_percentile must be between 0.0 and 1.0")
+        if self.batch_size < 1:
+            raise ValueError("batch_size must be >= 1")
+        if self.device is not None and self.device not in ("cuda", "mps", "cpu"):
+            raise ValueError("device must be 'cuda', 'mps', 'cpu', or None")
+        # Backend validation
+        if self.backend not in ("sentence-transformers", "llama-cpp"):
+            raise ValueError(
+                f"backend must be 'sentence-transformers' or 'llama-cpp', got '{self.backend}'"
+            )
+        # llama-cpp specific validation
+        if self.backend == "llama-cpp" and self.model_path is not None:
+            # If model_path is provided, validate it exists and has correct extension
+            # If None, LlamaCppVectorizer will auto-download default model
+            model_file = Path(self.model_path)
+            if not model_file.exists():
+                raise ValueError(f"GGUF model file not found: {self.model_path}")
+            if model_file.suffix != ".gguf":
+                raise ValueError(f"model_path must be a .gguf file, got: {model_file.suffix}")
+        # llama.cpp parameter validation
+        if self.n_ctx < 1:
+            raise ValueError("n_ctx must be >= 1")
+        if self.n_gpu_layers < -1:
+            raise ValueError("n_gpu_layers must be >= -1 (-1 for all layers, 0 for CPU-only)")
+        if self.n_threads is not None and self.n_threads < 1:
+            raise ValueError("n_threads must be >= 1 or None for auto-detect")