cordon 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cordon/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ from cordon.core.config import AnalysisConfig
2
+ from cordon.core.types import AnalysisResult, MergedBlock, ScoredWindow, TextWindow
3
+ from cordon.pipeline import SemanticLogAnalyzer
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ __all__ = [
8
+ "SemanticLogAnalyzer",
9
+ "AnalysisConfig",
10
+ "AnalysisResult",
11
+ "TextWindow",
12
+ "ScoredWindow",
13
+ "MergedBlock",
14
+ ]
@@ -0,0 +1,4 @@
1
+ from cordon.analysis.scorer import DensityAnomalyScorer
2
+ from cordon.analysis.thresholder import Thresholder
3
+
4
+ __all__ = ["DensityAnomalyScorer", "Thresholder"]
@@ -0,0 +1,256 @@
1
+ import tempfile
2
+ import warnings
3
+ from collections.abc import Sequence
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+ from sklearn.neighbors import NearestNeighbors
10
+
11
+ from cordon.core.config import AnalysisConfig
12
+ from cordon.core.types import ScoredWindow, TextWindow
13
+
14
+ # optional FAISS support
15
+ try:
16
+ import faiss
17
+
18
+ HAS_FAISS = True
19
+ except ImportError:
20
+ HAS_FAISS = False
21
+
22
+
23
+ class DensityAnomalyScorer:
24
+ """Calculate significance scores using k-NN cosine distance.
25
+
26
+ This scorer uses the average distance to k nearest neighbors as a measure
27
+ of how anomalous each window is. Higher distances
28
+ indicate more anomalous content.
29
+
30
+ For large datasets, automatically switches to memory-mapped storage to
31
+ reduce RAM usage.
32
+ """
33
+
34
+ def _calculate_n_neighbors(self, config: AnalysisConfig, n_samples: int) -> int:
35
+ """Calculate the number of neighbors to use for k-NN.
36
+
37
+ Args:
38
+ config: Analysis configuration with k_neighbors setting
39
+ n_samples: Total number of samples in the dataset
40
+
41
+ Returns:
42
+ Number of neighbors to use (k+1 for self, capped at n_samples)
43
+ """
44
+ num_neighbors = config.k_neighbors
45
+ return min(num_neighbors + 1, n_samples)
46
+
47
+ def score_windows(
48
+ self,
49
+ embedded_windows: Sequence[tuple[TextWindow, npt.NDArray[np.floating[Any]]]],
50
+ config: AnalysisConfig,
51
+ ) -> list[ScoredWindow]:
52
+ """Score windows based on k-NN density.
53
+
54
+ Args:
55
+ embedded_windows: Sequence of (window, embedding) pairs
56
+ config: Analysis configuration with k_neighbors setting
57
+
58
+ Returns:
59
+ List of scored windows with anomaly scores
60
+ """
61
+ if not embedded_windows:
62
+ return []
63
+
64
+ # single window
65
+ if len(embedded_windows) == 1:
66
+ window, embedding = embedded_windows[0]
67
+ return [ScoredWindow(window=window, score=0.0, embedding=embedding)]
68
+
69
+ n_windows = len(embedded_windows)
70
+
71
+ # choose strategy based on dataset size
72
+ use_faiss = (
73
+ HAS_FAISS
74
+ and config.use_faiss_threshold is not None
75
+ and n_windows >= config.use_faiss_threshold
76
+ )
77
+ use_mmap = (
78
+ config.use_mmap_threshold is not None
79
+ and n_windows >= config.use_mmap_threshold
80
+ and not use_faiss # FAISS takes precedence
81
+ )
82
+
83
+ if use_faiss:
84
+ return self._score_windows_faiss(embedded_windows, config)
85
+ elif use_mmap:
86
+ return self._score_windows_mmap(embedded_windows, config)
87
+ else:
88
+ return self._score_windows_inmemory(embedded_windows, config)
89
+
90
+ def _score_windows_inmemory(
91
+ self,
92
+ embedded_windows: Sequence[tuple[TextWindow, npt.NDArray[np.floating[Any]]]],
93
+ config: AnalysisConfig,
94
+ ) -> list[ScoredWindow]:
95
+ """Score windows using in-memory arrays (fast, but uses more RAM).
96
+
97
+ Args:
98
+ embedded_windows: Sequence of (window, embedding) pairs
99
+ config: Analysis configuration
100
+
101
+ Returns:
102
+ List of scored windows
103
+ """
104
+ # extract embeddings into matrix
105
+ windows = [window for window, _ in embedded_windows]
106
+ embeddings = np.array([embedding for _, embedding in embedded_windows])
107
+
108
+ # build k-NN index
109
+ n_samples = len(embeddings)
110
+ n_neighbors = self._calculate_n_neighbors(config, n_samples)
111
+
112
+ knn = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
113
+ knn.fit(embeddings)
114
+
115
+ # query all points
116
+ distances, _ = knn.kneighbors(embeddings)
117
+
118
+ # calculate scores (average distance to k nearest neighbors, excluding self)
119
+ scored_windows = []
120
+ for window_idx, (window, embedding) in enumerate(zip(windows, embeddings, strict=False)):
121
+ # skip first distance (self = 0) and take mean of remaining
122
+ neighbor_distances = distances[window_idx][1:]
123
+ score = float(np.mean(neighbor_distances))
124
+
125
+ scored_windows.append(ScoredWindow(window=window, score=score, embedding=embedding))
126
+
127
+ return scored_windows
128
+
129
+ def _score_windows_mmap(
130
+ self,
131
+ embedded_windows: Sequence[tuple[TextWindow, npt.NDArray[np.floating[Any]]]],
132
+ config: AnalysisConfig,
133
+ ) -> list[ScoredWindow]:
134
+ """Score windows using memory-mapped storage (lower RAM, slightly slower).
135
+
136
+ Args:
137
+ embedded_windows: Sequence of (window, embedding) pairs
138
+ config: Analysis configuration
139
+
140
+ Returns:
141
+ List of scored windows
142
+ """
143
+ windows = [window for window, _ in embedded_windows]
144
+ n_windows = len(windows)
145
+
146
+ # embedding dimension from first embedding
147
+ first_embedding = embedded_windows[0][1]
148
+ embedding_dim = len(first_embedding)
149
+
150
+ # create temporary memory-mapped file
151
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".dat")
152
+ temp_path = Path(temp_file.name)
153
+ temp_file.close()
154
+
155
+ try:
156
+ # memory-mapped array for embeddings
157
+ embeddings_mmap = np.memmap(
158
+ temp_path,
159
+ dtype="float32",
160
+ mode="w+",
161
+ shape=(n_windows, embedding_dim),
162
+ )
163
+
164
+ # copy embeddings to mmap and flush to disk
165
+ for window_idx, (_, embedding) in enumerate(embedded_windows):
166
+ embeddings_mmap[window_idx] = embedding
167
+
168
+ embeddings_mmap.flush()
169
+
170
+ # build k-NN index
171
+ n_neighbors = self._calculate_n_neighbors(config, n_windows)
172
+
173
+ knn = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
174
+ knn.fit(embeddings_mmap)
175
+
176
+ # query all points and calculate scores
177
+ distances, _ = knn.kneighbors(embeddings_mmap)
178
+ scored_windows = []
179
+ for window_idx, window in enumerate(windows):
180
+ neighbor_distances = distances[window_idx][1:]
181
+ score = float(np.mean(neighbor_distances))
182
+
183
+ scored_windows.append(
184
+ ScoredWindow(
185
+ window=window,
186
+ score=score,
187
+ embedding=embeddings_mmap[window_idx].copy(),
188
+ )
189
+ )
190
+
191
+ return scored_windows
192
+
193
+ finally:
194
+ # clean up temporary file
195
+ if temp_path.exists():
196
+ temp_path.unlink()
197
+
198
+ def _score_windows_faiss(
199
+ self,
200
+ embedded_windows: Sequence[tuple[TextWindow, npt.NDArray[np.floating[Any]]]],
201
+ config: AnalysisConfig,
202
+ ) -> list[ScoredWindow]:
203
+ """Score windows using FAISS for fast approximate k-NN (lowest RAM, fastest).
204
+
205
+ Args:
206
+ embedded_windows: Sequence of (window, embedding) pairs
207
+ config: Analysis configuration
208
+
209
+ Returns:
210
+ List of scored windows
211
+ """
212
+ if not HAS_FAISS:
213
+ warnings.warn(
214
+ "FAISS not available, falling back to memory-mapped approach. "
215
+ "Install faiss-cpu or faiss-gpu for better performance on large logs.",
216
+ UserWarning,
217
+ stacklevel=2,
218
+ )
219
+ return self._score_windows_mmap(embedded_windows, config)
220
+
221
+ windows = [window for window, _ in embedded_windows]
222
+ embeddings = np.array([embedding for _, embedding in embedded_windows], dtype=np.float32)
223
+
224
+ n_windows = len(embeddings)
225
+ embedding_dim = embeddings.shape[1]
226
+ n_neighbors = self._calculate_n_neighbors(config, n_windows)
227
+
228
+ # normalize embeddings so inner product = cosine similarity
229
+ faiss.normalize_L2(embeddings)
230
+
231
+ # create FAISS index
232
+ index = faiss.IndexFlatIP(embedding_dim)
233
+ index.add(embeddings)
234
+
235
+ # query k-nearest neighbors
236
+ distances, _ = index.search(embeddings, n_neighbors)
237
+
238
+ # convert inner product (cosine similarity) to cosine distance
239
+ # after normalization, inner product equals cosine similarity
240
+ distances = 1.0 - distances
241
+
242
+ # calculate scores
243
+ scored_windows = []
244
+ for window_idx, window in enumerate(windows):
245
+ # skip first distance (self) and take mean of remaining
246
+ neighbor_distances = distances[window_idx][1:]
247
+ score = float(np.mean(neighbor_distances))
248
+
249
+ # ensure non-negative scores (handle numerical precision issues)
250
+ score = max(0.0, score)
251
+
252
+ scored_windows.append(
253
+ ScoredWindow(window=window, score=score, embedding=embeddings[window_idx])
254
+ )
255
+
256
+ return scored_windows
@@ -0,0 +1,51 @@
1
+ from collections.abc import Sequence
2
+
3
+ import numpy as np
4
+
5
+ from cordon.core.config import AnalysisConfig
6
+ from cordon.core.types import ScoredWindow
7
+
8
+
9
+ class Thresholder:
10
+ """Select top windows based on anomaly percentile.
11
+
12
+ Determines which windows are significant based on the distribution
13
+ of scores in the current dataset.
14
+ """
15
+
16
+ def select_significant(
17
+ self, scored_windows: Sequence[ScoredWindow], config: AnalysisConfig
18
+ ) -> list[ScoredWindow]:
19
+ """Select significant windows based on threshold.
20
+
21
+ Args:
22
+ scored_windows: Sequence of scored windows
23
+ config: Analysis configuration with anomaly_percentile
24
+
25
+ Returns:
26
+ List of significant windows, sorted by score (descending)
27
+ """
28
+ # no scored windows
29
+ if not scored_windows:
30
+ return []
31
+
32
+ # all windows, sorted by score descending
33
+ if config.anomaly_percentile == 1.0:
34
+ return sorted(scored_windows, key=lambda window: window.score, reverse=True)
35
+
36
+ # no windows requested
37
+ if config.anomaly_percentile == 0.0:
38
+ return []
39
+
40
+ # calculate percentile threshold
41
+ scores = np.array([sw.score for sw in scored_windows])
42
+ percentile = (1 - config.anomaly_percentile) * 100
43
+ threshold = np.percentile(scores, percentile)
44
+
45
+ # filter windows at or above threshold
46
+ selected = [sw for sw in scored_windows if sw.score >= threshold]
47
+
48
+ # sort by score descending (highest anomalies first)
49
+ selected.sort(key=lambda window: window.score, reverse=True)
50
+
51
+ return selected
cordon/cli.py ADDED
@@ -0,0 +1,230 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from cordon import AnalysisConfig, SemanticLogAnalyzer
7
+
8
+
9
+ def parse_args() -> argparse.Namespace:
10
+ """Parse command line arguments."""
11
+ parser = argparse.ArgumentParser(
12
+ prog="cordon",
13
+ description="Analyze log files for anomalous patterns using semantic similarity",
14
+ formatter_class=argparse.RawDescriptionHelpFormatter,
15
+ )
16
+
17
+ # positional arguments
18
+ parser.add_argument(
19
+ "logfiles",
20
+ type=Path,
21
+ nargs="+",
22
+ help="Path(s) to log file(s) to analyze",
23
+ )
24
+
25
+ # embedding backend selection
26
+ backend_group = parser.add_argument_group("embedding backend")
27
+ backend_group.add_argument(
28
+ "--backend",
29
+ type=str,
30
+ choices=["sentence-transformers", "llama-cpp"],
31
+ default="sentence-transformers",
32
+ help="Embedding backend to use (default: sentence-transformers)",
33
+ )
34
+ backend_group.add_argument(
35
+ "--model-path",
36
+ type=Path,
37
+ default=None,
38
+ help="GGUF model path (auto-downloads default if omitted)",
39
+ )
40
+ backend_group.add_argument(
41
+ "--n-gpu-layers",
42
+ type=int,
43
+ default=0,
44
+ help="Number of layers to offload to GPU (llama-cpp only, default: 0)",
45
+ )
46
+ backend_group.add_argument(
47
+ "--n-threads",
48
+ type=int,
49
+ default=None,
50
+ help="Thread count for llama.cpp (default: auto-detect)",
51
+ )
52
+ backend_group.add_argument(
53
+ "--n-ctx",
54
+ type=int,
55
+ default=2048,
56
+ help="Context size for llama.cpp (default: 2048)",
57
+ )
58
+
59
+ # configuration options
60
+ config_group = parser.add_argument_group("analysis configuration")
61
+ config_group.add_argument(
62
+ "--window-size",
63
+ type=int,
64
+ default=5,
65
+ help="Number of lines per window (default: 5)",
66
+ )
67
+ config_group.add_argument(
68
+ "--stride",
69
+ type=int,
70
+ default=2,
71
+ help="Step size for sliding window in lines (default: 2)",
72
+ )
73
+ config_group.add_argument(
74
+ "--k-neighbors",
75
+ type=int,
76
+ default=5,
77
+ help="Number of neighbors for k-NN density calculation (default: 5)",
78
+ )
79
+ config_group.add_argument(
80
+ "--anomaly-percentile",
81
+ type=float,
82
+ default=0.1,
83
+ help="Percentile of windows to retain, e.g., 0.1 = top 10%% (default: 0.1)",
84
+ )
85
+ config_group.add_argument(
86
+ "--model-name",
87
+ type=str,
88
+ default="all-MiniLM-L6-v2",
89
+ help="HuggingFace model name for sentence-transformers (default: all-MiniLM-L6-v2)",
90
+ )
91
+ config_group.add_argument(
92
+ "--batch-size",
93
+ type=int,
94
+ default=32,
95
+ help="Batch size for embeddings (default: 32)",
96
+ )
97
+ config_group.add_argument(
98
+ "--device",
99
+ type=str,
100
+ choices=["cuda", "mps", "cpu"],
101
+ default=None,
102
+ help="Device for model inference (default: auto-detect)",
103
+ )
104
+ config_group.add_argument(
105
+ "--use-faiss",
106
+ action="store_true",
107
+ help="Use FAISS for faster k-NN search on large logs",
108
+ )
109
+
110
+ # output options
111
+ output_group = parser.add_argument_group("output options")
112
+ output_group.add_argument(
113
+ "--detailed",
114
+ action="store_true",
115
+ help="Show detailed statistics in addition to anomalous blocks",
116
+ )
117
+
118
+ return parser.parse_args()
119
+
120
+
121
+ def analyze_file(log_path: Path, analyzer: SemanticLogAnalyzer, detailed: bool) -> None:
122
+ """Analyze a single log file and print results.
123
+
124
+ Args:
125
+ log_path: Path to the log file
126
+ analyzer: Configured SemanticLogAnalyzer instance
127
+ detailed: Whether to show detailed statistics
128
+ """
129
+ # verify file exists and is readable
130
+ if not log_path.exists():
131
+ print(f"Error: File not found: {log_path}", file=sys.stderr)
132
+ return
133
+ if not log_path.is_file():
134
+ print(f"Error: Not a file: {log_path}", file=sys.stderr)
135
+ return
136
+
137
+ # count lines in file
138
+ with open(log_path) as log_file:
139
+ line_count = sum(1 for _ in log_file)
140
+
141
+ print("=" * 80)
142
+ print(f"Analyzing: {log_path}")
143
+ print(f"Total lines: {line_count:,}")
144
+ print("=" * 80)
145
+
146
+ if detailed:
147
+ # run detailed analysis
148
+ result = analyzer.analyze_file_detailed(log_path)
149
+
150
+ print("\nAnalysis Statistics:")
151
+ print(f" Total windows created: {result.total_windows:,}")
152
+ print(f" Significant windows: {result.significant_windows:,}")
153
+ print(f" Merged blocks: {result.merged_blocks}")
154
+ print(f" Processing time: {result.processing_time:.2f}s")
155
+ print("\nScore Distribution:")
156
+ print(f" Min: {result.score_distribution['min']:.4f}")
157
+ print(f" Mean: {result.score_distribution['mean']:.4f}")
158
+ print(f" Median: {result.score_distribution['median']:.4f}")
159
+ print(f" P90: {result.score_distribution['p90']:.4f}")
160
+ print(f" Max: {result.score_distribution['max']:.4f}")
161
+
162
+ print(f"\n{'Significant Blocks':^80}")
163
+ print("=" * 80)
164
+ print(result.output)
165
+ else:
166
+ # run simple analysis
167
+ output = analyzer.analyze_file(log_path)
168
+ print(output)
169
+
170
+ print()
171
+
172
+
173
+ def main() -> None:
174
+ """Main entry point for the CLI."""
175
+ args = parse_args()
176
+
177
+ # create configuration from arguments
178
+ try:
179
+ config = AnalysisConfig(
180
+ window_size=args.window_size,
181
+ stride=args.stride,
182
+ k_neighbors=args.k_neighbors,
183
+ anomaly_percentile=args.anomaly_percentile,
184
+ model_name=args.model_name,
185
+ batch_size=args.batch_size,
186
+ device=args.device,
187
+ use_faiss_threshold=0 if args.use_faiss else None,
188
+ backend=args.backend,
189
+ model_path=str(args.model_path) if args.model_path else None,
190
+ n_gpu_layers=args.n_gpu_layers,
191
+ n_threads=args.n_threads,
192
+ n_ctx=args.n_ctx,
193
+ )
194
+ except ValueError as error:
195
+ print(f"Configuration error: {error}", file=sys.stderr)
196
+ sys.exit(1)
197
+
198
+ # create analyzer
199
+ print("Initializing analyzer...")
200
+ print(f"Backend: {config.backend}")
201
+ if config.backend == "sentence-transformers":
202
+ print(f"Model: {config.model_name}")
203
+ print(f"Device: {config.device or 'auto'}")
204
+ elif config.backend == "llama-cpp":
205
+ print(f"Model path: {config.model_path}")
206
+ print(f"GPU layers: {config.n_gpu_layers}")
207
+ if config.n_threads:
208
+ print(f"Threads: {config.n_threads}")
209
+ print()
210
+
211
+ try:
212
+ analyzer = SemanticLogAnalyzer(config)
213
+ except ImportError as error:
214
+ print(f"Import error: {error}", file=sys.stderr)
215
+ print("\nTo install llama.cpp support:", file=sys.stderr)
216
+ print(" uv pip install 'cordon[llama-cpp]'", file=sys.stderr)
217
+ print(" or: pip install llama-cpp-python", file=sys.stderr)
218
+ sys.exit(1)
219
+ except Exception as error:
220
+ print(f"Initialization error: {error}", file=sys.stderr)
221
+ sys.exit(1)
222
+ print()
223
+
224
+ # analyze each log file
225
+ for log_path in args.logfiles:
226
+ analyze_file(log_path, analyzer, args.detailed)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()
@@ -0,0 +1,19 @@
1
+ from cordon.core.config import AnalysisConfig
2
+ from cordon.core.types import (
3
+ AnalysisResult,
4
+ Embedder,
5
+ MergedBlock,
6
+ ScoredWindow,
7
+ Scorer,
8
+ TextWindow,
9
+ )
10
+
11
+ __all__ = [
12
+ "AnalysisConfig",
13
+ "TextWindow",
14
+ "ScoredWindow",
15
+ "MergedBlock",
16
+ "AnalysisResult",
17
+ "Embedder",
18
+ "Scorer",
19
+ ]
cordon/core/config.py ADDED
@@ -0,0 +1,64 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+
5
+ @dataclass
6
+ class AnalysisConfig:
7
+ """Global configuration for the analysis pipeline."""
8
+
9
+ window_size: int = 5
10
+ stride: int = 2
11
+ k_neighbors: int = 5
12
+ anomaly_percentile: float = 0.1
13
+ model_name: str = "all-MiniLM-L6-v2"
14
+ batch_size: int = 32
15
+ device: str | None = None
16
+ use_mmap_threshold: int | None = 50000 # switch to mmap at 50k windows
17
+ use_faiss_threshold: int | None = None # FAISS disabled by default
18
+ backend: str = "sentence-transformers" # or "llama-cpp"
19
+ model_path: str | None = None # GGUF model file path
20
+ n_ctx: int = 2048 # llama.cpp context size
21
+ n_threads: int | None = None # llama.cpp threads (None=auto)
22
+ n_gpu_layers: int = 0 # llama.cpp GPU layer offloading
23
+
24
+ def __post_init__(self) -> None:
25
+ """Validate configuration parameters."""
26
+ if self.window_size < 1:
27
+ raise ValueError("window_size must be >= 1")
28
+ if self.stride < 1:
29
+ raise ValueError("stride must be >= 1")
30
+ if self.k_neighbors < 1:
31
+ raise ValueError("k_neighbors must be >= 1")
32
+ if not 0.0 <= self.anomaly_percentile <= 1.0:
33
+ raise ValueError("anomaly_percentile must be between 0.0 and 1.0")
34
+ if self.batch_size < 1:
35
+ raise ValueError("batch_size must be >= 1")
36
+ if self.device is not None and self.device not in ("cuda", "mps", "cpu"):
37
+ raise ValueError("device must be 'cuda', 'mps', 'cpu', or None")
38
+
39
+ # Backend validation
40
+ if self.backend not in ("sentence-transformers", "llama-cpp"):
41
+ raise ValueError(
42
+ f"backend must be 'sentence-transformers' or 'llama-cpp', got '{self.backend}'"
43
+ )
44
+
45
+ # llama-cpp specific validation
46
+ if self.backend == "llama-cpp" and self.model_path is not None:
47
+ # If model_path is provided, validate it exists and has correct extension
48
+ # If None, LlamaCppVectorizer will auto-download default model
49
+ model_file = Path(self.model_path)
50
+ if not model_file.exists():
51
+ raise ValueError(f"GGUF model file not found: {self.model_path}")
52
+
53
+ if model_file.suffix != ".gguf":
54
+ raise ValueError(f"model_path must be a .gguf file, got: {model_file.suffix}")
55
+
56
+ # llama.cpp parameter validation
57
+ if self.n_ctx < 1:
58
+ raise ValueError("n_ctx must be >= 1")
59
+
60
+ if self.n_gpu_layers < -1:
61
+ raise ValueError("n_gpu_layers must be >= -1 (-1 for all layers, 0 for CPU-only)")
62
+
63
+ if self.n_threads is not None and self.n_threads < 1:
64
+ raise ValueError("n_threads must be >= 1 or None for auto-detect")