stele-context 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
stele/__init__.py ADDED
@@ -0,0 +1,36 @@
1
+ """
2
+ Stele — Local context cache for LLM agents with semantic chunking
3
+ and vector search.
4
+
5
+ Smart context cache that avoids re-reading unchanged files by caching
6
+ chunk data with semantic search. Routes documents through modality-specific
7
+ chunkers, stores chunk content with HNSW vector indexing, and provides
8
+ fast retrieval via semantic search.
9
+
10
+ Key Features:
11
+ - Dynamic semantic chunking with modality-specific chunkers
12
+ - HNSW vector index for O(log n) similarity search
13
+ - Chunk content persistence for instant retrieval
14
+ - Change detection with hash + semantic comparison
15
+ - Session management with rollback support
16
+ - Built-in MCP server for agent integration
17
+
18
+ All operations are 100% offline and local-only. No internet access required.
19
+ """
20
+
21
+ __version__ = "0.9.0"
22
+ __author__ = "Stele Contributors"
23
+ __license__ = "MIT"
24
+
25
+ from stele.engine import Stele
26
+ from stele.storage import StorageBackend
27
+ from stele.session import SessionManager
28
+ from stele.mcp_server import MCPServer
29
+
30
+ __all__ = [
31
+ "Stele",
32
+ "StorageBackend",
33
+ "SessionManager",
34
+ "MCPServer",
35
+ "__version__",
36
+ ]
stele/bm25.py ADDED
@@ -0,0 +1,125 @@
1
+ """
2
+ BM25 keyword index for Stele.
3
+
4
+ Provides term-frequency based scoring to complement HNSW vector search
5
+ for hybrid retrieval. Pure Python implementation with zero dependencies.
6
+ """
7
+
8
+ import math
9
+ import re
10
+ from collections import Counter
11
+ from typing import Dict, List
12
+
13
+ _WORD_RE = re.compile(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b")
14
+
15
+
16
+ class BM25Index:
17
+ """
18
+ Okapi BM25 keyword index for hybrid search.
19
+
20
+ Maintains term frequencies and inverse document frequencies
21
+ for fast keyword scoring. Designed to run alongside HNSW
22
+ vector search — HNSW finds semantic neighbours, BM25 boosts
23
+ exact keyword matches.
24
+ """
25
+
26
+ def __init__(self, k1: float = 1.5, b: float = 0.75):
27
+ self.k1 = k1
28
+ self.b = b
29
+ self.doc_freqs: Counter = Counter()
30
+ self.doc_lengths: Dict[str, int] = {}
31
+ self.term_freqs: Dict[str, Counter] = {}
32
+ self.avg_dl: float = 0.0
33
+ self.n_docs: int = 0
34
+
35
+ def add_document(self, doc_id: str, text: str) -> None:
36
+ """Add or replace a document in the index."""
37
+ if doc_id in self.term_freqs:
38
+ self.remove_document(doc_id)
39
+
40
+ terms = self._tokenize(text)
41
+ self.term_freqs[doc_id] = Counter(terms)
42
+ self.doc_lengths[doc_id] = len(terms)
43
+ for term in set(terms):
44
+ self.doc_freqs[term] += 1
45
+ self.n_docs += 1
46
+ self._update_avg_dl()
47
+
48
+ def remove_document(self, doc_id: str) -> None:
49
+ """Remove a document from the index."""
50
+ if doc_id not in self.term_freqs:
51
+ return
52
+ for term in self.term_freqs[doc_id]:
53
+ self.doc_freqs[term] -= 1
54
+ if self.doc_freqs[term] <= 0:
55
+ del self.doc_freqs[term]
56
+ del self.term_freqs[doc_id]
57
+ del self.doc_lengths[doc_id]
58
+ self.n_docs -= 1
59
+ self._update_avg_dl()
60
+
61
+ def score(self, query: str, doc_id: str) -> float:
62
+ """Compute BM25 score for a query against a single document."""
63
+ return self._score_terms(self._tokenize(query), doc_id)
64
+
65
+ def _score_terms(self, query_terms: List[str], doc_id: str) -> float:
66
+ """Compute BM25 score from pre-tokenized query terms."""
67
+ if doc_id not in self.term_freqs or self.avg_dl == 0:
68
+ return 0.0
69
+
70
+ total = 0.0
71
+ dl = self.doc_lengths[doc_id]
72
+ tf_map = self.term_freqs[doc_id]
73
+
74
+ for term in query_terms:
75
+ n = self.doc_freqs.get(term, 0)
76
+ if n == 0:
77
+ continue
78
+ idf = math.log((self.n_docs - n + 0.5) / (n + 0.5) + 1.0)
79
+ tf = tf_map.get(term, 0)
80
+ numerator = tf * (self.k1 + 1.0)
81
+ denominator = tf + self.k1 * (1.0 - self.b + self.b * dl / self.avg_dl)
82
+ total += idf * numerator / denominator
83
+
84
+ return total
85
+
86
+ def score_batch(self, query: str, doc_ids: List[str]) -> Dict[str, float]:
87
+ """Score multiple documents against a query (tokenizes once)."""
88
+ terms = self._tokenize(query)
89
+ return {doc_id: self._score_terms(terms, doc_id) for doc_id in doc_ids}
90
+
91
+ def _tokenize(self, text: str) -> List[str]:
92
+ """Tokenize text into lowercase word terms (len > 1)."""
93
+ return [w.lower() for w in _WORD_RE.findall(text) if len(w) > 1]
94
+
95
+ def _update_avg_dl(self) -> None:
96
+ """Recompute average document length."""
97
+ if self.n_docs > 0:
98
+ self.avg_dl = sum(self.doc_lengths.values()) / self.n_docs
99
+ else:
100
+ self.avg_dl = 0.0
101
+
102
+ def to_dict(self) -> Dict:
103
+ """Serialize to a plain dict for persistence."""
104
+ return {
105
+ "k1": self.k1,
106
+ "b": self.b,
107
+ "doc_freqs": dict(self.doc_freqs),
108
+ "doc_lengths": self.doc_lengths,
109
+ "term_freqs": {doc_id: dict(tf) for doc_id, tf in self.term_freqs.items()},
110
+ "avg_dl": self.avg_dl,
111
+ "n_docs": self.n_docs,
112
+ }
113
+
114
+ @classmethod
115
+ def from_dict(cls, data: Dict) -> "BM25Index":
116
+ """Reconstruct from serialized dict."""
117
+ idx = cls(k1=data["k1"], b=data["b"])
118
+ idx.doc_freqs = Counter(data["doc_freqs"])
119
+ idx.doc_lengths = data["doc_lengths"]
120
+ idx.term_freqs = {
121
+ doc_id: Counter(tf) for doc_id, tf in data["term_freqs"].items()
122
+ }
123
+ idx.avg_dl = data["avg_dl"]
124
+ idx.n_docs = data["n_docs"]
125
+ return idx
@@ -0,0 +1,67 @@
1
+ """
2
+ Stele chunkers module.
3
+
4
+ Provides modality-specific chunkers for different file types:
5
+ - TextChunker: Plain text files (zero dependencies)
6
+ - CodeChunker: Code files with AST awareness (zero dependencies)
7
+ - ImageChunker: Image files (requires Pillow)
8
+ - PDFChunker: PDF files (requires pymupdf)
9
+ - AudioChunker: Audio files (requires librosa)
10
+ - VideoChunker: Video files (requires opencv)
11
+
12
+ All chunkers follow the same interface and can be registered with Stele.
13
+ """
14
+
15
+ from stele.chunkers.base import BaseChunker, Chunk
16
+ from stele.chunkers.text import TextChunker
17
+ from stele.chunkers.code import CodeChunker
18
+
19
+ # Optional chunkers (require additional dependencies)
20
+ # Each chunker module imports successfully even without its optional dependency,
21
+ # but the constructor raises ImportError. Check the inner availability flag.
22
+ try:
23
+ from stele.chunkers.image import ImageChunker, HAS_PIL
24
+
25
+ HAS_IMAGE_CHUNKER = HAS_PIL
26
+ except ImportError:
27
+ HAS_IMAGE_CHUNKER = False
28
+ ImageChunker = None # type: ignore
29
+
30
+ try:
31
+ from stele.chunkers.pdf import PDFChunker, HAS_PYMUPDF
32
+
33
+ HAS_PDF_CHUNKER = HAS_PYMUPDF
34
+ except ImportError:
35
+ HAS_PDF_CHUNKER = False
36
+ PDFChunker = None # type: ignore
37
+
38
+ try:
39
+ from stele.chunkers.audio import AudioChunker, HAS_LIBROSA
40
+
41
+ HAS_AUDIO_CHUNKER = HAS_LIBROSA
42
+ except ImportError:
43
+ HAS_AUDIO_CHUNKER = False
44
+ AudioChunker = None # type: ignore
45
+
46
+ try:
47
+ from stele.chunkers.video import VideoChunker, HAS_OPENCV
48
+
49
+ HAS_VIDEO_CHUNKER = HAS_OPENCV
50
+ except ImportError:
51
+ HAS_VIDEO_CHUNKER = False
52
+ VideoChunker = None # type: ignore
53
+
54
+ __all__ = [
55
+ "BaseChunker",
56
+ "Chunk",
57
+ "TextChunker",
58
+ "CodeChunker",
59
+ "ImageChunker",
60
+ "PDFChunker",
61
+ "AudioChunker",
62
+ "VideoChunker",
63
+ "HAS_IMAGE_CHUNKER",
64
+ "HAS_PDF_CHUNKER",
65
+ "HAS_AUDIO_CHUNKER",
66
+ "HAS_VIDEO_CHUNKER",
67
+ ]
@@ -0,0 +1,198 @@
1
+ """
2
+ Audio chunker for Stele.
3
+
4
+ Splits audio files into time-based segments with MFCC features.
5
+ Requires librosa for audio processing.
6
+
7
+ Install: pip install stele[audio]
8
+ """
9
+
10
+ from typing import Any, Dict, List
11
+
12
+ from stele.chunkers.base import BaseChunker, Chunk
13
+
14
+ # Check for librosa
15
+ try:
16
+ import librosa
17
+ import numpy as np
18
+
19
+ HAS_LIBROSA = True
20
+ except ImportError:
21
+ HAS_LIBROSA = False
22
+ librosa = None # type: ignore
23
+ np = None # type: ignore
24
+
25
+
26
+ class AudioChunker(BaseChunker):
27
+ """
28
+ Chunker for audio files.
29
+
30
+ Supports:
31
+ - Time-based segmentation
32
+ - MFCC feature extraction
33
+ - Spectral features
34
+
35
+ Requires: librosa (pip install stele[audio])
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ segment_duration: float = 30.0, # seconds
41
+ sample_rate: int = 22050,
42
+ n_mfcc: int = 13,
43
+ ):
44
+ """
45
+ Initialize audio chunker.
46
+
47
+ Args:
48
+ segment_duration: Duration of each segment in seconds
49
+ sample_rate: Sample rate for processing
50
+ n_mfcc: Number of MFCC coefficients
51
+ """
52
+ if not HAS_LIBROSA:
53
+ raise ImportError(
54
+ "librosa is required for audio support. "
55
+ "Install with: pip install stele[audio]"
56
+ )
57
+
58
+ self.segment_duration = segment_duration
59
+ self.sample_rate = sample_rate
60
+ self.n_mfcc = n_mfcc
61
+
62
+ def supported_extensions(self) -> List[str]:
63
+ """Return supported audio file extensions."""
64
+ return [
65
+ ".mp3",
66
+ ".wav",
67
+ ".ogg",
68
+ ".flac",
69
+ ".m4a",
70
+ ".aac",
71
+ ".wma",
72
+ ]
73
+
74
+ def chunk(
75
+ self,
76
+ content: Any,
77
+ document_path: str,
78
+ **kwargs: Any,
79
+ ) -> List[Chunk]:
80
+ """
81
+ Split audio into chunks.
82
+
83
+ Args:
84
+ content: Audio content (bytes or file path)
85
+ document_path: Path to source document
86
+ **kwargs: Additional options
87
+
88
+ Returns:
89
+ List of Chunk objects
90
+ """
91
+ # Load audio
92
+ if isinstance(content, bytes):
93
+ # Save to temp file for librosa
94
+ import tempfile
95
+
96
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
97
+ f.write(content)
98
+ temp_path = f.name
99
+
100
+ try:
101
+ y, sr = librosa.load(temp_path, sr=self.sample_rate)
102
+ finally:
103
+ import os
104
+
105
+ os.unlink(temp_path)
106
+ elif isinstance(content, str):
107
+ y, sr = librosa.load(content, sr=self.sample_rate)
108
+ else:
109
+ raise ValueError(f"Unsupported content type: {type(content)}")
110
+
111
+ # Compute segment samples
112
+ segment_samples = int(self.segment_duration * sr)
113
+
114
+ # Create chunks
115
+ chunks: List[Chunk] = []
116
+ chunk_index = 0
117
+
118
+ for start_sample in range(0, len(y), segment_samples):
119
+ end_sample = min(start_sample + segment_samples, len(y))
120
+ segment = y[start_sample:end_sample]
121
+
122
+ # Skip very short segments
123
+ if len(segment) < sr * 0.1: # Less than 0.1 seconds
124
+ continue
125
+
126
+ # Compute features
127
+ mfcc = self._compute_mfcc(segment, sr)
128
+ spectral_features = self._compute_spectral_features(segment, sr)
129
+
130
+ # Time range
131
+ start_time = start_sample / sr
132
+ end_time = end_sample / sr
133
+
134
+ chunk = Chunk(
135
+ content=segment.tobytes(),
136
+ modality="audio",
137
+ start_pos=int(start_time * 1000), # milliseconds
138
+ end_pos=int(end_time * 1000),
139
+ document_path=document_path,
140
+ chunk_index=chunk_index,
141
+ metadata={
142
+ "start_time": start_time,
143
+ "end_time": end_time,
144
+ "duration": end_time - start_time,
145
+ "sample_rate": sr,
146
+ "mfcc_mean": mfcc.tolist(),
147
+ "spectral_features": spectral_features,
148
+ },
149
+ )
150
+ chunks.append(chunk)
151
+ chunk_index += 1
152
+
153
+ # Handle empty audio
154
+ if not chunks:
155
+ chunks.append(
156
+ Chunk(
157
+ content=b"",
158
+ modality="audio",
159
+ start_pos=0,
160
+ end_pos=0,
161
+ document_path=document_path,
162
+ chunk_index=0,
163
+ metadata={"sample_rate": sr},
164
+ )
165
+ )
166
+
167
+ return chunks
168
+
169
+ def _compute_mfcc(self, y: Any, sr: int) -> Any:
170
+ """Compute MFCC features."""
171
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc)
172
+ # Return mean across time
173
+ return mfcc.mean(axis=1)
174
+
175
+ def _compute_spectral_features(self, y: Any, sr: int) -> Dict[str, float]:
176
+ """Compute spectral features."""
177
+ # Spectral centroid
178
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
179
+
180
+ # Spectral bandwidth
181
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
182
+
183
+ # Spectral rolloff
184
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
185
+
186
+ # Zero crossing rate
187
+ zero_crossing_rate = librosa.feature.zero_crossing_rate(y).mean()
188
+
189
+ # RMS energy
190
+ rms = librosa.feature.rms(y=y).mean()
191
+
192
+ return {
193
+ "spectral_centroid": float(spectral_centroid),
194
+ "spectral_bandwidth": float(spectral_bandwidth),
195
+ "spectral_rolloff": float(spectral_rolloff),
196
+ "zero_crossing_rate": float(zero_crossing_rate),
197
+ "rms": float(rms),
198
+ }