stele-context 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stele/__init__.py +36 -0
- stele/bm25.py +125 -0
- stele/chunkers/__init__.py +67 -0
- stele/chunkers/audio.py +198 -0
- stele/chunkers/base.py +307 -0
- stele/chunkers/code.py +613 -0
- stele/chunkers/image.py +277 -0
- stele/chunkers/numpy_compat.py +77 -0
- stele/chunkers/pdf.py +173 -0
- stele/chunkers/text.py +319 -0
- stele/chunkers/video.py +254 -0
- stele/cli.py +579 -0
- stele/cli_metadata.py +140 -0
- stele/config.py +195 -0
- stele/coordination.py +695 -0
- stele/core.py +18 -0
- stele/document_lock_storage.py +440 -0
- stele/engine.py +1735 -0
- stele/env_checks.py +153 -0
- stele/index.py +616 -0
- stele/index_store.py +175 -0
- stele/mcp_server.py +842 -0
- stele/mcp_stdio.py +969 -0
- stele/metadata_storage.py +233 -0
- stele/py.typed +0 -0
- stele/rwlock.py +52 -0
- stele/session.py +224 -0
- stele/session_storage.py +350 -0
- stele/storage.py +1040 -0
- stele/symbol_graph.py +327 -0
- stele/symbol_storage.py +256 -0
- stele/symbols.py +885 -0
- stele_context-0.7.0.dist-info/METADATA +554 -0
- stele_context-0.7.0.dist-info/RECORD +38 -0
- stele_context-0.7.0.dist-info/WHEEL +5 -0
- stele_context-0.7.0.dist-info/entry_points.txt +3 -0
- stele_context-0.7.0.dist-info/licenses/LICENSE +21 -0
- stele_context-0.7.0.dist-info/top_level.txt +1 -0
stele/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Stele — Local context cache for LLM agents with semantic chunking
|
|
3
|
+
and vector search.
|
|
4
|
+
|
|
5
|
+
Smart context cache that avoids re-reading unchanged files by caching
|
|
6
|
+
chunk data with semantic search. Routes documents through modality-specific
|
|
7
|
+
chunkers, stores chunk content with HNSW vector indexing, and provides
|
|
8
|
+
fast retrieval via semantic search.
|
|
9
|
+
|
|
10
|
+
Key Features:
|
|
11
|
+
- Dynamic semantic chunking with modality-specific chunkers
|
|
12
|
+
- HNSW vector index for O(log n) similarity search
|
|
13
|
+
- Chunk content persistence for instant retrieval
|
|
14
|
+
- Change detection with hash + semantic comparison
|
|
15
|
+
- Session management with rollback support
|
|
16
|
+
- Built-in MCP server for agent integration
|
|
17
|
+
|
|
18
|
+
All operations are 100% offline and local-only. No internet access required.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
__version__ = "0.9.0"
|
|
22
|
+
__author__ = "Stele Contributors"
|
|
23
|
+
__license__ = "MIT"
|
|
24
|
+
|
|
25
|
+
from stele.engine import Stele
|
|
26
|
+
from stele.storage import StorageBackend
|
|
27
|
+
from stele.session import SessionManager
|
|
28
|
+
from stele.mcp_server import MCPServer
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"Stele",
|
|
32
|
+
"StorageBackend",
|
|
33
|
+
"SessionManager",
|
|
34
|
+
"MCPServer",
|
|
35
|
+
"__version__",
|
|
36
|
+
]
|
stele/bm25.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BM25 keyword index for Stele.
|
|
3
|
+
|
|
4
|
+
Provides term-frequency based scoring to complement HNSW vector search
|
|
5
|
+
for hybrid retrieval. Pure Python implementation with zero dependencies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import math
|
|
9
|
+
import re
|
|
10
|
+
from collections import Counter
|
|
11
|
+
from typing import Dict, List
|
|
12
|
+
|
|
13
|
+
_WORD_RE = re.compile(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BM25Index:
|
|
17
|
+
"""
|
|
18
|
+
Okapi BM25 keyword index for hybrid search.
|
|
19
|
+
|
|
20
|
+
Maintains term frequencies and inverse document frequencies
|
|
21
|
+
for fast keyword scoring. Designed to run alongside HNSW
|
|
22
|
+
vector search — HNSW finds semantic neighbours, BM25 boosts
|
|
23
|
+
exact keyword matches.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, k1: float = 1.5, b: float = 0.75):
|
|
27
|
+
self.k1 = k1
|
|
28
|
+
self.b = b
|
|
29
|
+
self.doc_freqs: Counter = Counter()
|
|
30
|
+
self.doc_lengths: Dict[str, int] = {}
|
|
31
|
+
self.term_freqs: Dict[str, Counter] = {}
|
|
32
|
+
self.avg_dl: float = 0.0
|
|
33
|
+
self.n_docs: int = 0
|
|
34
|
+
|
|
35
|
+
def add_document(self, doc_id: str, text: str) -> None:
|
|
36
|
+
"""Add or replace a document in the index."""
|
|
37
|
+
if doc_id in self.term_freqs:
|
|
38
|
+
self.remove_document(doc_id)
|
|
39
|
+
|
|
40
|
+
terms = self._tokenize(text)
|
|
41
|
+
self.term_freqs[doc_id] = Counter(terms)
|
|
42
|
+
self.doc_lengths[doc_id] = len(terms)
|
|
43
|
+
for term in set(terms):
|
|
44
|
+
self.doc_freqs[term] += 1
|
|
45
|
+
self.n_docs += 1
|
|
46
|
+
self._update_avg_dl()
|
|
47
|
+
|
|
48
|
+
def remove_document(self, doc_id: str) -> None:
|
|
49
|
+
"""Remove a document from the index."""
|
|
50
|
+
if doc_id not in self.term_freqs:
|
|
51
|
+
return
|
|
52
|
+
for term in self.term_freqs[doc_id]:
|
|
53
|
+
self.doc_freqs[term] -= 1
|
|
54
|
+
if self.doc_freqs[term] <= 0:
|
|
55
|
+
del self.doc_freqs[term]
|
|
56
|
+
del self.term_freqs[doc_id]
|
|
57
|
+
del self.doc_lengths[doc_id]
|
|
58
|
+
self.n_docs -= 1
|
|
59
|
+
self._update_avg_dl()
|
|
60
|
+
|
|
61
|
+
def score(self, query: str, doc_id: str) -> float:
|
|
62
|
+
"""Compute BM25 score for a query against a single document."""
|
|
63
|
+
return self._score_terms(self._tokenize(query), doc_id)
|
|
64
|
+
|
|
65
|
+
def _score_terms(self, query_terms: List[str], doc_id: str) -> float:
|
|
66
|
+
"""Compute BM25 score from pre-tokenized query terms."""
|
|
67
|
+
if doc_id not in self.term_freqs or self.avg_dl == 0:
|
|
68
|
+
return 0.0
|
|
69
|
+
|
|
70
|
+
total = 0.0
|
|
71
|
+
dl = self.doc_lengths[doc_id]
|
|
72
|
+
tf_map = self.term_freqs[doc_id]
|
|
73
|
+
|
|
74
|
+
for term in query_terms:
|
|
75
|
+
n = self.doc_freqs.get(term, 0)
|
|
76
|
+
if n == 0:
|
|
77
|
+
continue
|
|
78
|
+
idf = math.log((self.n_docs - n + 0.5) / (n + 0.5) + 1.0)
|
|
79
|
+
tf = tf_map.get(term, 0)
|
|
80
|
+
numerator = tf * (self.k1 + 1.0)
|
|
81
|
+
denominator = tf + self.k1 * (1.0 - self.b + self.b * dl / self.avg_dl)
|
|
82
|
+
total += idf * numerator / denominator
|
|
83
|
+
|
|
84
|
+
return total
|
|
85
|
+
|
|
86
|
+
def score_batch(self, query: str, doc_ids: List[str]) -> Dict[str, float]:
|
|
87
|
+
"""Score multiple documents against a query (tokenizes once)."""
|
|
88
|
+
terms = self._tokenize(query)
|
|
89
|
+
return {doc_id: self._score_terms(terms, doc_id) for doc_id in doc_ids}
|
|
90
|
+
|
|
91
|
+
def _tokenize(self, text: str) -> List[str]:
|
|
92
|
+
"""Tokenize text into lowercase word terms (len > 1)."""
|
|
93
|
+
return [w.lower() for w in _WORD_RE.findall(text) if len(w) > 1]
|
|
94
|
+
|
|
95
|
+
def _update_avg_dl(self) -> None:
|
|
96
|
+
"""Recompute average document length."""
|
|
97
|
+
if self.n_docs > 0:
|
|
98
|
+
self.avg_dl = sum(self.doc_lengths.values()) / self.n_docs
|
|
99
|
+
else:
|
|
100
|
+
self.avg_dl = 0.0
|
|
101
|
+
|
|
102
|
+
def to_dict(self) -> Dict:
|
|
103
|
+
"""Serialize to a plain dict for persistence."""
|
|
104
|
+
return {
|
|
105
|
+
"k1": self.k1,
|
|
106
|
+
"b": self.b,
|
|
107
|
+
"doc_freqs": dict(self.doc_freqs),
|
|
108
|
+
"doc_lengths": self.doc_lengths,
|
|
109
|
+
"term_freqs": {doc_id: dict(tf) for doc_id, tf in self.term_freqs.items()},
|
|
110
|
+
"avg_dl": self.avg_dl,
|
|
111
|
+
"n_docs": self.n_docs,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_dict(cls, data: Dict) -> "BM25Index":
|
|
116
|
+
"""Reconstruct from serialized dict."""
|
|
117
|
+
idx = cls(k1=data["k1"], b=data["b"])
|
|
118
|
+
idx.doc_freqs = Counter(data["doc_freqs"])
|
|
119
|
+
idx.doc_lengths = data["doc_lengths"]
|
|
120
|
+
idx.term_freqs = {
|
|
121
|
+
doc_id: Counter(tf) for doc_id, tf in data["term_freqs"].items()
|
|
122
|
+
}
|
|
123
|
+
idx.avg_dl = data["avg_dl"]
|
|
124
|
+
idx.n_docs = data["n_docs"]
|
|
125
|
+
return idx
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Stele chunkers module.
|
|
3
|
+
|
|
4
|
+
Provides modality-specific chunkers for different file types:
|
|
5
|
+
- TextChunker: Plain text files (zero dependencies)
|
|
6
|
+
- CodeChunker: Code files with AST awareness (zero dependencies)
|
|
7
|
+
- ImageChunker: Image files (requires Pillow)
|
|
8
|
+
- PDFChunker: PDF files (requires pymupdf)
|
|
9
|
+
- AudioChunker: Audio files (requires librosa)
|
|
10
|
+
- VideoChunker: Video files (requires opencv)
|
|
11
|
+
|
|
12
|
+
All chunkers follow the same interface and can be registered with Stele.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from stele.chunkers.base import BaseChunker, Chunk
|
|
16
|
+
from stele.chunkers.text import TextChunker
|
|
17
|
+
from stele.chunkers.code import CodeChunker
|
|
18
|
+
|
|
19
|
+
# Optional chunkers (require additional dependencies)
|
|
20
|
+
# Each chunker module imports successfully even without its optional dependency,
|
|
21
|
+
# but the constructor raises ImportError. Check the inner availability flag.
|
|
22
|
+
try:
|
|
23
|
+
from stele.chunkers.image import ImageChunker, HAS_PIL
|
|
24
|
+
|
|
25
|
+
HAS_IMAGE_CHUNKER = HAS_PIL
|
|
26
|
+
except ImportError:
|
|
27
|
+
HAS_IMAGE_CHUNKER = False
|
|
28
|
+
ImageChunker = None # type: ignore
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
from stele.chunkers.pdf import PDFChunker, HAS_PYMUPDF
|
|
32
|
+
|
|
33
|
+
HAS_PDF_CHUNKER = HAS_PYMUPDF
|
|
34
|
+
except ImportError:
|
|
35
|
+
HAS_PDF_CHUNKER = False
|
|
36
|
+
PDFChunker = None # type: ignore
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from stele.chunkers.audio import AudioChunker, HAS_LIBROSA
|
|
40
|
+
|
|
41
|
+
HAS_AUDIO_CHUNKER = HAS_LIBROSA
|
|
42
|
+
except ImportError:
|
|
43
|
+
HAS_AUDIO_CHUNKER = False
|
|
44
|
+
AudioChunker = None # type: ignore
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
from stele.chunkers.video import VideoChunker, HAS_OPENCV
|
|
48
|
+
|
|
49
|
+
HAS_VIDEO_CHUNKER = HAS_OPENCV
|
|
50
|
+
except ImportError:
|
|
51
|
+
HAS_VIDEO_CHUNKER = False
|
|
52
|
+
VideoChunker = None # type: ignore
|
|
53
|
+
|
|
54
|
+
__all__ = [
|
|
55
|
+
"BaseChunker",
|
|
56
|
+
"Chunk",
|
|
57
|
+
"TextChunker",
|
|
58
|
+
"CodeChunker",
|
|
59
|
+
"ImageChunker",
|
|
60
|
+
"PDFChunker",
|
|
61
|
+
"AudioChunker",
|
|
62
|
+
"VideoChunker",
|
|
63
|
+
"HAS_IMAGE_CHUNKER",
|
|
64
|
+
"HAS_PDF_CHUNKER",
|
|
65
|
+
"HAS_AUDIO_CHUNKER",
|
|
66
|
+
"HAS_VIDEO_CHUNKER",
|
|
67
|
+
]
|
stele/chunkers/audio.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Audio chunker for Stele.
|
|
3
|
+
|
|
4
|
+
Splits audio files into time-based segments with MFCC features.
|
|
5
|
+
Requires librosa for audio processing.
|
|
6
|
+
|
|
7
|
+
Install: pip install stele[audio]
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List
|
|
11
|
+
|
|
12
|
+
from stele.chunkers.base import BaseChunker, Chunk
|
|
13
|
+
|
|
14
|
+
# Check for librosa
|
|
15
|
+
try:
|
|
16
|
+
import librosa
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
HAS_LIBROSA = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
HAS_LIBROSA = False
|
|
22
|
+
librosa = None # type: ignore
|
|
23
|
+
np = None # type: ignore
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AudioChunker(BaseChunker):
|
|
27
|
+
"""
|
|
28
|
+
Chunker for audio files.
|
|
29
|
+
|
|
30
|
+
Supports:
|
|
31
|
+
- Time-based segmentation
|
|
32
|
+
- MFCC feature extraction
|
|
33
|
+
- Spectral features
|
|
34
|
+
|
|
35
|
+
Requires: librosa (pip install stele[audio])
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
segment_duration: float = 30.0, # seconds
|
|
41
|
+
sample_rate: int = 22050,
|
|
42
|
+
n_mfcc: int = 13,
|
|
43
|
+
):
|
|
44
|
+
"""
|
|
45
|
+
Initialize audio chunker.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
segment_duration: Duration of each segment in seconds
|
|
49
|
+
sample_rate: Sample rate for processing
|
|
50
|
+
n_mfcc: Number of MFCC coefficients
|
|
51
|
+
"""
|
|
52
|
+
if not HAS_LIBROSA:
|
|
53
|
+
raise ImportError(
|
|
54
|
+
"librosa is required for audio support. "
|
|
55
|
+
"Install with: pip install stele[audio]"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
self.segment_duration = segment_duration
|
|
59
|
+
self.sample_rate = sample_rate
|
|
60
|
+
self.n_mfcc = n_mfcc
|
|
61
|
+
|
|
62
|
+
def supported_extensions(self) -> List[str]:
|
|
63
|
+
"""Return supported audio file extensions."""
|
|
64
|
+
return [
|
|
65
|
+
".mp3",
|
|
66
|
+
".wav",
|
|
67
|
+
".ogg",
|
|
68
|
+
".flac",
|
|
69
|
+
".m4a",
|
|
70
|
+
".aac",
|
|
71
|
+
".wma",
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
def chunk(
|
|
75
|
+
self,
|
|
76
|
+
content: Any,
|
|
77
|
+
document_path: str,
|
|
78
|
+
**kwargs: Any,
|
|
79
|
+
) -> List[Chunk]:
|
|
80
|
+
"""
|
|
81
|
+
Split audio into chunks.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
content: Audio content (bytes or file path)
|
|
85
|
+
document_path: Path to source document
|
|
86
|
+
**kwargs: Additional options
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
List of Chunk objects
|
|
90
|
+
"""
|
|
91
|
+
# Load audio
|
|
92
|
+
if isinstance(content, bytes):
|
|
93
|
+
# Save to temp file for librosa
|
|
94
|
+
import tempfile
|
|
95
|
+
|
|
96
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
97
|
+
f.write(content)
|
|
98
|
+
temp_path = f.name
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
y, sr = librosa.load(temp_path, sr=self.sample_rate)
|
|
102
|
+
finally:
|
|
103
|
+
import os
|
|
104
|
+
|
|
105
|
+
os.unlink(temp_path)
|
|
106
|
+
elif isinstance(content, str):
|
|
107
|
+
y, sr = librosa.load(content, sr=self.sample_rate)
|
|
108
|
+
else:
|
|
109
|
+
raise ValueError(f"Unsupported content type: {type(content)}")
|
|
110
|
+
|
|
111
|
+
# Compute segment samples
|
|
112
|
+
segment_samples = int(self.segment_duration * sr)
|
|
113
|
+
|
|
114
|
+
# Create chunks
|
|
115
|
+
chunks: List[Chunk] = []
|
|
116
|
+
chunk_index = 0
|
|
117
|
+
|
|
118
|
+
for start_sample in range(0, len(y), segment_samples):
|
|
119
|
+
end_sample = min(start_sample + segment_samples, len(y))
|
|
120
|
+
segment = y[start_sample:end_sample]
|
|
121
|
+
|
|
122
|
+
# Skip very short segments
|
|
123
|
+
if len(segment) < sr * 0.1: # Less than 0.1 seconds
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
# Compute features
|
|
127
|
+
mfcc = self._compute_mfcc(segment, sr)
|
|
128
|
+
spectral_features = self._compute_spectral_features(segment, sr)
|
|
129
|
+
|
|
130
|
+
# Time range
|
|
131
|
+
start_time = start_sample / sr
|
|
132
|
+
end_time = end_sample / sr
|
|
133
|
+
|
|
134
|
+
chunk = Chunk(
|
|
135
|
+
content=segment.tobytes(),
|
|
136
|
+
modality="audio",
|
|
137
|
+
start_pos=int(start_time * 1000), # milliseconds
|
|
138
|
+
end_pos=int(end_time * 1000),
|
|
139
|
+
document_path=document_path,
|
|
140
|
+
chunk_index=chunk_index,
|
|
141
|
+
metadata={
|
|
142
|
+
"start_time": start_time,
|
|
143
|
+
"end_time": end_time,
|
|
144
|
+
"duration": end_time - start_time,
|
|
145
|
+
"sample_rate": sr,
|
|
146
|
+
"mfcc_mean": mfcc.tolist(),
|
|
147
|
+
"spectral_features": spectral_features,
|
|
148
|
+
},
|
|
149
|
+
)
|
|
150
|
+
chunks.append(chunk)
|
|
151
|
+
chunk_index += 1
|
|
152
|
+
|
|
153
|
+
# Handle empty audio
|
|
154
|
+
if not chunks:
|
|
155
|
+
chunks.append(
|
|
156
|
+
Chunk(
|
|
157
|
+
content=b"",
|
|
158
|
+
modality="audio",
|
|
159
|
+
start_pos=0,
|
|
160
|
+
end_pos=0,
|
|
161
|
+
document_path=document_path,
|
|
162
|
+
chunk_index=0,
|
|
163
|
+
metadata={"sample_rate": sr},
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
return chunks
|
|
168
|
+
|
|
169
|
+
def _compute_mfcc(self, y: Any, sr: int) -> Any:
|
|
170
|
+
"""Compute MFCC features."""
|
|
171
|
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc)
|
|
172
|
+
# Return mean across time
|
|
173
|
+
return mfcc.mean(axis=1)
|
|
174
|
+
|
|
175
|
+
def _compute_spectral_features(self, y: Any, sr: int) -> Dict[str, float]:
|
|
176
|
+
"""Compute spectral features."""
|
|
177
|
+
# Spectral centroid
|
|
178
|
+
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
|
|
179
|
+
|
|
180
|
+
# Spectral bandwidth
|
|
181
|
+
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
|
|
182
|
+
|
|
183
|
+
# Spectral rolloff
|
|
184
|
+
spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
|
|
185
|
+
|
|
186
|
+
# Zero crossing rate
|
|
187
|
+
zero_crossing_rate = librosa.feature.zero_crossing_rate(y).mean()
|
|
188
|
+
|
|
189
|
+
# RMS energy
|
|
190
|
+
rms = librosa.feature.rms(y=y).mean()
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
"spectral_centroid": float(spectral_centroid),
|
|
194
|
+
"spectral_bandwidth": float(spectral_bandwidth),
|
|
195
|
+
"spectral_rolloff": float(spectral_rolloff),
|
|
196
|
+
"zero_crossing_rate": float(zero_crossing_rate),
|
|
197
|
+
"rms": float(rms),
|
|
198
|
+
}
|