brainlayer 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brainlayer/__init__.py +3 -0
- brainlayer/cli/__init__.py +1545 -0
- brainlayer/cli/wizard.py +132 -0
- brainlayer/cli_new.py +151 -0
- brainlayer/client.py +164 -0
- brainlayer/clustering.py +736 -0
- brainlayer/daemon.py +1105 -0
- brainlayer/dashboard/README.md +129 -0
- brainlayer/dashboard/__init__.py +5 -0
- brainlayer/dashboard/app.py +151 -0
- brainlayer/dashboard/search.py +229 -0
- brainlayer/dashboard/views.py +230 -0
- brainlayer/embeddings.py +131 -0
- brainlayer/engine.py +550 -0
- brainlayer/index_new.py +87 -0
- brainlayer/mcp/__init__.py +1558 -0
- brainlayer/migrate.py +205 -0
- brainlayer/paths.py +43 -0
- brainlayer/pipeline/__init__.py +47 -0
- brainlayer/pipeline/analyze_communication.py +508 -0
- brainlayer/pipeline/brain_graph.py +567 -0
- brainlayer/pipeline/chat_tags.py +63 -0
- brainlayer/pipeline/chunk.py +422 -0
- brainlayer/pipeline/classify.py +472 -0
- brainlayer/pipeline/cluster_sampling.py +73 -0
- brainlayer/pipeline/enrichment.py +810 -0
- brainlayer/pipeline/extract.py +66 -0
- brainlayer/pipeline/extract_claude_desktop.py +149 -0
- brainlayer/pipeline/extract_corrections.py +231 -0
- brainlayer/pipeline/extract_markdown.py +195 -0
- brainlayer/pipeline/extract_whatsapp.py +227 -0
- brainlayer/pipeline/git_overlay.py +301 -0
- brainlayer/pipeline/longitudinal_analyzer.py +568 -0
- brainlayer/pipeline/obsidian_export.py +455 -0
- brainlayer/pipeline/operation_grouping.py +486 -0
- brainlayer/pipeline/plan_linking.py +313 -0
- brainlayer/pipeline/sanitize.py +549 -0
- brainlayer/pipeline/semantic_style.py +574 -0
- brainlayer/pipeline/session_enrichment.py +472 -0
- brainlayer/pipeline/style_embed.py +67 -0
- brainlayer/pipeline/style_index.py +139 -0
- brainlayer/pipeline/temporal_chains.py +203 -0
- brainlayer/pipeline/time_batcher.py +248 -0
- brainlayer/pipeline/unified_timeline.py +569 -0
- brainlayer/storage.py +66 -0
- brainlayer/store.py +155 -0
- brainlayer/taxonomy.json +80 -0
- brainlayer/vector_store.py +1891 -0
- brainlayer-1.0.0.dist-info/METADATA +313 -0
- brainlayer-1.0.0.dist-info/RECORD +53 -0
- brainlayer-1.0.0.dist-info/WHEEL +4 -0
- brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
- brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
brainlayer/migrate.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Migration script to convert ChromaDB data to sqlite-vec."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# Disable ChromaDB telemetry
|
|
9
|
+
os.environ["ANONYMIZED_TELEMETRY"] = "false"
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import chromadb
|
|
13
|
+
from chromadb.config import Settings
|
|
14
|
+
|
|
15
|
+
CHROMADB_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
CHROMADB_AVAILABLE = False
|
|
18
|
+
|
|
19
|
+
from .embeddings import get_embedding_model
|
|
20
|
+
from .vector_store import VectorStore
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# Paths
|
|
25
|
+
CHROMADB_PATH = Path.home() / ".local" / "share" / "brainlayer" / "chromadb.backup"
|
|
26
|
+
SQLITE_PATH = Path.home() / ".local" / "share" / "brainlayer" / "brainlayer.db"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def migrate_from_chromadb() -> bool:
|
|
30
|
+
"""Migrate data from ChromaDB to sqlite-vec."""
|
|
31
|
+
if not CHROMADB_AVAILABLE:
|
|
32
|
+
print("ChromaDB not available, skipping migration")
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
if not CHROMADB_PATH.exists():
|
|
36
|
+
print("No existing ChromaDB found, skipping migration")
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
print(f"Migrating from ChromaDB at {CHROMADB_PATH}")
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
# Connect to ChromaDB
|
|
43
|
+
client = chromadb.PersistentClient(path=str(CHROMADB_PATH), settings=Settings(anonymized_telemetry=False))
|
|
44
|
+
|
|
45
|
+
# Get all collections
|
|
46
|
+
collections = client.list_collections()
|
|
47
|
+
if not collections:
|
|
48
|
+
print("No collections found in ChromaDB")
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
collection_names = [c.name for c in collections]
|
|
52
|
+
total_all = sum(c.count() for c in collections)
|
|
53
|
+
print(f"Found {len(collections)} collections with {total_all} total chunks: {collection_names}")
|
|
54
|
+
|
|
55
|
+
# Create sqlite-vec store
|
|
56
|
+
print(f"Creating sqlite-vec database at {SQLITE_PATH}")
|
|
57
|
+
vector_store = VectorStore(SQLITE_PATH)
|
|
58
|
+
|
|
59
|
+
# Lazy-load embedding model only if needed
|
|
60
|
+
embedding_model = None
|
|
61
|
+
grand_total = 0
|
|
62
|
+
|
|
63
|
+
for collection in collections:
|
|
64
|
+
total_count = collection.count()
|
|
65
|
+
print(f"\n--- Migrating collection '{collection.name}' ({total_count} chunks) ---")
|
|
66
|
+
|
|
67
|
+
# Fetch and insert in batches to avoid OOM
|
|
68
|
+
FETCH_BATCH = 5000
|
|
69
|
+
migrated = 0
|
|
70
|
+
need_reembed_total = 0
|
|
71
|
+
|
|
72
|
+
for offset in range(0, total_count, FETCH_BATCH):
|
|
73
|
+
batch_data = collection.get(
|
|
74
|
+
include=["documents", "metadatas", "embeddings"],
|
|
75
|
+
limit=FETCH_BATCH,
|
|
76
|
+
offset=offset,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
if not batch_data["ids"]:
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
chunks = []
|
|
83
|
+
embeddings_batch = []
|
|
84
|
+
need_reembed = []
|
|
85
|
+
|
|
86
|
+
for i, chunk_id in enumerate(batch_data["ids"]):
|
|
87
|
+
document = batch_data["documents"][i]
|
|
88
|
+
metadata = batch_data["metadatas"][i] or {}
|
|
89
|
+
embedding = batch_data["embeddings"][i] if batch_data["embeddings"] else None
|
|
90
|
+
|
|
91
|
+
chunk_data = {
|
|
92
|
+
"id": chunk_id,
|
|
93
|
+
"content": document,
|
|
94
|
+
"metadata": {
|
|
95
|
+
k: v
|
|
96
|
+
for k, v in metadata.items()
|
|
97
|
+
if k
|
|
98
|
+
not in [
|
|
99
|
+
"source_file",
|
|
100
|
+
"project",
|
|
101
|
+
"content_type",
|
|
102
|
+
"value_type",
|
|
103
|
+
"char_count",
|
|
104
|
+
]
|
|
105
|
+
},
|
|
106
|
+
"source_file": metadata.get("source_file", "unknown"),
|
|
107
|
+
"project": metadata.get("project"),
|
|
108
|
+
"content_type": metadata.get("content_type"),
|
|
109
|
+
"value_type": metadata.get("value_type"),
|
|
110
|
+
"char_count": metadata.get("char_count", len(document)),
|
|
111
|
+
}
|
|
112
|
+
chunks.append(chunk_data)
|
|
113
|
+
|
|
114
|
+
if embedding and len(embedding) == 1024:
|
|
115
|
+
embeddings_batch.append(embedding)
|
|
116
|
+
else:
|
|
117
|
+
embeddings_batch.append(None)
|
|
118
|
+
need_reembed.append(i)
|
|
119
|
+
|
|
120
|
+
# Re-embed chunks with wrong/missing dimensions
|
|
121
|
+
if need_reembed:
|
|
122
|
+
need_reembed_total += len(need_reembed)
|
|
123
|
+
if embedding_model is None:
|
|
124
|
+
print("Loading bge-large-en-v1.5 for re-embedding...")
|
|
125
|
+
embedding_model = get_embedding_model()
|
|
126
|
+
|
|
127
|
+
for rb_start in range(0, len(need_reembed), 32):
|
|
128
|
+
rb_indices = need_reembed[rb_start : rb_start + 32]
|
|
129
|
+
rb_texts = [chunks[idx]["content"] for idx in rb_indices]
|
|
130
|
+
try:
|
|
131
|
+
rb_embs = embedding_model._load_model().encode(
|
|
132
|
+
rb_texts, convert_to_numpy=True, show_progress_bar=False
|
|
133
|
+
)
|
|
134
|
+
for j, idx in enumerate(rb_indices):
|
|
135
|
+
embeddings_batch[idx] = rb_embs[j].tolist()
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"Failed to re-embed batch: {e}")
|
|
138
|
+
for idx in rb_indices:
|
|
139
|
+
embeddings_batch[idx] = [0.0] * 1024
|
|
140
|
+
|
|
141
|
+
# Insert this batch into sqlite-vec
|
|
142
|
+
INSERT_BATCH = 1000
|
|
143
|
+
for ins_start in range(0, len(chunks), INSERT_BATCH):
|
|
144
|
+
ins_chunks = chunks[ins_start : ins_start + INSERT_BATCH]
|
|
145
|
+
ins_embs = embeddings_batch[ins_start : ins_start + INSERT_BATCH]
|
|
146
|
+
vector_store.upsert_chunks(ins_chunks, ins_embs)
|
|
147
|
+
|
|
148
|
+
migrated += len(chunks)
|
|
149
|
+
reembed_note = f" ({len(need_reembed)} re-embedded)" if need_reembed else ""
|
|
150
|
+
print(f" {migrated}/{total_count} chunks{reembed_note}")
|
|
151
|
+
|
|
152
|
+
grand_total += migrated
|
|
153
|
+
if need_reembed_total:
|
|
154
|
+
print(f" ({need_reembed_total} total re-embedded in this collection)")
|
|
155
|
+
|
|
156
|
+
vector_store.close()
|
|
157
|
+
|
|
158
|
+
# Verify migration
|
|
159
|
+
vector_store = VectorStore(SQLITE_PATH)
|
|
160
|
+
final_count = vector_store.count()
|
|
161
|
+
vector_store.close()
|
|
162
|
+
|
|
163
|
+
print(f"\nMigration complete: {final_count} chunks in sqlite-vec (from {total_all} in ChromaDB)")
|
|
164
|
+
|
|
165
|
+
return True
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.error(f"Migration failed: {e}")
|
|
169
|
+
print(f"Migration failed: {e}")
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def main():
|
|
174
|
+
"""Main migration entry point."""
|
|
175
|
+
logging.basicConfig(level=logging.INFO)
|
|
176
|
+
|
|
177
|
+
print("זיכרון - Migration Tool")
|
|
178
|
+
print("=" * 50)
|
|
179
|
+
|
|
180
|
+
if SQLITE_PATH.exists():
|
|
181
|
+
response = input(f"sqlite-vec database already exists at {SQLITE_PATH}. Overwrite? (y/N): ")
|
|
182
|
+
if response.lower() != "y":
|
|
183
|
+
print("Migration cancelled")
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
# Remove existing database and WAL/SHM files
|
|
187
|
+
SQLITE_PATH.unlink()
|
|
188
|
+
for suffix in ["-shm", "-wal"]:
|
|
189
|
+
p = SQLITE_PATH.parent / (SQLITE_PATH.name + suffix)
|
|
190
|
+
if p.exists():
|
|
191
|
+
p.unlink()
|
|
192
|
+
|
|
193
|
+
success = migrate_from_chromadb()
|
|
194
|
+
|
|
195
|
+
if success:
|
|
196
|
+
print("\nMigration completed successfully!")
|
|
197
|
+
print("You can now use the new fast daemon service:")
|
|
198
|
+
print(" brainlayer search 'your query'")
|
|
199
|
+
else:
|
|
200
|
+
print("\nMigration failed or skipped")
|
|
201
|
+
sys.exit(1)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
if __name__ == "__main__":
|
|
205
|
+
main()
|
brainlayer/paths.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Centralized data paths for BrainLayer.
|
|
2
|
+
|
|
3
|
+
The database currently lives at ~/.local/share/zikaron/ (legacy naming from
|
|
4
|
+
before the BrainLayer extraction). An env var override is supported for custom
|
|
5
|
+
installations.
|
|
6
|
+
|
|
7
|
+
Resolution order:
|
|
8
|
+
1. BRAINLAYER_DB env var (full path to .db file)
|
|
9
|
+
2. ~/.local/share/zikaron/zikaron.db (current production location)
|
|
10
|
+
3. ~/.local/share/brainlayer/brainlayer.db (future default)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
# Legacy path (where data actually lives today)
|
|
17
|
+
_LEGACY_DB_PATH = Path.home() / ".local" / "share" / "zikaron" / "zikaron.db"
|
|
18
|
+
|
|
19
|
+
# New canonical path (for fresh installs)
|
|
20
|
+
_CANONICAL_DB_PATH = Path.home() / ".local" / "share" / "brainlayer" / "brainlayer.db"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_db_path() -> Path:
|
|
24
|
+
"""Resolve the BrainLayer database path.
|
|
25
|
+
|
|
26
|
+
Checks BRAINLAYER_DB env var first, then falls back to whichever
|
|
27
|
+
known path exists (preferring the legacy zikaron path if both exist,
|
|
28
|
+
since that's where the real data is).
|
|
29
|
+
"""
|
|
30
|
+
env = os.environ.get("BRAINLAYER_DB")
|
|
31
|
+
if env:
|
|
32
|
+
return Path(env)
|
|
33
|
+
|
|
34
|
+
if _LEGACY_DB_PATH.exists():
|
|
35
|
+
return _LEGACY_DB_PATH
|
|
36
|
+
|
|
37
|
+
# Ensure parent dir exists for fresh installs
|
|
38
|
+
_CANONICAL_DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
return _CANONICAL_DB_PATH
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Convenience: pre-resolved default for import
|
|
43
|
+
DEFAULT_DB_PATH = get_db_path()
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Pipeline stages for processing Claude Code conversations."""
|
|
2
|
+
|
|
3
|
+
from .chunk import chunk_content
|
|
4
|
+
from .classify import classify_content
|
|
5
|
+
from .enrichment import build_external_prompt
|
|
6
|
+
from .extract import extract_system_prompts
|
|
7
|
+
from .extract_markdown import (
|
|
8
|
+
classify_by_path,
|
|
9
|
+
extract_markdown_content,
|
|
10
|
+
find_markdown_files,
|
|
11
|
+
parse_markdown,
|
|
12
|
+
)
|
|
13
|
+
from .sanitize import (
|
|
14
|
+
Replacement,
|
|
15
|
+
SanitizeConfig,
|
|
16
|
+
Sanitizer,
|
|
17
|
+
SanitizeResult,
|
|
18
|
+
)
|
|
19
|
+
from .semantic_style import (
|
|
20
|
+
SemanticStyleAnalysis,
|
|
21
|
+
SemanticStyleAnalyzer,
|
|
22
|
+
TopicCluster,
|
|
23
|
+
analyze_semantic_style,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"extract_system_prompts",
|
|
28
|
+
"classify_content",
|
|
29
|
+
"chunk_content",
|
|
30
|
+
# Markdown extraction
|
|
31
|
+
"find_markdown_files",
|
|
32
|
+
"parse_markdown",
|
|
33
|
+
"classify_by_path",
|
|
34
|
+
"extract_markdown_content",
|
|
35
|
+
# Semantic style analysis
|
|
36
|
+
"SemanticStyleAnalyzer",
|
|
37
|
+
"SemanticStyleAnalysis",
|
|
38
|
+
"TopicCluster",
|
|
39
|
+
"analyze_semantic_style",
|
|
40
|
+
# PII sanitization
|
|
41
|
+
"Sanitizer",
|
|
42
|
+
"SanitizeConfig",
|
|
43
|
+
"SanitizeResult",
|
|
44
|
+
"Replacement",
|
|
45
|
+
# External enrichment (sanitized)
|
|
46
|
+
"build_external_prompt",
|
|
47
|
+
]
|