memstrata 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memstrata/__init__.py +2 -0
- memstrata/cli/__init__.py +0 -0
- memstrata/cli/cd_hook.py +148 -0
- memstrata/cli/ingest.py +432 -0
- memstrata/cli/main.py +340 -0
- memstrata/config/__init__.py +0 -0
- memstrata/config/keychain.py +47 -0
- memstrata/layer3/__init__.py +0 -0
- memstrata/layer3/_db.py +638 -0
- memstrata/layer3/api_server.py +2298 -0
- memstrata/layer3/ingestion/__init__.py +115 -0
- memstrata/layer3/ingestion/branch_switch.py +230 -0
- memstrata/layer3/ingestion/chunker.py +351 -0
- memstrata/layer3/ingestion/denylist.py +307 -0
- memstrata/layer3/ingestion/lifecycle.py +312 -0
- memstrata/layer3/ingestion/orchestrator.py +664 -0
- memstrata/layer3/ingestion/progress.py +209 -0
- memstrata/layer3/ingestion/resource_policy.py +297 -0
- memstrata/layer3/ingestion/watcher.py +523 -0
- memstrata/layer3/mcp_app.py +361 -0
- memstrata/layer3/mcp_server.py +196 -0
- memstrata/layer3/ollama_health.py +181 -0
- memstrata/layer3/pricing/__init__.py +0 -0
- memstrata/layer3/pricing/fx.py +147 -0
- memstrata/layer3/pricing/lookup.py +166 -0
- memstrata/layer3/pricing/openrouter_sync.py +174 -0
- memstrata/layer3/pricing/pricing_matrix.json +78 -0
- memstrata/layer3/retrieval.py +132 -0
- memstrata/workers/__init__.py +0 -0
- memstrata/workers/embedding_worker.py +301 -0
- memstrata-0.6.0.dist-info/METADATA +182 -0
- memstrata-0.6.0.dist-info/RECORD +36 -0
- memstrata-0.6.0.dist-info/WHEEL +5 -0
- memstrata-0.6.0.dist-info/entry_points.txt +2 -0
- memstrata-0.6.0.dist-info/licenses/LICENSE +21 -0
- memstrata-0.6.0.dist-info/top_level.txt +1 -0
memstrata/__init__.py
ADDED
|
File without changes
|
memstrata/cli/cd_hook.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shell cd-hook generation and idempotent installation.
|
|
3
|
+
|
|
4
|
+
Hook text and write/remove patterns taken verbatim from
|
|
5
|
+
v5_1_reference/critical_snippets.py §2. The idempotent marker pair
|
|
6
|
+
ensures repeated writes replace rather than duplicate the block.
|
|
7
|
+
|
|
8
|
+
Hard Rule 54: hooks only check for .git/ — no process scanning.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
_HOOK_MARKER_BEGIN = "# >>> memstrata cd-hook >>>"
|
|
18
|
+
_HOOK_MARKER_END = "# <<< memstrata cd-hook <<<"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def hook_for_shell(shell: str) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Generate the hook block for the given shell.
|
|
24
|
+
|
|
25
|
+
The returned string is delimited by _HOOK_MARKER_BEGIN / _HOOK_MARKER_END
|
|
26
|
+
so write_hook can replace it idempotently.
|
|
27
|
+
"""
|
|
28
|
+
if shell == "zsh":
|
|
29
|
+
body = """
|
|
30
|
+
ml_cd_hook() {
|
|
31
|
+
if [ -d ".git" ] && command -v memstrata >/dev/null 2>&1; then
|
|
32
|
+
(memstrata register "$PWD" --quiet >/dev/null 2>&1 &)
|
|
33
|
+
fi
|
|
34
|
+
}
|
|
35
|
+
typeset -gaU chpwd_functions
|
|
36
|
+
chpwd_functions+=(ml_cd_hook)
|
|
37
|
+
"""
|
|
38
|
+
elif shell == "bash":
|
|
39
|
+
body = """
|
|
40
|
+
ml_cd_hook() {
|
|
41
|
+
if [ -d ".git" ] && command -v memstrata >/dev/null 2>&1; then
|
|
42
|
+
(memstrata register "$PWD" --quiet >/dev/null 2>&1 &)
|
|
43
|
+
fi
|
|
44
|
+
}
|
|
45
|
+
PROMPT_COMMAND="ml_cd_hook;${PROMPT_COMMAND:-:}"
|
|
46
|
+
"""
|
|
47
|
+
elif shell == "fish":
|
|
48
|
+
body = """
|
|
49
|
+
function ml_cd_hook --on-variable PWD
|
|
50
|
+
if test -d .git
|
|
51
|
+
if command -v memstrata >/dev/null 2>&1
|
|
52
|
+
memstrata register "$PWD" --quiet >/dev/null 2>&1 &
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
"""
|
|
57
|
+
elif shell == "powershell":
|
|
58
|
+
body = """
|
|
59
|
+
$global:__MlOriginalPrompt = if (Test-Path Function:prompt) { Get-Item Function:prompt } else { $null }
|
|
60
|
+
function global:prompt {
|
|
61
|
+
if (Test-Path -PathType Container ".git") {
|
|
62
|
+
if (Get-Command memstrata -ErrorAction SilentlyContinue) {
|
|
63
|
+
Start-Job -ScriptBlock {
|
|
64
|
+
param($p) memstrata register $p --quiet
|
|
65
|
+
} -ArgumentList $PWD.Path | Out-Null
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
if ($global:__MlOriginalPrompt) { & $global:__MlOriginalPrompt }
|
|
69
|
+
else { "PS $($executionContext.SessionState.Path.CurrentLocation)$('>' * ($nestedPromptLevel + 1)) " }
|
|
70
|
+
}
|
|
71
|
+
"""
|
|
72
|
+
else:
|
|
73
|
+
raise ValueError(f"unsupported shell: {shell!r}")
|
|
74
|
+
|
|
75
|
+
return f"\n{_HOOK_MARKER_BEGIN}\n{body.strip()}\n{_HOOK_MARKER_END}\n"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def write_hook(shell: str, config_path: Path) -> None:
|
|
79
|
+
"""
|
|
80
|
+
Idempotently install the hook into config_path.
|
|
81
|
+
|
|
82
|
+
If the marker block is already present it is replaced in-place.
|
|
83
|
+
Otherwise the block is appended. A .ml-backup is created once on
|
|
84
|
+
the first write (never overwritten on subsequent writes).
|
|
85
|
+
"""
|
|
86
|
+
backup = config_path.with_suffix(config_path.suffix + ".ml-backup")
|
|
87
|
+
if config_path.exists() and not backup.exists():
|
|
88
|
+
backup.write_text(config_path.read_text(encoding="utf-8"), encoding="utf-8")
|
|
89
|
+
|
|
90
|
+
existing = config_path.read_text(encoding="utf-8") if config_path.exists() else ""
|
|
91
|
+
new_block = hook_for_shell(shell)
|
|
92
|
+
|
|
93
|
+
if _HOOK_MARKER_BEGIN in existing:
|
|
94
|
+
before, _, rest = existing.partition(_HOOK_MARKER_BEGIN)
|
|
95
|
+
_, _, after = rest.partition(_HOOK_MARKER_END)
|
|
96
|
+
after = after.lstrip("\n")
|
|
97
|
+
result = before.rstrip() + new_block + ("\n" + after if after else "")
|
|
98
|
+
else:
|
|
99
|
+
# new_block already starts with "\n", so rstrip() + new_block gives one separator.
|
|
100
|
+
result = existing.rstrip() + new_block
|
|
101
|
+
|
|
102
|
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
config_path.write_text(result, encoding="utf-8")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def remove_hook(config_path: Path) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Reverse write_hook. Strips the marker block from config_path in-place.
|
|
109
|
+
No-op if the file is missing or the block was never written.
|
|
110
|
+
"""
|
|
111
|
+
if not config_path.exists():
|
|
112
|
+
return
|
|
113
|
+
text = config_path.read_text(encoding="utf-8")
|
|
114
|
+
if _HOOK_MARKER_BEGIN not in text:
|
|
115
|
+
return
|
|
116
|
+
before, _, rest = text.partition(_HOOK_MARKER_BEGIN)
|
|
117
|
+
_, _, after = rest.partition(_HOOK_MARKER_END)
|
|
118
|
+
config_path.write_text(before.rstrip() + "\n" + after.lstrip("\n"), encoding="utf-8")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def detect_shell() -> str | None:
|
|
122
|
+
"""Best-effort shell detection from the environment."""
|
|
123
|
+
shell_env = os.environ.get("SHELL", "")
|
|
124
|
+
if "zsh" in shell_env:
|
|
125
|
+
return "zsh"
|
|
126
|
+
if "bash" in shell_env:
|
|
127
|
+
return "bash"
|
|
128
|
+
if "fish" in shell_env:
|
|
129
|
+
return "fish"
|
|
130
|
+
if os.environ.get("PSModulePath") and not shell_env:
|
|
131
|
+
return "powershell"
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def config_path_for_shell(shell: str) -> Path:
|
|
136
|
+
"""Return the canonical config file path for the given shell."""
|
|
137
|
+
home = Path.home()
|
|
138
|
+
if shell == "zsh":
|
|
139
|
+
return home / ".zshrc"
|
|
140
|
+
if shell == "bash":
|
|
141
|
+
return home / ".bashrc"
|
|
142
|
+
if shell == "fish":
|
|
143
|
+
return home / ".config" / "fish" / "config.fish"
|
|
144
|
+
if shell == "powershell":
|
|
145
|
+
if sys.platform == "win32":
|
|
146
|
+
return home / "Documents" / "PowerShell" / "Microsoft.PowerShell_profile.ps1"
|
|
147
|
+
return home / ".config" / "powershell" / "Microsoft.PowerShell_profile.ps1"
|
|
148
|
+
raise ValueError(f"unsupported shell: {shell!r}")
|
memstrata/cli/ingest.py
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
"""Phase 36 - Codebase ingestion (CLI + library).
|
|
2
|
+
|
|
3
|
+
Walks a project directory, reads source files, splits them into ~500-token
|
|
4
|
+
chunks, embeds each chunk via Ollama's nomic-embed-text, and stores the
|
|
5
|
+
results in the `codebase_chunks` + `codebase_chunks_vec` tables. The dashboard
|
|
6
|
+
server's /context/injection endpoint reads from these tables to build a real
|
|
7
|
+
project-context block instead of the V5.1 stub that always returned "".
|
|
8
|
+
|
|
9
|
+
Design choices (kept deliberately small):
|
|
10
|
+
- No watch mode; user re-runs the CLI when they want to re-index.
|
|
11
|
+
- File walker uses .gitignore-like skip patterns (vendored, no extra dep).
|
|
12
|
+
- Chunking is fixed-size by character count (TOKENS_PER_CHUNK * 4); good
|
|
13
|
+
enough as a first pass and matches how chat-turn embedding is sized.
|
|
14
|
+
- Re-ingestion is incremental: a file whose SHA-1 hasn't changed is
|
|
15
|
+
skipped; changed files have their old chunks deleted + replaced.
|
|
16
|
+
- Embeddings are best-effort. If Ollama is unreachable the metadata rows
|
|
17
|
+
are still written; the embedding column is just empty until the next
|
|
18
|
+
successful run.
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import hashlib
|
|
24
|
+
import json
|
|
25
|
+
import logging
|
|
26
|
+
import sqlite3
|
|
27
|
+
import sys
|
|
28
|
+
import time
|
|
29
|
+
from collections.abc import Iterable
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
import requests
|
|
34
|
+
|
|
35
|
+
from memstrata.layer3._db import _load_vec_extension, get_db_path, init_db
|
|
36
|
+
|
|
37
|
+
_logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# nomic-embed-text outputs 768-dim vectors; max input ~8192 tokens. Chunk
|
|
40
|
+
# to ~500 tokens (2000 chars) with no overlap - simple and fast.
|
|
41
|
+
TOKENS_PER_CHUNK = 500
|
|
42
|
+
CHARS_PER_TOKEN = 4
|
|
43
|
+
CHUNK_CHARS = TOKENS_PER_CHUNK * CHARS_PER_TOKEN
|
|
44
|
+
EMBED_BATCH = 8
|
|
45
|
+
OLLAMA_EMBED_URL = "http://localhost:11434/api/embed"
|
|
46
|
+
EMBED_MODEL = "nomic-embed-text"
|
|
47
|
+
EMBED_DIM = 768
|
|
48
|
+
|
|
49
|
+
# What we consider "source we'd want context from". Add to taste; the list is
|
|
50
|
+
# intentionally narrow so we don't index minified JS, lockfiles, or images.
|
|
51
|
+
_INCLUDE_SUFFIXES = {
|
|
52
|
+
".py", ".pyi",
|
|
53
|
+
".ts", ".tsx", ".js", ".jsx", ".mjs",
|
|
54
|
+
".md", ".mdx", ".rst", ".txt",
|
|
55
|
+
".rs", ".go", ".java", ".kt",
|
|
56
|
+
".rb", ".php", ".cs", ".swift",
|
|
57
|
+
".c", ".h", ".cc", ".cpp", ".hpp",
|
|
58
|
+
".html", ".css", ".scss",
|
|
59
|
+
".toml", ".yaml", ".yml",
|
|
60
|
+
".json", ".sql", ".sh", ".ps1",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Directories we never descend into. Kept as a set of names (not full paths)
|
|
64
|
+
# so the walker can prune cheaply.
|
|
65
|
+
_SKIP_DIRS = {
|
|
66
|
+
".git", ".hg", ".svn",
|
|
67
|
+
"node_modules", ".venv", "venv", "env",
|
|
68
|
+
"__pycache__", ".mypy_cache", ".pytest_cache", ".ruff_cache",
|
|
69
|
+
"dist", "build", "out", "target", ".next",
|
|
70
|
+
".tox", ".cache", "coverage",
|
|
71
|
+
".idea", ".vscode",
|
|
72
|
+
".memstrata", ".memstrata-pro",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
# Files we never read even if they have an included suffix.
|
|
76
|
+
_SKIP_FILE_PATTERNS = (
|
|
77
|
+
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "poetry.lock",
|
|
78
|
+
"uv.lock", "Cargo.lock", "Gemfile.lock", "composer.lock",
|
|
79
|
+
".vsix",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
MAX_FILE_BYTES = 1_000_000 # skip files over 1 MB (binary heuristic)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# Walker
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
@dataclass(frozen=True)
|
|
90
|
+
class _FileRef:
|
|
91
|
+
path: Path # absolute path on disk
|
|
92
|
+
rel: str # path relative to project root, POSIX-style
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def iter_source_files(root: Path) -> Iterable[_FileRef]:
|
|
96
|
+
"""Yield every source file under *root*, pruning skip-dirs as we go."""
|
|
97
|
+
root = root.resolve()
|
|
98
|
+
for sub in root.rglob("*"):
|
|
99
|
+
# rglob walks lazily but doesn't prune; check ancestors.
|
|
100
|
+
if any(p.name in _SKIP_DIRS for p in sub.parents if p != sub):
|
|
101
|
+
continue
|
|
102
|
+
if not sub.is_file():
|
|
103
|
+
continue
|
|
104
|
+
if sub.name in _SKIP_FILE_PATTERNS:
|
|
105
|
+
continue
|
|
106
|
+
if sub.suffix.lower() not in _INCLUDE_SUFFIXES:
|
|
107
|
+
continue
|
|
108
|
+
try:
|
|
109
|
+
if sub.stat().st_size > MAX_FILE_BYTES:
|
|
110
|
+
continue
|
|
111
|
+
except OSError:
|
|
112
|
+
continue
|
|
113
|
+
rel = sub.relative_to(root).as_posix()
|
|
114
|
+
yield _FileRef(path=sub, rel=rel)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# Reading + chunking
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def _read_text(p: Path) -> str | None:
|
|
122
|
+
"""Read a file as UTF-8; return None for binary / encoding errors."""
|
|
123
|
+
try:
|
|
124
|
+
return p.read_text(encoding="utf-8", errors="strict")
|
|
125
|
+
except (UnicodeDecodeError, OSError):
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def chunk_text(text: str, chunk_chars: int = CHUNK_CHARS) -> list[str]:
|
|
130
|
+
"""Split text into roughly chunk_chars-sized slices on whitespace boundaries.
|
|
131
|
+
|
|
132
|
+
Falls back to a hard split when no whitespace is found within the window
|
|
133
|
+
(e.g., a single very long line of minified code).
|
|
134
|
+
"""
|
|
135
|
+
text = text.strip()
|
|
136
|
+
if not text:
|
|
137
|
+
return []
|
|
138
|
+
out: list[str] = []
|
|
139
|
+
i = 0
|
|
140
|
+
n = len(text)
|
|
141
|
+
while i < n:
|
|
142
|
+
end = min(i + chunk_chars, n)
|
|
143
|
+
if end < n:
|
|
144
|
+
# Walk back to the nearest whitespace boundary so a chunk doesn't
|
|
145
|
+
# split a word/identifier; if none found within 100 chars, hard-cut.
|
|
146
|
+
cut = end
|
|
147
|
+
for j in range(end, max(end - 100, i), -1):
|
|
148
|
+
if text[j].isspace():
|
|
149
|
+
cut = j
|
|
150
|
+
break
|
|
151
|
+
end = cut
|
|
152
|
+
chunk = text[i:end].strip()
|
|
153
|
+
if chunk:
|
|
154
|
+
out.append(chunk)
|
|
155
|
+
i = end
|
|
156
|
+
return out
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _sha1_hex(s: bytes) -> str:
|
|
160
|
+
return hashlib.sha1(s).hexdigest()
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
# Embedding (Ollama nomic-embed-text)
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
def _embed_batch(texts: list[str], *, timeout: float = 60.0) -> list[list[float]] | None:
|
|
168
|
+
"""POST to Ollama /api/embed. Returns the list of vectors or None on any error."""
|
|
169
|
+
try:
|
|
170
|
+
r = requests.post(
|
|
171
|
+
OLLAMA_EMBED_URL,
|
|
172
|
+
json={"model": EMBED_MODEL, "input": texts},
|
|
173
|
+
timeout=timeout,
|
|
174
|
+
)
|
|
175
|
+
r.raise_for_status()
|
|
176
|
+
data = r.json()
|
|
177
|
+
emb = data.get("embeddings")
|
|
178
|
+
if emb is None or len(emb) != len(texts):
|
|
179
|
+
_logger.warning("ollama embed returned unexpected shape: %r", data)
|
|
180
|
+
return None
|
|
181
|
+
return emb
|
|
182
|
+
except Exception as exc:
|
|
183
|
+
_logger.warning("ollama embed failed: %s", exc)
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# ---------------------------------------------------------------------------
|
|
188
|
+
# Database I/O
|
|
189
|
+
# ---------------------------------------------------------------------------
|
|
190
|
+
|
|
191
|
+
def _open_conn(db_path: Path | None = None) -> sqlite3.Connection:
|
|
192
|
+
path = db_path if db_path else get_db_path()
|
|
193
|
+
conn = sqlite3.connect(str(path), timeout=10.0)
|
|
194
|
+
conn.row_factory = sqlite3.Row
|
|
195
|
+
_load_vec_extension(conn)
|
|
196
|
+
init_db(conn)
|
|
197
|
+
return conn
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _existing_sha(conn: sqlite3.Connection, project_id: str, rel: str) -> str | None:
|
|
201
|
+
row = conn.execute(
|
|
202
|
+
"SELECT sha1 FROM codebase_files WHERE project_id = ? AND path = ?",
|
|
203
|
+
(project_id, rel),
|
|
204
|
+
).fetchone()
|
|
205
|
+
return row["sha1"] if row else None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _drop_old_chunks(conn: sqlite3.Connection, project_id: str, rel: str) -> None:
|
|
209
|
+
"""Remove all existing chunk + vector rows for a (project, path)."""
|
|
210
|
+
ids = [
|
|
211
|
+
r["id"] for r in conn.execute(
|
|
212
|
+
"SELECT id FROM codebase_chunks WHERE project_id = ? AND path = ?",
|
|
213
|
+
(project_id, rel),
|
|
214
|
+
).fetchall()
|
|
215
|
+
]
|
|
216
|
+
if not ids:
|
|
217
|
+
return
|
|
218
|
+
placeholders = ",".join("?" * len(ids))
|
|
219
|
+
try:
|
|
220
|
+
conn.execute(
|
|
221
|
+
f"DELETE FROM codebase_chunks_vec WHERE chunk_id IN ({placeholders})",
|
|
222
|
+
ids,
|
|
223
|
+
)
|
|
224
|
+
except sqlite3.OperationalError:
|
|
225
|
+
pass # vec0 unavailable; nothing to clean
|
|
226
|
+
conn.execute(
|
|
227
|
+
f"DELETE FROM codebase_chunks WHERE id IN ({placeholders})", ids
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _store_chunks(
|
|
232
|
+
conn: sqlite3.Connection,
|
|
233
|
+
project_id: str,
|
|
234
|
+
rel: str,
|
|
235
|
+
chunks: list[str],
|
|
236
|
+
embeddings: list[list[float]] | None,
|
|
237
|
+
) -> int:
|
|
238
|
+
"""Insert one row per chunk (+ optional embedding). Returns total tokens."""
|
|
239
|
+
total_tokens = 0
|
|
240
|
+
for idx, chunk in enumerate(chunks):
|
|
241
|
+
tokens = max(1, len(chunk) // CHARS_PER_TOKEN)
|
|
242
|
+
total_tokens += tokens
|
|
243
|
+
cur = conn.execute(
|
|
244
|
+
"""
|
|
245
|
+
INSERT INTO codebase_chunks (project_id, path, chunk_idx, text, token_count)
|
|
246
|
+
VALUES (?, ?, ?, ?, ?)
|
|
247
|
+
""",
|
|
248
|
+
(project_id, rel, idx, chunk, tokens),
|
|
249
|
+
)
|
|
250
|
+
chunk_id = cur.lastrowid
|
|
251
|
+
if embeddings is not None and idx < len(embeddings):
|
|
252
|
+
vec = embeddings[idx]
|
|
253
|
+
if len(vec) == EMBED_DIM:
|
|
254
|
+
try:
|
|
255
|
+
conn.execute(
|
|
256
|
+
"INSERT OR REPLACE INTO codebase_chunks_vec (chunk_id, embedding) VALUES (?, ?)",
|
|
257
|
+
(chunk_id, json.dumps(vec)),
|
|
258
|
+
)
|
|
259
|
+
except sqlite3.OperationalError as exc:
|
|
260
|
+
_logger.warning("vec0 insert failed (chunk_id=%d): %s", chunk_id, exc)
|
|
261
|
+
return total_tokens
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _upsert_file(
|
|
265
|
+
conn: sqlite3.Connection,
|
|
266
|
+
project_id: str,
|
|
267
|
+
rel: str,
|
|
268
|
+
sha1: str,
|
|
269
|
+
size_bytes: int,
|
|
270
|
+
token_count: int,
|
|
271
|
+
) -> None:
|
|
272
|
+
conn.execute(
|
|
273
|
+
"""
|
|
274
|
+
INSERT INTO codebase_files (project_id, path, sha1, size_bytes, token_count, last_indexed)
|
|
275
|
+
VALUES (?, ?, ?, ?, ?, datetime('now'))
|
|
276
|
+
ON CONFLICT (project_id, path) DO UPDATE SET
|
|
277
|
+
sha1 = excluded.sha1,
|
|
278
|
+
size_bytes = excluded.size_bytes,
|
|
279
|
+
token_count = excluded.token_count,
|
|
280
|
+
last_indexed = excluded.last_indexed
|
|
281
|
+
""",
|
|
282
|
+
(project_id, rel, sha1, size_bytes, token_count),
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# ---------------------------------------------------------------------------
|
|
287
|
+
# Top-level ingest
|
|
288
|
+
# ---------------------------------------------------------------------------
|
|
289
|
+
|
|
290
|
+
@dataclass
|
|
291
|
+
class IngestSummary:
|
|
292
|
+
project_id: str
|
|
293
|
+
root: Path
|
|
294
|
+
files_seen: int
|
|
295
|
+
files_indexed: int
|
|
296
|
+
files_unchanged: int
|
|
297
|
+
files_failed: int
|
|
298
|
+
chunks_written: int
|
|
299
|
+
chunks_embedded: int
|
|
300
|
+
tokens_total: int
|
|
301
|
+
duration_s: float
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def ingest_project(
|
|
305
|
+
root: Path,
|
|
306
|
+
*,
|
|
307
|
+
project_id: str | None = None,
|
|
308
|
+
db_path: Path | None = None,
|
|
309
|
+
embed: bool = True,
|
|
310
|
+
) -> IngestSummary:
|
|
311
|
+
"""Walk *root*, ingest changed source files into the codebase tables.
|
|
312
|
+
|
|
313
|
+
project_id defaults to the basename of *root* (so memstrata-pro/
|
|
314
|
+
becomes "memstrata-pro"). Pass an explicit value when the harness or
|
|
315
|
+
extension uses a different identifier.
|
|
316
|
+
"""
|
|
317
|
+
start = time.time()
|
|
318
|
+
root = root.resolve()
|
|
319
|
+
if not root.is_dir():
|
|
320
|
+
raise FileNotFoundError(f"not a directory: {root}")
|
|
321
|
+
|
|
322
|
+
pid = project_id or root.name
|
|
323
|
+
conn = _open_conn(db_path)
|
|
324
|
+
|
|
325
|
+
seen = indexed = unchanged = failed = 0
|
|
326
|
+
chunks_written = chunks_embedded = 0
|
|
327
|
+
tokens_total = 0
|
|
328
|
+
|
|
329
|
+
try:
|
|
330
|
+
for ref in iter_source_files(root):
|
|
331
|
+
seen += 1
|
|
332
|
+
raw = ref.path.read_bytes() if ref.path.is_file() else None
|
|
333
|
+
if raw is None:
|
|
334
|
+
failed += 1
|
|
335
|
+
continue
|
|
336
|
+
sha = _sha1_hex(raw)
|
|
337
|
+
if _existing_sha(conn, pid, ref.rel) == sha:
|
|
338
|
+
unchanged += 1
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
text = _read_text(ref.path)
|
|
342
|
+
if text is None:
|
|
343
|
+
failed += 1
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
chunks = chunk_text(text)
|
|
347
|
+
if not chunks:
|
|
348
|
+
# Empty file - still record it as seen so we don't keep retrying.
|
|
349
|
+
_drop_old_chunks(conn, pid, ref.rel)
|
|
350
|
+
_upsert_file(conn, pid, ref.rel, sha, len(raw), 0)
|
|
351
|
+
conn.commit()
|
|
352
|
+
indexed += 1
|
|
353
|
+
continue
|
|
354
|
+
|
|
355
|
+
embeddings: list[list[float]] | None = None
|
|
356
|
+
if embed:
|
|
357
|
+
embeddings = []
|
|
358
|
+
for batch_start in range(0, len(chunks), EMBED_BATCH):
|
|
359
|
+
batch = chunks[batch_start: batch_start + EMBED_BATCH]
|
|
360
|
+
got = _embed_batch(batch)
|
|
361
|
+
if got is None:
|
|
362
|
+
embeddings = None # bail; store text without vectors
|
|
363
|
+
break
|
|
364
|
+
embeddings.extend(got)
|
|
365
|
+
if embeddings is not None:
|
|
366
|
+
chunks_embedded += len(embeddings)
|
|
367
|
+
|
|
368
|
+
_drop_old_chunks(conn, pid, ref.rel)
|
|
369
|
+
written_tokens = _store_chunks(conn, pid, ref.rel, chunks, embeddings)
|
|
370
|
+
_upsert_file(conn, pid, ref.rel, sha, len(raw), written_tokens)
|
|
371
|
+
conn.commit()
|
|
372
|
+
indexed += 1
|
|
373
|
+
chunks_written += len(chunks)
|
|
374
|
+
tokens_total += written_tokens
|
|
375
|
+
finally:
|
|
376
|
+
conn.close()
|
|
377
|
+
|
|
378
|
+
duration = round(time.time() - start, 2)
|
|
379
|
+
return IngestSummary(
|
|
380
|
+
project_id=pid,
|
|
381
|
+
root=root,
|
|
382
|
+
files_seen=seen,
|
|
383
|
+
files_indexed=indexed,
|
|
384
|
+
files_unchanged=unchanged,
|
|
385
|
+
files_failed=failed,
|
|
386
|
+
chunks_written=chunks_written,
|
|
387
|
+
chunks_embedded=chunks_embedded,
|
|
388
|
+
tokens_total=tokens_total,
|
|
389
|
+
duration_s=duration,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# ---------------------------------------------------------------------------
|
|
394
|
+
# CLI
|
|
395
|
+
# ---------------------------------------------------------------------------
|
|
396
|
+
|
|
397
|
+
def cmd_ingest(args: argparse.Namespace) -> None:
|
|
398
|
+
"""Entry point for `memstrata ingest <path>`."""
|
|
399
|
+
root = Path(args.path).expanduser().resolve()
|
|
400
|
+
if not root.exists():
|
|
401
|
+
print(f"ingest: path does not exist: {root}", file=sys.stderr)
|
|
402
|
+
sys.exit(1)
|
|
403
|
+
if not root.is_dir():
|
|
404
|
+
print(f"ingest: not a directory: {root}", file=sys.stderr)
|
|
405
|
+
sys.exit(1)
|
|
406
|
+
|
|
407
|
+
logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s")
|
|
408
|
+
print(f"[memstrata ingest] root: {root}")
|
|
409
|
+
print(f"[memstrata ingest] project_id: {args.project_id or root.name}")
|
|
410
|
+
print(f"[memstrata ingest] embed: {not args.no_embed}")
|
|
411
|
+
print()
|
|
412
|
+
|
|
413
|
+
summary = ingest_project(
|
|
414
|
+
root,
|
|
415
|
+
project_id=args.project_id,
|
|
416
|
+
embed=not args.no_embed,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
print(f" files seen: {summary.files_seen}")
|
|
420
|
+
print(f" files indexed: {summary.files_indexed}")
|
|
421
|
+
print(f" files unchanged: {summary.files_unchanged}")
|
|
422
|
+
print(f" files failed: {summary.files_failed}")
|
|
423
|
+
print(f" chunks written: {summary.chunks_written}")
|
|
424
|
+
print(f" chunks embedded: {summary.chunks_embedded}")
|
|
425
|
+
print(f" tokens total: {summary.tokens_total:,}")
|
|
426
|
+
print(f" duration: {summary.duration_s}s")
|
|
427
|
+
if summary.chunks_embedded == 0 and summary.chunks_written > 0:
|
|
428
|
+
print(
|
|
429
|
+
"\n ! No embeddings were stored. Ollama at http://localhost:11434 "
|
|
430
|
+
"may be offline. Re-run with the same command after starting it; "
|
|
431
|
+
"unchanged files will be skipped automatically."
|
|
432
|
+
)
|