memstrata 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. memstrata/__init__.py +2 -0
  2. memstrata/cli/__init__.py +0 -0
  3. memstrata/cli/cd_hook.py +148 -0
  4. memstrata/cli/ingest.py +432 -0
  5. memstrata/cli/main.py +340 -0
  6. memstrata/config/__init__.py +0 -0
  7. memstrata/config/keychain.py +47 -0
  8. memstrata/layer3/__init__.py +0 -0
  9. memstrata/layer3/_db.py +638 -0
  10. memstrata/layer3/api_server.py +2298 -0
  11. memstrata/layer3/ingestion/__init__.py +115 -0
  12. memstrata/layer3/ingestion/branch_switch.py +230 -0
  13. memstrata/layer3/ingestion/chunker.py +351 -0
  14. memstrata/layer3/ingestion/denylist.py +307 -0
  15. memstrata/layer3/ingestion/lifecycle.py +312 -0
  16. memstrata/layer3/ingestion/orchestrator.py +664 -0
  17. memstrata/layer3/ingestion/progress.py +209 -0
  18. memstrata/layer3/ingestion/resource_policy.py +297 -0
  19. memstrata/layer3/ingestion/watcher.py +523 -0
  20. memstrata/layer3/mcp_app.py +361 -0
  21. memstrata/layer3/mcp_server.py +196 -0
  22. memstrata/layer3/ollama_health.py +181 -0
  23. memstrata/layer3/pricing/__init__.py +0 -0
  24. memstrata/layer3/pricing/fx.py +147 -0
  25. memstrata/layer3/pricing/lookup.py +166 -0
  26. memstrata/layer3/pricing/openrouter_sync.py +174 -0
  27. memstrata/layer3/pricing/pricing_matrix.json +78 -0
  28. memstrata/layer3/retrieval.py +132 -0
  29. memstrata/workers/__init__.py +0 -0
  30. memstrata/workers/embedding_worker.py +301 -0
  31. memstrata-0.6.0.dist-info/METADATA +182 -0
  32. memstrata-0.6.0.dist-info/RECORD +36 -0
  33. memstrata-0.6.0.dist-info/WHEEL +5 -0
  34. memstrata-0.6.0.dist-info/entry_points.txt +2 -0
  35. memstrata-0.6.0.dist-info/licenses/LICENSE +21 -0
  36. memstrata-0.6.0.dist-info/top_level.txt +1 -0
memstrata/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ """MemStrata — MIT open-source context server for LLM-assisted coding."""
2
+ __version__ = "0.6.0"
File without changes
@@ -0,0 +1,148 @@
1
+ """
2
+ Shell cd-hook generation and idempotent installation.
3
+
4
+ Hook text and write/remove patterns taken verbatim from
5
+ v5_1_reference/critical_snippets.py §2. The idempotent marker pair
6
+ ensures repeated writes replace rather than duplicate the block.
7
+
8
+ Hard Rule 54: hooks only check for .git/ — no process scanning.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import sys
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ _HOOK_MARKER_BEGIN = "# >>> memstrata cd-hook >>>"
18
+ _HOOK_MARKER_END = "# <<< memstrata cd-hook <<<"
19
+
20
+
21
+ def hook_for_shell(shell: str) -> str:
22
+ """
23
+ Generate the hook block for the given shell.
24
+
25
+ The returned string is delimited by _HOOK_MARKER_BEGIN / _HOOK_MARKER_END
26
+ so write_hook can replace it idempotently.
27
+ """
28
+ if shell == "zsh":
29
+ body = """
30
+ ml_cd_hook() {
31
+ if [ -d ".git" ] && command -v memstrata >/dev/null 2>&1; then
32
+ (memstrata register "$PWD" --quiet >/dev/null 2>&1 &)
33
+ fi
34
+ }
35
+ typeset -gaU chpwd_functions
36
+ chpwd_functions+=(ml_cd_hook)
37
+ """
38
+ elif shell == "bash":
39
+ body = """
40
+ ml_cd_hook() {
41
+ if [ -d ".git" ] && command -v memstrata >/dev/null 2>&1; then
42
+ (memstrata register "$PWD" --quiet >/dev/null 2>&1 &)
43
+ fi
44
+ }
45
+ PROMPT_COMMAND="ml_cd_hook;${PROMPT_COMMAND:-:}"
46
+ """
47
+ elif shell == "fish":
48
+ body = """
49
+ function ml_cd_hook --on-variable PWD
50
+ if test -d .git
51
+ if command -v memstrata >/dev/null 2>&1
52
+ memstrata register "$PWD" --quiet >/dev/null 2>&1 &
53
+ end
54
+ end
55
+ end
56
+ """
57
+ elif shell == "powershell":
58
+ body = """
59
+ $global:__MlOriginalPrompt = if (Test-Path Function:prompt) { Get-Item Function:prompt } else { $null }
60
+ function global:prompt {
61
+ if (Test-Path -PathType Container ".git") {
62
+ if (Get-Command memstrata -ErrorAction SilentlyContinue) {
63
+ Start-Job -ScriptBlock {
64
+ param($p) memstrata register $p --quiet
65
+ } -ArgumentList $PWD.Path | Out-Null
66
+ }
67
+ }
68
+ if ($global:__MlOriginalPrompt) { & $global:__MlOriginalPrompt }
69
+ else { "PS $($executionContext.SessionState.Path.CurrentLocation)$('>' * ($nestedPromptLevel + 1)) " }
70
+ }
71
+ """
72
+ else:
73
+ raise ValueError(f"unsupported shell: {shell!r}")
74
+
75
+ return f"\n{_HOOK_MARKER_BEGIN}\n{body.strip()}\n{_HOOK_MARKER_END}\n"
76
+
77
+
78
+ def write_hook(shell: str, config_path: Path) -> None:
79
+ """
80
+ Idempotently install the hook into config_path.
81
+
82
+ If the marker block is already present it is replaced in-place.
83
+ Otherwise the block is appended. A .ml-backup is created once on
84
+ the first write (never overwritten on subsequent writes).
85
+ """
86
+ backup = config_path.with_suffix(config_path.suffix + ".ml-backup")
87
+ if config_path.exists() and not backup.exists():
88
+ backup.write_text(config_path.read_text(encoding="utf-8"), encoding="utf-8")
89
+
90
+ existing = config_path.read_text(encoding="utf-8") if config_path.exists() else ""
91
+ new_block = hook_for_shell(shell)
92
+
93
+ if _HOOK_MARKER_BEGIN in existing:
94
+ before, _, rest = existing.partition(_HOOK_MARKER_BEGIN)
95
+ _, _, after = rest.partition(_HOOK_MARKER_END)
96
+ after = after.lstrip("\n")
97
+ result = before.rstrip() + new_block + ("\n" + after if after else "")
98
+ else:
99
+ # new_block already starts with "\n", so rstrip() + new_block gives one separator.
100
+ result = existing.rstrip() + new_block
101
+
102
+ config_path.parent.mkdir(parents=True, exist_ok=True)
103
+ config_path.write_text(result, encoding="utf-8")
104
+
105
+
106
+ def remove_hook(config_path: Path) -> None:
107
+ """
108
+ Reverse write_hook. Strips the marker block from config_path in-place.
109
+ No-op if the file is missing or the block was never written.
110
+ """
111
+ if not config_path.exists():
112
+ return
113
+ text = config_path.read_text(encoding="utf-8")
114
+ if _HOOK_MARKER_BEGIN not in text:
115
+ return
116
+ before, _, rest = text.partition(_HOOK_MARKER_BEGIN)
117
+ _, _, after = rest.partition(_HOOK_MARKER_END)
118
+ config_path.write_text(before.rstrip() + "\n" + after.lstrip("\n"), encoding="utf-8")
119
+
120
+
121
+ def detect_shell() -> str | None:
122
+ """Best-effort shell detection from the environment."""
123
+ shell_env = os.environ.get("SHELL", "")
124
+ if "zsh" in shell_env:
125
+ return "zsh"
126
+ if "bash" in shell_env:
127
+ return "bash"
128
+ if "fish" in shell_env:
129
+ return "fish"
130
+ if os.environ.get("PSModulePath") and not shell_env:
131
+ return "powershell"
132
+ return None
133
+
134
+
135
+ def config_path_for_shell(shell: str) -> Path:
136
+ """Return the canonical config file path for the given shell."""
137
+ home = Path.home()
138
+ if shell == "zsh":
139
+ return home / ".zshrc"
140
+ if shell == "bash":
141
+ return home / ".bashrc"
142
+ if shell == "fish":
143
+ return home / ".config" / "fish" / "config.fish"
144
+ if shell == "powershell":
145
+ if sys.platform == "win32":
146
+ return home / "Documents" / "PowerShell" / "Microsoft.PowerShell_profile.ps1"
147
+ return home / ".config" / "powershell" / "Microsoft.PowerShell_profile.ps1"
148
+ raise ValueError(f"unsupported shell: {shell!r}")
@@ -0,0 +1,432 @@
1
+ """Phase 36 - Codebase ingestion (CLI + library).
2
+
3
+ Walks a project directory, reads source files, splits them into ~500-token
4
+ chunks, embeds each chunk via Ollama's nomic-embed-text, and stores the
5
+ results in the `codebase_chunks` + `codebase_chunks_vec` tables. The dashboard
6
+ server's /context/injection endpoint reads from these tables to build a real
7
+ project-context block instead of the V5.1 stub that always returned "".
8
+
9
+ Design choices (kept deliberately small):
10
+ - No watch mode; user re-runs the CLI when they want to re-index.
11
+ - File walker uses .gitignore-like skip patterns (vendored, no extra dep).
12
+ - Chunking is fixed-size by character count (TOKENS_PER_CHUNK * 4); good
13
+ enough as a first pass and matches how chat-turn embedding is sized.
14
+ - Re-ingestion is incremental: a file whose SHA-1 hasn't changed is
15
+ skipped; changed files have their old chunks deleted + replaced.
16
+ - Embeddings are best-effort. If Ollama is unreachable the metadata rows
17
+ are still written; the embedding column is just empty until the next
18
+ successful run.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ import sqlite3
27
+ import sys
28
+ import time
29
+ from collections.abc import Iterable
30
+ from dataclasses import dataclass
31
+ from pathlib import Path
32
+
33
+ import requests
34
+
35
+ from memstrata.layer3._db import _load_vec_extension, get_db_path, init_db
36
+
37
+ _logger = logging.getLogger(__name__)
38
+
39
+ # nomic-embed-text outputs 768-dim vectors; max input ~8192 tokens. Chunk
40
+ # to ~500 tokens (2000 chars) with no overlap - simple and fast.
41
+ TOKENS_PER_CHUNK = 500
42
+ CHARS_PER_TOKEN = 4
43
+ CHUNK_CHARS = TOKENS_PER_CHUNK * CHARS_PER_TOKEN
44
+ EMBED_BATCH = 8
45
+ OLLAMA_EMBED_URL = "http://localhost:11434/api/embed"
46
+ EMBED_MODEL = "nomic-embed-text"
47
+ EMBED_DIM = 768
48
+
49
+ # What we consider "source we'd want context from". Add to taste; the list is
50
+ # intentionally narrow so we don't index minified JS, lockfiles, or images.
51
+ _INCLUDE_SUFFIXES = {
52
+ ".py", ".pyi",
53
+ ".ts", ".tsx", ".js", ".jsx", ".mjs",
54
+ ".md", ".mdx", ".rst", ".txt",
55
+ ".rs", ".go", ".java", ".kt",
56
+ ".rb", ".php", ".cs", ".swift",
57
+ ".c", ".h", ".cc", ".cpp", ".hpp",
58
+ ".html", ".css", ".scss",
59
+ ".toml", ".yaml", ".yml",
60
+ ".json", ".sql", ".sh", ".ps1",
61
+ }
62
+
63
+ # Directories we never descend into. Kept as a set of names (not full paths)
64
+ # so the walker can prune cheaply.
65
+ _SKIP_DIRS = {
66
+ ".git", ".hg", ".svn",
67
+ "node_modules", ".venv", "venv", "env",
68
+ "__pycache__", ".mypy_cache", ".pytest_cache", ".ruff_cache",
69
+ "dist", "build", "out", "target", ".next",
70
+ ".tox", ".cache", "coverage",
71
+ ".idea", ".vscode",
72
+ ".memstrata", ".memstrata-pro",
73
+ }
74
+
75
+ # Files we never read even if they have an included suffix.
76
+ _SKIP_FILE_PATTERNS = (
77
+ "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "poetry.lock",
78
+ "uv.lock", "Cargo.lock", "Gemfile.lock", "composer.lock",
79
+ ".vsix",
80
+ )
81
+
82
+ MAX_FILE_BYTES = 1_000_000 # skip files over 1 MB (binary heuristic)
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Walker
87
+ # ---------------------------------------------------------------------------
88
+
89
+ @dataclass(frozen=True)
90
+ class _FileRef:
91
+ path: Path # absolute path on disk
92
+ rel: str # path relative to project root, POSIX-style
93
+
94
+
95
+ def iter_source_files(root: Path) -> Iterable[_FileRef]:
96
+ """Yield every source file under *root*, pruning skip-dirs as we go."""
97
+ root = root.resolve()
98
+ for sub in root.rglob("*"):
99
+ # rglob walks lazily but doesn't prune; check ancestors.
100
+ if any(p.name in _SKIP_DIRS for p in sub.parents if p != sub):
101
+ continue
102
+ if not sub.is_file():
103
+ continue
104
+ if sub.name in _SKIP_FILE_PATTERNS:
105
+ continue
106
+ if sub.suffix.lower() not in _INCLUDE_SUFFIXES:
107
+ continue
108
+ try:
109
+ if sub.stat().st_size > MAX_FILE_BYTES:
110
+ continue
111
+ except OSError:
112
+ continue
113
+ rel = sub.relative_to(root).as_posix()
114
+ yield _FileRef(path=sub, rel=rel)
115
+
116
+
117
+ # ---------------------------------------------------------------------------
118
+ # Reading + chunking
119
+ # ---------------------------------------------------------------------------
120
+
121
+ def _read_text(p: Path) -> str | None:
122
+ """Read a file as UTF-8; return None for binary / encoding errors."""
123
+ try:
124
+ return p.read_text(encoding="utf-8", errors="strict")
125
+ except (UnicodeDecodeError, OSError):
126
+ return None
127
+
128
+
129
+ def chunk_text(text: str, chunk_chars: int = CHUNK_CHARS) -> list[str]:
130
+ """Split text into roughly chunk_chars-sized slices on whitespace boundaries.
131
+
132
+ Falls back to a hard split when no whitespace is found within the window
133
+ (e.g., a single very long line of minified code).
134
+ """
135
+ text = text.strip()
136
+ if not text:
137
+ return []
138
+ out: list[str] = []
139
+ i = 0
140
+ n = len(text)
141
+ while i < n:
142
+ end = min(i + chunk_chars, n)
143
+ if end < n:
144
+ # Walk back to the nearest whitespace boundary so a chunk doesn't
145
+ # split a word/identifier; if none found within 100 chars, hard-cut.
146
+ cut = end
147
+ for j in range(end, max(end - 100, i), -1):
148
+ if text[j].isspace():
149
+ cut = j
150
+ break
151
+ end = cut
152
+ chunk = text[i:end].strip()
153
+ if chunk:
154
+ out.append(chunk)
155
+ i = end
156
+ return out
157
+
158
+
159
+ def _sha1_hex(s: bytes) -> str:
160
+ return hashlib.sha1(s).hexdigest()
161
+
162
+
163
+ # ---------------------------------------------------------------------------
164
+ # Embedding (Ollama nomic-embed-text)
165
+ # ---------------------------------------------------------------------------
166
+
167
+ def _embed_batch(texts: list[str], *, timeout: float = 60.0) -> list[list[float]] | None:
168
+ """POST to Ollama /api/embed. Returns the list of vectors or None on any error."""
169
+ try:
170
+ r = requests.post(
171
+ OLLAMA_EMBED_URL,
172
+ json={"model": EMBED_MODEL, "input": texts},
173
+ timeout=timeout,
174
+ )
175
+ r.raise_for_status()
176
+ data = r.json()
177
+ emb = data.get("embeddings")
178
+ if emb is None or len(emb) != len(texts):
179
+ _logger.warning("ollama embed returned unexpected shape: %r", data)
180
+ return None
181
+ return emb
182
+ except Exception as exc:
183
+ _logger.warning("ollama embed failed: %s", exc)
184
+ return None
185
+
186
+
187
+ # ---------------------------------------------------------------------------
188
+ # Database I/O
189
+ # ---------------------------------------------------------------------------
190
+
191
+ def _open_conn(db_path: Path | None = None) -> sqlite3.Connection:
192
+ path = db_path if db_path else get_db_path()
193
+ conn = sqlite3.connect(str(path), timeout=10.0)
194
+ conn.row_factory = sqlite3.Row
195
+ _load_vec_extension(conn)
196
+ init_db(conn)
197
+ return conn
198
+
199
+
200
+ def _existing_sha(conn: sqlite3.Connection, project_id: str, rel: str) -> str | None:
201
+ row = conn.execute(
202
+ "SELECT sha1 FROM codebase_files WHERE project_id = ? AND path = ?",
203
+ (project_id, rel),
204
+ ).fetchone()
205
+ return row["sha1"] if row else None
206
+
207
+
208
+ def _drop_old_chunks(conn: sqlite3.Connection, project_id: str, rel: str) -> None:
209
+ """Remove all existing chunk + vector rows for a (project, path)."""
210
+ ids = [
211
+ r["id"] for r in conn.execute(
212
+ "SELECT id FROM codebase_chunks WHERE project_id = ? AND path = ?",
213
+ (project_id, rel),
214
+ ).fetchall()
215
+ ]
216
+ if not ids:
217
+ return
218
+ placeholders = ",".join("?" * len(ids))
219
+ try:
220
+ conn.execute(
221
+ f"DELETE FROM codebase_chunks_vec WHERE chunk_id IN ({placeholders})",
222
+ ids,
223
+ )
224
+ except sqlite3.OperationalError:
225
+ pass # vec0 unavailable; nothing to clean
226
+ conn.execute(
227
+ f"DELETE FROM codebase_chunks WHERE id IN ({placeholders})", ids
228
+ )
229
+
230
+
231
+ def _store_chunks(
232
+ conn: sqlite3.Connection,
233
+ project_id: str,
234
+ rel: str,
235
+ chunks: list[str],
236
+ embeddings: list[list[float]] | None,
237
+ ) -> int:
238
+ """Insert one row per chunk (+ optional embedding). Returns total tokens."""
239
+ total_tokens = 0
240
+ for idx, chunk in enumerate(chunks):
241
+ tokens = max(1, len(chunk) // CHARS_PER_TOKEN)
242
+ total_tokens += tokens
243
+ cur = conn.execute(
244
+ """
245
+ INSERT INTO codebase_chunks (project_id, path, chunk_idx, text, token_count)
246
+ VALUES (?, ?, ?, ?, ?)
247
+ """,
248
+ (project_id, rel, idx, chunk, tokens),
249
+ )
250
+ chunk_id = cur.lastrowid
251
+ if embeddings is not None and idx < len(embeddings):
252
+ vec = embeddings[idx]
253
+ if len(vec) == EMBED_DIM:
254
+ try:
255
+ conn.execute(
256
+ "INSERT OR REPLACE INTO codebase_chunks_vec (chunk_id, embedding) VALUES (?, ?)",
257
+ (chunk_id, json.dumps(vec)),
258
+ )
259
+ except sqlite3.OperationalError as exc:
260
+ _logger.warning("vec0 insert failed (chunk_id=%d): %s", chunk_id, exc)
261
+ return total_tokens
262
+
263
+
264
+ def _upsert_file(
265
+ conn: sqlite3.Connection,
266
+ project_id: str,
267
+ rel: str,
268
+ sha1: str,
269
+ size_bytes: int,
270
+ token_count: int,
271
+ ) -> None:
272
+ conn.execute(
273
+ """
274
+ INSERT INTO codebase_files (project_id, path, sha1, size_bytes, token_count, last_indexed)
275
+ VALUES (?, ?, ?, ?, ?, datetime('now'))
276
+ ON CONFLICT (project_id, path) DO UPDATE SET
277
+ sha1 = excluded.sha1,
278
+ size_bytes = excluded.size_bytes,
279
+ token_count = excluded.token_count,
280
+ last_indexed = excluded.last_indexed
281
+ """,
282
+ (project_id, rel, sha1, size_bytes, token_count),
283
+ )
284
+
285
+
286
+ # ---------------------------------------------------------------------------
287
+ # Top-level ingest
288
+ # ---------------------------------------------------------------------------
289
+
290
+ @dataclass
291
+ class IngestSummary:
292
+ project_id: str
293
+ root: Path
294
+ files_seen: int
295
+ files_indexed: int
296
+ files_unchanged: int
297
+ files_failed: int
298
+ chunks_written: int
299
+ chunks_embedded: int
300
+ tokens_total: int
301
+ duration_s: float
302
+
303
+
304
+ def ingest_project(
305
+ root: Path,
306
+ *,
307
+ project_id: str | None = None,
308
+ db_path: Path | None = None,
309
+ embed: bool = True,
310
+ ) -> IngestSummary:
311
+ """Walk *root*, ingest changed source files into the codebase tables.
312
+
313
+ project_id defaults to the basename of *root* (so memstrata-pro/
314
+ becomes "memstrata-pro"). Pass an explicit value when the harness or
315
+ extension uses a different identifier.
316
+ """
317
+ start = time.time()
318
+ root = root.resolve()
319
+ if not root.is_dir():
320
+ raise FileNotFoundError(f"not a directory: {root}")
321
+
322
+ pid = project_id or root.name
323
+ conn = _open_conn(db_path)
324
+
325
+ seen = indexed = unchanged = failed = 0
326
+ chunks_written = chunks_embedded = 0
327
+ tokens_total = 0
328
+
329
+ try:
330
+ for ref in iter_source_files(root):
331
+ seen += 1
332
+ raw = ref.path.read_bytes() if ref.path.is_file() else None
333
+ if raw is None:
334
+ failed += 1
335
+ continue
336
+ sha = _sha1_hex(raw)
337
+ if _existing_sha(conn, pid, ref.rel) == sha:
338
+ unchanged += 1
339
+ continue
340
+
341
+ text = _read_text(ref.path)
342
+ if text is None:
343
+ failed += 1
344
+ continue
345
+
346
+ chunks = chunk_text(text)
347
+ if not chunks:
348
+ # Empty file - still record it as seen so we don't keep retrying.
349
+ _drop_old_chunks(conn, pid, ref.rel)
350
+ _upsert_file(conn, pid, ref.rel, sha, len(raw), 0)
351
+ conn.commit()
352
+ indexed += 1
353
+ continue
354
+
355
+ embeddings: list[list[float]] | None = None
356
+ if embed:
357
+ embeddings = []
358
+ for batch_start in range(0, len(chunks), EMBED_BATCH):
359
+ batch = chunks[batch_start: batch_start + EMBED_BATCH]
360
+ got = _embed_batch(batch)
361
+ if got is None:
362
+ embeddings = None # bail; store text without vectors
363
+ break
364
+ embeddings.extend(got)
365
+ if embeddings is not None:
366
+ chunks_embedded += len(embeddings)
367
+
368
+ _drop_old_chunks(conn, pid, ref.rel)
369
+ written_tokens = _store_chunks(conn, pid, ref.rel, chunks, embeddings)
370
+ _upsert_file(conn, pid, ref.rel, sha, len(raw), written_tokens)
371
+ conn.commit()
372
+ indexed += 1
373
+ chunks_written += len(chunks)
374
+ tokens_total += written_tokens
375
+ finally:
376
+ conn.close()
377
+
378
+ duration = round(time.time() - start, 2)
379
+ return IngestSummary(
380
+ project_id=pid,
381
+ root=root,
382
+ files_seen=seen,
383
+ files_indexed=indexed,
384
+ files_unchanged=unchanged,
385
+ files_failed=failed,
386
+ chunks_written=chunks_written,
387
+ chunks_embedded=chunks_embedded,
388
+ tokens_total=tokens_total,
389
+ duration_s=duration,
390
+ )
391
+
392
+
393
+ # ---------------------------------------------------------------------------
394
+ # CLI
395
+ # ---------------------------------------------------------------------------
396
+
397
+ def cmd_ingest(args: argparse.Namespace) -> None:
398
+ """Entry point for `memstrata ingest <path>`."""
399
+ root = Path(args.path).expanduser().resolve()
400
+ if not root.exists():
401
+ print(f"ingest: path does not exist: {root}", file=sys.stderr)
402
+ sys.exit(1)
403
+ if not root.is_dir():
404
+ print(f"ingest: not a directory: {root}", file=sys.stderr)
405
+ sys.exit(1)
406
+
407
+ logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s")
408
+ print(f"[memstrata ingest] root: {root}")
409
+ print(f"[memstrata ingest] project_id: {args.project_id or root.name}")
410
+ print(f"[memstrata ingest] embed: {not args.no_embed}")
411
+ print()
412
+
413
+ summary = ingest_project(
414
+ root,
415
+ project_id=args.project_id,
416
+ embed=not args.no_embed,
417
+ )
418
+
419
+ print(f" files seen: {summary.files_seen}")
420
+ print(f" files indexed: {summary.files_indexed}")
421
+ print(f" files unchanged: {summary.files_unchanged}")
422
+ print(f" files failed: {summary.files_failed}")
423
+ print(f" chunks written: {summary.chunks_written}")
424
+ print(f" chunks embedded: {summary.chunks_embedded}")
425
+ print(f" tokens total: {summary.tokens_total:,}")
426
+ print(f" duration: {summary.duration_s}s")
427
+ if summary.chunks_embedded == 0 and summary.chunks_written > 0:
428
+ print(
429
+ "\n ! No embeddings were stored. Ollama at http://localhost:11434 "
430
+ "may be offline. Re-run with the same command after starting it; "
431
+ "unchanged files will be skipped automatically."
432
+ )