mikoshi 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mikoshi/config.py ADDED
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+ DEFAULT_MAX_BYTES = 1_000_000
8
+ DEFAULT_CHUNK_LINES = 120
9
+ DEFAULT_CHUNK_OVERLAP = 20
10
+ DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
11
+ DEFAULT_OPENAI_EMBED_MODEL = "text-embedding-3-small"
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class EmbeddingConfig:
16
+ provider: str
17
+ model: str
18
+ openai_api_key: str | None
19
+ openai_base_url: str | None
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class Config:
24
+ max_bytes: int
25
+ chunk_lines: int
26
+ chunk_overlap: int
27
+ embeddings: EmbeddingConfig
28
+ index_root: Path
29
+ quiet_external_libs: bool
30
+
31
+
32
+ class ConfigError(RuntimeError):
33
+ pass
34
+
35
+
36
+ def _env_bool(name: str, default: bool) -> bool:
37
+ raw = os.getenv(name)
38
+ if raw is None:
39
+ return default
40
+ value = raw.strip().lower()
41
+ if value in {"1", "true", "yes", "on"}:
42
+ return True
43
+ if value in {"0", "false", "no", "off"}:
44
+ return False
45
+ return default
46
+
47
+
48
+ _EXTERNAL_LIBS_CONFIGURED = False
49
+
50
+
51
+ def configure_external_libs(quiet: bool) -> None:
52
+ global _EXTERNAL_LIBS_CONFIGURED
53
+ if not quiet or _EXTERNAL_LIBS_CONFIGURED:
54
+ return
55
+
56
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
57
+ os.environ["TRANSFORMERS_VERBOSITY"] = "error"
58
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
59
+ os.environ["PYTHONWARNINGS"] = "ignore"
60
+ os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
61
+ os.environ.setdefault("OMP_NUM_THREADS", "1")
62
+ os.environ.setdefault("MKL_NUM_THREADS", "1")
63
+ os.environ.setdefault("VECLIB_MAXIMUM_THREADS", "1")
64
+ os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
65
+
66
+ import logging
67
+
68
+ logging.getLogger("transformers").setLevel(logging.ERROR)
69
+ logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
70
+ logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
71
+
72
+ try:
73
+ from transformers.utils import logging as transformers_logging
74
+
75
+ transformers_logging.set_verbosity_error()
76
+ if hasattr(transformers_logging, "disable_progress_bar"):
77
+ transformers_logging.disable_progress_bar()
78
+ except Exception:
79
+ pass
80
+
81
+ _EXTERNAL_LIBS_CONFIGURED = True
82
+
83
+
84
+ def load_config() -> Config:
85
+ provider = os.getenv("MIKOSHI_EMBEDDINGS_PROVIDER", "local").strip().lower()
86
+ if provider not in {"local", "openai"}:
87
+ raise ConfigError(
88
+ "MIKOSHI_EMBEDDINGS_PROVIDER must be 'local' or 'openai'."
89
+ )
90
+
91
+ model = os.getenv("MIKOSHI_EMBEDDINGS_MODEL", DEFAULT_EMBEDDING_MODEL)
92
+ openai_model = os.getenv("MIKOSHI_OPENAI_EMBED_MODEL", DEFAULT_OPENAI_EMBED_MODEL)
93
+
94
+ openai_api_key = os.getenv("MIKOSHI_OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY")
95
+ openai_base_url = os.getenv("MIKOSHI_OPENAI_BASE_URL") or os.getenv("OPENAI_BASE_URL")
96
+
97
+ if provider == "openai":
98
+ if not openai_api_key:
99
+ raise ConfigError(
100
+ "OpenAI embeddings selected but no API key found. "
101
+ "Set MIKOSHI_OPENAI_API_KEY or OPENAI_API_KEY."
102
+ )
103
+ model = openai_model
104
+
105
+ index_root = Path(os.getenv("MIKOSHI_INDEX_ROOT", "~/.mikoshi")).expanduser()
106
+ max_bytes = int(os.getenv("MIKOSHI_MAX_BYTES", str(DEFAULT_MAX_BYTES)))
107
+ chunk_lines = int(os.getenv("MIKOSHI_CHUNK_LINES", str(DEFAULT_CHUNK_LINES)))
108
+ chunk_overlap = int(os.getenv("MIKOSHI_CHUNK_OVERLAP", str(DEFAULT_CHUNK_OVERLAP)))
109
+ quiet_external_libs = _env_bool("MIKOSHI_QUIET_EXTERNAL_LIBS", True)
110
+
111
+ embeddings = EmbeddingConfig(
112
+ provider=provider,
113
+ model=model,
114
+ openai_api_key=openai_api_key,
115
+ openai_base_url=openai_base_url,
116
+ )
117
+
118
+ return Config(
119
+ max_bytes=max_bytes,
120
+ chunk_lines=chunk_lines,
121
+ chunk_overlap=chunk_overlap,
122
+ embeddings=embeddings,
123
+ index_root=index_root,
124
+ quiet_external_libs=quiet_external_libs,
125
+ )
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ DEFAULT_PLAN = "free"
7
+ DEFAULT_FEATURES = ("local_index", "mcp")
8
+
9
+
10
+ class UpgradeRequired(RuntimeError):
11
+ pass
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class Entitlements:
16
+ plan: str
17
+ features: frozenset[str]
18
+
19
+ def has(self, feature: str) -> bool:
20
+ return feature in self.features
21
+
22
+ def require(self, feature: str) -> None:
23
+ if not self.has(feature):
24
+ raise UpgradeRequired(f"Available with Mikoshi Pro: {feature}")
25
+
26
+
27
+ def from_plan_features(plan: str | None, features: list[str] | None) -> Entitlements:
28
+ normalized_plan = (plan or DEFAULT_PLAN).strip().lower()
29
+ normalized_features = frozenset(
30
+ {item.strip() for item in (features or list(DEFAULT_FEATURES)) if item}
31
+ )
32
+ return Entitlements(plan=normalized_plan, features=normalized_features)
mikoshi/hashing.py ADDED
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+
5
+
6
+ def sha256_bytes(data: bytes) -> str:
7
+ return hashlib.sha256(data).hexdigest()
8
+
9
+
10
+ def sha256_text(text: str) -> str:
11
+ return sha256_bytes(text.encode("utf-8"))
mikoshi/ignore.py ADDED
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Iterable
6
+
7
+ from pathspec import PathSpec
8
+
9
+ COMMON_IGNORE_PATTERNS = [
10
+ "**/.git/",
11
+ "**/.hg/",
12
+ "**/.svn/",
13
+ "**/.DS_Store",
14
+ "**/__pycache__/",
15
+ "**/*.pyc",
16
+ "**/.venv/",
17
+ "**/venv/",
18
+ "**/node_modules/",
19
+ "**/dist/",
20
+ "**/build/",
21
+ "**/.idea/",
22
+ "**/.vscode/",
23
+ "**/.mypy_cache/",
24
+ "**/.pytest_cache/",
25
+ "**/coverage/",
26
+ "**/.coverage",
27
+ "**/*.log",
28
+ ]
29
+
30
+ COMMON_IGNORE_DIRS = {
31
+ ".git",
32
+ ".hg",
33
+ ".svn",
34
+ "node_modules",
35
+ "dist",
36
+ "build",
37
+ ".venv",
38
+ "venv",
39
+ "__pycache__",
40
+ ".idea",
41
+ ".vscode",
42
+ ".mypy_cache",
43
+ ".pytest_cache",
44
+ }
45
+
46
+
47
+ def _convert_gitignore_pattern(
48
+ pattern: str,
49
+ base_dir: Path,
50
+ repo_root: Path,
51
+ ) -> str | None:
52
+ stripped = pattern.strip()
53
+ if not stripped or stripped.startswith("#"):
54
+ return None
55
+
56
+ negated = stripped.startswith("!")
57
+ if negated:
58
+ stripped = stripped[1:]
59
+
60
+ if not stripped:
61
+ return None
62
+
63
+ rel_base = base_dir.relative_to(repo_root).as_posix()
64
+ prefix = "" if rel_base == "." else f"{rel_base}/"
65
+
66
+ anchored = stripped.startswith("/")
67
+ if anchored:
68
+ body = stripped.lstrip("/")
69
+ converted = f"{prefix}{body}"
70
+ else:
71
+ if "/" in stripped:
72
+ converted = f"{prefix}{stripped}"
73
+ else:
74
+ converted = f"{prefix}**/{stripped}"
75
+
76
+ return f"!{converted}" if negated else converted
77
+
78
+
79
+ def _load_gitignore_files(repo_root: Path) -> list[Path]:
80
+ gitignores: list[Path] = []
81
+ for root, dirs, files in os.walk(repo_root):
82
+ dirs[:] = [d for d in dirs if d not in COMMON_IGNORE_DIRS]
83
+ if ".gitignore" in files:
84
+ gitignores.append(Path(root) / ".gitignore")
85
+ return sorted(gitignores, key=lambda p: len(p.relative_to(repo_root).parts))
86
+
87
+
88
+ def _load_patterns(repo_root: Path) -> list[str]:
89
+ patterns: list[str] = list(COMMON_IGNORE_PATTERNS)
90
+ for gitignore in _load_gitignore_files(repo_root):
91
+ try:
92
+ content = gitignore.read_text(encoding="utf-8")
93
+ except OSError:
94
+ continue
95
+ for line in content.splitlines():
96
+ converted = _convert_gitignore_pattern(line, gitignore.parent, repo_root)
97
+ if converted:
98
+ patterns.append(converted)
99
+ return patterns
100
+
101
+
102
+ def build_ignore_spec(repo_root: Path) -> PathSpec:
103
+ patterns = _load_patterns(repo_root)
104
+ return PathSpec.from_lines("gitwildmatch", patterns)
105
+
106
+
107
+ class IgnoreMatcher:
108
+ def __init__(self, repo_root: Path) -> None:
109
+ self.repo_root = repo_root
110
+ self.spec = build_ignore_spec(repo_root)
111
+
112
+ def is_ignored(self, path: Path) -> bool:
113
+ try:
114
+ relpath = path.relative_to(self.repo_root).as_posix()
115
+ except ValueError:
116
+ relpath = path.as_posix()
117
+ if path.is_dir() and not relpath.endswith("/"):
118
+ relpath = f"{relpath}/"
119
+ if self.spec.match_file(relpath):
120
+ return True
121
+ if not path.is_dir():
122
+ parent = path.parent
123
+ while parent != self.repo_root and parent != parent.parent:
124
+ parent_rel = parent.relative_to(self.repo_root).as_posix()
125
+ if not parent_rel.endswith("/"):
126
+ parent_rel = f"{parent_rel}/"
127
+ if self.spec.match_file(parent_rel):
128
+ return True
129
+ parent = parent.parent
130
+ return False
131
+
132
+ def is_ignored_rel(self, relpath: str) -> bool:
133
+ return self.spec.match_file(relpath)
134
+
135
+
136
+ def iter_ignored(paths: Iterable[Path], matcher: IgnoreMatcher) -> Iterable[Path]:
137
+ for path in paths:
138
+ if matcher.is_ignored(path):
139
+ yield path
@@ -0,0 +1,9 @@
1
+ from .indexer import index_repo
2
+ from .index_store import IndexStore
3
+ from .file_scanner import scan_repo
4
+
5
+ __all__ = [
6
+ "index_repo",
7
+ "IndexStore",
8
+ "scan_repo",
9
+ ]
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from mikoshi.ignore import IgnoreMatcher
7
+
8
+
9
+ class ScanError(RuntimeError):
10
+ pass
11
+
12
+
13
+ def _is_binary(path: Path) -> bool:
14
+ try:
15
+ with path.open("rb") as handle:
16
+ chunk = handle.read(4096)
17
+ except OSError:
18
+ return True
19
+ if b"\x00" in chunk:
20
+ return True
21
+ try:
22
+ chunk.decode("utf-8")
23
+ except UnicodeDecodeError:
24
+ return True
25
+ return False
26
+
27
+
28
+ def scan_repo(
29
+ repo_root: Path,
30
+ matcher: IgnoreMatcher,
31
+ max_bytes: int,
32
+ ) -> list[Path]:
33
+ if not repo_root.exists() or not repo_root.is_dir():
34
+ raise ScanError(f"Repository path does not exist: {repo_root}")
35
+
36
+ files: list[Path] = []
37
+ for root, dirs, filenames in os.walk(repo_root):
38
+ root_path = Path(root)
39
+ dirs[:] = [
40
+ d
41
+ for d in dirs
42
+ if not matcher.is_ignored(root_path / d)
43
+ ]
44
+ for name in filenames:
45
+ file_path = root_path / name
46
+ if matcher.is_ignored(file_path):
47
+ continue
48
+ if file_path.is_symlink():
49
+ continue
50
+ try:
51
+ size = file_path.stat().st_size
52
+ except OSError:
53
+ continue
54
+ if size > max_bytes:
55
+ continue
56
+ if _is_binary(file_path):
57
+ continue
58
+ files.append(file_path)
59
+
60
+ return sorted(files)
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Iterable
7
+
8
+ import numpy as np
9
+ import faiss
10
+
11
+ from mikoshi.hashing import sha256_text
12
+ from mikoshi.utils.types import Chunk, IndexMeta
13
+
14
+
15
+ class IndexStore:
16
+ def __init__(self, repo_root: Path, index_root: Path) -> None:
17
+ self.repo_root = repo_root
18
+ self.repo_id = repo_id_for_path(repo_root)
19
+ self.store_dir = index_root / self.repo_id
20
+ self.meta_path = self.store_dir / "meta.json"
21
+ self.chunks_path = self.store_dir / "chunks.jsonl"
22
+ self.embeddings_path = self.store_dir / "embeddings.npy"
23
+ self.faiss_path = self.store_dir / "index.faiss"
24
+
25
+ def exists(self) -> bool:
26
+ return self.meta_path.exists() and self.chunks_path.exists()
27
+
28
+ def ensure_dir(self) -> None:
29
+ self.store_dir.mkdir(parents=True, exist_ok=True)
30
+
31
+ def clear(self) -> None:
32
+ if self.store_dir.exists():
33
+ shutil.rmtree(self.store_dir)
34
+
35
+ def load_meta(self) -> IndexMeta | None:
36
+ if not self.meta_path.exists():
37
+ return None
38
+ data = json.loads(self.meta_path.read_text(encoding="utf-8"))
39
+ return IndexMeta.from_dict(data)
40
+
41
+ def save_meta(self, meta: IndexMeta) -> None:
42
+ self.ensure_dir()
43
+ self.meta_path.write_text(
44
+ json.dumps(meta.to_dict(), indent=2, sort_keys=True),
45
+ encoding="utf-8",
46
+ )
47
+
48
+ def load_chunks(self) -> list[Chunk]:
49
+ if not self.chunks_path.exists():
50
+ return []
51
+ chunks: list[Chunk] = []
52
+ with self.chunks_path.open("r", encoding="utf-8") as handle:
53
+ for line in handle:
54
+ line = line.strip()
55
+ if not line:
56
+ continue
57
+ chunks.append(Chunk.from_dict(json.loads(line)))
58
+ return chunks
59
+
60
+ def save_chunks(self, chunks: Iterable[Chunk]) -> None:
61
+ self.ensure_dir()
62
+ with self.chunks_path.open("w", encoding="utf-8") as handle:
63
+ for chunk in chunks:
64
+ handle.write(json.dumps(chunk.to_dict(), ensure_ascii=False))
65
+ handle.write("\n")
66
+
67
+ def load_embeddings(self) -> np.ndarray | None:
68
+ if not self.embeddings_path.exists():
69
+ return None
70
+ return np.load(self.embeddings_path)
71
+
72
+ def save_embeddings(self, embeddings: np.ndarray) -> None:
73
+ self.ensure_dir()
74
+ np.save(self.embeddings_path, embeddings)
75
+
76
+ def load_faiss(self) -> faiss.Index | None:
77
+ if not self.faiss_path.exists():
78
+ return None
79
+ return faiss.read_index(str(self.faiss_path))
80
+
81
+ def save_faiss(self, index: faiss.Index) -> None:
82
+ self.ensure_dir()
83
+ faiss.write_index(index, str(self.faiss_path))
84
+
85
+
86
+ def repo_id_for_path(repo_root: Path) -> str:
87
+ return sha256_text(str(repo_root.resolve()))[:16]
@@ -0,0 +1,237 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+ from typing import Iterable
7
+
8
+ import faiss
9
+ import numpy as np
10
+
11
+ from mikoshi.chunking import chunk_text
12
+ from mikoshi.config import Config, load_config
13
+ from mikoshi.hashing import sha256_bytes, sha256_text
14
+ from mikoshi.ignore import IgnoreMatcher
15
+ from mikoshi.indexing.file_scanner import scan_repo
16
+ from mikoshi.indexing.index_store import IndexStore
17
+ from mikoshi.retrieval.semantic import get_embeddings_provider, normalize_embeddings
18
+ from mikoshi.utils.timer import Timer
19
+ from mikoshi.utils.types import Chunk, IndexMeta, IndexResult
20
+
21
+
22
+ class IndexerError(RuntimeError):
23
+ pass
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class IndexedFile:
28
+ path: Path
29
+ relpath: str
30
+ file_hash: str
31
+ text: str
32
+
33
+
34
+ def _read_file(path: Path) -> bytes:
35
+ return path.read_bytes()
36
+
37
+
38
+ def _prepare_file(path: Path, repo_root: Path) -> IndexedFile:
39
+ data = _read_file(path)
40
+ file_hash = sha256_bytes(data)
41
+ text = data.decode("utf-8", errors="ignore")
42
+ relpath = path.relative_to(repo_root).as_posix()
43
+ return IndexedFile(path=path, relpath=relpath, file_hash=file_hash, text=text)
44
+
45
+
46
+ def _chunks_from_text(
47
+ relpath: str,
48
+ file_hash: str,
49
+ text: str,
50
+ max_lines: int,
51
+ overlap: int,
52
+ ) -> list[Chunk]:
53
+ spans = chunk_text(text, max_lines, overlap)
54
+ chunks: list[Chunk] = []
55
+ for span in spans:
56
+ chunk_id = sha256_text(
57
+ f"{relpath}:{span.start_line}:{span.end_line}:{span.text}"
58
+ )
59
+ chunks.append(
60
+ Chunk(
61
+ id=chunk_id,
62
+ relpath=relpath,
63
+ start_line=span.start_line,
64
+ end_line=span.end_line,
65
+ text=span.text,
66
+ file_hash=file_hash,
67
+ vector_idx=None,
68
+ )
69
+ )
70
+ return chunks
71
+
72
+
73
+ def _with_vector_idx(chunks: Iterable[Chunk]) -> list[Chunk]:
74
+ return [
75
+ Chunk(
76
+ id=chunk.id,
77
+ relpath=chunk.relpath,
78
+ start_line=chunk.start_line,
79
+ end_line=chunk.end_line,
80
+ text=chunk.text,
81
+ file_hash=chunk.file_hash,
82
+ vector_idx=idx,
83
+ )
84
+ for idx, chunk in enumerate(chunks)
85
+ ]
86
+
87
+
88
+ def _reuse_chunks(
89
+ prev_chunks: list[Chunk],
90
+ prev_embeddings: np.ndarray | None,
91
+ ) -> tuple[dict[str, list[Chunk]], dict[str, np.ndarray]]:
92
+ chunks_by_file: dict[str, list[Chunk]] = {}
93
+ embedding_by_id: dict[str, np.ndarray] = {}
94
+
95
+ if not prev_chunks:
96
+ return chunks_by_file, embedding_by_id
97
+
98
+ if prev_embeddings is not None:
99
+ for chunk in prev_chunks:
100
+ if chunk.vector_idx is None:
101
+ continue
102
+ if 0 <= chunk.vector_idx < len(prev_embeddings):
103
+ embedding_by_id[chunk.id] = prev_embeddings[chunk.vector_idx]
104
+
105
+ for chunk in prev_chunks:
106
+ chunks_by_file.setdefault(chunk.relpath, []).append(chunk)
107
+
108
+ return chunks_by_file, embedding_by_id
109
+
110
+
111
+ def index_repo(repo_path: str, config: Config | None = None) -> IndexResult:
112
+ config = config or load_config()
113
+ repo_root = Path(repo_path).expanduser().resolve()
114
+
115
+ matcher = IgnoreMatcher(repo_root)
116
+ store = IndexStore(repo_root, config.index_root)
117
+
118
+ prev_meta = store.load_meta()
119
+ prev_chunks = store.load_chunks()
120
+ prev_embeddings = store.load_embeddings()
121
+ prev_files = prev_meta.files if prev_meta else {}
122
+
123
+ if prev_meta:
124
+ if (
125
+ prev_meta.chunk_lines != config.chunk_lines
126
+ or prev_meta.chunk_overlap != config.chunk_overlap
127
+ or prev_meta.max_bytes != config.max_bytes
128
+ or prev_meta.embedding_provider != config.embeddings.provider
129
+ or prev_meta.model != config.embeddings.model
130
+ ):
131
+ prev_meta = None
132
+ prev_chunks = []
133
+ prev_embeddings = None
134
+ prev_files = {}
135
+
136
+ prev_chunks_by_file, prev_embedding_by_id = _reuse_chunks(
137
+ prev_chunks,
138
+ prev_embeddings,
139
+ )
140
+
141
+ with Timer() as timer:
142
+ files = scan_repo(repo_root, matcher, config.max_bytes)
143
+ indexed_files: list[IndexedFile] = []
144
+ for path in files:
145
+ try:
146
+ indexed_files.append(_prepare_file(path, repo_root))
147
+ except OSError:
148
+ continue
149
+
150
+ file_hashes = {item.relpath: item.file_hash for item in indexed_files}
151
+
152
+ new_chunks: list[Chunk] = []
153
+ embedding_slots: list[np.ndarray | None] = []
154
+ pending_texts: list[str] = []
155
+ pending_indices: list[int] = []
156
+
157
+ for item in indexed_files:
158
+ relpath = item.relpath
159
+ unchanged = prev_files.get(relpath) == item.file_hash
160
+ if unchanged and relpath in prev_chunks_by_file and prev_embedding_by_id:
161
+ existing_chunks = prev_chunks_by_file[relpath]
162
+ if all(chunk.id in prev_embedding_by_id for chunk in existing_chunks):
163
+ for chunk in existing_chunks:
164
+ new_chunks.append(chunk)
165
+ embedding_slots.append(prev_embedding_by_id[chunk.id])
166
+ continue
167
+
168
+ chunks = _chunks_from_text(
169
+ relpath,
170
+ item.file_hash,
171
+ item.text,
172
+ config.chunk_lines,
173
+ config.chunk_overlap,
174
+ )
175
+ for chunk in chunks:
176
+ pending_indices.append(len(new_chunks))
177
+ new_chunks.append(chunk)
178
+ embedding_slots.append(None)
179
+ pending_texts.append(chunk.text)
180
+
181
+ provider = None
182
+ if pending_texts:
183
+ provider = get_embeddings_provider(config)
184
+ pending_embeddings = provider.embed_texts(pending_texts)
185
+ else:
186
+ pending_embeddings = np.zeros((0, 0), dtype=np.float32)
187
+
188
+ if provider is not None and provider.dimension:
189
+ dimension = provider.dimension
190
+ elif prev_embeddings is not None and prev_embeddings.size:
191
+ dimension = int(prev_embeddings.shape[1])
192
+ elif prev_meta and prev_meta.embedding_dim:
193
+ dimension = prev_meta.embedding_dim
194
+ else:
195
+ dimension = int(pending_embeddings.shape[1]) if pending_embeddings.size else 1
196
+
197
+ for slot_idx, embedding in zip(pending_indices, pending_embeddings):
198
+ embedding_slots[slot_idx] = embedding
199
+
200
+ if embedding_slots:
201
+ all_embeddings = np.stack(embedding_slots).astype(np.float32)
202
+ else:
203
+ all_embeddings = np.zeros((0, dimension), dtype=np.float32)
204
+ all_embeddings = normalize_embeddings(all_embeddings)
205
+
206
+ final_chunks = _with_vector_idx(new_chunks)
207
+
208
+ index = faiss.IndexFlatIP(dimension)
209
+ if all_embeddings.size:
210
+ index.add(all_embeddings)
211
+
212
+ now = datetime.now(timezone.utc).isoformat()
213
+ meta = IndexMeta(
214
+ repo_id=store.repo_id,
215
+ repo_path=str(repo_root),
216
+ created_at=prev_meta.created_at if prev_meta else now,
217
+ updated_at=now,
218
+ embedding_provider=config.embeddings.provider,
219
+ model=config.embeddings.model,
220
+ embedding_dim=dimension,
221
+ chunk_lines=config.chunk_lines,
222
+ chunk_overlap=config.chunk_overlap,
223
+ max_bytes=config.max_bytes,
224
+ files=file_hashes,
225
+ chunks=len(final_chunks),
226
+ )
227
+
228
+ store.save_chunks(final_chunks)
229
+ store.save_embeddings(all_embeddings)
230
+ store.save_faiss(index)
231
+ store.save_meta(meta)
232
+
233
+ return IndexResult(
234
+ repo_id=store.repo_id,
235
+ chunks_indexed=len(final_chunks),
236
+ took_ms=timer.ms,
237
+ )