mikoshi 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mikoshi/__init__.py +3 -0
- mikoshi/auth.py +265 -0
- mikoshi/chunking.py +44 -0
- mikoshi/cli.py +295 -0
- mikoshi/config.py +125 -0
- mikoshi/entitlements.py +32 -0
- mikoshi/hashing.py +11 -0
- mikoshi/ignore.py +139 -0
- mikoshi/indexing/__init__.py +9 -0
- mikoshi/indexing/file_scanner.py +60 -0
- mikoshi/indexing/index_store.py +87 -0
- mikoshi/indexing/indexer.py +237 -0
- mikoshi/mcp_server/__init__.py +3 -0
- mikoshi/mcp_server/server.py +135 -0
- mikoshi/retrieval/__init__.py +17 -0
- mikoshi/retrieval/hybrid.py +109 -0
- mikoshi/retrieval/lexical.py +68 -0
- mikoshi/retrieval/rerank.py +27 -0
- mikoshi/retrieval/semantic.py +175 -0
- mikoshi/utils/__init__.py +11 -0
- mikoshi/utils/timer.py +18 -0
- mikoshi/utils/types.py +111 -0
- mikoshi-0.1.9.dist-info/METADATA +52 -0
- mikoshi-0.1.9.dist-info/RECORD +26 -0
- mikoshi-0.1.9.dist-info/WHEEL +5 -0
- mikoshi-0.1.9.dist-info/top_level.txt +1 -0
mikoshi/config.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
DEFAULT_MAX_BYTES = 1_000_000
|
|
8
|
+
DEFAULT_CHUNK_LINES = 120
|
|
9
|
+
DEFAULT_CHUNK_OVERLAP = 20
|
|
10
|
+
DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
11
|
+
DEFAULT_OPENAI_EMBED_MODEL = "text-embedding-3-small"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class EmbeddingConfig:
|
|
16
|
+
provider: str
|
|
17
|
+
model: str
|
|
18
|
+
openai_api_key: str | None
|
|
19
|
+
openai_base_url: str | None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class Config:
|
|
24
|
+
max_bytes: int
|
|
25
|
+
chunk_lines: int
|
|
26
|
+
chunk_overlap: int
|
|
27
|
+
embeddings: EmbeddingConfig
|
|
28
|
+
index_root: Path
|
|
29
|
+
quiet_external_libs: bool
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ConfigError(RuntimeError):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _env_bool(name: str, default: bool) -> bool:
|
|
37
|
+
raw = os.getenv(name)
|
|
38
|
+
if raw is None:
|
|
39
|
+
return default
|
|
40
|
+
value = raw.strip().lower()
|
|
41
|
+
if value in {"1", "true", "yes", "on"}:
|
|
42
|
+
return True
|
|
43
|
+
if value in {"0", "false", "no", "off"}:
|
|
44
|
+
return False
|
|
45
|
+
return default
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
_EXTERNAL_LIBS_CONFIGURED = False
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def configure_external_libs(quiet: bool) -> None:
|
|
52
|
+
global _EXTERNAL_LIBS_CONFIGURED
|
|
53
|
+
if not quiet or _EXTERNAL_LIBS_CONFIGURED:
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
|
57
|
+
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
|
|
58
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
59
|
+
os.environ["PYTHONWARNINGS"] = "ignore"
|
|
60
|
+
os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
|
|
61
|
+
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
|
62
|
+
os.environ.setdefault("MKL_NUM_THREADS", "1")
|
|
63
|
+
os.environ.setdefault("VECLIB_MAXIMUM_THREADS", "1")
|
|
64
|
+
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
|
|
65
|
+
|
|
66
|
+
import logging
|
|
67
|
+
|
|
68
|
+
logging.getLogger("transformers").setLevel(logging.ERROR)
|
|
69
|
+
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
|
|
70
|
+
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
from transformers.utils import logging as transformers_logging
|
|
74
|
+
|
|
75
|
+
transformers_logging.set_verbosity_error()
|
|
76
|
+
if hasattr(transformers_logging, "disable_progress_bar"):
|
|
77
|
+
transformers_logging.disable_progress_bar()
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
_EXTERNAL_LIBS_CONFIGURED = True
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def load_config() -> Config:
|
|
85
|
+
provider = os.getenv("MIKOSHI_EMBEDDINGS_PROVIDER", "local").strip().lower()
|
|
86
|
+
if provider not in {"local", "openai"}:
|
|
87
|
+
raise ConfigError(
|
|
88
|
+
"MIKOSHI_EMBEDDINGS_PROVIDER must be 'local' or 'openai'."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
model = os.getenv("MIKOSHI_EMBEDDINGS_MODEL", DEFAULT_EMBEDDING_MODEL)
|
|
92
|
+
openai_model = os.getenv("MIKOSHI_OPENAI_EMBED_MODEL", DEFAULT_OPENAI_EMBED_MODEL)
|
|
93
|
+
|
|
94
|
+
openai_api_key = os.getenv("MIKOSHI_OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY")
|
|
95
|
+
openai_base_url = os.getenv("MIKOSHI_OPENAI_BASE_URL") or os.getenv("OPENAI_BASE_URL")
|
|
96
|
+
|
|
97
|
+
if provider == "openai":
|
|
98
|
+
if not openai_api_key:
|
|
99
|
+
raise ConfigError(
|
|
100
|
+
"OpenAI embeddings selected but no API key found. "
|
|
101
|
+
"Set MIKOSHI_OPENAI_API_KEY or OPENAI_API_KEY."
|
|
102
|
+
)
|
|
103
|
+
model = openai_model
|
|
104
|
+
|
|
105
|
+
index_root = Path(os.getenv("MIKOSHI_INDEX_ROOT", "~/.mikoshi")).expanduser()
|
|
106
|
+
max_bytes = int(os.getenv("MIKOSHI_MAX_BYTES", str(DEFAULT_MAX_BYTES)))
|
|
107
|
+
chunk_lines = int(os.getenv("MIKOSHI_CHUNK_LINES", str(DEFAULT_CHUNK_LINES)))
|
|
108
|
+
chunk_overlap = int(os.getenv("MIKOSHI_CHUNK_OVERLAP", str(DEFAULT_CHUNK_OVERLAP)))
|
|
109
|
+
quiet_external_libs = _env_bool("MIKOSHI_QUIET_EXTERNAL_LIBS", True)
|
|
110
|
+
|
|
111
|
+
embeddings = EmbeddingConfig(
|
|
112
|
+
provider=provider,
|
|
113
|
+
model=model,
|
|
114
|
+
openai_api_key=openai_api_key,
|
|
115
|
+
openai_base_url=openai_base_url,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return Config(
|
|
119
|
+
max_bytes=max_bytes,
|
|
120
|
+
chunk_lines=chunk_lines,
|
|
121
|
+
chunk_overlap=chunk_overlap,
|
|
122
|
+
embeddings=embeddings,
|
|
123
|
+
index_root=index_root,
|
|
124
|
+
quiet_external_libs=quiet_external_libs,
|
|
125
|
+
)
|
mikoshi/entitlements.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
DEFAULT_PLAN = "free"
|
|
7
|
+
DEFAULT_FEATURES = ("local_index", "mcp")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UpgradeRequired(RuntimeError):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class Entitlements:
|
|
16
|
+
plan: str
|
|
17
|
+
features: frozenset[str]
|
|
18
|
+
|
|
19
|
+
def has(self, feature: str) -> bool:
|
|
20
|
+
return feature in self.features
|
|
21
|
+
|
|
22
|
+
def require(self, feature: str) -> None:
|
|
23
|
+
if not self.has(feature):
|
|
24
|
+
raise UpgradeRequired(f"Available with Mikoshi Pro: {feature}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def from_plan_features(plan: str | None, features: list[str] | None) -> Entitlements:
|
|
28
|
+
normalized_plan = (plan or DEFAULT_PLAN).strip().lower()
|
|
29
|
+
normalized_features = frozenset(
|
|
30
|
+
{item.strip() for item in (features or list(DEFAULT_FEATURES)) if item}
|
|
31
|
+
)
|
|
32
|
+
return Entitlements(plan=normalized_plan, features=normalized_features)
|
mikoshi/hashing.py
ADDED
mikoshi/ignore.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterable
|
|
6
|
+
|
|
7
|
+
from pathspec import PathSpec
|
|
8
|
+
|
|
9
|
+
COMMON_IGNORE_PATTERNS = [
|
|
10
|
+
"**/.git/",
|
|
11
|
+
"**/.hg/",
|
|
12
|
+
"**/.svn/",
|
|
13
|
+
"**/.DS_Store",
|
|
14
|
+
"**/__pycache__/",
|
|
15
|
+
"**/*.pyc",
|
|
16
|
+
"**/.venv/",
|
|
17
|
+
"**/venv/",
|
|
18
|
+
"**/node_modules/",
|
|
19
|
+
"**/dist/",
|
|
20
|
+
"**/build/",
|
|
21
|
+
"**/.idea/",
|
|
22
|
+
"**/.vscode/",
|
|
23
|
+
"**/.mypy_cache/",
|
|
24
|
+
"**/.pytest_cache/",
|
|
25
|
+
"**/coverage/",
|
|
26
|
+
"**/.coverage",
|
|
27
|
+
"**/*.log",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
COMMON_IGNORE_DIRS = {
|
|
31
|
+
".git",
|
|
32
|
+
".hg",
|
|
33
|
+
".svn",
|
|
34
|
+
"node_modules",
|
|
35
|
+
"dist",
|
|
36
|
+
"build",
|
|
37
|
+
".venv",
|
|
38
|
+
"venv",
|
|
39
|
+
"__pycache__",
|
|
40
|
+
".idea",
|
|
41
|
+
".vscode",
|
|
42
|
+
".mypy_cache",
|
|
43
|
+
".pytest_cache",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _convert_gitignore_pattern(
|
|
48
|
+
pattern: str,
|
|
49
|
+
base_dir: Path,
|
|
50
|
+
repo_root: Path,
|
|
51
|
+
) -> str | None:
|
|
52
|
+
stripped = pattern.strip()
|
|
53
|
+
if not stripped or stripped.startswith("#"):
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
negated = stripped.startswith("!")
|
|
57
|
+
if negated:
|
|
58
|
+
stripped = stripped[1:]
|
|
59
|
+
|
|
60
|
+
if not stripped:
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
rel_base = base_dir.relative_to(repo_root).as_posix()
|
|
64
|
+
prefix = "" if rel_base == "." else f"{rel_base}/"
|
|
65
|
+
|
|
66
|
+
anchored = stripped.startswith("/")
|
|
67
|
+
if anchored:
|
|
68
|
+
body = stripped.lstrip("/")
|
|
69
|
+
converted = f"{prefix}{body}"
|
|
70
|
+
else:
|
|
71
|
+
if "/" in stripped:
|
|
72
|
+
converted = f"{prefix}{stripped}"
|
|
73
|
+
else:
|
|
74
|
+
converted = f"{prefix}**/{stripped}"
|
|
75
|
+
|
|
76
|
+
return f"!{converted}" if negated else converted
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _load_gitignore_files(repo_root: Path) -> list[Path]:
|
|
80
|
+
gitignores: list[Path] = []
|
|
81
|
+
for root, dirs, files in os.walk(repo_root):
|
|
82
|
+
dirs[:] = [d for d in dirs if d not in COMMON_IGNORE_DIRS]
|
|
83
|
+
if ".gitignore" in files:
|
|
84
|
+
gitignores.append(Path(root) / ".gitignore")
|
|
85
|
+
return sorted(gitignores, key=lambda p: len(p.relative_to(repo_root).parts))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _load_patterns(repo_root: Path) -> list[str]:
|
|
89
|
+
patterns: list[str] = list(COMMON_IGNORE_PATTERNS)
|
|
90
|
+
for gitignore in _load_gitignore_files(repo_root):
|
|
91
|
+
try:
|
|
92
|
+
content = gitignore.read_text(encoding="utf-8")
|
|
93
|
+
except OSError:
|
|
94
|
+
continue
|
|
95
|
+
for line in content.splitlines():
|
|
96
|
+
converted = _convert_gitignore_pattern(line, gitignore.parent, repo_root)
|
|
97
|
+
if converted:
|
|
98
|
+
patterns.append(converted)
|
|
99
|
+
return patterns
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def build_ignore_spec(repo_root: Path) -> PathSpec:
|
|
103
|
+
patterns = _load_patterns(repo_root)
|
|
104
|
+
return PathSpec.from_lines("gitwildmatch", patterns)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class IgnoreMatcher:
|
|
108
|
+
def __init__(self, repo_root: Path) -> None:
|
|
109
|
+
self.repo_root = repo_root
|
|
110
|
+
self.spec = build_ignore_spec(repo_root)
|
|
111
|
+
|
|
112
|
+
def is_ignored(self, path: Path) -> bool:
|
|
113
|
+
try:
|
|
114
|
+
relpath = path.relative_to(self.repo_root).as_posix()
|
|
115
|
+
except ValueError:
|
|
116
|
+
relpath = path.as_posix()
|
|
117
|
+
if path.is_dir() and not relpath.endswith("/"):
|
|
118
|
+
relpath = f"{relpath}/"
|
|
119
|
+
if self.spec.match_file(relpath):
|
|
120
|
+
return True
|
|
121
|
+
if not path.is_dir():
|
|
122
|
+
parent = path.parent
|
|
123
|
+
while parent != self.repo_root and parent != parent.parent:
|
|
124
|
+
parent_rel = parent.relative_to(self.repo_root).as_posix()
|
|
125
|
+
if not parent_rel.endswith("/"):
|
|
126
|
+
parent_rel = f"{parent_rel}/"
|
|
127
|
+
if self.spec.match_file(parent_rel):
|
|
128
|
+
return True
|
|
129
|
+
parent = parent.parent
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
def is_ignored_rel(self, relpath: str) -> bool:
|
|
133
|
+
return self.spec.match_file(relpath)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def iter_ignored(paths: Iterable[Path], matcher: IgnoreMatcher) -> Iterable[Path]:
|
|
137
|
+
for path in paths:
|
|
138
|
+
if matcher.is_ignored(path):
|
|
139
|
+
yield path
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from mikoshi.ignore import IgnoreMatcher
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ScanError(RuntimeError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _is_binary(path: Path) -> bool:
|
|
14
|
+
try:
|
|
15
|
+
with path.open("rb") as handle:
|
|
16
|
+
chunk = handle.read(4096)
|
|
17
|
+
except OSError:
|
|
18
|
+
return True
|
|
19
|
+
if b"\x00" in chunk:
|
|
20
|
+
return True
|
|
21
|
+
try:
|
|
22
|
+
chunk.decode("utf-8")
|
|
23
|
+
except UnicodeDecodeError:
|
|
24
|
+
return True
|
|
25
|
+
return False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def scan_repo(
|
|
29
|
+
repo_root: Path,
|
|
30
|
+
matcher: IgnoreMatcher,
|
|
31
|
+
max_bytes: int,
|
|
32
|
+
) -> list[Path]:
|
|
33
|
+
if not repo_root.exists() or not repo_root.is_dir():
|
|
34
|
+
raise ScanError(f"Repository path does not exist: {repo_root}")
|
|
35
|
+
|
|
36
|
+
files: list[Path] = []
|
|
37
|
+
for root, dirs, filenames in os.walk(repo_root):
|
|
38
|
+
root_path = Path(root)
|
|
39
|
+
dirs[:] = [
|
|
40
|
+
d
|
|
41
|
+
for d in dirs
|
|
42
|
+
if not matcher.is_ignored(root_path / d)
|
|
43
|
+
]
|
|
44
|
+
for name in filenames:
|
|
45
|
+
file_path = root_path / name
|
|
46
|
+
if matcher.is_ignored(file_path):
|
|
47
|
+
continue
|
|
48
|
+
if file_path.is_symlink():
|
|
49
|
+
continue
|
|
50
|
+
try:
|
|
51
|
+
size = file_path.stat().st_size
|
|
52
|
+
except OSError:
|
|
53
|
+
continue
|
|
54
|
+
if size > max_bytes:
|
|
55
|
+
continue
|
|
56
|
+
if _is_binary(file_path):
|
|
57
|
+
continue
|
|
58
|
+
files.append(file_path)
|
|
59
|
+
|
|
60
|
+
return sorted(files)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import faiss
|
|
10
|
+
|
|
11
|
+
from mikoshi.hashing import sha256_text
|
|
12
|
+
from mikoshi.utils.types import Chunk, IndexMeta
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class IndexStore:
|
|
16
|
+
def __init__(self, repo_root: Path, index_root: Path) -> None:
|
|
17
|
+
self.repo_root = repo_root
|
|
18
|
+
self.repo_id = repo_id_for_path(repo_root)
|
|
19
|
+
self.store_dir = index_root / self.repo_id
|
|
20
|
+
self.meta_path = self.store_dir / "meta.json"
|
|
21
|
+
self.chunks_path = self.store_dir / "chunks.jsonl"
|
|
22
|
+
self.embeddings_path = self.store_dir / "embeddings.npy"
|
|
23
|
+
self.faiss_path = self.store_dir / "index.faiss"
|
|
24
|
+
|
|
25
|
+
def exists(self) -> bool:
|
|
26
|
+
return self.meta_path.exists() and self.chunks_path.exists()
|
|
27
|
+
|
|
28
|
+
def ensure_dir(self) -> None:
|
|
29
|
+
self.store_dir.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
def clear(self) -> None:
|
|
32
|
+
if self.store_dir.exists():
|
|
33
|
+
shutil.rmtree(self.store_dir)
|
|
34
|
+
|
|
35
|
+
def load_meta(self) -> IndexMeta | None:
|
|
36
|
+
if not self.meta_path.exists():
|
|
37
|
+
return None
|
|
38
|
+
data = json.loads(self.meta_path.read_text(encoding="utf-8"))
|
|
39
|
+
return IndexMeta.from_dict(data)
|
|
40
|
+
|
|
41
|
+
def save_meta(self, meta: IndexMeta) -> None:
|
|
42
|
+
self.ensure_dir()
|
|
43
|
+
self.meta_path.write_text(
|
|
44
|
+
json.dumps(meta.to_dict(), indent=2, sort_keys=True),
|
|
45
|
+
encoding="utf-8",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def load_chunks(self) -> list[Chunk]:
|
|
49
|
+
if not self.chunks_path.exists():
|
|
50
|
+
return []
|
|
51
|
+
chunks: list[Chunk] = []
|
|
52
|
+
with self.chunks_path.open("r", encoding="utf-8") as handle:
|
|
53
|
+
for line in handle:
|
|
54
|
+
line = line.strip()
|
|
55
|
+
if not line:
|
|
56
|
+
continue
|
|
57
|
+
chunks.append(Chunk.from_dict(json.loads(line)))
|
|
58
|
+
return chunks
|
|
59
|
+
|
|
60
|
+
def save_chunks(self, chunks: Iterable[Chunk]) -> None:
|
|
61
|
+
self.ensure_dir()
|
|
62
|
+
with self.chunks_path.open("w", encoding="utf-8") as handle:
|
|
63
|
+
for chunk in chunks:
|
|
64
|
+
handle.write(json.dumps(chunk.to_dict(), ensure_ascii=False))
|
|
65
|
+
handle.write("\n")
|
|
66
|
+
|
|
67
|
+
def load_embeddings(self) -> np.ndarray | None:
|
|
68
|
+
if not self.embeddings_path.exists():
|
|
69
|
+
return None
|
|
70
|
+
return np.load(self.embeddings_path)
|
|
71
|
+
|
|
72
|
+
def save_embeddings(self, embeddings: np.ndarray) -> None:
|
|
73
|
+
self.ensure_dir()
|
|
74
|
+
np.save(self.embeddings_path, embeddings)
|
|
75
|
+
|
|
76
|
+
def load_faiss(self) -> faiss.Index | None:
|
|
77
|
+
if not self.faiss_path.exists():
|
|
78
|
+
return None
|
|
79
|
+
return faiss.read_index(str(self.faiss_path))
|
|
80
|
+
|
|
81
|
+
def save_faiss(self, index: faiss.Index) -> None:
|
|
82
|
+
self.ensure_dir()
|
|
83
|
+
faiss.write_index(index, str(self.faiss_path))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def repo_id_for_path(repo_root: Path) -> str:
|
|
87
|
+
return sha256_text(str(repo_root.resolve()))[:16]
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
import faiss
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from mikoshi.chunking import chunk_text
|
|
12
|
+
from mikoshi.config import Config, load_config
|
|
13
|
+
from mikoshi.hashing import sha256_bytes, sha256_text
|
|
14
|
+
from mikoshi.ignore import IgnoreMatcher
|
|
15
|
+
from mikoshi.indexing.file_scanner import scan_repo
|
|
16
|
+
from mikoshi.indexing.index_store import IndexStore
|
|
17
|
+
from mikoshi.retrieval.semantic import get_embeddings_provider, normalize_embeddings
|
|
18
|
+
from mikoshi.utils.timer import Timer
|
|
19
|
+
from mikoshi.utils.types import Chunk, IndexMeta, IndexResult
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class IndexerError(RuntimeError):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class IndexedFile:
|
|
28
|
+
path: Path
|
|
29
|
+
relpath: str
|
|
30
|
+
file_hash: str
|
|
31
|
+
text: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _read_file(path: Path) -> bytes:
|
|
35
|
+
return path.read_bytes()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _prepare_file(path: Path, repo_root: Path) -> IndexedFile:
|
|
39
|
+
data = _read_file(path)
|
|
40
|
+
file_hash = sha256_bytes(data)
|
|
41
|
+
text = data.decode("utf-8", errors="ignore")
|
|
42
|
+
relpath = path.relative_to(repo_root).as_posix()
|
|
43
|
+
return IndexedFile(path=path, relpath=relpath, file_hash=file_hash, text=text)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _chunks_from_text(
|
|
47
|
+
relpath: str,
|
|
48
|
+
file_hash: str,
|
|
49
|
+
text: str,
|
|
50
|
+
max_lines: int,
|
|
51
|
+
overlap: int,
|
|
52
|
+
) -> list[Chunk]:
|
|
53
|
+
spans = chunk_text(text, max_lines, overlap)
|
|
54
|
+
chunks: list[Chunk] = []
|
|
55
|
+
for span in spans:
|
|
56
|
+
chunk_id = sha256_text(
|
|
57
|
+
f"{relpath}:{span.start_line}:{span.end_line}:{span.text}"
|
|
58
|
+
)
|
|
59
|
+
chunks.append(
|
|
60
|
+
Chunk(
|
|
61
|
+
id=chunk_id,
|
|
62
|
+
relpath=relpath,
|
|
63
|
+
start_line=span.start_line,
|
|
64
|
+
end_line=span.end_line,
|
|
65
|
+
text=span.text,
|
|
66
|
+
file_hash=file_hash,
|
|
67
|
+
vector_idx=None,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
return chunks
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _with_vector_idx(chunks: Iterable[Chunk]) -> list[Chunk]:
|
|
74
|
+
return [
|
|
75
|
+
Chunk(
|
|
76
|
+
id=chunk.id,
|
|
77
|
+
relpath=chunk.relpath,
|
|
78
|
+
start_line=chunk.start_line,
|
|
79
|
+
end_line=chunk.end_line,
|
|
80
|
+
text=chunk.text,
|
|
81
|
+
file_hash=chunk.file_hash,
|
|
82
|
+
vector_idx=idx,
|
|
83
|
+
)
|
|
84
|
+
for idx, chunk in enumerate(chunks)
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _reuse_chunks(
|
|
89
|
+
prev_chunks: list[Chunk],
|
|
90
|
+
prev_embeddings: np.ndarray | None,
|
|
91
|
+
) -> tuple[dict[str, list[Chunk]], dict[str, np.ndarray]]:
|
|
92
|
+
chunks_by_file: dict[str, list[Chunk]] = {}
|
|
93
|
+
embedding_by_id: dict[str, np.ndarray] = {}
|
|
94
|
+
|
|
95
|
+
if not prev_chunks:
|
|
96
|
+
return chunks_by_file, embedding_by_id
|
|
97
|
+
|
|
98
|
+
if prev_embeddings is not None:
|
|
99
|
+
for chunk in prev_chunks:
|
|
100
|
+
if chunk.vector_idx is None:
|
|
101
|
+
continue
|
|
102
|
+
if 0 <= chunk.vector_idx < len(prev_embeddings):
|
|
103
|
+
embedding_by_id[chunk.id] = prev_embeddings[chunk.vector_idx]
|
|
104
|
+
|
|
105
|
+
for chunk in prev_chunks:
|
|
106
|
+
chunks_by_file.setdefault(chunk.relpath, []).append(chunk)
|
|
107
|
+
|
|
108
|
+
return chunks_by_file, embedding_by_id
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def index_repo(repo_path: str, config: Config | None = None) -> IndexResult:
|
|
112
|
+
config = config or load_config()
|
|
113
|
+
repo_root = Path(repo_path).expanduser().resolve()
|
|
114
|
+
|
|
115
|
+
matcher = IgnoreMatcher(repo_root)
|
|
116
|
+
store = IndexStore(repo_root, config.index_root)
|
|
117
|
+
|
|
118
|
+
prev_meta = store.load_meta()
|
|
119
|
+
prev_chunks = store.load_chunks()
|
|
120
|
+
prev_embeddings = store.load_embeddings()
|
|
121
|
+
prev_files = prev_meta.files if prev_meta else {}
|
|
122
|
+
|
|
123
|
+
if prev_meta:
|
|
124
|
+
if (
|
|
125
|
+
prev_meta.chunk_lines != config.chunk_lines
|
|
126
|
+
or prev_meta.chunk_overlap != config.chunk_overlap
|
|
127
|
+
or prev_meta.max_bytes != config.max_bytes
|
|
128
|
+
or prev_meta.embedding_provider != config.embeddings.provider
|
|
129
|
+
or prev_meta.model != config.embeddings.model
|
|
130
|
+
):
|
|
131
|
+
prev_meta = None
|
|
132
|
+
prev_chunks = []
|
|
133
|
+
prev_embeddings = None
|
|
134
|
+
prev_files = {}
|
|
135
|
+
|
|
136
|
+
prev_chunks_by_file, prev_embedding_by_id = _reuse_chunks(
|
|
137
|
+
prev_chunks,
|
|
138
|
+
prev_embeddings,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
with Timer() as timer:
|
|
142
|
+
files = scan_repo(repo_root, matcher, config.max_bytes)
|
|
143
|
+
indexed_files: list[IndexedFile] = []
|
|
144
|
+
for path in files:
|
|
145
|
+
try:
|
|
146
|
+
indexed_files.append(_prepare_file(path, repo_root))
|
|
147
|
+
except OSError:
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
file_hashes = {item.relpath: item.file_hash for item in indexed_files}
|
|
151
|
+
|
|
152
|
+
new_chunks: list[Chunk] = []
|
|
153
|
+
embedding_slots: list[np.ndarray | None] = []
|
|
154
|
+
pending_texts: list[str] = []
|
|
155
|
+
pending_indices: list[int] = []
|
|
156
|
+
|
|
157
|
+
for item in indexed_files:
|
|
158
|
+
relpath = item.relpath
|
|
159
|
+
unchanged = prev_files.get(relpath) == item.file_hash
|
|
160
|
+
if unchanged and relpath in prev_chunks_by_file and prev_embedding_by_id:
|
|
161
|
+
existing_chunks = prev_chunks_by_file[relpath]
|
|
162
|
+
if all(chunk.id in prev_embedding_by_id for chunk in existing_chunks):
|
|
163
|
+
for chunk in existing_chunks:
|
|
164
|
+
new_chunks.append(chunk)
|
|
165
|
+
embedding_slots.append(prev_embedding_by_id[chunk.id])
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
chunks = _chunks_from_text(
|
|
169
|
+
relpath,
|
|
170
|
+
item.file_hash,
|
|
171
|
+
item.text,
|
|
172
|
+
config.chunk_lines,
|
|
173
|
+
config.chunk_overlap,
|
|
174
|
+
)
|
|
175
|
+
for chunk in chunks:
|
|
176
|
+
pending_indices.append(len(new_chunks))
|
|
177
|
+
new_chunks.append(chunk)
|
|
178
|
+
embedding_slots.append(None)
|
|
179
|
+
pending_texts.append(chunk.text)
|
|
180
|
+
|
|
181
|
+
provider = None
|
|
182
|
+
if pending_texts:
|
|
183
|
+
provider = get_embeddings_provider(config)
|
|
184
|
+
pending_embeddings = provider.embed_texts(pending_texts)
|
|
185
|
+
else:
|
|
186
|
+
pending_embeddings = np.zeros((0, 0), dtype=np.float32)
|
|
187
|
+
|
|
188
|
+
if provider is not None and provider.dimension:
|
|
189
|
+
dimension = provider.dimension
|
|
190
|
+
elif prev_embeddings is not None and prev_embeddings.size:
|
|
191
|
+
dimension = int(prev_embeddings.shape[1])
|
|
192
|
+
elif prev_meta and prev_meta.embedding_dim:
|
|
193
|
+
dimension = prev_meta.embedding_dim
|
|
194
|
+
else:
|
|
195
|
+
dimension = int(pending_embeddings.shape[1]) if pending_embeddings.size else 1
|
|
196
|
+
|
|
197
|
+
for slot_idx, embedding in zip(pending_indices, pending_embeddings):
|
|
198
|
+
embedding_slots[slot_idx] = embedding
|
|
199
|
+
|
|
200
|
+
if embedding_slots:
|
|
201
|
+
all_embeddings = np.stack(embedding_slots).astype(np.float32)
|
|
202
|
+
else:
|
|
203
|
+
all_embeddings = np.zeros((0, dimension), dtype=np.float32)
|
|
204
|
+
all_embeddings = normalize_embeddings(all_embeddings)
|
|
205
|
+
|
|
206
|
+
final_chunks = _with_vector_idx(new_chunks)
|
|
207
|
+
|
|
208
|
+
index = faiss.IndexFlatIP(dimension)
|
|
209
|
+
if all_embeddings.size:
|
|
210
|
+
index.add(all_embeddings)
|
|
211
|
+
|
|
212
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
213
|
+
meta = IndexMeta(
|
|
214
|
+
repo_id=store.repo_id,
|
|
215
|
+
repo_path=str(repo_root),
|
|
216
|
+
created_at=prev_meta.created_at if prev_meta else now,
|
|
217
|
+
updated_at=now,
|
|
218
|
+
embedding_provider=config.embeddings.provider,
|
|
219
|
+
model=config.embeddings.model,
|
|
220
|
+
embedding_dim=dimension,
|
|
221
|
+
chunk_lines=config.chunk_lines,
|
|
222
|
+
chunk_overlap=config.chunk_overlap,
|
|
223
|
+
max_bytes=config.max_bytes,
|
|
224
|
+
files=file_hashes,
|
|
225
|
+
chunks=len(final_chunks),
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
store.save_chunks(final_chunks)
|
|
229
|
+
store.save_embeddings(all_embeddings)
|
|
230
|
+
store.save_faiss(index)
|
|
231
|
+
store.save_meta(meta)
|
|
232
|
+
|
|
233
|
+
return IndexResult(
|
|
234
|
+
repo_id=store.repo_id,
|
|
235
|
+
chunks_indexed=len(final_chunks),
|
|
236
|
+
took_ms=timer.ms,
|
|
237
|
+
)
|