haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag-slim might be problematic. Click here for more details.
- haiku/rag/app.py +430 -72
- haiku/rag/chunkers/__init__.py +31 -0
- haiku/rag/chunkers/base.py +31 -0
- haiku/rag/chunkers/docling_local.py +164 -0
- haiku/rag/chunkers/docling_serve.py +179 -0
- haiku/rag/cli.py +207 -24
- haiku/rag/cli_chat.py +489 -0
- haiku/rag/client.py +1251 -266
- haiku/rag/config/__init__.py +16 -10
- haiku/rag/config/loader.py +5 -44
- haiku/rag/config/models.py +126 -17
- haiku/rag/converters/__init__.py +31 -0
- haiku/rag/converters/base.py +63 -0
- haiku/rag/converters/docling_local.py +193 -0
- haiku/rag/converters/docling_serve.py +229 -0
- haiku/rag/converters/text_utils.py +237 -0
- haiku/rag/embeddings/__init__.py +123 -24
- haiku/rag/embeddings/voyageai.py +175 -20
- haiku/rag/graph/__init__.py +0 -11
- haiku/rag/graph/agui/__init__.py +8 -2
- haiku/rag/graph/agui/cli_renderer.py +1 -1
- haiku/rag/graph/agui/emitter.py +219 -31
- haiku/rag/graph/agui/server.py +20 -62
- haiku/rag/graph/agui/stream.py +1 -2
- haiku/rag/graph/research/__init__.py +5 -2
- haiku/rag/graph/research/dependencies.py +12 -126
- haiku/rag/graph/research/graph.py +390 -135
- haiku/rag/graph/research/models.py +91 -112
- haiku/rag/graph/research/prompts.py +99 -91
- haiku/rag/graph/research/state.py +35 -27
- haiku/rag/inspector/__init__.py +8 -0
- haiku/rag/inspector/app.py +259 -0
- haiku/rag/inspector/widgets/__init__.py +6 -0
- haiku/rag/inspector/widgets/chunk_list.py +100 -0
- haiku/rag/inspector/widgets/context_modal.py +89 -0
- haiku/rag/inspector/widgets/detail_view.py +130 -0
- haiku/rag/inspector/widgets/document_list.py +75 -0
- haiku/rag/inspector/widgets/info_modal.py +209 -0
- haiku/rag/inspector/widgets/search_modal.py +183 -0
- haiku/rag/inspector/widgets/visual_modal.py +126 -0
- haiku/rag/mcp.py +106 -102
- haiku/rag/monitor.py +33 -9
- haiku/rag/providers/__init__.py +5 -0
- haiku/rag/providers/docling_serve.py +108 -0
- haiku/rag/qa/__init__.py +12 -10
- haiku/rag/qa/agent.py +43 -61
- haiku/rag/qa/prompts.py +35 -57
- haiku/rag/reranking/__init__.py +9 -6
- haiku/rag/reranking/base.py +1 -1
- haiku/rag/reranking/cohere.py +5 -4
- haiku/rag/reranking/mxbai.py +5 -2
- haiku/rag/reranking/vllm.py +3 -4
- haiku/rag/reranking/zeroentropy.py +6 -5
- haiku/rag/store/__init__.py +2 -1
- haiku/rag/store/engine.py +242 -42
- haiku/rag/store/exceptions.py +4 -0
- haiku/rag/store/models/__init__.py +8 -2
- haiku/rag/store/models/chunk.py +190 -0
- haiku/rag/store/models/document.py +46 -0
- haiku/rag/store/repositories/chunk.py +141 -121
- haiku/rag/store/repositories/document.py +25 -84
- haiku/rag/store/repositories/settings.py +11 -14
- haiku/rag/store/upgrades/__init__.py +19 -3
- haiku/rag/store/upgrades/v0_10_1.py +1 -1
- haiku/rag/store/upgrades/v0_19_6.py +65 -0
- haiku/rag/store/upgrades/v0_20_0.py +68 -0
- haiku/rag/store/upgrades/v0_23_1.py +100 -0
- haiku/rag/store/upgrades/v0_9_3.py +3 -3
- haiku/rag/utils.py +371 -146
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
- haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
- haiku/rag/chunker.py +0 -65
- haiku/rag/embeddings/base.py +0 -25
- haiku/rag/embeddings/ollama.py +0 -28
- haiku/rag/embeddings/openai.py +0 -26
- haiku/rag/embeddings/vllm.py +0 -29
- haiku/rag/graph/agui/events.py +0 -254
- haiku/rag/graph/common/__init__.py +0 -5
- haiku/rag/graph/common/models.py +0 -42
- haiku/rag/graph/common/nodes.py +0 -265
- haiku/rag/graph/common/prompts.py +0 -46
- haiku/rag/graph/common/utils.py +0 -44
- haiku/rag/graph/deep_qa/__init__.py +0 -1
- haiku/rag/graph/deep_qa/dependencies.py +0 -27
- haiku/rag/graph/deep_qa/graph.py +0 -243
- haiku/rag/graph/deep_qa/models.py +0 -20
- haiku/rag/graph/deep_qa/prompts.py +0 -59
- haiku/rag/graph/deep_qa/state.py +0 -56
- haiku/rag/graph/research/common.py +0 -87
- haiku/rag/reader.py +0 -135
- haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/qa/prompts.py
CHANGED
|
@@ -1,60 +1,38 @@
|
|
|
1
|
-
QA_SYSTEM_PROMPT = """
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
1. IMMEDIATELY call the search_documents tool with relevant keywords from the user's question
|
|
31
|
-
2. Review the search results and their relevance scores
|
|
32
|
-
3. If you need additional context, perform follow-up searches with different keywords
|
|
33
|
-
4. Provide a short and to the point comprehensive answer based only on the retrieved documents
|
|
34
|
-
5. Always include citations for the sources used in your answer
|
|
1
|
+
QA_SYSTEM_PROMPT = """You are a knowledgeable assistant that answers questions using a document knowledge base.
|
|
2
|
+
|
|
3
|
+
Process:
|
|
4
|
+
1. Call search_documents with relevant keywords from the question
|
|
5
|
+
2. Review the results and their relevance scores
|
|
6
|
+
3. If needed, perform follow-up searches with different keywords (max 3 total)
|
|
7
|
+
4. Provide a concise answer based strictly on the retrieved content
|
|
8
|
+
|
|
9
|
+
The search tool returns results like:
|
|
10
|
+
[chunk_abc123] (score: 0.85)
|
|
11
|
+
Source: "Document Title" > Section > Subsection
|
|
12
|
+
Type: paragraph
|
|
13
|
+
Content:
|
|
14
|
+
The actual text content here...
|
|
15
|
+
|
|
16
|
+
[chunk_def456] (score: 0.72)
|
|
17
|
+
Source: "Another Document"
|
|
18
|
+
Type: table
|
|
19
|
+
Content:
|
|
20
|
+
| Column 1 | Column 2 |
|
|
21
|
+
...
|
|
22
|
+
|
|
23
|
+
Each result includes:
|
|
24
|
+
- chunk_id in brackets and relevance score
|
|
25
|
+
- Source: document title and section hierarchy (when available)
|
|
26
|
+
- Type: content type like paragraph, table, code, list_item (when available)
|
|
27
|
+
- Content: the actual text
|
|
28
|
+
|
|
29
|
+
In your response, include the chunk IDs you used in cited_chunks.
|
|
35
30
|
|
|
36
31
|
Guidelines:
|
|
37
|
-
- Base
|
|
38
|
-
-
|
|
39
|
-
-
|
|
40
|
-
- If
|
|
41
|
-
-
|
|
42
|
-
-
|
|
43
|
-
- ALWAYS include citations at the end of your response using the format below
|
|
44
|
-
|
|
45
|
-
Citation Format:
|
|
46
|
-
After your answer, include a "Citations:" section that lists:
|
|
47
|
-
- The document title (if available) or URI from each search result used
|
|
48
|
-
- A brief excerpt (first 50-100 characters) of the content that supported your answer
|
|
49
|
-
- Format: "Citations:\n- [document title or URI]: [content_excerpt]..."
|
|
50
|
-
|
|
51
|
-
Example response format:
|
|
52
|
-
[Your answer here]
|
|
53
|
-
|
|
54
|
-
Citations:
|
|
55
|
-
- /path/to/document1.pdf: "This document explains that AFMAN stands for Air Force Manual..."
|
|
56
|
-
- /path/to/document2.pdf: "The manual provides guidance on military procedures and..."
|
|
57
|
-
|
|
58
|
-
Be concise, and always maintain accuracy over completeness. Prefer short, direct answers that are well-supported by the documents.
|
|
59
|
-
/no_think
|
|
32
|
+
- Base answers strictly on retrieved content - do not use external knowledge
|
|
33
|
+
- Use the Source and Type metadata to understand context
|
|
34
|
+
- If multiple results are relevant, synthesize them coherently
|
|
35
|
+
- If information is insufficient, say: "I cannot find enough information in the knowledge base to answer this question."
|
|
36
|
+
- Be concise and direct - avoid elaboration unless asked
|
|
37
|
+
- Higher scores indicate more relevant results
|
|
60
38
|
"""
|
haiku/rag/reranking/__init__.py
CHANGED
|
@@ -24,7 +24,7 @@ def get_reranker(config: AppConfig = Config) -> RerankerBase | None:
|
|
|
24
24
|
|
|
25
25
|
reranker: RerankerBase | None = None
|
|
26
26
|
|
|
27
|
-
if config.reranking.provider == "mxbai":
|
|
27
|
+
if config.reranking.model and config.reranking.model.provider == "mxbai":
|
|
28
28
|
try:
|
|
29
29
|
from haiku.rag.reranking.mxbai import MxBAIReranker
|
|
30
30
|
|
|
@@ -33,7 +33,7 @@ def get_reranker(config: AppConfig = Config) -> RerankerBase | None:
|
|
|
33
33
|
except ImportError:
|
|
34
34
|
reranker = None
|
|
35
35
|
|
|
36
|
-
elif config.reranking.provider == "cohere":
|
|
36
|
+
elif config.reranking.model and config.reranking.model.provider == "cohere":
|
|
37
37
|
try:
|
|
38
38
|
from haiku.rag.reranking.cohere import CohereReranker
|
|
39
39
|
|
|
@@ -41,20 +41,23 @@ def get_reranker(config: AppConfig = Config) -> RerankerBase | None:
|
|
|
41
41
|
except ImportError:
|
|
42
42
|
reranker = None
|
|
43
43
|
|
|
44
|
-
elif config.reranking.provider == "vllm":
|
|
44
|
+
elif config.reranking.model and config.reranking.model.provider == "vllm":
|
|
45
45
|
try:
|
|
46
46
|
from haiku.rag.reranking.vllm import VLLMReranker
|
|
47
47
|
|
|
48
|
-
|
|
48
|
+
base_url = config.reranking.model.base_url
|
|
49
|
+
if not base_url:
|
|
50
|
+
raise ValueError("vLLM reranker requires base_url in reranking.model")
|
|
51
|
+
reranker = VLLMReranker(config.reranking.model.name, base_url)
|
|
49
52
|
except ImportError:
|
|
50
53
|
reranker = None
|
|
51
54
|
|
|
52
|
-
elif config.reranking.provider == "zeroentropy":
|
|
55
|
+
elif config.reranking.model and config.reranking.model.provider == "zeroentropy":
|
|
53
56
|
try:
|
|
54
57
|
from haiku.rag.reranking.zeroentropy import ZeroEntropyReranker
|
|
55
58
|
|
|
56
59
|
# Use configured model or default to zerank-1
|
|
57
|
-
model = config.reranking.model or "zerank-1"
|
|
60
|
+
model = config.reranking.model.name or "zerank-1"
|
|
58
61
|
reranker = ZeroEntropyReranker(model)
|
|
59
62
|
except ImportError:
|
|
60
63
|
reranker = None
|
haiku/rag/reranking/base.py
CHANGED
|
@@ -3,7 +3,7 @@ from haiku.rag.store.models.chunk import Chunk
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class RerankerBase:
|
|
6
|
-
_model: str = Config.reranking.model
|
|
6
|
+
_model: str | None = Config.reranking.model.name if Config.reranking.model else None
|
|
7
7
|
|
|
8
8
|
async def rerank(
|
|
9
9
|
self, query: str, chunks: list[Chunk], top_n: int = 10
|
haiku/rag/reranking/cohere.py
CHANGED
|
@@ -9,10 +9,10 @@ except ImportError as e:
|
|
|
9
9
|
) from e
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
class CohereReranker(RerankerBase):
|
|
12
|
+
class CohereReranker(RerankerBase): # pragma: no cover
|
|
13
13
|
def __init__(self):
|
|
14
14
|
# Cohere SDK reads CO_API_KEY from environment by default
|
|
15
|
-
self._client = cohere.
|
|
15
|
+
self._client = cohere.AsyncClientV2()
|
|
16
16
|
|
|
17
17
|
async def rerank(
|
|
18
18
|
self, query: str, chunks: list[Chunk], top_n: int = 10
|
|
@@ -22,8 +22,9 @@ class CohereReranker(RerankerBase):
|
|
|
22
22
|
|
|
23
23
|
documents = [chunk.content for chunk in chunks]
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
model_name = self._model or "rerank-v3.5"
|
|
26
|
+
response = await self._client.rerank(
|
|
27
|
+
model=model_name, query=query, documents=documents, top_n=top_n
|
|
27
28
|
)
|
|
28
29
|
|
|
29
30
|
reranked_chunks = []
|
haiku/rag/reranking/mxbai.py
CHANGED
|
@@ -7,9 +7,12 @@ from haiku.rag.store.models.chunk import Chunk
|
|
|
7
7
|
|
|
8
8
|
class MxBAIReranker(RerankerBase):
|
|
9
9
|
def __init__(self):
|
|
10
|
-
|
|
11
|
-
Config.reranking.model
|
|
10
|
+
model_name = (
|
|
11
|
+
Config.reranking.model.name
|
|
12
|
+
if Config.reranking.model
|
|
13
|
+
else "mixedbread-ai/mxbai-rerank-base-v2"
|
|
12
14
|
)
|
|
15
|
+
self._client = MxbaiRerankV2(model_name, disable_transformers_warnings=True)
|
|
13
16
|
|
|
14
17
|
async def rerank(
|
|
15
18
|
self, query: str, chunks: list[Chunk], top_n: int = 10
|
haiku/rag/reranking/vllm.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import httpx
|
|
2
2
|
|
|
3
|
-
from haiku.rag.config import Config
|
|
4
3
|
from haiku.rag.reranking.base import RerankerBase
|
|
5
4
|
from haiku.rag.store.models.chunk import Chunk
|
|
6
5
|
|
|
7
6
|
|
|
8
|
-
class VLLMReranker(RerankerBase):
|
|
9
|
-
def __init__(self, model: str):
|
|
7
|
+
class VLLMReranker(RerankerBase): # pragma: no cover
|
|
8
|
+
def __init__(self, model: str, base_url: str):
|
|
10
9
|
self._model = model
|
|
11
|
-
self._base_url =
|
|
10
|
+
self._base_url = base_url
|
|
12
11
|
|
|
13
12
|
async def rerank(
|
|
14
13
|
self, query: str, chunks: list[Chunk], top_n: int = 10
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from zeroentropy import
|
|
1
|
+
from zeroentropy import AsyncZeroEntropy
|
|
2
2
|
|
|
3
3
|
from haiku.rag.reranking.base import RerankerBase
|
|
4
4
|
from haiku.rag.store.models.chunk import Chunk
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
class ZeroEntropyReranker(RerankerBase):
|
|
7
|
+
class ZeroEntropyReranker(RerankerBase): # pragma: no cover
|
|
8
8
|
"""Zero Entropy reranker implementation using the zerank-1 model."""
|
|
9
9
|
|
|
10
10
|
def __init__(self, model: str = "zerank-1"):
|
|
@@ -15,7 +15,7 @@ class ZeroEntropyReranker(RerankerBase):
|
|
|
15
15
|
"""
|
|
16
16
|
self._model = model
|
|
17
17
|
# Zero Entropy SDK reads ZEROENTROPY_API_KEY from environment by default
|
|
18
|
-
self._client =
|
|
18
|
+
self._client = AsyncZeroEntropy()
|
|
19
19
|
|
|
20
20
|
async def rerank(
|
|
21
21
|
self, query: str, chunks: list[Chunk], top_n: int = 10
|
|
@@ -37,8 +37,9 @@ class ZeroEntropyReranker(RerankerBase):
|
|
|
37
37
|
documents = [chunk.content for chunk in chunks]
|
|
38
38
|
|
|
39
39
|
# Call Zero Entropy reranking API
|
|
40
|
-
|
|
41
|
-
|
|
40
|
+
model_name = self._model or "zerank-1"
|
|
41
|
+
response = await self._client.models.rerank(
|
|
42
|
+
model=model_name,
|
|
42
43
|
query=query,
|
|
43
44
|
documents=documents,
|
|
44
45
|
)
|
haiku/rag/store/__init__.py
CHANGED
haiku/rag/store/engine.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
from datetime import timedelta
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
5
|
from importlib import metadata
|
|
6
6
|
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
7
8
|
from uuid import uuid4
|
|
8
9
|
|
|
9
10
|
import lancedb
|
|
@@ -12,6 +13,7 @@ from pydantic import Field
|
|
|
12
13
|
|
|
13
14
|
from haiku.rag.config import AppConfig, Config
|
|
14
15
|
from haiku.rag.embeddings import get_embedder
|
|
16
|
+
from haiku.rag.store.exceptions import ReadOnlyError
|
|
15
17
|
|
|
16
18
|
logger = logging.getLogger(__name__)
|
|
17
19
|
|
|
@@ -22,6 +24,8 @@ class DocumentRecord(LanceModel):
|
|
|
22
24
|
uri: str | None = None
|
|
23
25
|
title: str | None = None
|
|
24
26
|
metadata: str = Field(default="{}")
|
|
27
|
+
docling_document_json: str | None = None
|
|
28
|
+
docling_version: str | None = None
|
|
25
29
|
created_at: str = Field(default_factory=lambda: "")
|
|
26
30
|
updated_at: str = Field(default_factory=lambda: "")
|
|
27
31
|
|
|
@@ -36,6 +40,7 @@ def create_chunk_model(vector_dim: int):
|
|
|
36
40
|
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
37
41
|
document_id: str
|
|
38
42
|
content: str
|
|
43
|
+
content_fts: str = Field(default="")
|
|
39
44
|
metadata: str = Field(default="{}")
|
|
40
45
|
order: int = Field(default=0)
|
|
41
46
|
vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim) # type: ignore
|
|
@@ -54,39 +59,67 @@ class Store:
|
|
|
54
59
|
db_path: Path,
|
|
55
60
|
config: AppConfig = Config,
|
|
56
61
|
skip_validation: bool = False,
|
|
57
|
-
|
|
62
|
+
create: bool = False,
|
|
63
|
+
read_only: bool = False,
|
|
64
|
+
before: datetime | None = None,
|
|
58
65
|
):
|
|
59
66
|
self.db_path: Path = db_path
|
|
60
67
|
self._config = config
|
|
68
|
+
self._before = before
|
|
69
|
+
# Time-travel mode is always read-only
|
|
70
|
+
self._read_only = read_only or (before is not None)
|
|
61
71
|
self.embedder = get_embedder(config=self._config)
|
|
62
72
|
self._vacuum_lock = asyncio.Lock()
|
|
63
73
|
|
|
64
74
|
# Create the ChunkRecord model with the correct vector dimension
|
|
65
75
|
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
66
76
|
|
|
67
|
-
#
|
|
77
|
+
# Check if database exists (for local filesystem only)
|
|
78
|
+
is_new_db = False
|
|
68
79
|
if not self._has_cloud_config():
|
|
69
|
-
if not
|
|
70
|
-
|
|
71
|
-
if not db_path.exists():
|
|
80
|
+
if not db_path.exists():
|
|
81
|
+
if not create:
|
|
72
82
|
raise FileNotFoundError(
|
|
73
|
-
f"Database does not exist
|
|
83
|
+
f"Database does not exist at {self.db_path.absolute()}. "
|
|
84
|
+
"Use 'haiku-rag init' to create a new database."
|
|
74
85
|
)
|
|
75
|
-
|
|
76
|
-
#
|
|
86
|
+
is_new_db = True
|
|
87
|
+
# Ensure parent directories exist for new databases
|
|
77
88
|
if not db_path.parent.exists():
|
|
78
89
|
Path.mkdir(db_path.parent, parents=True)
|
|
79
90
|
|
|
80
91
|
# Connect to LanceDB
|
|
81
92
|
self.db = self._connect_to_lancedb(db_path)
|
|
82
93
|
|
|
83
|
-
# Initialize tables
|
|
84
|
-
self.
|
|
94
|
+
# Initialize tables (creates them if they don't exist)
|
|
95
|
+
self._init_tables()
|
|
96
|
+
|
|
97
|
+
# Checkout tables to historical state if before is specified
|
|
98
|
+
if before is not None:
|
|
99
|
+
self._checkout_tables_before(before)
|
|
100
|
+
|
|
101
|
+
# Run upgrades only on existing databases, set version for new ones
|
|
102
|
+
# Skip upgrades in read-only mode (they would fail anyway)
|
|
103
|
+
if not self._read_only:
|
|
104
|
+
if is_new_db:
|
|
105
|
+
self._set_initial_version()
|
|
106
|
+
else:
|
|
107
|
+
self._run_upgrades()
|
|
85
108
|
|
|
86
109
|
# Validate config compatibility after connection is established
|
|
87
110
|
if not skip_validation:
|
|
88
111
|
self._validate_configuration()
|
|
89
112
|
|
|
113
|
+
@property
|
|
114
|
+
def is_read_only(self) -> bool:
|
|
115
|
+
"""Whether the store is in read-only mode."""
|
|
116
|
+
return self._read_only
|
|
117
|
+
|
|
118
|
+
def _assert_writable(self) -> None:
|
|
119
|
+
"""Raise ReadOnlyError if the store is in read-only mode."""
|
|
120
|
+
if self._read_only:
|
|
121
|
+
raise ReadOnlyError("Cannot modify database in read-only mode")
|
|
122
|
+
|
|
90
123
|
async def vacuum(self, retention_seconds: int | None = None) -> None:
|
|
91
124
|
"""Optimize and clean up old versions across all tables to reduce disk usage.
|
|
92
125
|
|
|
@@ -97,7 +130,12 @@ class Store:
|
|
|
97
130
|
Note:
|
|
98
131
|
If vacuum is already running, this method returns immediately without blocking.
|
|
99
132
|
Use asyncio.create_task(store.vacuum()) for non-blocking background execution.
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
ReadOnlyError: If the store is in read-only mode.
|
|
100
136
|
"""
|
|
137
|
+
self._assert_writable()
|
|
138
|
+
|
|
101
139
|
if self._has_cloud_config() and str(self._config.lancedb.uri).startswith(
|
|
102
140
|
"db://"
|
|
103
141
|
):
|
|
@@ -145,6 +183,87 @@ class Store:
|
|
|
145
183
|
and self._config.lancedb.region
|
|
146
184
|
)
|
|
147
185
|
|
|
186
|
+
def get_stats(self) -> dict:
|
|
187
|
+
"""Get comprehensive table statistics.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Dictionary with statistics for documents and chunks tables including:
|
|
191
|
+
- Row counts
|
|
192
|
+
- Storage sizes
|
|
193
|
+
- Vector index status and statistics
|
|
194
|
+
"""
|
|
195
|
+
stats_dict: dict = {
|
|
196
|
+
"documents": {"exists": False},
|
|
197
|
+
"chunks": {"exists": False},
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# Documents table stats
|
|
201
|
+
doc_stats: dict = self.documents_table.stats() # type: ignore[assignment]
|
|
202
|
+
stats_dict["documents"] = {
|
|
203
|
+
"exists": True,
|
|
204
|
+
"num_rows": doc_stats.get("num_rows", 0),
|
|
205
|
+
"total_bytes": doc_stats.get("total_bytes", 0),
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# Chunks table stats
|
|
209
|
+
chunk_stats: dict = self.chunks_table.stats() # type: ignore[assignment]
|
|
210
|
+
stats_dict["chunks"] = {
|
|
211
|
+
"exists": True,
|
|
212
|
+
"num_rows": chunk_stats.get("num_rows", 0),
|
|
213
|
+
"total_bytes": chunk_stats.get("total_bytes", 0),
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
# Vector index stats
|
|
217
|
+
indices = self.chunks_table.list_indices()
|
|
218
|
+
has_vector_index = any("vector" in str(idx).lower() for idx in indices)
|
|
219
|
+
stats_dict["chunks"]["has_vector_index"] = has_vector_index
|
|
220
|
+
|
|
221
|
+
if has_vector_index:
|
|
222
|
+
index_stats = self.chunks_table.index_stats("vector_idx")
|
|
223
|
+
if index_stats is not None:
|
|
224
|
+
stats_dict["chunks"]["num_indexed_rows"] = index_stats.num_indexed_rows
|
|
225
|
+
stats_dict["chunks"]["num_unindexed_rows"] = (
|
|
226
|
+
index_stats.num_unindexed_rows
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
return stats_dict
|
|
230
|
+
|
|
231
|
+
def _ensure_vector_index(self) -> None:
|
|
232
|
+
"""Create or rebuild vector index on chunks table.
|
|
233
|
+
|
|
234
|
+
Cloud deployments auto-create indexes, so we skip for those.
|
|
235
|
+
For self-hosted, creates an IVF_PQ index. If an index exists,
|
|
236
|
+
it will be replaced (using replace=True parameter).
|
|
237
|
+
Note: Index creation requires sufficient training data.
|
|
238
|
+
"""
|
|
239
|
+
if self._has_cloud_config():
|
|
240
|
+
return
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
# Check if table has enough data (indexes require training data)
|
|
244
|
+
row_count = self.chunks_table.count_rows()
|
|
245
|
+
if row_count < 256:
|
|
246
|
+
logger.debug(
|
|
247
|
+
f"Skipping vector index creation: need at least 256 rows, have {row_count}"
|
|
248
|
+
)
|
|
249
|
+
return
|
|
250
|
+
|
|
251
|
+
# Create or replace index (replace=True is the default)
|
|
252
|
+
logger.info("Creating vector index on chunks table...")
|
|
253
|
+
self.chunks_table.create_index(
|
|
254
|
+
metric=self._config.search.vector_index_metric,
|
|
255
|
+
index_type="IVF_PQ",
|
|
256
|
+
replace=True, # Explicit: replace existing index
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Wait for index creation to complete
|
|
260
|
+
# Index name is column_name + "_idx"
|
|
261
|
+
self.chunks_table.wait_for_index(["vector_idx"], timeout=timedelta(hours=1))
|
|
262
|
+
|
|
263
|
+
logger.info("Vector index created successfully")
|
|
264
|
+
except Exception as e:
|
|
265
|
+
logger.warning(f"Could not create vector index: {e}")
|
|
266
|
+
|
|
148
267
|
def _validate_configuration(self) -> None:
|
|
149
268
|
"""Validate that the configuration is compatible with the database."""
|
|
150
269
|
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
@@ -152,9 +271,8 @@ class Store:
|
|
|
152
271
|
settings_repo = SettingsRepository(self)
|
|
153
272
|
settings_repo.validate_config_compatibility()
|
|
154
273
|
|
|
155
|
-
def
|
|
156
|
-
"""
|
|
157
|
-
|
|
274
|
+
def _init_tables(self):
|
|
275
|
+
"""Initialize database tables (create if they don't exist)."""
|
|
158
276
|
# Get list of existing tables
|
|
159
277
|
existing_tables = self.db.table_names()
|
|
160
278
|
|
|
@@ -171,9 +289,9 @@ class Store:
|
|
|
171
289
|
self.chunks_table = self.db.open_table("chunks")
|
|
172
290
|
else:
|
|
173
291
|
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
174
|
-
# Create FTS index on
|
|
292
|
+
# Create FTS index on content_fts (contextualized content) for better search
|
|
175
293
|
self.chunks_table.create_fts_index(
|
|
176
|
-
"
|
|
294
|
+
"content_fts", replace=True, with_position=True, remove_stop_words=False
|
|
177
295
|
)
|
|
178
296
|
|
|
179
297
|
# Create or get settings table
|
|
@@ -189,34 +307,21 @@ class Store:
|
|
|
189
307
|
[SettingsRecord(id="settings", settings=json.dumps(settings_data))]
|
|
190
308
|
)
|
|
191
309
|
|
|
192
|
-
|
|
310
|
+
def _set_initial_version(self):
|
|
311
|
+
"""Set the initial version for a new database."""
|
|
312
|
+
self.set_haiku_version(metadata.version("haiku.rag-slim"))
|
|
313
|
+
|
|
314
|
+
def _run_upgrades(self):
|
|
315
|
+
"""Run pending database upgrades."""
|
|
193
316
|
try:
|
|
194
317
|
from haiku.rag.store.upgrades import run_pending_upgrades
|
|
195
318
|
|
|
196
319
|
current_version = metadata.version("haiku.rag-slim")
|
|
197
320
|
db_version = self.get_haiku_version()
|
|
198
321
|
|
|
199
|
-
|
|
200
|
-
run_pending_upgrades(self, db_version, current_version)
|
|
201
|
-
|
|
202
|
-
# After upgrades complete (or if none), set stored version
|
|
203
|
-
# to the greater of the installed package version and the
|
|
204
|
-
# highest available upgrade step version in code.
|
|
205
|
-
try:
|
|
206
|
-
from packaging.version import parse as _v
|
|
207
|
-
|
|
208
|
-
from haiku.rag.store.upgrades import upgrades as _steps
|
|
209
|
-
|
|
210
|
-
highest_step = max((_v(u.version) for u in _steps), default=None)
|
|
211
|
-
effective_version = (
|
|
212
|
-
str(max(_v(current_version), highest_step))
|
|
213
|
-
if highest_step is not None
|
|
214
|
-
else current_version
|
|
215
|
-
)
|
|
216
|
-
except Exception:
|
|
217
|
-
effective_version = current_version
|
|
322
|
+
run_pending_upgrades(self, db_version, current_version)
|
|
218
323
|
|
|
219
|
-
self.set_haiku_version(
|
|
324
|
+
self.set_haiku_version(current_version)
|
|
220
325
|
except Exception as e:
|
|
221
326
|
# Avoid hard failure on initial connection; log and continue so CLI remains usable.
|
|
222
327
|
logger.warning(
|
|
@@ -241,7 +346,12 @@ class Store:
|
|
|
241
346
|
return "0.0.0"
|
|
242
347
|
|
|
243
348
|
def set_haiku_version(self, version: str) -> None:
|
|
244
|
-
"""Updates the user version in settings.
|
|
349
|
+
"""Updates the user version in settings.
|
|
350
|
+
|
|
351
|
+
Raises:
|
|
352
|
+
ReadOnlyError: If the store is in read-only mode.
|
|
353
|
+
"""
|
|
354
|
+
self._assert_writable()
|
|
245
355
|
settings_records = list(
|
|
246
356
|
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
247
357
|
)
|
|
@@ -267,7 +377,12 @@ class Store:
|
|
|
267
377
|
)
|
|
268
378
|
|
|
269
379
|
def recreate_embeddings_table(self) -> None:
|
|
270
|
-
"""Recreate the chunks table with current vector dimensions.
|
|
380
|
+
"""Recreate the chunks table with current vector dimensions.
|
|
381
|
+
|
|
382
|
+
Raises:
|
|
383
|
+
ReadOnlyError: If the store is in read-only mode.
|
|
384
|
+
"""
|
|
385
|
+
self._assert_writable()
|
|
271
386
|
# Drop and recreate chunks table
|
|
272
387
|
try:
|
|
273
388
|
self.db.drop_table("chunks")
|
|
@@ -278,9 +393,9 @@ class Store:
|
|
|
278
393
|
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
279
394
|
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
280
395
|
|
|
281
|
-
# Create FTS index on
|
|
396
|
+
# Create FTS index on content_fts (contextualized content) for better search
|
|
282
397
|
self.chunks_table.create_fts_index(
|
|
283
|
-
"
|
|
398
|
+
"content_fts", replace=True, with_position=True, remove_stop_words=False
|
|
284
399
|
)
|
|
285
400
|
|
|
286
401
|
def close(self):
|
|
@@ -297,7 +412,12 @@ class Store:
|
|
|
297
412
|
}
|
|
298
413
|
|
|
299
414
|
def restore_table_versions(self, versions: dict[str, int]) -> bool:
|
|
300
|
-
"""Restore tables to the provided versions using LanceDB's API.
|
|
415
|
+
"""Restore tables to the provided versions using LanceDB's API.
|
|
416
|
+
|
|
417
|
+
Raises:
|
|
418
|
+
ReadOnlyError: If the store is in read-only mode.
|
|
419
|
+
"""
|
|
420
|
+
self._assert_writable()
|
|
301
421
|
self.documents_table.restore(int(versions["documents"]))
|
|
302
422
|
self.chunks_table.restore(int(versions["chunks"]))
|
|
303
423
|
self.settings_table.restore(int(versions["settings"]))
|
|
@@ -307,3 +427,83 @@ class Store:
|
|
|
307
427
|
def _connection(self):
|
|
308
428
|
"""Compatibility property for repositories expecting _connection."""
|
|
309
429
|
return self
|
|
430
|
+
|
|
431
|
+
def _checkout_tables_before(self, before: datetime) -> None:
|
|
432
|
+
"""Checkout all tables to their state at or before the given datetime.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
before: The datetime to checkout to
|
|
436
|
+
|
|
437
|
+
Raises:
|
|
438
|
+
ValueError: If no version exists before the given datetime
|
|
439
|
+
"""
|
|
440
|
+
# LanceDB stores timestamps as naive datetimes in local time.
|
|
441
|
+
# Convert 'before' to naive local time for comparison.
|
|
442
|
+
if before.tzinfo is not None:
|
|
443
|
+
# Convert to local time and make naive
|
|
444
|
+
before_local = before.astimezone().replace(tzinfo=None)
|
|
445
|
+
else:
|
|
446
|
+
# Already naive, assume local time
|
|
447
|
+
before_local = before
|
|
448
|
+
|
|
449
|
+
tables = [
|
|
450
|
+
("documents", self.documents_table),
|
|
451
|
+
("chunks", self.chunks_table),
|
|
452
|
+
("settings", self.settings_table),
|
|
453
|
+
]
|
|
454
|
+
|
|
455
|
+
for table_name, table in tables:
|
|
456
|
+
versions = table.list_versions()
|
|
457
|
+
# Find the latest version at or before the target datetime
|
|
458
|
+
# Versions are sorted by version number, not timestamp, so we need to check all
|
|
459
|
+
best_version = None
|
|
460
|
+
best_timestamp = None
|
|
461
|
+
|
|
462
|
+
for v in versions:
|
|
463
|
+
# LanceDB version timestamps are naive datetime objects in local time
|
|
464
|
+
v_timestamp = v["timestamp"]
|
|
465
|
+
# Make sure it's naive for comparison
|
|
466
|
+
if v_timestamp.tzinfo is not None:
|
|
467
|
+
v_timestamp = v_timestamp.replace(tzinfo=None)
|
|
468
|
+
|
|
469
|
+
if v_timestamp <= before_local:
|
|
470
|
+
if best_timestamp is None or v_timestamp > best_timestamp:
|
|
471
|
+
best_version = v["version"]
|
|
472
|
+
best_timestamp = v_timestamp
|
|
473
|
+
|
|
474
|
+
if best_version is None:
|
|
475
|
+
# Find the earliest version to report in error message
|
|
476
|
+
if versions:
|
|
477
|
+
earliest = min(versions, key=lambda v: v["timestamp"])
|
|
478
|
+
earliest_ts = earliest["timestamp"]
|
|
479
|
+
raise ValueError(
|
|
480
|
+
f"No data exists before {before}. "
|
|
481
|
+
f"Database was created on {earliest_ts}"
|
|
482
|
+
)
|
|
483
|
+
else:
|
|
484
|
+
raise ValueError(
|
|
485
|
+
f"No data exists before {before}. Table has no versions."
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
# Checkout to the found version
|
|
489
|
+
table.checkout(best_version)
|
|
490
|
+
|
|
491
|
+
def list_table_versions(self, table_name: str) -> list[dict[str, Any]]:
|
|
492
|
+
"""List version history for a table.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
table_name: Name of the table ("documents", "chunks", or "settings")
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
List of version info dicts with "version" and "timestamp" keys
|
|
499
|
+
"""
|
|
500
|
+
table_map = {
|
|
501
|
+
"documents": self.documents_table,
|
|
502
|
+
"chunks": self.chunks_table,
|
|
503
|
+
"settings": self.settings_table,
|
|
504
|
+
}
|
|
505
|
+
table = table_map.get(table_name)
|
|
506
|
+
if table is None:
|
|
507
|
+
raise ValueError(f"Unknown table: {table_name}")
|
|
508
|
+
|
|
509
|
+
return list(table.list_versions())
|