haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (94) hide show
  1. haiku/rag/app.py +430 -72
  2. haiku/rag/chunkers/__init__.py +31 -0
  3. haiku/rag/chunkers/base.py +31 -0
  4. haiku/rag/chunkers/docling_local.py +164 -0
  5. haiku/rag/chunkers/docling_serve.py +179 -0
  6. haiku/rag/cli.py +207 -24
  7. haiku/rag/cli_chat.py +489 -0
  8. haiku/rag/client.py +1251 -266
  9. haiku/rag/config/__init__.py +16 -10
  10. haiku/rag/config/loader.py +5 -44
  11. haiku/rag/config/models.py +126 -17
  12. haiku/rag/converters/__init__.py +31 -0
  13. haiku/rag/converters/base.py +63 -0
  14. haiku/rag/converters/docling_local.py +193 -0
  15. haiku/rag/converters/docling_serve.py +229 -0
  16. haiku/rag/converters/text_utils.py +237 -0
  17. haiku/rag/embeddings/__init__.py +123 -24
  18. haiku/rag/embeddings/voyageai.py +175 -20
  19. haiku/rag/graph/__init__.py +0 -11
  20. haiku/rag/graph/agui/__init__.py +8 -2
  21. haiku/rag/graph/agui/cli_renderer.py +1 -1
  22. haiku/rag/graph/agui/emitter.py +219 -31
  23. haiku/rag/graph/agui/server.py +20 -62
  24. haiku/rag/graph/agui/stream.py +1 -2
  25. haiku/rag/graph/research/__init__.py +5 -2
  26. haiku/rag/graph/research/dependencies.py +12 -126
  27. haiku/rag/graph/research/graph.py +390 -135
  28. haiku/rag/graph/research/models.py +91 -112
  29. haiku/rag/graph/research/prompts.py +99 -91
  30. haiku/rag/graph/research/state.py +35 -27
  31. haiku/rag/inspector/__init__.py +8 -0
  32. haiku/rag/inspector/app.py +259 -0
  33. haiku/rag/inspector/widgets/__init__.py +6 -0
  34. haiku/rag/inspector/widgets/chunk_list.py +100 -0
  35. haiku/rag/inspector/widgets/context_modal.py +89 -0
  36. haiku/rag/inspector/widgets/detail_view.py +130 -0
  37. haiku/rag/inspector/widgets/document_list.py +75 -0
  38. haiku/rag/inspector/widgets/info_modal.py +209 -0
  39. haiku/rag/inspector/widgets/search_modal.py +183 -0
  40. haiku/rag/inspector/widgets/visual_modal.py +126 -0
  41. haiku/rag/mcp.py +106 -102
  42. haiku/rag/monitor.py +33 -9
  43. haiku/rag/providers/__init__.py +5 -0
  44. haiku/rag/providers/docling_serve.py +108 -0
  45. haiku/rag/qa/__init__.py +12 -10
  46. haiku/rag/qa/agent.py +43 -61
  47. haiku/rag/qa/prompts.py +35 -57
  48. haiku/rag/reranking/__init__.py +9 -6
  49. haiku/rag/reranking/base.py +1 -1
  50. haiku/rag/reranking/cohere.py +5 -4
  51. haiku/rag/reranking/mxbai.py +5 -2
  52. haiku/rag/reranking/vllm.py +3 -4
  53. haiku/rag/reranking/zeroentropy.py +6 -5
  54. haiku/rag/store/__init__.py +2 -1
  55. haiku/rag/store/engine.py +242 -42
  56. haiku/rag/store/exceptions.py +4 -0
  57. haiku/rag/store/models/__init__.py +8 -2
  58. haiku/rag/store/models/chunk.py +190 -0
  59. haiku/rag/store/models/document.py +46 -0
  60. haiku/rag/store/repositories/chunk.py +141 -121
  61. haiku/rag/store/repositories/document.py +25 -84
  62. haiku/rag/store/repositories/settings.py +11 -14
  63. haiku/rag/store/upgrades/__init__.py +19 -3
  64. haiku/rag/store/upgrades/v0_10_1.py +1 -1
  65. haiku/rag/store/upgrades/v0_19_6.py +65 -0
  66. haiku/rag/store/upgrades/v0_20_0.py +68 -0
  67. haiku/rag/store/upgrades/v0_23_1.py +100 -0
  68. haiku/rag/store/upgrades/v0_9_3.py +3 -3
  69. haiku/rag/utils.py +371 -146
  70. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
  71. haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
  72. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
  73. haiku/rag/chunker.py +0 -65
  74. haiku/rag/embeddings/base.py +0 -25
  75. haiku/rag/embeddings/ollama.py +0 -28
  76. haiku/rag/embeddings/openai.py +0 -26
  77. haiku/rag/embeddings/vllm.py +0 -29
  78. haiku/rag/graph/agui/events.py +0 -254
  79. haiku/rag/graph/common/__init__.py +0 -5
  80. haiku/rag/graph/common/models.py +0 -42
  81. haiku/rag/graph/common/nodes.py +0 -265
  82. haiku/rag/graph/common/prompts.py +0 -46
  83. haiku/rag/graph/common/utils.py +0 -44
  84. haiku/rag/graph/deep_qa/__init__.py +0 -1
  85. haiku/rag/graph/deep_qa/dependencies.py +0 -27
  86. haiku/rag/graph/deep_qa/graph.py +0 -243
  87. haiku/rag/graph/deep_qa/models.py +0 -20
  88. haiku/rag/graph/deep_qa/prompts.py +0 -59
  89. haiku/rag/graph/deep_qa/state.py +0 -56
  90. haiku/rag/graph/research/common.py +0 -87
  91. haiku/rag/reader.py +0 -135
  92. haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
  93. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
  94. {haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/qa/prompts.py CHANGED
@@ -1,60 +1,38 @@
1
- QA_SYSTEM_PROMPT = """
2
- You are a knowledgeable assistant that helps users find information from a document knowledge base.
3
-
4
- Your process:
5
- 1. When a user asks a question, use the search_documents tool to find relevant information
6
- 2. Search with specific keywords and phrases from the user's question
7
- 3. Review the search results and their relevance scores
8
- 4. If you need additional context, perform follow-up searches with different keywords
9
- 5. Provide a short and to the point comprehensive answer based only on the retrieved documents
10
-
11
- Guidelines:
12
- - Base your answers strictly on the provided document content
13
- - Quote or reference specific information when possible
14
- - If multiple documents contain relevant information, synthesize them coherently
15
- - Indicate when information is incomplete or when you need to search for additional context
16
- - If the retrieved documents don't contain sufficient information, clearly state: "I cannot find enough information in the knowledge base to answer this question."
17
- - For complex questions, consider breaking them down and performing multiple searches
18
- - Stick to the answer, do not ellaborate or provide context unless explicitly asked for it.
19
-
20
- Be concise, and always maintain accuracy over completeness. Prefer short, direct answers that are well-supported by the documents.
21
- /no_think
22
- """
23
-
24
- QA_SYSTEM_PROMPT_WITH_CITATIONS = """
25
- You are a knowledgeable assistant that helps users find information from a document knowledge base.
26
-
27
- IMPORTANT: You MUST use the search_documents tool for every question. Do not answer any question without first searching the knowledge base.
28
-
29
- Your process:
30
- 1. IMMEDIATELY call the search_documents tool with relevant keywords from the user's question
31
- 2. Review the search results and their relevance scores
32
- 3. If you need additional context, perform follow-up searches with different keywords
33
- 4. Provide a short and to the point comprehensive answer based only on the retrieved documents
34
- 5. Always include citations for the sources used in your answer
1
+ QA_SYSTEM_PROMPT = """You are a knowledgeable assistant that answers questions using a document knowledge base.
2
+
3
+ Process:
4
+ 1. Call search_documents with relevant keywords from the question
5
+ 2. Review the results and their relevance scores
6
+ 3. If needed, perform follow-up searches with different keywords (max 3 total)
7
+ 4. Provide a concise answer based strictly on the retrieved content
8
+
9
+ The search tool returns results like:
10
+ [chunk_abc123] (score: 0.85)
11
+ Source: "Document Title" > Section > Subsection
12
+ Type: paragraph
13
+ Content:
14
+ The actual text content here...
15
+
16
+ [chunk_def456] (score: 0.72)
17
+ Source: "Another Document"
18
+ Type: table
19
+ Content:
20
+ | Column 1 | Column 2 |
21
+ ...
22
+
23
+ Each result includes:
24
+ - chunk_id in brackets and relevance score
25
+ - Source: document title and section hierarchy (when available)
26
+ - Type: content type like paragraph, table, code, list_item (when available)
27
+ - Content: the actual text
28
+
29
+ In your response, include the chunk IDs you used in cited_chunks.
35
30
 
36
31
  Guidelines:
37
- - Base your answers strictly on the provided document content
38
- - If multiple documents contain relevant information, synthesize them coherently
39
- - Indicate when information is incomplete or when you need to search for additional context
40
- - If the retrieved documents don't contain sufficient information, clearly state: "I cannot find enough information in the knowledge base to answer this question."
41
- - For complex questions, consider breaking them down and performing multiple searches
42
- - Stick to the answer, do not ellaborate or provide context unless explicitly asked for it.
43
- - ALWAYS include citations at the end of your response using the format below
44
-
45
- Citation Format:
46
- After your answer, include a "Citations:" section that lists:
47
- - The document title (if available) or URI from each search result used
48
- - A brief excerpt (first 50-100 characters) of the content that supported your answer
49
- - Format: "Citations:\n- [document title or URI]: [content_excerpt]..."
50
-
51
- Example response format:
52
- [Your answer here]
53
-
54
- Citations:
55
- - /path/to/document1.pdf: "This document explains that AFMAN stands for Air Force Manual..."
56
- - /path/to/document2.pdf: "The manual provides guidance on military procedures and..."
57
-
58
- Be concise, and always maintain accuracy over completeness. Prefer short, direct answers that are well-supported by the documents.
59
- /no_think
32
+ - Base answers strictly on retrieved content - do not use external knowledge
33
+ - Use the Source and Type metadata to understand context
34
+ - If multiple results are relevant, synthesize them coherently
35
+ - If information is insufficient, say: "I cannot find enough information in the knowledge base to answer this question."
36
+ - Be concise and direct - avoid elaboration unless asked
37
+ - Higher scores indicate more relevant results
60
38
  """
@@ -24,7 +24,7 @@ def get_reranker(config: AppConfig = Config) -> RerankerBase | None:
24
24
 
25
25
  reranker: RerankerBase | None = None
26
26
 
27
- if config.reranking.provider == "mxbai":
27
+ if config.reranking.model and config.reranking.model.provider == "mxbai":
28
28
  try:
29
29
  from haiku.rag.reranking.mxbai import MxBAIReranker
30
30
 
@@ -33,7 +33,7 @@ def get_reranker(config: AppConfig = Config) -> RerankerBase | None:
33
33
  except ImportError:
34
34
  reranker = None
35
35
 
36
- elif config.reranking.provider == "cohere":
36
+ elif config.reranking.model and config.reranking.model.provider == "cohere":
37
37
  try:
38
38
  from haiku.rag.reranking.cohere import CohereReranker
39
39
 
@@ -41,20 +41,23 @@ def get_reranker(config: AppConfig = Config) -> RerankerBase | None:
41
41
  except ImportError:
42
42
  reranker = None
43
43
 
44
- elif config.reranking.provider == "vllm":
44
+ elif config.reranking.model and config.reranking.model.provider == "vllm":
45
45
  try:
46
46
  from haiku.rag.reranking.vllm import VLLMReranker
47
47
 
48
- reranker = VLLMReranker(config.reranking.model)
48
+ base_url = config.reranking.model.base_url
49
+ if not base_url:
50
+ raise ValueError("vLLM reranker requires base_url in reranking.model")
51
+ reranker = VLLMReranker(config.reranking.model.name, base_url)
49
52
  except ImportError:
50
53
  reranker = None
51
54
 
52
- elif config.reranking.provider == "zeroentropy":
55
+ elif config.reranking.model and config.reranking.model.provider == "zeroentropy":
53
56
  try:
54
57
  from haiku.rag.reranking.zeroentropy import ZeroEntropyReranker
55
58
 
56
59
  # Use configured model or default to zerank-1
57
- model = config.reranking.model or "zerank-1"
60
+ model = config.reranking.model.name or "zerank-1"
58
61
  reranker = ZeroEntropyReranker(model)
59
62
  except ImportError:
60
63
  reranker = None
@@ -3,7 +3,7 @@ from haiku.rag.store.models.chunk import Chunk
3
3
 
4
4
 
5
5
  class RerankerBase:
6
- _model: str = Config.reranking.model
6
+ _model: str | None = Config.reranking.model.name if Config.reranking.model else None
7
7
 
8
8
  async def rerank(
9
9
  self, query: str, chunks: list[Chunk], top_n: int = 10
@@ -9,10 +9,10 @@ except ImportError as e:
9
9
  ) from e
10
10
 
11
11
 
12
- class CohereReranker(RerankerBase):
12
+ class CohereReranker(RerankerBase): # pragma: no cover
13
13
  def __init__(self):
14
14
  # Cohere SDK reads CO_API_KEY from environment by default
15
- self._client = cohere.ClientV2()
15
+ self._client = cohere.AsyncClientV2()
16
16
 
17
17
  async def rerank(
18
18
  self, query: str, chunks: list[Chunk], top_n: int = 10
@@ -22,8 +22,9 @@ class CohereReranker(RerankerBase):
22
22
 
23
23
  documents = [chunk.content for chunk in chunks]
24
24
 
25
- response = self._client.rerank(
26
- model=self._model, query=query, documents=documents, top_n=top_n
25
+ model_name = self._model or "rerank-v3.5"
26
+ response = await self._client.rerank(
27
+ model=model_name, query=query, documents=documents, top_n=top_n
27
28
  )
28
29
 
29
30
  reranked_chunks = []
@@ -7,9 +7,12 @@ from haiku.rag.store.models.chunk import Chunk
7
7
 
8
8
  class MxBAIReranker(RerankerBase):
9
9
  def __init__(self):
10
- self._client = MxbaiRerankV2(
11
- Config.reranking.model, disable_transformers_warnings=True
10
+ model_name = (
11
+ Config.reranking.model.name
12
+ if Config.reranking.model
13
+ else "mixedbread-ai/mxbai-rerank-base-v2"
12
14
  )
15
+ self._client = MxbaiRerankV2(model_name, disable_transformers_warnings=True)
13
16
 
14
17
  async def rerank(
15
18
  self, query: str, chunks: list[Chunk], top_n: int = 10
@@ -1,14 +1,13 @@
1
1
  import httpx
2
2
 
3
- from haiku.rag.config import Config
4
3
  from haiku.rag.reranking.base import RerankerBase
5
4
  from haiku.rag.store.models.chunk import Chunk
6
5
 
7
6
 
8
- class VLLMReranker(RerankerBase):
9
- def __init__(self, model: str):
7
+ class VLLMReranker(RerankerBase): # pragma: no cover
8
+ def __init__(self, model: str, base_url: str):
10
9
  self._model = model
11
- self._base_url = Config.providers.vllm.rerank_base_url
10
+ self._base_url = base_url
12
11
 
13
12
  async def rerank(
14
13
  self, query: str, chunks: list[Chunk], top_n: int = 10
@@ -1,10 +1,10 @@
1
- from zeroentropy import ZeroEntropy
1
+ from zeroentropy import AsyncZeroEntropy
2
2
 
3
3
  from haiku.rag.reranking.base import RerankerBase
4
4
  from haiku.rag.store.models.chunk import Chunk
5
5
 
6
6
 
7
- class ZeroEntropyReranker(RerankerBase):
7
+ class ZeroEntropyReranker(RerankerBase): # pragma: no cover
8
8
  """Zero Entropy reranker implementation using the zerank-1 model."""
9
9
 
10
10
  def __init__(self, model: str = "zerank-1"):
@@ -15,7 +15,7 @@ class ZeroEntropyReranker(RerankerBase):
15
15
  """
16
16
  self._model = model
17
17
  # Zero Entropy SDK reads ZEROENTROPY_API_KEY from environment by default
18
- self._client = ZeroEntropy()
18
+ self._client = AsyncZeroEntropy()
19
19
 
20
20
  async def rerank(
21
21
  self, query: str, chunks: list[Chunk], top_n: int = 10
@@ -37,8 +37,9 @@ class ZeroEntropyReranker(RerankerBase):
37
37
  documents = [chunk.content for chunk in chunks]
38
38
 
39
39
  # Call Zero Entropy reranking API
40
- response = self._client.models.rerank(
41
- model=self._model,
40
+ model_name = self._model or "zerank-1"
41
+ response = await self._client.models.rerank(
42
+ model=model_name,
42
43
  query=query,
43
44
  documents=documents,
44
45
  )
@@ -1,4 +1,5 @@
1
1
  from .engine import Store
2
+ from .exceptions import ReadOnlyError
2
3
  from .models import Chunk, Document
3
4
 
4
- __all__ = ["Store", "Chunk", "Document"]
5
+ __all__ = ["Store", "Chunk", "Document", "ReadOnlyError"]
haiku/rag/store/engine.py CHANGED
@@ -1,9 +1,10 @@
1
1
  import asyncio
2
2
  import json
3
3
  import logging
4
- from datetime import timedelta
4
+ from datetime import datetime, timedelta
5
5
  from importlib import metadata
6
6
  from pathlib import Path
7
+ from typing import Any
7
8
  from uuid import uuid4
8
9
 
9
10
  import lancedb
@@ -12,6 +13,7 @@ from pydantic import Field
12
13
 
13
14
  from haiku.rag.config import AppConfig, Config
14
15
  from haiku.rag.embeddings import get_embedder
16
+ from haiku.rag.store.exceptions import ReadOnlyError
15
17
 
16
18
  logger = logging.getLogger(__name__)
17
19
 
@@ -22,6 +24,8 @@ class DocumentRecord(LanceModel):
22
24
  uri: str | None = None
23
25
  title: str | None = None
24
26
  metadata: str = Field(default="{}")
27
+ docling_document_json: str | None = None
28
+ docling_version: str | None = None
25
29
  created_at: str = Field(default_factory=lambda: "")
26
30
  updated_at: str = Field(default_factory=lambda: "")
27
31
 
@@ -36,6 +40,7 @@ def create_chunk_model(vector_dim: int):
36
40
  id: str = Field(default_factory=lambda: str(uuid4()))
37
41
  document_id: str
38
42
  content: str
43
+ content_fts: str = Field(default="")
39
44
  metadata: str = Field(default="{}")
40
45
  order: int = Field(default=0)
41
46
  vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim) # type: ignore
@@ -54,39 +59,67 @@ class Store:
54
59
  db_path: Path,
55
60
  config: AppConfig = Config,
56
61
  skip_validation: bool = False,
57
- allow_create: bool = True,
62
+ create: bool = False,
63
+ read_only: bool = False,
64
+ before: datetime | None = None,
58
65
  ):
59
66
  self.db_path: Path = db_path
60
67
  self._config = config
68
+ self._before = before
69
+ # Time-travel mode is always read-only
70
+ self._read_only = read_only or (before is not None)
61
71
  self.embedder = get_embedder(config=self._config)
62
72
  self._vacuum_lock = asyncio.Lock()
63
73
 
64
74
  # Create the ChunkRecord model with the correct vector dimension
65
75
  self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
66
76
 
67
- # Local filesystem handling for DB directory
77
+ # Check if database exists (for local filesystem only)
78
+ is_new_db = False
68
79
  if not self._has_cloud_config():
69
- if not allow_create:
70
- # Read operations should not create the database
71
- if not db_path.exists():
80
+ if not db_path.exists():
81
+ if not create:
72
82
  raise FileNotFoundError(
73
- f"Database does not exist: {db_path}. Use a write operation (add, add-src) to create it."
83
+ f"Database does not exist at {self.db_path.absolute()}. "
84
+ "Use 'haiku-rag init' to create a new database."
74
85
  )
75
- else:
76
- # Write operations - ensure parent directories exist
86
+ is_new_db = True
87
+ # Ensure parent directories exist for new databases
77
88
  if not db_path.parent.exists():
78
89
  Path.mkdir(db_path.parent, parents=True)
79
90
 
80
91
  # Connect to LanceDB
81
92
  self.db = self._connect_to_lancedb(db_path)
82
93
 
83
- # Initialize tables
84
- self.create_or_update_db()
94
+ # Initialize tables (creates them if they don't exist)
95
+ self._init_tables()
96
+
97
+ # Checkout tables to historical state if before is specified
98
+ if before is not None:
99
+ self._checkout_tables_before(before)
100
+
101
+ # Run upgrades only on existing databases, set version for new ones
102
+ # Skip upgrades in read-only mode (they would fail anyway)
103
+ if not self._read_only:
104
+ if is_new_db:
105
+ self._set_initial_version()
106
+ else:
107
+ self._run_upgrades()
85
108
 
86
109
  # Validate config compatibility after connection is established
87
110
  if not skip_validation:
88
111
  self._validate_configuration()
89
112
 
113
+ @property
114
+ def is_read_only(self) -> bool:
115
+ """Whether the store is in read-only mode."""
116
+ return self._read_only
117
+
118
+ def _assert_writable(self) -> None:
119
+ """Raise ReadOnlyError if the store is in read-only mode."""
120
+ if self._read_only:
121
+ raise ReadOnlyError("Cannot modify database in read-only mode")
122
+
90
123
  async def vacuum(self, retention_seconds: int | None = None) -> None:
91
124
  """Optimize and clean up old versions across all tables to reduce disk usage.
92
125
 
@@ -97,7 +130,12 @@ class Store:
97
130
  Note:
98
131
  If vacuum is already running, this method returns immediately without blocking.
99
132
  Use asyncio.create_task(store.vacuum()) for non-blocking background execution.
133
+
134
+ Raises:
135
+ ReadOnlyError: If the store is in read-only mode.
100
136
  """
137
+ self._assert_writable()
138
+
101
139
  if self._has_cloud_config() and str(self._config.lancedb.uri).startswith(
102
140
  "db://"
103
141
  ):
@@ -145,6 +183,87 @@ class Store:
145
183
  and self._config.lancedb.region
146
184
  )
147
185
 
186
+ def get_stats(self) -> dict:
187
+ """Get comprehensive table statistics.
188
+
189
+ Returns:
190
+ Dictionary with statistics for documents and chunks tables including:
191
+ - Row counts
192
+ - Storage sizes
193
+ - Vector index status and statistics
194
+ """
195
+ stats_dict: dict = {
196
+ "documents": {"exists": False},
197
+ "chunks": {"exists": False},
198
+ }
199
+
200
+ # Documents table stats
201
+ doc_stats: dict = self.documents_table.stats() # type: ignore[assignment]
202
+ stats_dict["documents"] = {
203
+ "exists": True,
204
+ "num_rows": doc_stats.get("num_rows", 0),
205
+ "total_bytes": doc_stats.get("total_bytes", 0),
206
+ }
207
+
208
+ # Chunks table stats
209
+ chunk_stats: dict = self.chunks_table.stats() # type: ignore[assignment]
210
+ stats_dict["chunks"] = {
211
+ "exists": True,
212
+ "num_rows": chunk_stats.get("num_rows", 0),
213
+ "total_bytes": chunk_stats.get("total_bytes", 0),
214
+ }
215
+
216
+ # Vector index stats
217
+ indices = self.chunks_table.list_indices()
218
+ has_vector_index = any("vector" in str(idx).lower() for idx in indices)
219
+ stats_dict["chunks"]["has_vector_index"] = has_vector_index
220
+
221
+ if has_vector_index:
222
+ index_stats = self.chunks_table.index_stats("vector_idx")
223
+ if index_stats is not None:
224
+ stats_dict["chunks"]["num_indexed_rows"] = index_stats.num_indexed_rows
225
+ stats_dict["chunks"]["num_unindexed_rows"] = (
226
+ index_stats.num_unindexed_rows
227
+ )
228
+
229
+ return stats_dict
230
+
231
+ def _ensure_vector_index(self) -> None:
232
+ """Create or rebuild vector index on chunks table.
233
+
234
+ Cloud deployments auto-create indexes, so we skip for those.
235
+ For self-hosted, creates an IVF_PQ index. If an index exists,
236
+ it will be replaced (using replace=True parameter).
237
+ Note: Index creation requires sufficient training data.
238
+ """
239
+ if self._has_cloud_config():
240
+ return
241
+
242
+ try:
243
+ # Check if table has enough data (indexes require training data)
244
+ row_count = self.chunks_table.count_rows()
245
+ if row_count < 256:
246
+ logger.debug(
247
+ f"Skipping vector index creation: need at least 256 rows, have {row_count}"
248
+ )
249
+ return
250
+
251
+ # Create or replace index (replace=True is the default)
252
+ logger.info("Creating vector index on chunks table...")
253
+ self.chunks_table.create_index(
254
+ metric=self._config.search.vector_index_metric,
255
+ index_type="IVF_PQ",
256
+ replace=True, # Explicit: replace existing index
257
+ )
258
+
259
+ # Wait for index creation to complete
260
+ # Index name is column_name + "_idx"
261
+ self.chunks_table.wait_for_index(["vector_idx"], timeout=timedelta(hours=1))
262
+
263
+ logger.info("Vector index created successfully")
264
+ except Exception as e:
265
+ logger.warning(f"Could not create vector index: {e}")
266
+
148
267
  def _validate_configuration(self) -> None:
149
268
  """Validate that the configuration is compatible with the database."""
150
269
  from haiku.rag.store.repositories.settings import SettingsRepository
@@ -152,9 +271,8 @@ class Store:
152
271
  settings_repo = SettingsRepository(self)
153
272
  settings_repo.validate_config_compatibility()
154
273
 
155
- def create_or_update_db(self):
156
- """Create the database tables."""
157
-
274
+ def _init_tables(self):
275
+ """Initialize database tables (create if they don't exist)."""
158
276
  # Get list of existing tables
159
277
  existing_tables = self.db.table_names()
160
278
 
@@ -171,9 +289,9 @@ class Store:
171
289
  self.chunks_table = self.db.open_table("chunks")
172
290
  else:
173
291
  self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
174
- # Create FTS index on the new table with phrase query support
292
+ # Create FTS index on content_fts (contextualized content) for better search
175
293
  self.chunks_table.create_fts_index(
176
- "content", replace=True, with_position=True, remove_stop_words=False
294
+ "content_fts", replace=True, with_position=True, remove_stop_words=False
177
295
  )
178
296
 
179
297
  # Create or get settings table
@@ -189,34 +307,21 @@ class Store:
189
307
  [SettingsRecord(id="settings", settings=json.dumps(settings_data))]
190
308
  )
191
309
 
192
- # Run pending upgrades based on stored version and package version
310
+ def _set_initial_version(self):
311
+ """Set the initial version for a new database."""
312
+ self.set_haiku_version(metadata.version("haiku.rag-slim"))
313
+
314
+ def _run_upgrades(self):
315
+ """Run pending database upgrades."""
193
316
  try:
194
317
  from haiku.rag.store.upgrades import run_pending_upgrades
195
318
 
196
319
  current_version = metadata.version("haiku.rag-slim")
197
320
  db_version = self.get_haiku_version()
198
321
 
199
- if db_version != "0.0.0":
200
- run_pending_upgrades(self, db_version, current_version)
201
-
202
- # After upgrades complete (or if none), set stored version
203
- # to the greater of the installed package version and the
204
- # highest available upgrade step version in code.
205
- try:
206
- from packaging.version import parse as _v
207
-
208
- from haiku.rag.store.upgrades import upgrades as _steps
209
-
210
- highest_step = max((_v(u.version) for u in _steps), default=None)
211
- effective_version = (
212
- str(max(_v(current_version), highest_step))
213
- if highest_step is not None
214
- else current_version
215
- )
216
- except Exception:
217
- effective_version = current_version
322
+ run_pending_upgrades(self, db_version, current_version)
218
323
 
219
- self.set_haiku_version(effective_version)
324
+ self.set_haiku_version(current_version)
220
325
  except Exception as e:
221
326
  # Avoid hard failure on initial connection; log and continue so CLI remains usable.
222
327
  logger.warning(
@@ -241,7 +346,12 @@ class Store:
241
346
  return "0.0.0"
242
347
 
243
348
  def set_haiku_version(self, version: str) -> None:
244
- """Updates the user version in settings."""
349
+ """Updates the user version in settings.
350
+
351
+ Raises:
352
+ ReadOnlyError: If the store is in read-only mode.
353
+ """
354
+ self._assert_writable()
245
355
  settings_records = list(
246
356
  self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
247
357
  )
@@ -267,7 +377,12 @@ class Store:
267
377
  )
268
378
 
269
379
  def recreate_embeddings_table(self) -> None:
270
- """Recreate the chunks table with current vector dimensions."""
380
+ """Recreate the chunks table with current vector dimensions.
381
+
382
+ Raises:
383
+ ReadOnlyError: If the store is in read-only mode.
384
+ """
385
+ self._assert_writable()
271
386
  # Drop and recreate chunks table
272
387
  try:
273
388
  self.db.drop_table("chunks")
@@ -278,9 +393,9 @@ class Store:
278
393
  self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
279
394
  self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
280
395
 
281
- # Create FTS index on the new table with phrase query support
396
+ # Create FTS index on content_fts (contextualized content) for better search
282
397
  self.chunks_table.create_fts_index(
283
- "content", replace=True, with_position=True, remove_stop_words=False
398
+ "content_fts", replace=True, with_position=True, remove_stop_words=False
284
399
  )
285
400
 
286
401
  def close(self):
@@ -297,7 +412,12 @@ class Store:
297
412
  }
298
413
 
299
414
  def restore_table_versions(self, versions: dict[str, int]) -> bool:
300
- """Restore tables to the provided versions using LanceDB's API."""
415
+ """Restore tables to the provided versions using LanceDB's API.
416
+
417
+ Raises:
418
+ ReadOnlyError: If the store is in read-only mode.
419
+ """
420
+ self._assert_writable()
301
421
  self.documents_table.restore(int(versions["documents"]))
302
422
  self.chunks_table.restore(int(versions["chunks"]))
303
423
  self.settings_table.restore(int(versions["settings"]))
@@ -307,3 +427,83 @@ class Store:
307
427
  def _connection(self):
308
428
  """Compatibility property for repositories expecting _connection."""
309
429
  return self
430
+
431
+ def _checkout_tables_before(self, before: datetime) -> None:
432
+ """Checkout all tables to their state at or before the given datetime.
433
+
434
+ Args:
435
+ before: The datetime to checkout to
436
+
437
+ Raises:
438
+ ValueError: If no version exists before the given datetime
439
+ """
440
+ # LanceDB stores timestamps as naive datetimes in local time.
441
+ # Convert 'before' to naive local time for comparison.
442
+ if before.tzinfo is not None:
443
+ # Convert to local time and make naive
444
+ before_local = before.astimezone().replace(tzinfo=None)
445
+ else:
446
+ # Already naive, assume local time
447
+ before_local = before
448
+
449
+ tables = [
450
+ ("documents", self.documents_table),
451
+ ("chunks", self.chunks_table),
452
+ ("settings", self.settings_table),
453
+ ]
454
+
455
+ for table_name, table in tables:
456
+ versions = table.list_versions()
457
+ # Find the latest version at or before the target datetime
458
+ # Versions are sorted by version number, not timestamp, so we need to check all
459
+ best_version = None
460
+ best_timestamp = None
461
+
462
+ for v in versions:
463
+ # LanceDB version timestamps are naive datetime objects in local time
464
+ v_timestamp = v["timestamp"]
465
+ # Make sure it's naive for comparison
466
+ if v_timestamp.tzinfo is not None:
467
+ v_timestamp = v_timestamp.replace(tzinfo=None)
468
+
469
+ if v_timestamp <= before_local:
470
+ if best_timestamp is None or v_timestamp > best_timestamp:
471
+ best_version = v["version"]
472
+ best_timestamp = v_timestamp
473
+
474
+ if best_version is None:
475
+ # Find the earliest version to report in error message
476
+ if versions:
477
+ earliest = min(versions, key=lambda v: v["timestamp"])
478
+ earliest_ts = earliest["timestamp"]
479
+ raise ValueError(
480
+ f"No data exists before {before}. "
481
+ f"Database was created on {earliest_ts}"
482
+ )
483
+ else:
484
+ raise ValueError(
485
+ f"No data exists before {before}. Table has no versions."
486
+ )
487
+
488
+ # Checkout to the found version
489
+ table.checkout(best_version)
490
+
491
+ def list_table_versions(self, table_name: str) -> list[dict[str, Any]]:
492
+ """List version history for a table.
493
+
494
+ Args:
495
+ table_name: Name of the table ("documents", "chunks", or "settings")
496
+
497
+ Returns:
498
+ List of version info dicts with "version" and "timestamp" keys
499
+ """
500
+ table_map = {
501
+ "documents": self.documents_table,
502
+ "chunks": self.chunks_table,
503
+ "settings": self.settings_table,
504
+ }
505
+ table = table_map.get(table_name)
506
+ if table is None:
507
+ raise ValueError(f"Unknown table: {table_name}")
508
+
509
+ return list(table.list_versions())