haiku.rag 0.8.0__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

Files changed (78) hide show
  1. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/PKG-INFO +1 -1
  2. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/benchmarks.md +2 -2
  3. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/cli.md +4 -2
  4. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/configuration.md +32 -0
  5. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/python.md +8 -0
  6. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/pyproject.toml +2 -1
  7. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/config.py +4 -0
  8. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/engine.py +15 -0
  9. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/chunk.py +24 -1
  10. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/document.py +48 -28
  11. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/utils.py +54 -0
  12. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/generate_benchmark_db.py +3 -1
  13. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_client.py +91 -95
  14. haiku_rag-0.8.1/tests/test_preprocessor.py +71 -0
  15. haiku_rag-0.8.1/tests/test_versioning.py +94 -0
  16. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/uv.lock +175 -1
  17. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.github/FUNDING.yml +0 -0
  18. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.github/workflows/build-docs.yml +0 -0
  19. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.github/workflows/build-publish.yml +0 -0
  20. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.gitignore +0 -0
  21. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.pre-commit-config.yaml +0 -0
  22. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/.python-version +0 -0
  23. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/LICENSE +0 -0
  24. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/README.md +0 -0
  25. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/index.md +0 -0
  26. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/installation.md +0 -0
  27. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/mcp.md +0 -0
  28. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/docs/server.md +0 -0
  29. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/mkdocs.yml +0 -0
  30. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/__init__.py +0 -0
  31. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/app.py +0 -0
  32. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/chunker.py +0 -0
  33. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/cli.py +0 -0
  34. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/client.py +0 -0
  35. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/__init__.py +0 -0
  36. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/base.py +0 -0
  37. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/ollama.py +0 -0
  38. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/openai.py +0 -0
  39. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/vllm.py +0 -0
  40. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/embeddings/voyageai.py +0 -0
  41. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/logging.py +0 -0
  42. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/mcp.py +0 -0
  43. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/migration.py +0 -0
  44. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/monitor.py +0 -0
  45. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/qa/__init__.py +0 -0
  46. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/qa/agent.py +0 -0
  47. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/qa/prompts.py +0 -0
  48. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reader.py +0 -0
  49. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/__init__.py +0 -0
  50. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/base.py +0 -0
  51. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/cohere.py +0 -0
  52. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/mxbai.py +0 -0
  53. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/reranking/vllm.py +0 -0
  54. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/__init__.py +0 -0
  55. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/models/__init__.py +0 -0
  56. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/models/chunk.py +0 -0
  57. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/models/document.py +0 -0
  58. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/__init__.py +0 -0
  59. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/repositories/settings.py +0 -0
  60. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/src/haiku/rag/store/upgrades/__init__.py +0 -0
  61. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/__init__.py +0 -0
  62. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/conftest.py +0 -0
  63. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/llm_judge.py +0 -0
  64. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_app.py +0 -0
  65. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_chunk.py +0 -0
  66. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_chunker.py +0 -0
  67. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_cli.py +0 -0
  68. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_document.py +0 -0
  69. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_embedder.py +0 -0
  70. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_lancedb_connection.py +0 -0
  71. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_monitor.py +0 -0
  72. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_qa.py +0 -0
  73. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_reader.py +0 -0
  74. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_rebuild.py +0 -0
  75. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_reranker.py +0 -0
  76. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_search.py +0 -0
  77. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_settings.py +0 -0
  78. {haiku_rag-0.8.0 → haiku_rag-0.8.1}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.8.0
3
+ Version: 0.8.1
4
4
  Summary: Retrieval Augmented Generation (RAG) with LanceDB
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -16,8 +16,8 @@ The recall obtained is ~0.79 for matching in the top result, raising to ~0.91 fo
16
16
  |---------------------------------------|-------------------|-------------------|------------------------|
17
17
  | Ollama / `mxbai-embed-large` | 0.79 | 0.91 | None |
18
18
  | Ollama / `mxbai-embed-large` | 0.90 | 0.95 | `mxbai-rerank-base-v2` |
19
- <!-- | Ollama / `nomic-embed-text` | 0.74 | 0.88 | None |
20
- | OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
19
+ | Ollama / `nomic-embed-text-v1.5` | 0.74 | 0.90 | None |
20
+ <!-- | OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
21
21
  | OpenAI / `text-embeddings-3-small` | 0.75 | 0.88 | None |
22
22
  | OpenAI / `text-embeddings-3-small` | 0.83 | 0.90 | Cohere / `rerank-v3.5` | -->
23
23
 
@@ -36,8 +36,10 @@ haiku-rag add-src https://example.com/article.html
36
36
  ```
37
37
 
38
38
  !!! note
39
- As you add documents to `haiku.rag` the database keeps growing. By default, `lanceDB` supports versioning
40
- of your data. You can optimize and compact the database by running the [vaccum](#vacuum-optimize-and-cleanup) command.
39
+ As you add documents to `haiku.rag` the database keeps growing. By default, LanceDB supports versioning
40
+ of your data. Create/update operations are atomic‑feeling: if anything fails during chunking or embedding,
41
+ the database rolls back to the pre‑operation snapshot using LanceDB table versioning. You can optimize and
42
+ compact the database by running the [vacuum](#vacuum-optimize-and-cleanup) command.
41
43
 
42
44
  ### Get Document
43
45
 
@@ -223,3 +223,35 @@ CHUNK_SIZE=256
223
223
  # into single chunks with continuous content to eliminate duplication
224
224
  CONTEXT_CHUNK_RADIUS=0
225
225
  ```
226
+
227
+ #### Markdown Preprocessor
228
+
229
+ Optionally preprocess Markdown before chunking by pointing to a callable that receives and returns Markdown text. This is useful for normalizing content, stripping boilerplate, or applying custom transformations before chunk boundaries are computed.
230
+
231
+ ```bash
232
+ # A callable path in one of these formats:
233
+ # - package.module:func
234
+ # - package.module.func
235
+ # - /abs/or/relative/path/to/file.py:func
236
+ MARKDOWN_PREPROCESSOR="my_pkg.preprocess:clean_md"
237
+ ```
238
+
239
+ !!! note
240
+ - The function signature should be `def clean_md(text: str) -> str` or `async def clean_md(text: str) -> str`.
241
+ - If the function raises or returns a non-string, haiku.rag logs a warning and proceeds without preprocessing.
242
+ - The preprocessor affects only the chunking pipeline. The stored document content remains unchanged.
243
+
244
+ Example implementation:
245
+
246
+ ```python
247
+ # my_pkg/preprocess.py
248
+ def clean_md(text: str) -> str:
249
+ # strip HTML comments and collapse multiple blank lines
250
+ lines = [line for line in text.splitlines() if not line.strip().startswith("<!--")]
251
+ out = []
252
+ for line in lines:
253
+ if line.strip() == "" and (out and out[-1] == ""):
254
+ continue
255
+ out.append(line)
256
+ return "\n".join(out)
257
+ ```
@@ -109,6 +109,14 @@ await client.vacuum()
109
109
 
110
110
  This compacts tables and removes historical versions to keep disk usage in check. It’s safe to run anytime, for example after bulk imports or periodically in long‑running apps.
111
111
 
112
+ ### Atomic Writes and Rollback
113
+
114
+ Document create and update operations take a snapshot of table versions before any write and automatically roll back to that snapshot if something fails (for example, during chunking or embedding). This restores both the `documents` and `chunks` tables to their pre‑operation state using LanceDB’s table versioning.
115
+
116
+ - Applies to: `create_document(...)`, `create_document_from_source(...)`, `update_document(...)`, and internal rebuild/update flows.
117
+ - Scope: Both document rows and all associated chunks are rolled back together.
118
+ - Vacuum: Running `vacuum()` later prunes old versions for disk efficiency; rollbacks occur immediately during the failing operation and are not impacted.
119
+
112
120
  ## Searching Documents
113
121
 
114
122
  The search method performs native hybrid search (vector + full-text) using LanceDB with optional reranking for improved relevance:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "haiku.rag"
3
- version = "0.8.0"
3
+ version = "0.8.1"
4
4
  description = "Retrieval Augmented Generation (RAG) with LanceDB"
5
5
  authors = [{ name = "Yiorgis Gozadinos", email = "ggozadinos@gmail.com" }]
6
6
  license = { text = "MIT" }
@@ -53,6 +53,7 @@ packages = ["src/haiku"]
53
53
  [dependency-groups]
54
54
  dev = [
55
55
  "datasets>=3.6.0",
56
+ "logfire>=4.6.0",
56
57
  "mkdocs>=1.6.1",
57
58
  "mkdocs-material>=9.6.14",
58
59
  "pre-commit>=4.2.0",
@@ -32,6 +32,10 @@ class AppConfig(BaseModel):
32
32
  CHUNK_SIZE: int = 256
33
33
  CONTEXT_CHUNK_RADIUS: int = 0
34
34
 
35
+ # Optional dotted path or file path to a callable that preprocesses
36
+ # markdown content before chunking. Examples:
37
+ MARKDOWN_PREPROCESSOR: str = ""
38
+
35
39
  OLLAMA_BASE_URL: str = "http://localhost:11434"
36
40
  VLLM_EMBEDDINGS_BASE_URL: str = ""
37
41
  VLLM_RERANK_BASE_URL: str = ""
@@ -209,6 +209,21 @@ class Store:
209
209
  # LanceDB connections are automatically managed
210
210
  pass
211
211
 
212
+ def current_table_versions(self) -> dict[str, int]:
213
+ """Capture current versions of key tables for rollback using LanceDB's API."""
214
+ return {
215
+ "documents": int(self.documents_table.version),
216
+ "chunks": int(self.chunks_table.version),
217
+ "settings": int(self.settings_table.version),
218
+ }
219
+
220
+ def restore_table_versions(self, versions: dict[str, int]) -> bool:
221
+ """Restore tables to the provided versions using LanceDB's API."""
222
+ self.documents_table.restore(int(versions["documents"]))
223
+ self.chunks_table.restore(int(versions["chunks"]))
224
+ self.settings_table.restore(int(versions["settings"]))
225
+ return True
226
+
212
227
  @property
213
228
  def _connection(self):
214
229
  """Compatibility property for repositories expecting _connection."""
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import inspect
2
3
  import json
3
4
  import logging
4
5
  from uuid import uuid4
@@ -11,6 +12,7 @@ from haiku.rag.config import Config
11
12
  from haiku.rag.embeddings import get_embedder
12
13
  from haiku.rag.store.engine import DocumentRecord, Store
13
14
  from haiku.rag.store.models.chunk import Chunk
15
+ from haiku.rag.utils import load_callable, text_to_docling_document
14
16
 
15
17
  logger = logging.getLogger(__name__)
16
18
 
@@ -152,7 +154,28 @@ class ChunkRepository:
152
154
  self, document_id: str, document: DoclingDocument
153
155
  ) -> list[Chunk]:
154
156
  """Create chunks and embeddings for a document from DoclingDocument."""
155
- chunk_texts = await chunker.chunk(document)
157
+ # Optionally preprocess markdown before chunking
158
+ processed_document = document
159
+ preprocessor_path = Config.MARKDOWN_PREPROCESSOR
160
+ if preprocessor_path:
161
+ try:
162
+ pre_fn = load_callable(preprocessor_path)
163
+ markdown = document.export_to_markdown()
164
+ result = pre_fn(markdown)
165
+ if inspect.isawaitable(result):
166
+ result = await result # type: ignore[assignment]
167
+ processed_markdown = result
168
+ if not isinstance(processed_markdown, str):
169
+ raise ValueError("Preprocessor must return a markdown string")
170
+ processed_document = text_to_docling_document(
171
+ processed_markdown, name="content.md"
172
+ )
173
+ except Exception as e:
174
+ logger.warning(
175
+ f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
176
+ )
177
+
178
+ chunk_texts = await chunker.chunk(processed_document)
156
179
 
157
180
  embeddings = await self.embedder.embed(chunk_texts)
158
181
 
@@ -171,44 +171,64 @@ class DocumentRepository:
171
171
  chunks: list["Chunk"] | None = None,
172
172
  ) -> Document:
173
173
  """Create a document with its chunks and embeddings."""
174
+ # Snapshot table versions for versioned rollback (if supported)
175
+ versions = self.store.current_table_versions()
176
+
174
177
  # Create the document
175
178
  created_doc = await self.create(entity)
176
179
 
177
- # Create chunks if not provided
178
- if chunks is None:
179
- assert created_doc.id is not None, (
180
- "Document ID should not be None after creation"
181
- )
182
- await self.chunk_repository.create_chunks_for_document(
183
- created_doc.id, docling_document
184
- )
185
- else:
186
- # Use provided chunks, set order from list position
187
- assert created_doc.id is not None, (
188
- "Document ID should not be None after creation"
189
- )
190
- for order, chunk in enumerate(chunks):
191
- chunk.document_id = created_doc.id
192
- chunk.metadata["order"] = order
193
- await self.chunk_repository.create(chunk)
194
-
195
- return created_doc
180
+ # Attempt to create chunks; on failure, prefer version rollback
181
+ try:
182
+ # Create chunks if not provided
183
+ if chunks is None:
184
+ assert created_doc.id is not None, (
185
+ "Document ID should not be None after creation"
186
+ )
187
+ await self.chunk_repository.create_chunks_for_document(
188
+ created_doc.id, docling_document
189
+ )
190
+ else:
191
+ # Use provided chunks, set order from list position
192
+ assert created_doc.id is not None, (
193
+ "Document ID should not be None after creation"
194
+ )
195
+ for order, chunk in enumerate(chunks):
196
+ chunk.document_id = created_doc.id
197
+ chunk.metadata["order"] = order
198
+ await self.chunk_repository.create(chunk)
199
+
200
+ return created_doc
201
+ except Exception:
202
+ # Roll back to the captured versions and re-raise
203
+ self.store.restore_table_versions(versions)
204
+ raise
196
205
 
197
206
  async def _update_with_docling(
198
207
  self, entity: Document, docling_document: DoclingDocument
199
208
  ) -> Document:
200
209
  """Update a document and regenerate its chunks."""
201
- # Delete existing chunks
202
210
  assert entity.id is not None, "Document ID is required for update"
211
+
212
+ # Snapshot table versions for versioned rollback
213
+ versions = self.store.current_table_versions()
214
+
215
+ # Delete existing chunks before writing new ones
203
216
  await self.chunk_repository.delete_by_document_id(entity.id)
204
217
 
205
- # Update the document
206
- updated_doc = await self.update(entity)
218
+ try:
219
+ # Update the document
220
+ updated_doc = await self.update(entity)
207
221
 
208
- # Create new chunks
209
- assert updated_doc.id is not None, "Document ID should not be None after update"
210
- await self.chunk_repository.create_chunks_for_document(
211
- updated_doc.id, docling_document
212
- )
222
+ # Create new chunks
223
+ assert updated_doc.id is not None, (
224
+ "Document ID should not be None after update"
225
+ )
226
+ await self.chunk_repository.create_chunks_for_document(
227
+ updated_doc.id, docling_document
228
+ )
213
229
 
214
- return updated_doc
230
+ return updated_doc
231
+ except Exception:
232
+ # Roll back to the captured versions and re-raise
233
+ self.store.restore_table_versions(versions)
234
+ raise
@@ -1,10 +1,13 @@
1
1
  import asyncio
2
+ import importlib
3
+ import importlib.util
2
4
  import sys
3
5
  from collections.abc import Callable
4
6
  from functools import wraps
5
7
  from importlib import metadata
6
8
  from io import BytesIO
7
9
  from pathlib import Path
10
+ from types import ModuleType
8
11
 
9
12
  import httpx
10
13
  from docling.document_converter import DocumentConverter
@@ -106,3 +109,54 @@ def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocu
106
109
  converter = DocumentConverter()
107
110
  result = converter.convert(doc_stream)
108
111
  return result.document
112
+
113
+
114
+ def load_callable(path: str):
115
+ """Load a callable from a dotted path or file path.
116
+
117
+ Supported formats:
118
+ - "package.module:func" or "package.module.func"
119
+ - "path/to/file.py:func"
120
+
121
+ Returns the loaded callable. Raises ValueError on failure.
122
+ """
123
+ if not path:
124
+ raise ValueError("Empty callable path provided")
125
+
126
+ module_part = None
127
+ func_name = None
128
+
129
+ if ":" in path:
130
+ module_part, func_name = path.split(":", 1)
131
+ else:
132
+ # split by last dot for module.attr
133
+ if "." in path:
134
+ module_part, func_name = path.rsplit(".", 1)
135
+ else:
136
+ raise ValueError(
137
+ "Invalid callable path format. Use 'module:func' or 'module.func' or 'file.py:func'."
138
+ )
139
+
140
+ # Try file path first
141
+ mod: ModuleType | None = None
142
+ module_path = Path(module_part)
143
+ if module_path.suffix == ".py" and module_path.exists():
144
+ spec = importlib.util.spec_from_file_location(module_path.stem, module_path)
145
+ if spec and spec.loader:
146
+ mod = importlib.util.module_from_spec(spec)
147
+ spec.loader.exec_module(mod)
148
+ else:
149
+ # Import as a module path
150
+ try:
151
+ mod = importlib.import_module(module_part)
152
+ except Exception as e:
153
+ raise ValueError(f"Failed to import module '{module_part}': {e}")
154
+
155
+ if not hasattr(mod, func_name):
156
+ raise ValueError(f"Callable '{func_name}' not found in module '{module_part}'")
157
+ func = getattr(mod, func_name)
158
+ if not callable(func):
159
+ raise ValueError(
160
+ f"Attribute '{func_name}' in module '{module_part}' is not callable"
161
+ )
162
+ return func
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  from pathlib import Path
3
3
 
4
+ import logfire
4
5
  from datasets import Dataset, load_dataset
5
6
  from llm_judge import LLMJudge
6
7
  from rich.console import Console
@@ -11,6 +12,8 @@ from haiku.rag.client import HaikuRAG
11
12
  from haiku.rag.logging import configure_cli_logging
12
13
  from haiku.rag.qa import get_qa_agent
13
14
 
15
+ logfire.configure()
16
+ logfire.instrument_pydantic_ai()
14
17
  configure_cli_logging()
15
18
  console = Console()
16
19
 
@@ -119,7 +122,6 @@ async def run_qa_benchmark(k: int | None = None):
119
122
 
120
123
  async with HaikuRAG(db_path) as rag:
121
124
  qa = get_qa_agent(rag)
122
-
123
125
  for doc in corpus:
124
126
  question = doc["question"] # type: ignore
125
127
  expected_answer = doc["answer"] # type: ignore
@@ -526,121 +526,117 @@ async def test_client_ask_with_cite(temp_db_path):
526
526
  @pytest.mark.asyncio
527
527
  async def test_client_expand_context(temp_db_path):
528
528
  """Test expanding search results with adjacent chunks."""
529
- # Mock Config to have CONTEXT_CHUNK_RADIUS = 2
530
- with patch("haiku.rag.client.Config.CONTEXT_CHUNK_RADIUS", 2):
531
- async with HaikuRAG(temp_db_path) as client:
532
- # Create chunks manually
533
- manual_chunks = [
534
- Chunk(content="Chunk 0 content", metadata={"order": 0}),
535
- Chunk(content="Chunk 1 content", metadata={"order": 1}),
536
- Chunk(content="Chunk 2 content", metadata={"order": 2}),
537
- Chunk(content="Chunk 3 content", metadata={"order": 3}),
538
- Chunk(content="Chunk 4 content", metadata={"order": 4}),
539
- ]
540
-
541
- doc = await client.create_document(
542
- content="Full document content",
543
- uri="test_doc.txt",
544
- chunks=manual_chunks,
545
- )
529
+ async with HaikuRAG(temp_db_path) as client:
530
+ # Create chunks manually
531
+ manual_chunks = [
532
+ Chunk(content="Chunk 0 content", metadata={"order": 0}),
533
+ Chunk(content="Chunk 1 content", metadata={"order": 1}),
534
+ Chunk(content="Chunk 2 content", metadata={"order": 2}),
535
+ Chunk(content="Chunk 3 content", metadata={"order": 3}),
536
+ Chunk(content="Chunk 4 content", metadata={"order": 4}),
537
+ ]
546
538
 
547
- # Get all chunks for the document
548
- assert doc.id is not None
549
- chunks = await client.chunk_repository.get_by_document_id(doc.id)
550
- assert len(chunks) == 5
539
+ doc = await client.create_document(
540
+ content="Full document content",
541
+ uri="test_doc.txt",
542
+ chunks=manual_chunks,
543
+ )
551
544
 
552
- # Find the middle chunk (order=2)
553
- middle_chunk = next(c for c in chunks if c.metadata.get("order") == 2)
554
- search_results = [(middle_chunk, 0.8)]
545
+ # Get all chunks for the document
546
+ assert doc.id is not None
547
+ chunks = await client.chunk_repository.get_by_document_id(doc.id)
548
+ assert len(chunks) == 5
555
549
 
556
- # Test expand_context
557
- expanded_results = await client.expand_context(search_results)
550
+ # Find the middle chunk (order=2)
551
+ middle_chunk = next(c for c in chunks if c.metadata.get("order") == 2)
552
+ search_results = [(middle_chunk, 0.8)]
558
553
 
559
- assert len(expanded_results) == 1
560
- expanded_chunk, score = expanded_results[0]
554
+ # Test expand_context with radius=2
555
+ expanded_results = await client.expand_context(search_results, radius=2)
561
556
 
562
- # Check that the expanded chunk has combined content
563
- assert expanded_chunk.id == middle_chunk.id
564
- assert score == 0.8
565
- assert "Chunk 2 content" in expanded_chunk.content
557
+ assert len(expanded_results) == 1
558
+ expanded_chunk, score = expanded_results[0]
566
559
 
567
- # Should include all chunks (radius=2 from chunk 2 = chunks 0,1,2,3,4)
568
- assert "Chunk 0 content" in expanded_chunk.content
569
- assert "Chunk 1 content" in expanded_chunk.content
570
- assert "Chunk 2 content" in expanded_chunk.content
571
- assert "Chunk 3 content" in expanded_chunk.content
572
- assert "Chunk 4 content" in expanded_chunk.content
560
+ # Check that the expanded chunk has combined content
561
+ assert expanded_chunk.id == middle_chunk.id
562
+ assert score == 0.8
563
+ assert "Chunk 2 content" in expanded_chunk.content
564
+
565
+ # Should include all chunks (radius=2 from chunk 2 = chunks 0,1,2,3,4)
566
+ assert "Chunk 0 content" in expanded_chunk.content
567
+ assert "Chunk 1 content" in expanded_chunk.content
568
+ assert "Chunk 2 content" in expanded_chunk.content
569
+ assert "Chunk 3 content" in expanded_chunk.content
570
+ assert "Chunk 4 content" in expanded_chunk.content
573
571
 
574
572
 
575
573
  @pytest.mark.asyncio
576
574
  async def test_client_expand_context_radius_zero(temp_db_path):
577
575
  """Test expand_context with radius 0 returns original results."""
578
- with patch("haiku.rag.client.Config.CONTEXT_CHUNK_RADIUS", 0):
579
- async with HaikuRAG(temp_db_path) as client:
580
- # Create a simple document
581
- doc = await client.create_document(content="Simple test content")
582
- assert doc.id is not None
583
- chunks = await client.chunk_repository.get_by_document_id(doc.id)
576
+ async with HaikuRAG(temp_db_path) as client:
577
+ # Create a simple document
578
+ doc = await client.create_document(content="Simple test content")
579
+ assert doc.id is not None
580
+ chunks = await client.chunk_repository.get_by_document_id(doc.id)
584
581
 
585
- search_results = [(chunks[0], 0.9)]
586
- expanded_results = await client.expand_context(search_results)
582
+ search_results = [(chunks[0], 0.9)]
583
+ expanded_results = await client.expand_context(search_results, radius=0)
587
584
 
588
- # Should return exactly the same results
589
- assert expanded_results == search_results
585
+ # Should return exactly the same results
586
+ assert expanded_results == search_results
590
587
 
591
588
 
592
589
  @pytest.mark.asyncio
593
590
  async def test_client_expand_context_multiple_chunks(temp_db_path):
594
591
  """Test expand_context with multiple search results."""
595
- with patch("haiku.rag.client.Config.CONTEXT_CHUNK_RADIUS", 1):
596
- async with HaikuRAG(temp_db_path) as client:
597
- # Create first document with manual chunks
598
- doc1_chunks = [
599
- Chunk(content="Doc1 Part A", metadata={"order": 0}),
600
- Chunk(content="Doc1 Part B", metadata={"order": 1}),
601
- Chunk(content="Doc1 Part C", metadata={"order": 2}),
602
- ]
603
- doc1 = await client.create_document(
604
- content="Doc1 content", uri="doc1.txt", chunks=doc1_chunks
605
- )
592
+ async with HaikuRAG(temp_db_path) as client:
593
+ # Create first document with manual chunks
594
+ doc1_chunks = [
595
+ Chunk(content="Doc1 Part A", metadata={"order": 0}),
596
+ Chunk(content="Doc1 Part B", metadata={"order": 1}),
597
+ Chunk(content="Doc1 Part C", metadata={"order": 2}),
598
+ ]
599
+ doc1 = await client.create_document(
600
+ content="Doc1 content", uri="doc1.txt", chunks=doc1_chunks
601
+ )
606
602
 
607
- # Create second document with manual chunks
608
- doc2_chunks = [
609
- Chunk(content="Doc2 Section X", metadata={"order": 0}),
610
- Chunk(content="Doc2 Section Y", metadata={"order": 1}),
611
- ]
612
- doc2 = await client.create_document(
613
- content="Doc2 content", uri="doc2.txt", chunks=doc2_chunks
614
- )
603
+ # Create second document with manual chunks
604
+ doc2_chunks = [
605
+ Chunk(content="Doc2 Section X", metadata={"order": 0}),
606
+ Chunk(content="Doc2 Section Y", metadata={"order": 1}),
607
+ ]
608
+ doc2 = await client.create_document(
609
+ content="Doc2 content", uri="doc2.txt", chunks=doc2_chunks
610
+ )
615
611
 
616
- assert doc1.id is not None
617
- assert doc2.id is not None
618
- chunks1 = await client.chunk_repository.get_by_document_id(doc1.id)
619
- chunks2 = await client.chunk_repository.get_by_document_id(doc2.id)
620
-
621
- # Get middle chunk from doc1 (order=1) and first chunk from doc2 (order=0)
622
- chunk1 = next(c for c in chunks1 if c.metadata.get("order") == 1)
623
- chunk2 = next(c for c in chunks2 if c.metadata.get("order") == 0)
624
-
625
- search_results = [(chunk1, 0.8), (chunk2, 0.7)]
626
- expanded_results = await client.expand_context(search_results)
627
-
628
- assert len(expanded_results) == 2
629
-
630
- # Check first expanded result (should include chunks 0,1,2 from doc1)
631
- expanded1, score1 = expanded_results[0]
632
- assert expanded1.id == chunk1.id
633
- assert score1 == 0.8
634
- assert "Doc1 Part A" in expanded1.content
635
- assert "Doc1 Part B" in expanded1.content
636
- assert "Doc1 Part C" in expanded1.content
637
-
638
- # Check second expanded result (should include chunks 0,1 from doc2)
639
- expanded2, score2 = expanded_results[1]
640
- assert expanded2.id == chunk2.id
641
- assert score2 == 0.7
642
- assert "Doc2 Section X" in expanded2.content
643
- assert "Doc2 Section Y" in expanded2.content
612
+ assert doc1.id is not None
613
+ assert doc2.id is not None
614
+ chunks1 = await client.chunk_repository.get_by_document_id(doc1.id)
615
+ chunks2 = await client.chunk_repository.get_by_document_id(doc2.id)
616
+
617
+ # Get middle chunk from doc1 (order=1) and first chunk from doc2 (order=0)
618
+ chunk1 = next(c for c in chunks1 if c.metadata.get("order") == 1)
619
+ chunk2 = next(c for c in chunks2 if c.metadata.get("order") == 0)
620
+
621
+ search_results = [(chunk1, 0.8), (chunk2, 0.7)]
622
+ expanded_results = await client.expand_context(search_results, radius=1)
623
+
624
+ assert len(expanded_results) == 2
625
+
626
+ # Check first expanded result (should include chunks 0,1,2 from doc1)
627
+ expanded1, score1 = expanded_results[0]
628
+ assert expanded1.id == chunk1.id
629
+ assert score1 == 0.8
630
+ assert "Doc1 Part A" in expanded1.content
631
+ assert "Doc1 Part B" in expanded1.content
632
+ assert "Doc1 Part C" in expanded1.content
633
+
634
+ # Check second expanded result (should include chunks 0,1 from doc2)
635
+ expanded2, score2 = expanded_results[1]
636
+ assert expanded2.id == chunk2.id
637
+ assert score2 == 0.7
638
+ assert "Doc2 Section X" in expanded2.content
639
+ assert "Doc2 Section Y" in expanded2.content
644
640
 
645
641
 
646
642
  @pytest.mark.asyncio