haiku.rag 0.5.0__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- haiku_rag-0.5.2/.python-version +1 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/PKG-INFO +5 -4
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/docs/configuration.md +26 -4
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/pyproject.toml +4 -4
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/app.py +2 -2
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/chunker.py +6 -15
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/cli.py +15 -12
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/client.py +93 -22
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/config.py +3 -4
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/reader.py +11 -6
- haiku_rag-0.5.2/src/haiku/rag/reranking/__init__.py +40 -0
- haiku_rag-0.5.2/src/haiku/rag/reranking/ollama.py +84 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/repositories/chunk.py +5 -3
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/repositories/document.py +29 -7
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/utils.py +21 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_chunk.py +5 -1
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_chunker.py +8 -3
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_reader.py +3 -2
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_reranker.py +25 -8
- haiku_rag-0.5.2/tests/test_utils.py +133 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/uv.lock +40 -679
- haiku_rag-0.5.0/.python-version +0 -1
- haiku_rag-0.5.0/src/haiku/rag/reranking/__init__.py +0 -37
- haiku_rag-0.5.0/tests/test_utils.py +0 -15
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/.github/FUNDING.yml +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/.github/workflows/build-docs.yml +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/.github/workflows/build-publish.yml +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/.gitignore +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/.pre-commit-config.yaml +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/LICENSE +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/README.md +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/docs/benchmarks.md +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/docs/cli.md +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/docs/index.md +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/docs/installation.md +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/docs/mcp.md +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/docs/python.md +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/docs/server.md +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/mkdocs.yml +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/__init__.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/embeddings/__init__.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/embeddings/base.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/embeddings/ollama.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/embeddings/openai.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/embeddings/voyageai.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/logging.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/mcp.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/monitor.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/qa/__init__.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/qa/anthropic.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/qa/base.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/qa/ollama.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/qa/openai.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/qa/prompts.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/reranking/base.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/reranking/cohere.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/reranking/mxbai.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/__init__.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/engine.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/models/__init__.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/models/chunk.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/models/document.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/repositories/__init__.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/repositories/base.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/repositories/settings.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/upgrades/__init__.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/src/haiku/rag/store/upgrades/v0_3_4.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/__init__.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/conftest.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/generate_benchmark_db.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/llm_judge.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_app.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_cli.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_client.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_document.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_embedder.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_monitor.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_qa.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_rebuild.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_search.py +0 -0
- {haiku_rag-0.5.0 → haiku_rag-0.5.2}/tests/test_settings.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.11
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: haiku.rag
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: Retrieval Augmented Generation (RAG) with SQLite
|
|
5
5
|
Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -17,12 +17,11 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Typing :: Typed
|
|
20
|
-
Requires-Python: >=3.
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
21
|
Requires-Dist: docling>=2.15.0
|
|
22
22
|
Requires-Dist: fastmcp>=2.8.1
|
|
23
23
|
Requires-Dist: httpx>=0.28.1
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist: ollama>=0.5.1
|
|
24
|
+
Requires-Dist: ollama>=0.5.3
|
|
26
25
|
Requires-Dist: pydantic>=2.11.7
|
|
27
26
|
Requires-Dist: python-dotenv>=1.1.0
|
|
28
27
|
Requires-Dist: rich>=14.0.0
|
|
@@ -34,6 +33,8 @@ Provides-Extra: anthropic
|
|
|
34
33
|
Requires-Dist: anthropic>=0.56.0; extra == 'anthropic'
|
|
35
34
|
Provides-Extra: cohere
|
|
36
35
|
Requires-Dist: cohere>=5.16.1; extra == 'cohere'
|
|
36
|
+
Provides-Extra: mxbai
|
|
37
|
+
Requires-Dist: mxbai-rerank>=0.1.6; extra == 'mxbai'
|
|
37
38
|
Provides-Extra: openai
|
|
38
39
|
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
39
40
|
Provides-Extra: voyageai
|
|
@@ -105,15 +105,37 @@ ANTHROPIC_API_KEY="your-api-key"
|
|
|
105
105
|
|
|
106
106
|
## Reranking
|
|
107
107
|
|
|
108
|
-
Reranking
|
|
108
|
+
Reranking improves search quality by re-ordering the initial search results using specialized models. When enabled, the system retrieves more candidates (3x the requested limit) and then reranks them to return the most relevant results.
|
|
109
109
|
|
|
110
|
-
|
|
110
|
+
Reranking is **automatically enabled** by default using Ollama, or if you install the appropriate reranking provider package.
|
|
111
|
+
|
|
112
|
+
### Disabling Reranking
|
|
113
|
+
|
|
114
|
+
To disable reranking completely for faster searches:
|
|
111
115
|
|
|
112
116
|
```bash
|
|
113
|
-
|
|
117
|
+
RERANK_PROVIDER=""
|
|
114
118
|
```
|
|
115
119
|
|
|
116
|
-
###
|
|
120
|
+
### Ollama (Default)
|
|
121
|
+
|
|
122
|
+
Ollama reranking uses LLMs with structured output to rank documents by relevance:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
RERANK_PROVIDER="ollama"
|
|
126
|
+
RERANK_MODEL="qwen3:1.7b" # or any model that supports structured output
|
|
127
|
+
OLLAMA_BASE_URL="http://localhost:11434"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### MixedBread AI
|
|
131
|
+
|
|
132
|
+
For MxBAI reranking, install with mxbai extras:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
uv pip install haiku.rag[mxbai]
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Then configure:
|
|
117
139
|
|
|
118
140
|
```bash
|
|
119
141
|
RERANK_PROVIDER="mxbai"
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "haiku.rag"
|
|
3
|
-
version = "0.5.
|
|
3
|
+
version = "0.5.2"
|
|
4
4
|
description = "Retrieval Augmented Generation (RAG) with SQLite"
|
|
5
5
|
authors = [{ name = "Yiorgis Gozadinos", email = "ggozadinos@gmail.com" }]
|
|
6
6
|
license = { text = "MIT" }
|
|
7
7
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
8
|
-
requires-python = ">=3.
|
|
8
|
+
requires-python = ">=3.11"
|
|
9
9
|
keywords = ["RAG", "sqlite", "sqlite-vec", "ml", "mcp"]
|
|
10
10
|
classifiers = [
|
|
11
11
|
"Development Status :: 4 - Beta",
|
|
@@ -25,8 +25,7 @@ dependencies = [
|
|
|
25
25
|
"docling>=2.15.0",
|
|
26
26
|
"fastmcp>=2.8.1",
|
|
27
27
|
"httpx>=0.28.1",
|
|
28
|
-
"
|
|
29
|
-
"ollama>=0.5.1",
|
|
28
|
+
"ollama>=0.5.3",
|
|
30
29
|
"pydantic>=2.11.7",
|
|
31
30
|
"python-dotenv>=1.1.0",
|
|
32
31
|
"rich>=14.0.0",
|
|
@@ -41,6 +40,7 @@ voyageai = ["voyageai>=0.3.2"]
|
|
|
41
40
|
openai = ["openai>=1.0.0"]
|
|
42
41
|
anthropic = ["anthropic>=0.56.0"]
|
|
43
42
|
cohere = ["cohere>=5.16.1"]
|
|
43
|
+
mxbai = ["mxbai-rerank>=0.1.6"]
|
|
44
44
|
|
|
45
45
|
[project.scripts]
|
|
46
46
|
haiku-rag = "haiku.rag.cli:cli"
|
|
@@ -32,9 +32,9 @@ class HaikuRAGApp:
|
|
|
32
32
|
f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
|
|
33
33
|
)
|
|
34
34
|
|
|
35
|
-
async def add_document_from_source(self,
|
|
35
|
+
async def add_document_from_source(self, source: str):
|
|
36
36
|
async with HaikuRAG(db_path=self.db_path) as self.client:
|
|
37
|
-
doc = await self.client.create_document_from_source(
|
|
37
|
+
doc = await self.client.create_document_from_source(source)
|
|
38
38
|
self._rich_print_document(doc, truncate=True)
|
|
39
39
|
self.console.print(
|
|
40
40
|
f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
|
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
from io import BytesIO
|
|
2
1
|
from typing import ClassVar
|
|
3
2
|
|
|
4
3
|
import tiktoken
|
|
5
4
|
from docling.chunking import HybridChunker # type: ignore
|
|
6
|
-
from docling.document_converter import DocumentConverter
|
|
7
5
|
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
|
|
8
|
-
from docling_core.types.
|
|
6
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
9
7
|
|
|
10
8
|
from haiku.rag.config import Config
|
|
11
9
|
|
|
@@ -33,27 +31,20 @@ class Chunker:
|
|
|
33
31
|
|
|
34
32
|
self.chunker = HybridChunker(tokenizer=tokenizer) # type: ignore
|
|
35
33
|
|
|
36
|
-
async def chunk(self,
|
|
37
|
-
"""Split the
|
|
34
|
+
async def chunk(self, document: DoclingDocument) -> list[str]:
|
|
35
|
+
"""Split the document into chunks using docling's structure-aware chunking.
|
|
38
36
|
|
|
39
37
|
Args:
|
|
40
|
-
|
|
38
|
+
document: The DoclingDocument to be split into chunks.
|
|
41
39
|
|
|
42
40
|
Returns:
|
|
43
41
|
A list of text chunks with semantic boundaries.
|
|
44
42
|
"""
|
|
45
|
-
if
|
|
43
|
+
if document is None:
|
|
46
44
|
return []
|
|
47
45
|
|
|
48
|
-
# Convert to docling document
|
|
49
|
-
bytes_io = BytesIO(text.encode("utf-8"))
|
|
50
|
-
doc_stream = DocumentStream(name="text.md", stream=bytes_io)
|
|
51
|
-
converter = DocumentConverter()
|
|
52
|
-
result = converter.convert(doc_stream)
|
|
53
|
-
doc = result.document
|
|
54
|
-
|
|
55
46
|
# Chunk using docling's hybrid chunker
|
|
56
|
-
chunks = list(self.chunker.chunk(
|
|
47
|
+
chunks = list(self.chunker.chunk(document))
|
|
57
48
|
return [self.chunker.contextualize(chunk) for chunk in chunks]
|
|
58
49
|
|
|
59
50
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import warnings
|
|
2
3
|
from importlib.metadata import version
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
@@ -9,12 +10,14 @@ from haiku.rag.app import HaikuRAGApp
|
|
|
9
10
|
from haiku.rag.config import Config
|
|
10
11
|
from haiku.rag.utils import is_up_to_date
|
|
11
12
|
|
|
13
|
+
if not Config.ENV == "development":
|
|
14
|
+
warnings.filterwarnings("ignore")
|
|
15
|
+
|
|
12
16
|
cli = typer.Typer(
|
|
13
17
|
context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True
|
|
14
18
|
)
|
|
15
19
|
|
|
16
20
|
console = Console()
|
|
17
|
-
event_loop = asyncio.get_event_loop()
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
async def check_version():
|
|
@@ -46,7 +49,7 @@ def main(
|
|
|
46
49
|
):
|
|
47
50
|
"""haiku.rag CLI - SQLite-based RAG system"""
|
|
48
51
|
# Run version check before any command
|
|
49
|
-
|
|
52
|
+
asyncio.run(check_version())
|
|
50
53
|
|
|
51
54
|
|
|
52
55
|
@cli.command("list", help="List all stored documents")
|
|
@@ -58,7 +61,7 @@ def list_documents(
|
|
|
58
61
|
),
|
|
59
62
|
):
|
|
60
63
|
app = HaikuRAGApp(db_path=db)
|
|
61
|
-
|
|
64
|
+
asyncio.run(app.list_documents())
|
|
62
65
|
|
|
63
66
|
|
|
64
67
|
@cli.command("add", help="Add a document from text input")
|
|
@@ -73,12 +76,12 @@ def add_document_text(
|
|
|
73
76
|
),
|
|
74
77
|
):
|
|
75
78
|
app = HaikuRAGApp(db_path=db)
|
|
76
|
-
|
|
79
|
+
asyncio.run(app.add_document_from_text(text=text))
|
|
77
80
|
|
|
78
81
|
|
|
79
82
|
@cli.command("add-src", help="Add a document from a file path or URL")
|
|
80
83
|
def add_document_src(
|
|
81
|
-
|
|
84
|
+
source: str = typer.Argument(
|
|
82
85
|
help="The file path or URL of the document to add",
|
|
83
86
|
),
|
|
84
87
|
db: Path = typer.Option(
|
|
@@ -88,7 +91,7 @@ def add_document_src(
|
|
|
88
91
|
),
|
|
89
92
|
):
|
|
90
93
|
app = HaikuRAGApp(db_path=db)
|
|
91
|
-
|
|
94
|
+
asyncio.run(app.add_document_from_source(source=source))
|
|
92
95
|
|
|
93
96
|
|
|
94
97
|
@cli.command("get", help="Get and display a document by its ID")
|
|
@@ -103,7 +106,7 @@ def get_document(
|
|
|
103
106
|
),
|
|
104
107
|
):
|
|
105
108
|
app = HaikuRAGApp(db_path=db)
|
|
106
|
-
|
|
109
|
+
asyncio.run(app.get_document(doc_id=doc_id))
|
|
107
110
|
|
|
108
111
|
|
|
109
112
|
@cli.command("delete", help="Delete a document by its ID")
|
|
@@ -118,7 +121,7 @@ def delete_document(
|
|
|
118
121
|
),
|
|
119
122
|
):
|
|
120
123
|
app = HaikuRAGApp(db_path=db)
|
|
121
|
-
|
|
124
|
+
asyncio.run(app.delete_document(doc_id=doc_id))
|
|
122
125
|
|
|
123
126
|
|
|
124
127
|
@cli.command("search", help="Search for documents by a query")
|
|
@@ -144,7 +147,7 @@ def search(
|
|
|
144
147
|
),
|
|
145
148
|
):
|
|
146
149
|
app = HaikuRAGApp(db_path=db)
|
|
147
|
-
|
|
150
|
+
asyncio.run(app.search(query=query, limit=limit, k=k))
|
|
148
151
|
|
|
149
152
|
|
|
150
153
|
@cli.command("ask", help="Ask a question using the QA agent")
|
|
@@ -159,7 +162,7 @@ def ask(
|
|
|
159
162
|
),
|
|
160
163
|
):
|
|
161
164
|
app = HaikuRAGApp(db_path=db)
|
|
162
|
-
|
|
165
|
+
asyncio.run(app.ask(question=question))
|
|
163
166
|
|
|
164
167
|
|
|
165
168
|
@cli.command("settings", help="Display current configuration settings")
|
|
@@ -180,7 +183,7 @@ def rebuild(
|
|
|
180
183
|
),
|
|
181
184
|
):
|
|
182
185
|
app = HaikuRAGApp(db_path=db)
|
|
183
|
-
|
|
186
|
+
asyncio.run(app.rebuild())
|
|
184
187
|
|
|
185
188
|
|
|
186
189
|
@cli.command(
|
|
@@ -216,7 +219,7 @@ def serve(
|
|
|
216
219
|
elif sse:
|
|
217
220
|
transport = "sse"
|
|
218
221
|
|
|
219
|
-
|
|
222
|
+
asyncio.run(app.serve(transport=transport))
|
|
220
223
|
|
|
221
224
|
|
|
222
225
|
if __name__ == "__main__":
|
|
@@ -16,6 +16,7 @@ from haiku.rag.store.models.chunk import Chunk
|
|
|
16
16
|
from haiku.rag.store.models.document import Document
|
|
17
17
|
from haiku.rag.store.repositories.chunk import ChunkRepository
|
|
18
18
|
from haiku.rag.store.repositories.document import DocumentRepository
|
|
19
|
+
from haiku.rag.utils import text_to_docling_document
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class HaikuRAG:
|
|
@@ -49,6 +50,24 @@ class HaikuRAG:
|
|
|
49
50
|
self.close()
|
|
50
51
|
return False
|
|
51
52
|
|
|
53
|
+
async def _create_document_with_docling(
|
|
54
|
+
self,
|
|
55
|
+
docling_document,
|
|
56
|
+
uri: str | None = None,
|
|
57
|
+
metadata: dict | None = None,
|
|
58
|
+
chunks: list[Chunk] | None = None,
|
|
59
|
+
) -> Document:
|
|
60
|
+
"""Create a new document from DoclingDocument."""
|
|
61
|
+
content = docling_document.export_to_markdown()
|
|
62
|
+
document = Document(
|
|
63
|
+
content=content,
|
|
64
|
+
uri=uri,
|
|
65
|
+
metadata=metadata or {},
|
|
66
|
+
)
|
|
67
|
+
return await self.document_repository._create_with_docling(
|
|
68
|
+
document, docling_document, chunks
|
|
69
|
+
)
|
|
70
|
+
|
|
52
71
|
async def create_document(
|
|
53
72
|
self,
|
|
54
73
|
content: str,
|
|
@@ -67,12 +86,17 @@ class HaikuRAG:
|
|
|
67
86
|
Returns:
|
|
68
87
|
The created Document instance.
|
|
69
88
|
"""
|
|
89
|
+
# Convert content to DoclingDocument for processing
|
|
90
|
+
docling_document = text_to_docling_document(content)
|
|
91
|
+
|
|
70
92
|
document = Document(
|
|
71
93
|
content=content,
|
|
72
94
|
uri=uri,
|
|
73
95
|
metadata=metadata or {},
|
|
74
96
|
)
|
|
75
|
-
return await self.document_repository.
|
|
97
|
+
return await self.document_repository._create_with_docling(
|
|
98
|
+
document, docling_document, chunks
|
|
99
|
+
)
|
|
76
100
|
|
|
77
101
|
async def create_document_from_source(
|
|
78
102
|
self, source: str | Path, metadata: dict = {}
|
|
@@ -101,16 +125,19 @@ class HaikuRAG:
|
|
|
101
125
|
parsed_url = urlparse(source_str)
|
|
102
126
|
if parsed_url.scheme in ("http", "https"):
|
|
103
127
|
return await self._create_or_update_document_from_url(source_str, metadata)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
128
|
+
elif parsed_url.scheme == "file":
|
|
129
|
+
# Handle file:// URI by converting to path
|
|
130
|
+
source_path = Path(parsed_url.path)
|
|
131
|
+
else:
|
|
132
|
+
# Handle as regular file path
|
|
133
|
+
source_path = Path(source) if isinstance(source, str) else source
|
|
107
134
|
if source_path.suffix.lower() not in FileReader.extensions:
|
|
108
135
|
raise ValueError(f"Unsupported file extension: {source_path.suffix}")
|
|
109
136
|
|
|
110
137
|
if not source_path.exists():
|
|
111
138
|
raise ValueError(f"File does not exist: {source_path}")
|
|
112
139
|
|
|
113
|
-
uri = source_path.as_uri()
|
|
140
|
+
uri = source_path.absolute().as_uri()
|
|
114
141
|
md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
|
|
115
142
|
|
|
116
143
|
# Check if document already exists
|
|
@@ -119,7 +146,7 @@ class HaikuRAG:
|
|
|
119
146
|
# MD5 unchanged, return existing document
|
|
120
147
|
return existing_doc
|
|
121
148
|
|
|
122
|
-
|
|
149
|
+
docling_document = FileReader.parse_file(source_path)
|
|
123
150
|
|
|
124
151
|
# Get content type from file extension
|
|
125
152
|
content_type, _ = mimetypes.guess_type(str(source_path))
|
|
@@ -131,13 +158,15 @@ class HaikuRAG:
|
|
|
131
158
|
|
|
132
159
|
if existing_doc:
|
|
133
160
|
# Update existing document
|
|
134
|
-
existing_doc.content =
|
|
161
|
+
existing_doc.content = docling_document.export_to_markdown()
|
|
135
162
|
existing_doc.metadata = metadata
|
|
136
|
-
return await self.
|
|
163
|
+
return await self.document_repository._update_with_docling(
|
|
164
|
+
existing_doc, docling_document
|
|
165
|
+
)
|
|
137
166
|
else:
|
|
138
|
-
# Create new document
|
|
139
|
-
return await self.
|
|
140
|
-
|
|
167
|
+
# Create new document using DoclingDocument
|
|
168
|
+
return await self._create_document_with_docling(
|
|
169
|
+
docling_document=docling_document, uri=uri, metadata=metadata
|
|
141
170
|
)
|
|
142
171
|
|
|
143
172
|
async def _create_or_update_document_from_url(
|
|
@@ -193,18 +222,20 @@ class HaikuRAG:
|
|
|
193
222
|
temp_path = Path(temp_file.name)
|
|
194
223
|
|
|
195
224
|
# Parse the content using FileReader
|
|
196
|
-
|
|
225
|
+
docling_document = FileReader.parse_file(temp_path)
|
|
197
226
|
|
|
198
227
|
# Merge metadata with contentType and md5
|
|
199
228
|
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
200
229
|
|
|
201
230
|
if existing_doc:
|
|
202
|
-
existing_doc.content =
|
|
231
|
+
existing_doc.content = docling_document.export_to_markdown()
|
|
203
232
|
existing_doc.metadata = metadata
|
|
204
|
-
return await self.
|
|
233
|
+
return await self.document_repository._update_with_docling(
|
|
234
|
+
existing_doc, docling_document
|
|
235
|
+
)
|
|
205
236
|
else:
|
|
206
|
-
return await self.
|
|
207
|
-
|
|
237
|
+
return await self._create_document_with_docling(
|
|
238
|
+
docling_document=docling_document, uri=url, metadata=metadata
|
|
208
239
|
)
|
|
209
240
|
|
|
210
241
|
def _get_extension_from_content_type_or_url(
|
|
@@ -262,7 +293,12 @@ class HaikuRAG:
|
|
|
262
293
|
|
|
263
294
|
async def update_document(self, document: Document) -> Document:
|
|
264
295
|
"""Update an existing document."""
|
|
265
|
-
|
|
296
|
+
# Convert content to DoclingDocument
|
|
297
|
+
docling_document = text_to_docling_document(document.content)
|
|
298
|
+
|
|
299
|
+
return await self.document_repository._update_with_docling(
|
|
300
|
+
document, docling_document
|
|
301
|
+
)
|
|
266
302
|
|
|
267
303
|
async def delete_document(self, document_id: int) -> bool:
|
|
268
304
|
"""Delete a document by its ID."""
|
|
@@ -283,7 +319,7 @@ class HaikuRAG:
|
|
|
283
319
|
return await self.document_repository.list_all(limit=limit, offset=offset)
|
|
284
320
|
|
|
285
321
|
async def search(
|
|
286
|
-
self, query: str, limit: int = 5, k: int = 60
|
|
322
|
+
self, query: str, limit: int = 5, k: int = 60
|
|
287
323
|
) -> list[tuple[Chunk, float]]:
|
|
288
324
|
"""Search for relevant chunks using hybrid search (vector similarity + full-text search) with reranking.
|
|
289
325
|
|
|
@@ -295,8 +331,10 @@ class HaikuRAG:
|
|
|
295
331
|
Returns:
|
|
296
332
|
List of (chunk, score) tuples ordered by relevance.
|
|
297
333
|
"""
|
|
334
|
+
# Get reranker if available
|
|
335
|
+
reranker = get_reranker()
|
|
298
336
|
|
|
299
|
-
if
|
|
337
|
+
if reranker is None:
|
|
300
338
|
return await self.chunk_repository.search_chunks_hybrid(query, limit, k)
|
|
301
339
|
|
|
302
340
|
# Get more initial results (3X) for reranking
|
|
@@ -304,7 +342,6 @@ class HaikuRAG:
|
|
|
304
342
|
query, limit * 3, k
|
|
305
343
|
)
|
|
306
344
|
# Apply reranking
|
|
307
|
-
reranker = get_reranker()
|
|
308
345
|
chunks = [chunk for chunk, _ in search_results]
|
|
309
346
|
reranked_results = await reranker.rerank(query, chunks, top_n=limit)
|
|
310
347
|
|
|
@@ -328,6 +365,13 @@ class HaikuRAG:
|
|
|
328
365
|
async def rebuild_database(self) -> AsyncGenerator[int, None]:
|
|
329
366
|
"""Rebuild the database by deleting all chunks and re-indexing all documents.
|
|
330
367
|
|
|
368
|
+
For documents with URIs:
|
|
369
|
+
- Deletes the document and re-adds it from source if source exists
|
|
370
|
+
- Skips documents where source no longer exists
|
|
371
|
+
|
|
372
|
+
For documents without URIs:
|
|
373
|
+
- Re-creates chunks from existing content
|
|
374
|
+
|
|
331
375
|
Yields:
|
|
332
376
|
int: The ID of the document currently being processed
|
|
333
377
|
"""
|
|
@@ -343,9 +387,36 @@ class HaikuRAG:
|
|
|
343
387
|
documents = await self.list_documents()
|
|
344
388
|
|
|
345
389
|
for doc in documents:
|
|
346
|
-
|
|
390
|
+
assert doc.id is not None, "Document ID should not be None"
|
|
391
|
+
if doc.uri:
|
|
392
|
+
# Document has a URI - delete and try to re-add from source
|
|
393
|
+
try:
|
|
394
|
+
# Delete the old document first
|
|
395
|
+
await self.delete_document(doc.id)
|
|
396
|
+
|
|
397
|
+
# Try to re-create from source (this creates the document with chunks)
|
|
398
|
+
new_doc = await self.create_document_from_source(
|
|
399
|
+
doc.uri, doc.metadata or {}
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
assert new_doc.id is not None, "New document ID should not be None"
|
|
403
|
+
yield new_doc.id
|
|
404
|
+
|
|
405
|
+
except (FileNotFoundError, ValueError, OSError) as e:
|
|
406
|
+
# Source doesn't exist or can't be accessed - document already deleted, skip
|
|
407
|
+
print(f"Skipping document with URI {doc.uri}: {e}")
|
|
408
|
+
continue
|
|
409
|
+
except Exception as e:
|
|
410
|
+
# Unexpected error - log it and skip
|
|
411
|
+
print(
|
|
412
|
+
f"Unexpected error processing document with URI {doc.uri}: {e}"
|
|
413
|
+
)
|
|
414
|
+
continue
|
|
415
|
+
else:
|
|
416
|
+
# Document without URI - re-create chunks from existing content
|
|
417
|
+
docling_document = text_to_docling_document(doc.content)
|
|
347
418
|
await self.chunk_repository.create_chunks_for_document(
|
|
348
|
-
doc.id,
|
|
419
|
+
doc.id, docling_document, commit=False
|
|
349
420
|
)
|
|
350
421
|
yield doc.id
|
|
351
422
|
|
|
@@ -10,7 +10,7 @@ load_dotenv()
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class AppConfig(BaseModel):
|
|
13
|
-
ENV: str = "
|
|
13
|
+
ENV: str = "production"
|
|
14
14
|
|
|
15
15
|
DEFAULT_DATA_DIR: Path = get_default_data_dir()
|
|
16
16
|
MONITOR_DIRECTORIES: list[Path] = []
|
|
@@ -19,9 +19,8 @@ class AppConfig(BaseModel):
|
|
|
19
19
|
EMBEDDINGS_MODEL: str = "mxbai-embed-large"
|
|
20
20
|
EMBEDDINGS_VECTOR_DIM: int = 1024
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
RERANK_MODEL: str = "mixedbread-ai/mxbai-rerank-base-v2"
|
|
22
|
+
RERANK_PROVIDER: str = "ollama"
|
|
23
|
+
RERANK_MODEL: str = "qwen3"
|
|
25
24
|
|
|
26
25
|
QA_PROVIDER: str = "ollama"
|
|
27
26
|
QA_MODEL: str = "qwen3"
|
|
@@ -2,6 +2,9 @@ from pathlib import Path
|
|
|
2
2
|
from typing import ClassVar
|
|
3
3
|
|
|
4
4
|
from docling.document_converter import DocumentConverter
|
|
5
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
6
|
+
|
|
7
|
+
from haiku.rag.utils import text_to_docling_document
|
|
5
8
|
|
|
6
9
|
|
|
7
10
|
class FileReader:
|
|
@@ -84,7 +87,7 @@ class FileReader:
|
|
|
84
87
|
extensions: ClassVar[list[str]] = docling_extensions + text_extensions
|
|
85
88
|
|
|
86
89
|
@staticmethod
|
|
87
|
-
def parse_file(path: Path) ->
|
|
90
|
+
def parse_file(path: Path) -> DoclingDocument:
|
|
88
91
|
try:
|
|
89
92
|
file_extension = path.suffix.lower()
|
|
90
93
|
|
|
@@ -92,7 +95,7 @@ class FileReader:
|
|
|
92
95
|
# Use docling for complex document formats
|
|
93
96
|
converter = DocumentConverter()
|
|
94
97
|
result = converter.convert(path)
|
|
95
|
-
return result.document
|
|
98
|
+
return result.document
|
|
96
99
|
elif file_extension in FileReader.text_extensions:
|
|
97
100
|
# Read plain text files directly
|
|
98
101
|
content = path.read_text(encoding="utf-8")
|
|
@@ -100,11 +103,13 @@ class FileReader:
|
|
|
100
103
|
# Wrap code files (but not plain txt) in markdown code blocks for better presentation
|
|
101
104
|
if file_extension in FileReader.code_markdown_identifier:
|
|
102
105
|
language = FileReader.code_markdown_identifier[file_extension]
|
|
103
|
-
|
|
106
|
+
content = f"```{language}\n{content}\n```"
|
|
104
107
|
|
|
105
|
-
|
|
108
|
+
# Convert text to DoclingDocument by wrapping as markdown
|
|
109
|
+
return text_to_docling_document(content, name=f"{path.stem}.md")
|
|
106
110
|
else:
|
|
107
|
-
# Fallback: try to read as text
|
|
108
|
-
|
|
111
|
+
# Fallback: try to read as text and convert to DoclingDocument
|
|
112
|
+
content = path.read_text(encoding="utf-8")
|
|
113
|
+
return text_to_docling_document(content, name=f"{path.stem}.md")
|
|
109
114
|
except Exception:
|
|
110
115
|
raise ValueError(f"Failed to parse file: {path}")
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from haiku.rag.config import Config
|
|
2
|
+
from haiku.rag.reranking.base import RerankerBase
|
|
3
|
+
|
|
4
|
+
_reranker: RerankerBase | None = None
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_reranker() -> RerankerBase | None:
|
|
8
|
+
"""
|
|
9
|
+
Factory function to get the appropriate reranker based on the configuration.
|
|
10
|
+
Returns None if if reranking is disabled.
|
|
11
|
+
"""
|
|
12
|
+
global _reranker
|
|
13
|
+
if _reranker is not None:
|
|
14
|
+
return _reranker
|
|
15
|
+
|
|
16
|
+
if Config.RERANK_PROVIDER == "mxbai":
|
|
17
|
+
try:
|
|
18
|
+
from haiku.rag.reranking.mxbai import MxBAIReranker
|
|
19
|
+
|
|
20
|
+
_reranker = MxBAIReranker()
|
|
21
|
+
return _reranker
|
|
22
|
+
except ImportError:
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
if Config.RERANK_PROVIDER == "cohere":
|
|
26
|
+
try:
|
|
27
|
+
from haiku.rag.reranking.cohere import CohereReranker
|
|
28
|
+
|
|
29
|
+
_reranker = CohereReranker()
|
|
30
|
+
return _reranker
|
|
31
|
+
except ImportError:
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
if Config.RERANK_PROVIDER == "ollama":
|
|
35
|
+
from haiku.rag.reranking.ollama import OllamaReranker
|
|
36
|
+
|
|
37
|
+
_reranker = OllamaReranker()
|
|
38
|
+
return _reranker
|
|
39
|
+
|
|
40
|
+
return None
|