haiku.rag 0.7.1__tar.gz → 0.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

Files changed (79) hide show
  1. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/PKG-INFO +3 -3
  2. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/README.md +2 -2
  3. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/docs/configuration.md +36 -0
  4. haiku_rag-0.7.3/docs/installation.md +74 -0
  5. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/pyproject.toml +1 -1
  6. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/config.py +3 -0
  7. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/embeddings/base.py +1 -1
  8. haiku_rag-0.7.3/src/haiku/rag/embeddings/ollama.py +17 -0
  9. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/embeddings/openai.py +5 -2
  10. haiku_rag-0.7.3/src/haiku/rag/embeddings/vllm.py +19 -0
  11. haiku_rag-0.7.3/src/haiku/rag/embeddings/voyageai.py +17 -0
  12. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/qa/agent.py +8 -0
  13. haiku_rag-0.7.3/src/haiku/rag/reranking/vllm.py +44 -0
  14. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/repositories/chunk.py +1 -7
  15. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/llm_judge.py +3 -3
  16. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_embedder.py +61 -3
  17. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_qa.py +24 -0
  18. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_reranker.py +21 -0
  19. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/uv.lock +1 -1
  20. haiku_rag-0.7.1/docs/installation.md +0 -34
  21. haiku_rag-0.7.1/src/haiku/rag/embeddings/ollama.py +0 -11
  22. haiku_rag-0.7.1/src/haiku/rag/embeddings/voyageai.py +0 -13
  23. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/.github/FUNDING.yml +0 -0
  24. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/.github/workflows/build-docs.yml +0 -0
  25. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/.github/workflows/build-publish.yml +0 -0
  26. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/.gitignore +0 -0
  27. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/.pre-commit-config.yaml +0 -0
  28. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/.python-version +0 -0
  29. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/LICENSE +0 -0
  30. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/docs/benchmarks.md +0 -0
  31. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/docs/cli.md +0 -0
  32. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/docs/index.md +0 -0
  33. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/docs/mcp.md +0 -0
  34. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/docs/python.md +0 -0
  35. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/docs/server.md +0 -0
  36. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/mkdocs.yml +0 -0
  37. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/__init__.py +0 -0
  38. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/app.py +0 -0
  39. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/chunker.py +0 -0
  40. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/cli.py +0 -0
  41. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/client.py +0 -0
  42. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/embeddings/__init__.py +0 -0
  43. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/logging.py +0 -0
  44. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/mcp.py +0 -0
  45. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/migration.py +0 -0
  46. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/monitor.py +0 -0
  47. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/qa/__init__.py +0 -0
  48. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/qa/prompts.py +0 -0
  49. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/reader.py +0 -0
  50. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/reranking/__init__.py +0 -0
  51. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/reranking/base.py +0 -0
  52. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/reranking/cohere.py +0 -0
  53. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/reranking/mxbai.py +0 -0
  54. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/__init__.py +0 -0
  55. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/engine.py +0 -0
  56. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/models/__init__.py +0 -0
  57. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/models/chunk.py +0 -0
  58. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/models/document.py +0 -0
  59. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/repositories/__init__.py +0 -0
  60. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/repositories/document.py +0 -0
  61. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/repositories/settings.py +0 -0
  62. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/store/upgrades/__init__.py +0 -0
  63. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/src/haiku/rag/utils.py +0 -0
  64. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/__init__.py +0 -0
  65. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/conftest.py +0 -0
  66. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/generate_benchmark_db.py +0 -0
  67. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_app.py +0 -0
  68. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_chunk.py +0 -0
  69. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_chunker.py +0 -0
  70. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_cli.py +0 -0
  71. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_client.py +0 -0
  72. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_document.py +0 -0
  73. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_lancedb_connection.py +0 -0
  74. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_monitor.py +0 -0
  75. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_reader.py +0 -0
  76. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_rebuild.py +0 -0
  77. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_search.py +0 -0
  78. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_settings.py +0 -0
  79. {haiku_rag-0.7.1 → haiku_rag-0.7.3}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: Retrieval Augmented Generation (RAG) with LanceDB
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -47,10 +47,10 @@ Retrieval-Augmented Generation (RAG) library built on LanceDB.
47
47
  ## Features
48
48
 
49
49
  - **Local LanceDB**: No external servers required, supports also LanceDB cloud storage, S3, Google Cloud & Azure
50
- - **Multiple embedding providers**: Ollama, VoyageAI, OpenAI
50
+ - **Multiple embedding providers**: Ollama, VoyageAI, OpenAI, vLLM
51
51
  - **Multiple QA providers**: Any provider/model supported by Pydantic AI
52
52
  - **Native hybrid search**: Vector + full-text search with native LanceDB RRF reranking
53
- - **Reranking**: Default search result reranking with MixedBread AI or Cohere
53
+ - **Reranking**: Default search result reranking with MixedBread AI, Cohere, or vLLM
54
54
  - **Question answering**: Built-in QA agents on your documents
55
55
  - **File monitoring**: Auto-index files when run as server
56
56
  - **40+ file formats**: PDF, DOCX, HTML, Markdown, code files, URLs
@@ -9,10 +9,10 @@ Retrieval-Augmented Generation (RAG) library built on LanceDB.
9
9
  ## Features
10
10
 
11
11
  - **Local LanceDB**: No external servers required, supports also LanceDB cloud storage, S3, Google Cloud & Azure
12
- - **Multiple embedding providers**: Ollama, VoyageAI, OpenAI
12
+ - **Multiple embedding providers**: Ollama, VoyageAI, OpenAI, vLLM
13
13
  - **Multiple QA providers**: Any provider/model supported by Pydantic AI
14
14
  - **Native hybrid search**: Vector + full-text search with native LanceDB RRF reranking
15
- - **Reranking**: Default search result reranking with MixedBread AI or Cohere
15
+ - **Reranking**: Default search result reranking with MixedBread AI, Cohere, or vLLM
16
16
  - **Question answering**: Built-in QA agents on your documents
17
17
  - **File monitoring**: Auto-index files when run as server
18
18
  - **40+ file formats**: PDF, DOCX, HTML, Markdown, code files, URLs
@@ -53,6 +53,18 @@ EMBEDDINGS_VECTOR_DIM=1536
53
53
  OPENAI_API_KEY="your-api-key"
54
54
  ```
55
55
 
56
+ ### vLLM
57
+ For high-performance local inference, you can use vLLM to serve embedding models with OpenAI-compatible APIs:
58
+
59
+ ```bash
60
+ EMBEDDINGS_PROVIDER="vllm"
61
+ EMBEDDINGS_MODEL="mixedbread-ai/mxbai-embed-large-v1" # Any embedding model supported by vLLM
62
+ EMBEDDINGS_VECTOR_DIM=512 # Dimension depends on the model
63
+ VLLM_EMBEDDINGS_BASE_URL="http://localhost:8000" # vLLM server URL
64
+ ```
65
+
66
+ **Note:** You need to run a vLLM server separately with an embedding model loaded.
67
+
56
68
  ## Question Answering Providers
57
69
 
58
70
  Configure which LLM provider to use for question answering. Any provider and model supported by [Pydantic AI](https://ai.pydantic.dev/models/) can be used.
@@ -85,6 +97,18 @@ QA_MODEL="claude-3-5-haiku-20241022" # or claude-3-5-sonnet-20241022, etc.
85
97
  ANTHROPIC_API_KEY="your-api-key"
86
98
  ```
87
99
 
100
+ ### vLLM
101
+
102
+ For high-performance local inference, you can use vLLM to serve models with OpenAI-compatible APIs:
103
+
104
+ ```bash
105
+ QA_PROVIDER="vllm"
106
+ QA_MODEL="Qwen/Qwen3-4B" # Any model with tool support in vLLM
107
+ VLLM_QA_BASE_URL="http://localhost:8002" # vLLM server URL
108
+ ```
109
+
110
+ **Note:** You need to run a vLLM server separately with a model that supports tool calling loaded. Consult the specific model's documentation for proper vLLM serving configuration.
111
+
88
112
  ### Other Providers
89
113
 
90
114
  Any provider supported by Pydantic AI can be used. Examples include:
@@ -136,6 +160,18 @@ RERANK_MODEL="rerank-v3.5"
136
160
  COHERE_API_KEY="your-api-key"
137
161
  ```
138
162
 
163
+ ### vLLM
164
+
165
+ For high-performance local reranking using dedicated reranking models:
166
+
167
+ ```bash
168
+ RERANK_PROVIDER="vllm"
169
+ RERANK_MODEL="mixedbread-ai/mxbai-rerank-base-v2" # Any reranking model supported by vLLM
170
+ VLLM_RERANK_BASE_URL="http://localhost:8001" # vLLM server URL
171
+ ```
172
+
173
+ **Note:** vLLM reranking uses the `/rerank` API endpoint. You need to run a vLLM server separately with a reranking model loaded. Consult the specific model's documentation for proper vLLM serving configuration.
174
+
139
175
  ## Other Settings
140
176
 
141
177
  ### Database and Storage
@@ -0,0 +1,74 @@
1
+ # Installation
2
+
3
+ ## Basic Installation
4
+
5
+ ```bash
6
+ uv pip install haiku.rag
7
+ ```
8
+
9
+ This includes support for:
10
+ - **Ollama** (default embedding provider using `mxbai-embed-large`)
11
+ - **OpenAI** (GPT models for QA and embeddings)
12
+ - **Anthropic** (Claude models for QA)
13
+ - **Cohere** (reranking models)
14
+ - **vLLM** (high-performance local inference for embeddings, QA, and reranking)
15
+
16
+ ## Provider-Specific Installation
17
+
18
+ For additional embedding providers, install with extras:
19
+
20
+ ### VoyageAI
21
+
22
+ ```bash
23
+ uv pip install haiku.rag[voyageai]
24
+ ```
25
+
26
+ ### MixedBread AI Reranking
27
+
28
+ ```bash
29
+ uv pip install haiku.rag[mxbai]
30
+ ```
31
+
32
+ ### vLLM Setup
33
+
34
+ vLLM requires no additional installation - it works with the base haiku.rag package. However, you need to run vLLM servers separately:
35
+
36
+ ```bash
37
+ # Install vLLM
38
+ pip install vllm
39
+
40
+ # Serve an embedding model
41
+ vllm serve mixedbread-ai/mxbai-embed-large-v1 --port 8000
42
+
43
+ # Serve a model for QA (requires tool calling support)
44
+ vllm serve Qwen/Qwen3-4B --port 8002 --enable-auto-tool-choice --tool-call-parser hermes
45
+
46
+ # Serve a model for reranking
47
+ vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}' --port 8001
48
+ ```
49
+
50
+ Then configure haiku.rag to use the vLLM servers:
51
+
52
+ ```bash
53
+ # Embeddings
54
+ EMBEDDINGS_PROVIDER="vllm"
55
+ EMBEDDINGS_MODEL="mixedbread-ai/mxbai-embed-large-v1"
56
+ EMBEDDINGS_VECTOR_DIM=512
57
+ VLLM_EMBEDDINGS_BASE_URL="http://localhost:8000"
58
+
59
+ # QA (optional)
60
+ QA_PROVIDER="vllm"
61
+ QA_MODEL="Qwen/Qwen3-4B"
62
+ VLLM_QA_BASE_URL="http://localhost:8002"
63
+
64
+ # Reranking (optional)
65
+ RERANK_PROVIDER="vllm"
66
+ RERANK_MODEL="mixedbread-ai/mxbai-rerank-base-v2"
67
+ VLLM_RERANK_BASE_URL="http://localhost:8001"
68
+ ```
69
+
70
+ ## Requirements
71
+
72
+ - Python 3.10+
73
+ - Ollama (for default embeddings)
74
+ - vLLM server (for vLLM provider)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "haiku.rag"
3
- version = "0.7.1"
3
+ version = "0.7.3"
4
4
  description = "Retrieval Augmented Generation (RAG) with LanceDB"
5
5
  authors = [{ name = "Yiorgis Gozadinos", email = "ggozadinos@gmail.com" }]
6
6
  license = { text = "MIT" }
@@ -33,6 +33,9 @@ class AppConfig(BaseModel):
33
33
  CONTEXT_CHUNK_RADIUS: int = 0
34
34
 
35
35
  OLLAMA_BASE_URL: str = "http://localhost:11434"
36
+ VLLM_EMBEDDINGS_BASE_URL: str = ""
37
+ VLLM_RERANK_BASE_URL: str = ""
38
+ VLLM_QA_BASE_URL: str = ""
36
39
 
37
40
  # Provider keys
38
41
  VOYAGE_API_KEY: str = ""
@@ -9,7 +9,7 @@ class EmbedderBase:
9
9
  self._model = model
10
10
  self._vector_dim = vector_dim
11
11
 
12
- async def embed(self, text: str) -> list[float]:
12
+ async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
13
13
  raise NotImplementedError(
14
14
  "Embedder is an abstract class. Please implement the embed method in a subclass."
15
15
  )
@@ -0,0 +1,17 @@
1
+ from openai import AsyncOpenAI
2
+
3
+ from haiku.rag.config import Config
4
+ from haiku.rag.embeddings.base import EmbedderBase
5
+
6
+
7
+ class Embedder(EmbedderBase):
8
+ async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
9
+ client = AsyncOpenAI(base_url=f"{Config.OLLAMA_BASE_URL}/v1", api_key="dummy")
10
+ response = await client.embeddings.create(
11
+ model=self._model,
12
+ input=text,
13
+ )
14
+ if isinstance(text, str):
15
+ return response.data[0].embedding
16
+ else:
17
+ return [item.embedding for item in response.data]
@@ -4,10 +4,13 @@ from haiku.rag.embeddings.base import EmbedderBase
4
4
 
5
5
 
6
6
  class Embedder(EmbedderBase):
7
- async def embed(self, text: str) -> list[float]:
7
+ async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
8
8
  client = AsyncOpenAI()
9
9
  response = await client.embeddings.create(
10
10
  model=self._model,
11
11
  input=text,
12
12
  )
13
- return response.data[0].embedding
13
+ if isinstance(text, str):
14
+ return response.data[0].embedding
15
+ else:
16
+ return [item.embedding for item in response.data]
@@ -0,0 +1,19 @@
1
+ from openai import AsyncOpenAI
2
+
3
+ from haiku.rag.config import Config
4
+ from haiku.rag.embeddings.base import EmbedderBase
5
+
6
+
7
+ class Embedder(EmbedderBase):
8
+ async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
9
+ client = AsyncOpenAI(
10
+ base_url=f"{Config.VLLM_EMBEDDINGS_BASE_URL}/v1", api_key="dummy"
11
+ )
12
+ response = await client.embeddings.create(
13
+ model=self._model,
14
+ input=text,
15
+ )
16
+ if isinstance(text, str):
17
+ return response.data[0].embedding
18
+ else:
19
+ return [item.embedding for item in response.data]
@@ -0,0 +1,17 @@
1
+ try:
2
+ from voyageai.client import Client # type: ignore
3
+
4
+ from haiku.rag.embeddings.base import EmbedderBase
5
+
6
+ class Embedder(EmbedderBase):
7
+ async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
8
+ client = Client()
9
+ if isinstance(text, str):
10
+ res = client.embed([text], model=self._model, output_dtype="float")
11
+ return res.embeddings[0] # type: ignore[return-value]
12
+ else:
13
+ res = client.embed(text, model=self._model, output_dtype="float")
14
+ return res.embeddings # type: ignore[return-value]
15
+
16
+ except ImportError:
17
+ pass
@@ -2,6 +2,7 @@ from pydantic import BaseModel, Field
2
2
  from pydantic_ai import Agent, RunContext
3
3
  from pydantic_ai.models.openai import OpenAIChatModel
4
4
  from pydantic_ai.providers.ollama import OllamaProvider
5
+ from pydantic_ai.providers.openai import OpenAIProvider
5
6
 
6
7
  from haiku.rag.client import HaikuRAG
7
8
  from haiku.rag.config import Config
@@ -65,6 +66,13 @@ class QuestionAnswerAgent:
65
66
  model_name=model,
66
67
  provider=OllamaProvider(base_url=f"{Config.OLLAMA_BASE_URL}/v1"),
67
68
  )
69
+ elif provider == "vllm":
70
+ return OpenAIChatModel(
71
+ model_name=model,
72
+ provider=OpenAIProvider(
73
+ base_url=f"{Config.VLLM_QA_BASE_URL}/v1", api_key="none"
74
+ ),
75
+ )
68
76
  else:
69
77
  # For all other providers, use the provider:model format
70
78
  return f"{provider}:{model}"
@@ -0,0 +1,44 @@
1
+ import httpx
2
+
3
+ from haiku.rag.config import Config
4
+ from haiku.rag.reranking.base import RerankerBase
5
+ from haiku.rag.store.models.chunk import Chunk
6
+
7
+
8
+ class VLLMReranker(RerankerBase):
9
+ def __init__(self, model: str):
10
+ self._model = model
11
+ self._base_url = Config.VLLM_RERANK_BASE_URL
12
+
13
+ async def rerank(
14
+ self, query: str, chunks: list[Chunk], top_n: int = 10
15
+ ) -> list[tuple[Chunk, float]]:
16
+ if not chunks:
17
+ return []
18
+
19
+ # Prepare documents for reranking
20
+ documents = [chunk.content for chunk in chunks]
21
+
22
+ async with httpx.AsyncClient() as client:
23
+ response = await client.post(
24
+ f"{self._base_url}/v1/rerank",
25
+ json={"model": self._model, "query": query, "documents": documents},
26
+ headers={
27
+ "accept": "application/json",
28
+ "Content-Type": "application/json",
29
+ },
30
+ )
31
+ response.raise_for_status()
32
+
33
+ result = response.json()
34
+
35
+ # Extract scores and pair with chunks
36
+ scored_chunks = []
37
+ for item in result.get("results", []):
38
+ index = item["index"]
39
+ score = item["relevance_score"]
40
+ scored_chunks.append((chunks[index], score))
41
+
42
+ # Sort by score (descending) and return top_n
43
+ scored_chunks.sort(key=lambda x: x[1], reverse=True)
44
+ return scored_chunks[:top_n]
@@ -154,13 +154,7 @@ class ChunkRepository:
154
154
  """Create chunks and embeddings for a document from DoclingDocument."""
155
155
  chunk_texts = await chunker.chunk(document)
156
156
 
157
- # Generate embeddings in parallel for all chunks
158
- embeddings_tasks = []
159
- for chunk_text in chunk_texts:
160
- embeddings_tasks.append(self.embedder.embed(chunk_text))
161
-
162
- # Wait for all embeddings to complete
163
- embeddings = await asyncio.gather(*embeddings_tasks)
157
+ embeddings = await self.embedder.embed(chunk_texts)
164
158
 
165
159
  # Prepare all chunk records for batch insertion
166
160
  chunk_records = []
@@ -1,6 +1,6 @@
1
1
  from pydantic import BaseModel
2
2
  from pydantic_ai import Agent
3
- from pydantic_ai.models.openai import OpenAIModel
3
+ from pydantic_ai.models.openai import OpenAIChatModel
4
4
  from pydantic_ai.providers.ollama import OllamaProvider
5
5
 
6
6
  from haiku.rag.config import Config
@@ -37,9 +37,9 @@ class LLMJudgeResponseSchema(BaseModel):
37
37
  class LLMJudge:
38
38
  """LLM-as-judge for evaluating answer equivalence using Pydantic AI."""
39
39
 
40
- def __init__(self, model: str = Config.QA_MODEL):
40
+ def __init__(self, model: str = "qwen3"):
41
41
  # Create Ollama model
42
- ollama_model = OpenAIModel(
42
+ ollama_model = OpenAIChatModel(
43
43
  model_name=model,
44
44
  provider=OllamaProvider(base_url=f"{Config.OLLAMA_BASE_URL}/v1"),
45
45
  )
@@ -4,9 +4,11 @@ import pytest
4
4
  from haiku.rag.config import Config
5
5
  from haiku.rag.embeddings.ollama import Embedder as OllamaEmbedder
6
6
  from haiku.rag.embeddings.openai import Embedder as OpenAIEmbedder
7
+ from haiku.rag.embeddings.vllm import Embedder as VLLMEmbedder
7
8
 
8
9
  OPENAI_AVAILABLE = bool(Config.OPENAI_API_KEY)
9
10
  VOYAGEAI_AVAILABLE = bool(Config.VOYAGE_API_KEY)
11
+ VLLM_EMBEDDINGS_AVAILABLE = bool(Config.VLLM_EMBEDDINGS_BASE_URL)
10
12
 
11
13
 
12
14
  # Calculate cosine similarity
@@ -26,7 +28,13 @@ async def test_ollama_embedder():
26
28
  "Python is my favorite programming language.",
27
29
  "I love to travel and see new places.",
28
30
  ]
29
- embeddings = [np.array(await embedder.embed(phrase)) for phrase in phrases]
31
+
32
+ # Test batch embedding
33
+ embeddings = await embedder.embed(phrases)
34
+ assert isinstance(embeddings, list)
35
+ assert len(embeddings) == 3
36
+ assert all(isinstance(emb, list) for emb in embeddings)
37
+ embeddings = [np.array(emb) for emb in embeddings]
30
38
 
31
39
  test_phrase = "I am going for a camping trip."
32
40
  test_embedding = await embedder.embed(test_phrase)
@@ -56,7 +64,13 @@ async def test_openai_embedder():
56
64
  "Python is my favorite programming language.",
57
65
  "I love to travel and see new places.",
58
66
  ]
59
- embeddings = [np.array(await embedder.embed(phrase)) for phrase in phrases]
67
+
68
+ # Test batch embedding
69
+ embeddings = await embedder.embed(phrases)
70
+ assert isinstance(embeddings, list)
71
+ assert len(embeddings) == 3
72
+ assert all(isinstance(emb, list) for emb in embeddings)
73
+ embeddings = [np.array(emb) for emb in embeddings]
60
74
 
61
75
  test_phrase = "I am going for a camping trip."
62
76
  test_embedding = await embedder.embed(test_phrase)
@@ -89,7 +103,13 @@ async def test_voyageai_embedder():
89
103
  "Python is my favorite programming language.",
90
104
  "I love to travel and see new places.",
91
105
  ]
92
- embeddings = [np.array(await embedder.embed(phrase)) for phrase in phrases]
106
+
107
+ # Test batch embedding
108
+ embeddings = await embedder.embed(phrases)
109
+ assert isinstance(embeddings, list)
110
+ assert len(embeddings) == 3
111
+ assert all(isinstance(emb, list) for emb in embeddings)
112
+ embeddings = [np.array(emb) for emb in embeddings]
93
113
 
94
114
  test_phrase = "I am going for a camping trip."
95
115
  test_embedding = await embedder.embed(test_phrase)
@@ -111,3 +131,41 @@ async def test_voyageai_embedder():
111
131
 
112
132
  except ImportError:
113
133
  pytest.skip("VoyageAI package not installed")
134
+
135
+
136
+ @pytest.mark.asyncio
137
+ @pytest.mark.skipif(
138
+ not VLLM_EMBEDDINGS_AVAILABLE, reason="vLLM embeddings server not configured"
139
+ )
140
+ async def test_vllm_embedder():
141
+ embedder = VLLMEmbedder("mixedbread-ai/mxbai-embed-large-v1", 512)
142
+ phrases = [
143
+ "I enjoy eating great food.",
144
+ "Python is my favorite programming language.",
145
+ "I love to travel and see new places.",
146
+ ]
147
+
148
+ # Test batch embedding
149
+ embeddings = await embedder.embed(phrases)
150
+ assert isinstance(embeddings, list)
151
+ assert len(embeddings) == 3
152
+ assert all(isinstance(emb, list) for emb in embeddings)
153
+ embeddings = [np.array(emb) for emb in embeddings]
154
+
155
+ test_phrase = "I am going for a camping trip."
156
+ test_embedding = await embedder.embed(test_phrase)
157
+
158
+ sims = similarities(embeddings, test_embedding)
159
+ assert max(sims) == sims[2]
160
+
161
+ test_phrase = "When is dinner ready?"
162
+ test_embedding = await embedder.embed(test_phrase)
163
+
164
+ sims = similarities(embeddings, test_embedding)
165
+ assert max(sims) == sims[0]
166
+
167
+ test_phrase = "I work as a software developer."
168
+ test_embedding = await embedder.embed(test_phrase)
169
+
170
+ sims = similarities(embeddings, test_embedding)
171
+ assert max(sims) == sims[1]
@@ -9,6 +9,7 @@ from .llm_judge import LLMJudge
9
9
 
10
10
  OPENAI_AVAILABLE = bool(Config.OPENAI_API_KEY)
11
11
  ANTHROPIC_AVAILABLE = bool(Config.ANTHROPIC_API_KEY)
12
+ VLLM_QA_AVAILABLE = bool(Config.VLLM_QA_BASE_URL)
12
13
 
13
14
 
14
15
  @pytest.mark.asyncio
@@ -80,3 +81,26 @@ async def test_qa_anthropic(qa_corpus: Dataset, temp_db_path):
80
81
  assert is_equivalent, (
81
82
  f"Generated answer not equivalent to expected answer.\nQuestion: {question}\nGenerated: {answer}\nExpected: {expected_answer}"
82
83
  )
84
+
85
+
86
+ @pytest.mark.asyncio
87
+ @pytest.mark.skipif(not VLLM_QA_AVAILABLE, reason="vLLM QA server not configured")
88
+ async def test_qa_vllm(qa_corpus: Dataset, temp_db_path):
89
+ """Test vLLM QA with LLM judge."""
90
+ client = HaikuRAG(temp_db_path)
91
+ qa = QuestionAnswerAgent(client, "vllm", "Qwen/Qwen3-4B")
92
+ llm_judge = LLMJudge()
93
+
94
+ doc = qa_corpus[1]
95
+ await client.create_document(
96
+ content=doc["document_extracted"], uri=doc["document_id"]
97
+ )
98
+
99
+ question = doc["question"]
100
+ expected_answer = doc["answer"]
101
+ answer = await qa.answer(question)
102
+ is_equivalent = await llm_judge.judge_answers(question, answer, expected_answer)
103
+
104
+ assert is_equivalent, (
105
+ f"Generated answer not equivalent to expected answer.\nQuestion: {question}\nGenerated: {answer}\nExpected: {expected_answer}"
106
+ )
@@ -2,9 +2,11 @@ import pytest
2
2
 
3
3
  from haiku.rag.config import Config
4
4
  from haiku.rag.reranking.base import RerankerBase
5
+ from haiku.rag.reranking.vllm import VLLMReranker
5
6
  from haiku.rag.store.models.chunk import Chunk
6
7
 
7
8
  COHERE_AVAILABLE = bool(Config.COHERE_API_KEY)
9
+ VLLM_RERANK_AVAILABLE = bool(Config.VLLM_RERANK_BASE_URL)
8
10
 
9
11
  chunks = [
10
12
  Chunk(content=content, document_id=str(i))
@@ -66,3 +68,22 @@ async def test_cohere_reranker():
66
68
 
67
69
  except ImportError:
68
70
  pytest.skip("Cohere package not installed")
71
+
72
+
73
+ @pytest.mark.asyncio
74
+ @pytest.mark.skipif(
75
+ not VLLM_RERANK_AVAILABLE, reason="vLLM rerank server not configured"
76
+ )
77
+ async def test_vllm_reranker():
78
+ try:
79
+ reranker = VLLMReranker("mixedbread-ai/mxbai-rerank-base-v2")
80
+
81
+ reranked = await reranker.rerank(
82
+ "Who wrote 'To Kill a Mockingbird'?", chunks, top_n=2
83
+ )
84
+ assert [chunk.document_id for chunk, score in reranked] == ["0", "2"]
85
+ assert all(isinstance(score, float) for chunk, score in reranked)
86
+
87
+ except Exception:
88
+ # Skip test if vLLM rerank server is not available
89
+ pytest.skip("vLLM rerank server not available")
@@ -951,7 +951,7 @@ wheels = [
951
951
 
952
952
  [[package]]
953
953
  name = "haiku-rag"
954
- version = "0.7.1"
954
+ version = "0.7.3"
955
955
  source = { editable = "." }
956
956
  dependencies = [
957
957
  { name = "docling" },
@@ -1,34 +0,0 @@
1
- # Installation
2
-
3
- ## Basic Installation
4
-
5
- ```bash
6
- uv pip install haiku.rag
7
- ```
8
-
9
- This includes support for:
10
- - **Ollama** (default embedding provider using `mxbai-embed-large`)
11
- - **OpenAI** (GPT models for QA and embeddings)
12
- - **Anthropic** (Claude models for QA)
13
- - **Cohere** (reranking models)
14
-
15
- ## Provider-Specific Installation
16
-
17
- For additional embedding providers, install with extras:
18
-
19
- ### VoyageAI
20
-
21
- ```bash
22
- uv pip install haiku.rag[voyageai]
23
- ```
24
-
25
- ### MixedBread AI Reranking
26
-
27
- ```bash
28
- uv pip install haiku.rag[mxbai]
29
- ```
30
-
31
- ## Requirements
32
-
33
- - Python 3.10+
34
- - Ollama (for default embeddings)
@@ -1,11 +0,0 @@
1
- from ollama import AsyncClient
2
-
3
- from haiku.rag.config import Config
4
- from haiku.rag.embeddings.base import EmbedderBase
5
-
6
-
7
- class Embedder(EmbedderBase):
8
- async def embed(self, text: str) -> list[float]:
9
- client = AsyncClient(host=Config.OLLAMA_BASE_URL)
10
- res = await client.embeddings(model=self._model, prompt=text)
11
- return list(res["embedding"])
@@ -1,13 +0,0 @@
1
- try:
2
- from voyageai.client import Client # type: ignore
3
-
4
- from haiku.rag.embeddings.base import EmbedderBase
5
-
6
- class Embedder(EmbedderBase):
7
- async def embed(self, text: str) -> list[float]:
8
- client = Client()
9
- res = client.embed([text], model=self._model, output_dtype="float")
10
- return res.embeddings[0] # type: ignore[return-value]
11
-
12
- except ImportError:
13
- pass
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes