haiku.rag 0.4.1__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/PKG-INFO +3 -3
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/README.md +1 -1
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/docs/index.md +1 -1
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/docs/python.md +25 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/docs/server.md +2 -1
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/pyproject.toml +2 -2
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/cli.py +17 -1
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/client.py +7 -2
- haiku_rag-0.4.3/src/haiku/rag/reader.py +109 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/engine.py +5 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/models/chunk.py +2 -1
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/repositories/chunk.py +11 -3
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/repositories/document.py +21 -5
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_client.py +40 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/uv.lock +1004 -294
- haiku_rag-0.4.1/src/haiku/rag/reader.py +0 -52
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/.github/FUNDING.yml +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/.github/workflows/build-docs.yml +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/.github/workflows/build-publish.yml +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/.gitignore +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/.pre-commit-config.yaml +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/.python-version +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/LICENSE +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/docs/benchmarks.md +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/docs/cli.md +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/docs/configuration.md +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/docs/installation.md +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/docs/mcp.md +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/mkdocs.yml +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/__init__.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/app.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/chunker.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/config.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/embeddings/__init__.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/embeddings/base.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/embeddings/ollama.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/embeddings/openai.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/embeddings/voyageai.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/logging.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/mcp.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/monitor.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/qa/__init__.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/qa/anthropic.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/qa/base.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/qa/ollama.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/qa/openai.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/qa/prompts.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/reranking/__init__.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/reranking/base.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/reranking/cohere.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/reranking/mxbai.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/__init__.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/models/__init__.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/models/document.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/repositories/__init__.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/repositories/base.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/repositories/settings.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/upgrades/__init__.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/store/upgrades/v0_3_4.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/src/haiku/rag/utils.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/__init__.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/conftest.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/generate_benchmark_db.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/llm_judge.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_app.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_chunk.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_chunker.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_cli.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_document.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_embedder.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_monitor.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_qa.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_rebuild.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_reranker.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_search.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_settings.py +0 -0
- {haiku_rag-0.4.1 → haiku_rag-0.4.3}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: haiku.rag
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Retrieval Augmented Generation (RAG) with SQLite
|
|
5
5
|
Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -18,9 +18,9 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Typing :: Typed
|
|
20
20
|
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: docling>=2.15.0
|
|
21
22
|
Requires-Dist: fastmcp>=2.8.1
|
|
22
23
|
Requires-Dist: httpx>=0.28.1
|
|
23
|
-
Requires-Dist: markitdown[audio-transcription,docx,pdf,pptx,xlsx]>=0.1.2
|
|
24
24
|
Requires-Dist: mxbai-rerank>=0.1.6
|
|
25
25
|
Requires-Dist: ollama>=0.5.1
|
|
26
26
|
Requires-Dist: pydantic>=2.11.7
|
|
@@ -55,7 +55,7 @@ Retrieval-Augmented Generation (RAG) library on SQLite.
|
|
|
55
55
|
- **Reranking**: Default search result reranking with MixedBread AI or Cohere
|
|
56
56
|
- **Question answering**: Built-in QA agents on your documents
|
|
57
57
|
- **File monitoring**: Auto-index files when run as server
|
|
58
|
-
- **40+ file formats**: PDF, DOCX, HTML, Markdown,
|
|
58
|
+
- **40+ file formats**: PDF, DOCX, HTML, Markdown, code files, URLs
|
|
59
59
|
- **MCP server**: Expose as tools for AI assistants
|
|
60
60
|
- **CLI & Python API**: Use from command line or Python
|
|
61
61
|
|
|
@@ -13,7 +13,7 @@ Retrieval-Augmented Generation (RAG) library on SQLite.
|
|
|
13
13
|
- **Reranking**: Default search result reranking with MixedBread AI or Cohere
|
|
14
14
|
- **Question answering**: Built-in QA agents on your documents
|
|
15
15
|
- **File monitoring**: Auto-index files when run as server
|
|
16
|
-
- **40+ file formats**: PDF, DOCX, HTML, Markdown,
|
|
16
|
+
- **40+ file formats**: PDF, DOCX, HTML, Markdown, code files, URLs
|
|
17
17
|
- **MCP server**: Expose as tools for AI assistants
|
|
18
18
|
- **CLI & Python API**: Use from command line or Python
|
|
19
19
|
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
- **Reranking**: Optional result reranking with MixedBread AI or Cohere
|
|
11
11
|
- **Question Answering**: Built-in QA agents using Ollama, OpenAI, or Anthropic.
|
|
12
12
|
- **File monitoring**: Automatically index files when run as a server
|
|
13
|
-
- **Extended file format support**: Parse 40+ file formats including PDF, DOCX, HTML, Markdown,
|
|
13
|
+
- **Extended file format support**: Parse 40+ file formats including PDF, DOCX, HTML, Markdown, code files and more. Or add a URL!
|
|
14
14
|
- **MCP server**: Exposes functionality as MCP tools
|
|
15
15
|
- **CLI commands**: Access all functionality from your terminal
|
|
16
16
|
- **Python client**: Call `haiku.rag` from your own python applications
|
|
@@ -27,6 +27,31 @@ doc = await client.create_document(
|
|
|
27
27
|
)
|
|
28
28
|
```
|
|
29
29
|
|
|
30
|
+
With custom externally generated chunks:
|
|
31
|
+
```python
|
|
32
|
+
from haiku.rag.store.models.chunk import Chunk
|
|
33
|
+
|
|
34
|
+
# Create custom chunks with optional embeddings
|
|
35
|
+
chunks = [
|
|
36
|
+
Chunk(
|
|
37
|
+
content="This is the first chunk",
|
|
38
|
+
metadata={"section": "intro"}
|
|
39
|
+
),
|
|
40
|
+
Chunk(
|
|
41
|
+
content="This is the second chunk",
|
|
42
|
+
metadata={"section": "body"},
|
|
43
|
+
embedding=[0.1] * 1024 # Optional pre-computed embedding
|
|
44
|
+
),
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
doc = await client.create_document(
|
|
48
|
+
content="Full document content",
|
|
49
|
+
uri="doc://custom",
|
|
50
|
+
metadata={"source": "manual"},
|
|
51
|
+
chunks=chunks # Use provided chunks instead of auto-generating
|
|
52
|
+
)
|
|
53
|
+
```
|
|
54
|
+
|
|
30
55
|
From file:
|
|
31
56
|
```python
|
|
32
57
|
doc = await client.create_document_from_source("path/to/document.pdf")
|
|
@@ -35,7 +35,8 @@ The server can parse 40+ file formats including:
|
|
|
35
35
|
- Microsoft Office (DOCX, XLSX, PPTX)
|
|
36
36
|
- HTML and Markdown
|
|
37
37
|
- Plain text files
|
|
38
|
-
-
|
|
38
|
+
- Code files (Python, JavaScript, etc.)
|
|
39
|
+
- Images (processed via OCR)
|
|
39
40
|
- And more...
|
|
40
41
|
|
|
41
42
|
URLs are also supported for web content.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "haiku.rag"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.3"
|
|
4
4
|
description = "Retrieval Augmented Generation (RAG) with SQLite"
|
|
5
5
|
authors = [{ name = "Yiorgis Gozadinos", email = "ggozadinos@gmail.com" }]
|
|
6
6
|
license = { text = "MIT" }
|
|
@@ -22,9 +22,9 @@ classifiers = [
|
|
|
22
22
|
]
|
|
23
23
|
|
|
24
24
|
dependencies = [
|
|
25
|
+
"docling>=2.15.0",
|
|
25
26
|
"fastmcp>=2.8.1",
|
|
26
27
|
"httpx>=0.28.1",
|
|
27
|
-
"markitdown[audio-transcription,docx,pdf,pptx,xlsx]>=0.1.2",
|
|
28
28
|
"mxbai-rerank>=0.1.6",
|
|
29
29
|
"ollama>=0.5.1",
|
|
30
30
|
"pydantic>=2.11.7",
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
from importlib.metadata import version
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
import typer
|
|
@@ -26,8 +27,23 @@ async def check_version():
|
|
|
26
27
|
console.print("[yellow]Please update.[/yellow]")
|
|
27
28
|
|
|
28
29
|
|
|
30
|
+
def version_callback(value: bool):
|
|
31
|
+
if value:
|
|
32
|
+
v = version("haiku.rag")
|
|
33
|
+
console.print(f"haiku.rag version {v}")
|
|
34
|
+
raise typer.Exit()
|
|
35
|
+
|
|
36
|
+
|
|
29
37
|
@cli.callback()
|
|
30
|
-
def main(
|
|
38
|
+
def main(
|
|
39
|
+
_version: bool = typer.Option(
|
|
40
|
+
False,
|
|
41
|
+
"-v",
|
|
42
|
+
"--version",
|
|
43
|
+
callback=version_callback,
|
|
44
|
+
help="Show version and exit",
|
|
45
|
+
),
|
|
46
|
+
):
|
|
31
47
|
"""haiku.rag CLI - SQLite-based RAG system"""
|
|
32
48
|
# Run version check before any command
|
|
33
49
|
event_loop.run_until_complete(check_version())
|
|
@@ -50,7 +50,11 @@ class HaikuRAG:
|
|
|
50
50
|
return False
|
|
51
51
|
|
|
52
52
|
async def create_document(
|
|
53
|
-
self,
|
|
53
|
+
self,
|
|
54
|
+
content: str,
|
|
55
|
+
uri: str | None = None,
|
|
56
|
+
metadata: dict | None = None,
|
|
57
|
+
chunks: list[Chunk] | None = None,
|
|
54
58
|
) -> Document:
|
|
55
59
|
"""Create a new document with optional URI and metadata.
|
|
56
60
|
|
|
@@ -58,6 +62,7 @@ class HaikuRAG:
|
|
|
58
62
|
content: The text content of the document.
|
|
59
63
|
uri: Optional URI identifier for the document.
|
|
60
64
|
metadata: Optional metadata dictionary.
|
|
65
|
+
chunks: Optional list of pre-created chunks to use instead of generating new ones.
|
|
61
66
|
|
|
62
67
|
Returns:
|
|
63
68
|
The created Document instance.
|
|
@@ -67,7 +72,7 @@ class HaikuRAG:
|
|
|
67
72
|
uri=uri,
|
|
68
73
|
metadata=metadata or {},
|
|
69
74
|
)
|
|
70
|
-
return await self.document_repository.create(document)
|
|
75
|
+
return await self.document_repository.create(document, chunks)
|
|
71
76
|
|
|
72
77
|
async def create_document_from_source(
|
|
73
78
|
self, source: str | Path, metadata: dict = {}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import ClassVar
|
|
3
|
+
|
|
4
|
+
from docling.document_converter import DocumentConverter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FileReader:
|
|
8
|
+
# Extensions supported by docling
|
|
9
|
+
docling_extensions: ClassVar[list[str]] = [
|
|
10
|
+
".asciidoc",
|
|
11
|
+
".bmp",
|
|
12
|
+
".csv",
|
|
13
|
+
".docx",
|
|
14
|
+
".html",
|
|
15
|
+
".xhtml",
|
|
16
|
+
".jpeg",
|
|
17
|
+
".jpg",
|
|
18
|
+
".md",
|
|
19
|
+
".pdf.png",
|
|
20
|
+
".pptx",
|
|
21
|
+
".tiff",
|
|
22
|
+
".xlsx",
|
|
23
|
+
".xml",
|
|
24
|
+
".webp",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
# Plain text extensions that we'll read directly
|
|
28
|
+
text_extensions: ClassVar[list[str]] = [
|
|
29
|
+
".astro",
|
|
30
|
+
".c",
|
|
31
|
+
".cpp",
|
|
32
|
+
".css",
|
|
33
|
+
".go",
|
|
34
|
+
".h",
|
|
35
|
+
".hpp",
|
|
36
|
+
".java",
|
|
37
|
+
".js",
|
|
38
|
+
".json",
|
|
39
|
+
".kt",
|
|
40
|
+
".mdx",
|
|
41
|
+
".mjs",
|
|
42
|
+
".php",
|
|
43
|
+
".py",
|
|
44
|
+
".rb",
|
|
45
|
+
".rs",
|
|
46
|
+
".svelte",
|
|
47
|
+
".swift",
|
|
48
|
+
".ts",
|
|
49
|
+
".tsx",
|
|
50
|
+
".txt",
|
|
51
|
+
".vue",
|
|
52
|
+
".yaml",
|
|
53
|
+
".yml",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
# Code file extensions with their markdown language identifiers for syntax highlighting
|
|
57
|
+
code_markdown_identifier: ClassVar[dict[str, str]] = {
|
|
58
|
+
".astro": "astro",
|
|
59
|
+
".c": "c",
|
|
60
|
+
".cpp": "cpp",
|
|
61
|
+
".css": "css",
|
|
62
|
+
".go": "go",
|
|
63
|
+
".h": "c",
|
|
64
|
+
".hpp": "cpp",
|
|
65
|
+
".java": "java",
|
|
66
|
+
".js": "javascript",
|
|
67
|
+
".json": "json",
|
|
68
|
+
".kt": "kotlin",
|
|
69
|
+
".mjs": "javascript",
|
|
70
|
+
".php": "php",
|
|
71
|
+
".py": "python",
|
|
72
|
+
".rb": "ruby",
|
|
73
|
+
".rs": "rust",
|
|
74
|
+
".svelte": "svelte",
|
|
75
|
+
".swift": "swift",
|
|
76
|
+
".ts": "typescript",
|
|
77
|
+
".tsx": "tsx",
|
|
78
|
+
".vue": "vue",
|
|
79
|
+
".yaml": "yaml",
|
|
80
|
+
".yml": "yaml",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
extensions: ClassVar[list[str]] = docling_extensions + text_extensions
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def parse_file(path: Path) -> str:
|
|
87
|
+
try:
|
|
88
|
+
file_extension = path.suffix.lower()
|
|
89
|
+
|
|
90
|
+
if file_extension in FileReader.docling_extensions:
|
|
91
|
+
# Use docling for complex document formats
|
|
92
|
+
converter = DocumentConverter()
|
|
93
|
+
result = converter.convert(path)
|
|
94
|
+
return result.document.export_to_markdown()
|
|
95
|
+
elif file_extension in FileReader.text_extensions:
|
|
96
|
+
# Read plain text files directly
|
|
97
|
+
content = path.read_text(encoding="utf-8")
|
|
98
|
+
|
|
99
|
+
# Wrap code files (but not plain txt) in markdown code blocks for better presentation
|
|
100
|
+
if file_extension in FileReader.code_markdown_identifier:
|
|
101
|
+
language = FileReader.code_markdown_identifier[file_extension]
|
|
102
|
+
return f"```{language}\n{content}\n```"
|
|
103
|
+
|
|
104
|
+
return content
|
|
105
|
+
else:
|
|
106
|
+
# Fallback: try to read as text
|
|
107
|
+
return path.read_text(encoding="utf-8")
|
|
108
|
+
except Exception:
|
|
109
|
+
raise ValueError(f"Failed to parse file: {path}")
|
|
@@ -37,6 +37,11 @@ class Store:
|
|
|
37
37
|
db = sqlite3.connect(self.db_path)
|
|
38
38
|
db.enable_load_extension(True)
|
|
39
39
|
sqlite_vec.load(db)
|
|
40
|
+
|
|
41
|
+
# Enable WAL mode for better concurrency (skip for in-memory databases)
|
|
42
|
+
if self.db_path != ":memory:":
|
|
43
|
+
db.execute("PRAGMA journal_mode=WAL")
|
|
44
|
+
|
|
40
45
|
self._connection = db
|
|
41
46
|
existing_tables = [
|
|
42
47
|
row[0]
|
|
@@ -18,6 +18,8 @@ class ChunkRepository(BaseRepository[Chunk]):
|
|
|
18
18
|
"""Create a chunk in the database."""
|
|
19
19
|
if self.store._connection is None:
|
|
20
20
|
raise ValueError("Store connection is not available")
|
|
21
|
+
if entity.document_id is None:
|
|
22
|
+
raise ValueError("Chunk must have a document_id to be created")
|
|
21
23
|
|
|
22
24
|
cursor = self.store._connection.cursor()
|
|
23
25
|
cursor.execute(
|
|
@@ -34,9 +36,15 @@ class ChunkRepository(BaseRepository[Chunk]):
|
|
|
34
36
|
|
|
35
37
|
entity.id = cursor.lastrowid
|
|
36
38
|
|
|
37
|
-
# Generate and store embedding
|
|
38
|
-
embedding
|
|
39
|
-
|
|
39
|
+
# Generate and store embedding - use existing one if provided
|
|
40
|
+
if entity.embedding is not None:
|
|
41
|
+
# Use the provided embedding
|
|
42
|
+
serialized_embedding = self.store.serialize_embedding(entity.embedding)
|
|
43
|
+
else:
|
|
44
|
+
# Generate embedding from content
|
|
45
|
+
embedding = await self.embedder.embed(entity.content)
|
|
46
|
+
serialized_embedding = self.store.serialize_embedding(embedding)
|
|
47
|
+
|
|
40
48
|
cursor.execute(
|
|
41
49
|
"""
|
|
42
50
|
INSERT INTO chunk_embeddings (chunk_id, embedding)
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
3
4
|
from haiku.rag.store.models.document import Document
|
|
4
5
|
from haiku.rag.store.repositories.base import BaseRepository
|
|
5
6
|
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from haiku.rag.store.models.chunk import Chunk
|
|
9
|
+
|
|
6
10
|
|
|
7
11
|
class DocumentRepository(BaseRepository[Document]):
|
|
8
12
|
"""Repository for Document database operations."""
|
|
@@ -16,7 +20,9 @@ class DocumentRepository(BaseRepository[Document]):
|
|
|
16
20
|
chunk_repository = ChunkRepository(store)
|
|
17
21
|
self.chunk_repository = chunk_repository
|
|
18
22
|
|
|
19
|
-
async def create(
|
|
23
|
+
async def create(
|
|
24
|
+
self, entity: Document, chunks: list["Chunk"] | None = None
|
|
25
|
+
) -> Document:
|
|
20
26
|
"""Create a document with its chunks and embeddings."""
|
|
21
27
|
if self.store._connection is None:
|
|
22
28
|
raise ValueError("Store connection is not available")
|
|
@@ -46,10 +52,20 @@ class DocumentRepository(BaseRepository[Document]):
|
|
|
46
52
|
assert document_id is not None, "Failed to create document in database"
|
|
47
53
|
entity.id = document_id
|
|
48
54
|
|
|
49
|
-
# Create chunks
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
55
|
+
# Create chunks - either use provided chunks or generate from content
|
|
56
|
+
if chunks is not None:
|
|
57
|
+
# Use provided chunks, but update their document_id and set order from list position
|
|
58
|
+
for order, chunk in enumerate(chunks):
|
|
59
|
+
chunk.document_id = document_id
|
|
60
|
+
# Ensure order is set from list position
|
|
61
|
+
chunk.metadata = chunk.metadata.copy() if chunk.metadata else {}
|
|
62
|
+
chunk.metadata["order"] = order
|
|
63
|
+
await self.chunk_repository.create(chunk, commit=False)
|
|
64
|
+
else:
|
|
65
|
+
# Create chunks and embeddings using ChunkRepository
|
|
66
|
+
await self.chunk_repository.create_chunks_for_document(
|
|
67
|
+
document_id, entity.content, commit=False
|
|
68
|
+
)
|
|
53
69
|
|
|
54
70
|
cursor.execute("COMMIT")
|
|
55
71
|
return entity
|
|
@@ -7,6 +7,7 @@ import pytest
|
|
|
7
7
|
from datasets import Dataset
|
|
8
8
|
|
|
9
9
|
from haiku.rag.client import HaikuRAG
|
|
10
|
+
from haiku.rag.store.models.chunk import Chunk
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
@pytest.mark.asyncio
|
|
@@ -449,3 +450,42 @@ async def test_client_async_context_manager():
|
|
|
449
450
|
# Context manager should have automatically closed the connection
|
|
450
451
|
# We can't easily test that the connection is closed without accessing internals,
|
|
451
452
|
# but the test passing means the context manager methods work correctly
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
@pytest.mark.asyncio
|
|
456
|
+
async def test_client_create_document_with_custom_chunks():
|
|
457
|
+
"""Test creating a document with pre-created chunks."""
|
|
458
|
+
async with HaikuRAG(":memory:") as client:
|
|
459
|
+
# Create some custom chunks with and without embeddings
|
|
460
|
+
chunks = [
|
|
461
|
+
Chunk(content="This is the first chunk", metadata={"custom": "metadata1"}),
|
|
462
|
+
Chunk(
|
|
463
|
+
content="This is the second chunk",
|
|
464
|
+
metadata={"custom": "metadata2"},
|
|
465
|
+
embedding=[0.1] * 1024,
|
|
466
|
+
), # With embedding
|
|
467
|
+
Chunk(content="This is the third chunk", metadata={"custom": "metadata3"}),
|
|
468
|
+
]
|
|
469
|
+
|
|
470
|
+
# Create document with custom chunks
|
|
471
|
+
document = await client.create_document(
|
|
472
|
+
content="Full document content", chunks=chunks
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
assert document.id is not None
|
|
476
|
+
assert document.content == "Full document content"
|
|
477
|
+
|
|
478
|
+
# Verify the chunks were created correctly
|
|
479
|
+
doc_chunks = await client.chunk_repository.get_by_document_id(document.id)
|
|
480
|
+
assert len(doc_chunks) == 3
|
|
481
|
+
|
|
482
|
+
# Check chunks have correct content, document_id, and order from list position
|
|
483
|
+
for i, chunk in enumerate(doc_chunks):
|
|
484
|
+
assert chunk.document_id == document.id
|
|
485
|
+
assert chunk.content == chunks[i].content
|
|
486
|
+
assert (
|
|
487
|
+
chunk.metadata["order"] == i
|
|
488
|
+
) # Order should be set from list position
|
|
489
|
+
assert (
|
|
490
|
+
chunk.metadata["custom"] == f"metadata{i + 1}"
|
|
491
|
+
) # Original metadata preserved
|