haiku.rag 0.4.2__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

Files changed (80) hide show
  1. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/PKG-INFO +3 -3
  2. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/README.md +1 -1
  3. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/docs/configuration.md +0 -3
  4. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/docs/index.md +1 -1
  5. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/docs/server.md +2 -1
  6. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/pyproject.toml +2 -2
  7. haiku_rag-0.5.0/src/haiku/rag/chunker.py +60 -0
  8. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/cli.py +17 -1
  9. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/config.py +0 -1
  10. haiku_rag-0.5.0/src/haiku/rag/reader.py +110 -0
  11. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/engine.py +5 -0
  12. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/settings.py +0 -1
  13. haiku_rag-0.5.0/tests/test_chunker.py +34 -0
  14. haiku_rag-0.5.0/tests/test_reader.py +22 -0
  15. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_search.py +5 -1
  16. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/uv.lock +1004 -294
  17. haiku_rag-0.4.2/src/haiku/rag/chunker.py +0 -67
  18. haiku_rag-0.4.2/src/haiku/rag/reader.py +0 -52
  19. haiku_rag-0.4.2/tests/test_chunker.py +0 -44
  20. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/.github/FUNDING.yml +0 -0
  21. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/.github/workflows/build-docs.yml +0 -0
  22. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/.github/workflows/build-publish.yml +0 -0
  23. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/.gitignore +0 -0
  24. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/.pre-commit-config.yaml +0 -0
  25. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/.python-version +0 -0
  26. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/LICENSE +0 -0
  27. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/docs/benchmarks.md +0 -0
  28. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/docs/cli.md +0 -0
  29. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/docs/installation.md +0 -0
  30. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/docs/mcp.md +0 -0
  31. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/docs/python.md +0 -0
  32. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/mkdocs.yml +0 -0
  33. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/__init__.py +0 -0
  34. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/app.py +0 -0
  35. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/client.py +0 -0
  36. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/__init__.py +0 -0
  37. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/base.py +0 -0
  38. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/ollama.py +0 -0
  39. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/openai.py +0 -0
  40. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/voyageai.py +0 -0
  41. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/logging.py +0 -0
  42. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/mcp.py +0 -0
  43. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/monitor.py +0 -0
  44. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/qa/__init__.py +0 -0
  45. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/qa/anthropic.py +0 -0
  46. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/qa/base.py +0 -0
  47. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/qa/ollama.py +0 -0
  48. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/qa/openai.py +0 -0
  49. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/qa/prompts.py +0 -0
  50. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/reranking/__init__.py +0 -0
  51. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/reranking/base.py +0 -0
  52. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/reranking/cohere.py +0 -0
  53. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/reranking/mxbai.py +0 -0
  54. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/__init__.py +0 -0
  55. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/models/__init__.py +0 -0
  56. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/models/chunk.py +0 -0
  57. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/models/document.py +0 -0
  58. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/__init__.py +0 -0
  59. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/base.py +0 -0
  60. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/chunk.py +0 -0
  61. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/document.py +0 -0
  62. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/upgrades/__init__.py +0 -0
  63. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/store/upgrades/v0_3_4.py +0 -0
  64. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/src/haiku/rag/utils.py +0 -0
  65. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/__init__.py +0 -0
  66. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/conftest.py +0 -0
  67. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/generate_benchmark_db.py +0 -0
  68. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/llm_judge.py +0 -0
  69. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_app.py +0 -0
  70. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_chunk.py +0 -0
  71. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_cli.py +0 -0
  72. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_client.py +0 -0
  73. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_document.py +0 -0
  74. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_embedder.py +0 -0
  75. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_monitor.py +0 -0
  76. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_qa.py +0 -0
  77. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_rebuild.py +0 -0
  78. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_reranker.py +0 -0
  79. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_settings.py +0 -0
  80. {haiku_rag-0.4.2 → haiku_rag-0.5.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.4.2
3
+ Version: 0.5.0
4
4
  Summary: Retrieval Augmented Generation (RAG) with SQLite
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -18,9 +18,9 @@ Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Typing :: Typed
20
20
  Requires-Python: >=3.10
21
+ Requires-Dist: docling>=2.15.0
21
22
  Requires-Dist: fastmcp>=2.8.1
22
23
  Requires-Dist: httpx>=0.28.1
23
- Requires-Dist: markitdown[audio-transcription,docx,pdf,pptx,xlsx]>=0.1.2
24
24
  Requires-Dist: mxbai-rerank>=0.1.6
25
25
  Requires-Dist: ollama>=0.5.1
26
26
  Requires-Dist: pydantic>=2.11.7
@@ -55,7 +55,7 @@ Retrieval-Augmented Generation (RAG) library on SQLite.
55
55
  - **Reranking**: Default search result reranking with MixedBread AI or Cohere
56
56
  - **Question answering**: Built-in QA agents on your documents
57
57
  - **File monitoring**: Auto-index files when run as server
58
- - **40+ file formats**: PDF, DOCX, HTML, Markdown, audio, URLs
58
+ - **40+ file formats**: PDF, DOCX, HTML, Markdown, code files, URLs
59
59
  - **MCP server**: Expose as tools for AI assistants
60
60
  - **CLI & Python API**: Use from command line or Python
61
61
 
@@ -13,7 +13,7 @@ Retrieval-Augmented Generation (RAG) library on SQLite.
13
13
  - **Reranking**: Default search result reranking with MixedBread AI or Cohere
14
14
  - **Question answering**: Built-in QA agents on your documents
15
15
  - **File monitoring**: Auto-index files when run as server
16
- - **40+ file formats**: PDF, DOCX, HTML, Markdown, audio, URLs
16
+ - **40+ file formats**: PDF, DOCX, HTML, Markdown, code files, URLs
17
17
  - **MCP server**: Expose as tools for AI assistants
18
18
  - **CLI & Python API**: Use from command line or Python
19
19
 
@@ -150,7 +150,4 @@ DEFAULT_DATA_DIR="/path/to/data"
150
150
  ```bash
151
151
  # Chunk size for document processing
152
152
  CHUNK_SIZE=256
153
-
154
- # Chunk overlap for better context
155
- CHUNK_OVERLAP=32
156
153
  ```
@@ -10,7 +10,7 @@
10
10
  - **Reranking**: Optional result reranking with MixedBread AI or Cohere
11
11
  - **Question Answering**: Built-in QA agents using Ollama, OpenAI, or Anthropic.
12
12
  - **File monitoring**: Automatically index files when run as a server
13
- - **Extended file format support**: Parse 40+ file formats including PDF, DOCX, HTML, Markdown, audio and more. Or add a URL!
13
+ - **Extended file format support**: Parse 40+ file formats including PDF, DOCX, HTML, Markdown, code files and more. Or add a URL!
14
14
  - **MCP server**: Exposes functionality as MCP tools
15
15
  - **CLI commands**: Access all functionality from your terminal
16
16
  - **Python client**: Call `haiku.rag` from your own python applications
@@ -35,7 +35,8 @@ The server can parse 40+ file formats including:
35
35
  - Microsoft Office (DOCX, XLSX, PPTX)
36
36
  - HTML and Markdown
37
37
  - Plain text files
38
- - Audio files
38
+ - Code files (Python, JavaScript, etc.)
39
+ - Images (processed via OCR)
39
40
  - And more...
40
41
 
41
42
  URLs are also supported for web content.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "haiku.rag"
3
- version = "0.4.2"
3
+ version = "0.5.0"
4
4
  description = "Retrieval Augmented Generation (RAG) with SQLite"
5
5
  authors = [{ name = "Yiorgis Gozadinos", email = "ggozadinos@gmail.com" }]
6
6
  license = { text = "MIT" }
@@ -22,9 +22,9 @@ classifiers = [
22
22
  ]
23
23
 
24
24
  dependencies = [
25
+ "docling>=2.15.0",
25
26
  "fastmcp>=2.8.1",
26
27
  "httpx>=0.28.1",
27
- "markitdown[audio-transcription,docx,pdf,pptx,xlsx]>=0.1.2",
28
28
  "mxbai-rerank>=0.1.6",
29
29
  "ollama>=0.5.1",
30
30
  "pydantic>=2.11.7",
@@ -0,0 +1,60 @@
1
+ from io import BytesIO
2
+ from typing import ClassVar
3
+
4
+ import tiktoken
5
+ from docling.chunking import HybridChunker # type: ignore
6
+ from docling.document_converter import DocumentConverter
7
+ from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
8
+ from docling_core.types.io import DocumentStream
9
+
10
+ from haiku.rag.config import Config
11
+
12
+
13
+ class Chunker:
14
+ """A class that chunks text into smaller pieces for embedding and retrieval.
15
+
16
+ Uses docling's structure-aware chunking to create semantically meaningful chunks
17
+ that respect document boundaries.
18
+
19
+ Args:
20
+ chunk_size: The maximum size of a chunk in tokens.
21
+ """
22
+
23
+ encoder: ClassVar[tiktoken.Encoding] = tiktoken.encoding_for_model("gpt-4o")
24
+
25
+ def __init__(
26
+ self,
27
+ chunk_size: int = Config.CHUNK_SIZE,
28
+ ):
29
+ self.chunk_size = chunk_size
30
+ tokenizer = OpenAITokenizer(
31
+ tokenizer=tiktoken.encoding_for_model("gpt-4o"), max_tokens=chunk_size
32
+ )
33
+
34
+ self.chunker = HybridChunker(tokenizer=tokenizer) # type: ignore
35
+
36
+ async def chunk(self, text: str) -> list[str]:
37
+ """Split the text into chunks using docling's structure-aware chunking.
38
+
39
+ Args:
40
+ text: The text to be split into chunks.
41
+
42
+ Returns:
43
+ A list of text chunks with semantic boundaries.
44
+ """
45
+ if not text:
46
+ return []
47
+
48
+ # Convert to docling document
49
+ bytes_io = BytesIO(text.encode("utf-8"))
50
+ doc_stream = DocumentStream(name="text.md", stream=bytes_io)
51
+ converter = DocumentConverter()
52
+ result = converter.convert(doc_stream)
53
+ doc = result.document
54
+
55
+ # Chunk using docling's hybrid chunker
56
+ chunks = list(self.chunker.chunk(doc))
57
+ return [self.chunker.contextualize(chunk) for chunk in chunks]
58
+
59
+
60
+ chunker = Chunker()
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ from importlib.metadata import version
2
3
  from pathlib import Path
3
4
 
4
5
  import typer
@@ -26,8 +27,23 @@ async def check_version():
26
27
  console.print("[yellow]Please update.[/yellow]")
27
28
 
28
29
 
30
+ def version_callback(value: bool):
31
+ if value:
32
+ v = version("haiku.rag")
33
+ console.print(f"haiku.rag version {v}")
34
+ raise typer.Exit()
35
+
36
+
29
37
  @cli.callback()
30
- def main():
38
+ def main(
39
+ _version: bool = typer.Option(
40
+ False,
41
+ "-v",
42
+ "--version",
43
+ callback=version_callback,
44
+ help="Show version and exit",
45
+ ),
46
+ ):
31
47
  """haiku.rag CLI - SQLite-based RAG system"""
32
48
  # Run version check before any command
33
49
  event_loop.run_until_complete(check_version())
@@ -27,7 +27,6 @@ class AppConfig(BaseModel):
27
27
  QA_MODEL: str = "qwen3"
28
28
 
29
29
  CHUNK_SIZE: int = 256
30
- CHUNK_OVERLAP: int = 32
31
30
 
32
31
  OLLAMA_BASE_URL: str = "http://localhost:11434"
33
32
 
@@ -0,0 +1,110 @@
1
+ from pathlib import Path
2
+ from typing import ClassVar
3
+
4
+ from docling.document_converter import DocumentConverter
5
+
6
+
7
+ class FileReader:
8
+ # Extensions supported by docling
9
+ docling_extensions: ClassVar[list[str]] = [
10
+ ".asciidoc",
11
+ ".bmp",
12
+ ".csv",
13
+ ".docx",
14
+ ".html",
15
+ ".xhtml",
16
+ ".jpeg",
17
+ ".jpg",
18
+ ".md",
19
+ ".pdf",
20
+ ".png",
21
+ ".pptx",
22
+ ".tiff",
23
+ ".xlsx",
24
+ ".xml",
25
+ ".webp",
26
+ ]
27
+
28
+ # Plain text extensions that we'll read directly
29
+ text_extensions: ClassVar[list[str]] = [
30
+ ".astro",
31
+ ".c",
32
+ ".cpp",
33
+ ".css",
34
+ ".go",
35
+ ".h",
36
+ ".hpp",
37
+ ".java",
38
+ ".js",
39
+ ".json",
40
+ ".kt",
41
+ ".mdx",
42
+ ".mjs",
43
+ ".php",
44
+ ".py",
45
+ ".rb",
46
+ ".rs",
47
+ ".svelte",
48
+ ".swift",
49
+ ".ts",
50
+ ".tsx",
51
+ ".txt",
52
+ ".vue",
53
+ ".yaml",
54
+ ".yml",
55
+ ]
56
+
57
+ # Code file extensions with their markdown language identifiers for syntax highlighting
58
+ code_markdown_identifier: ClassVar[dict[str, str]] = {
59
+ ".astro": "astro",
60
+ ".c": "c",
61
+ ".cpp": "cpp",
62
+ ".css": "css",
63
+ ".go": "go",
64
+ ".h": "c",
65
+ ".hpp": "cpp",
66
+ ".java": "java",
67
+ ".js": "javascript",
68
+ ".json": "json",
69
+ ".kt": "kotlin",
70
+ ".mjs": "javascript",
71
+ ".php": "php",
72
+ ".py": "python",
73
+ ".rb": "ruby",
74
+ ".rs": "rust",
75
+ ".svelte": "svelte",
76
+ ".swift": "swift",
77
+ ".ts": "typescript",
78
+ ".tsx": "tsx",
79
+ ".vue": "vue",
80
+ ".yaml": "yaml",
81
+ ".yml": "yaml",
82
+ }
83
+
84
+ extensions: ClassVar[list[str]] = docling_extensions + text_extensions
85
+
86
+ @staticmethod
87
+ def parse_file(path: Path) -> str:
88
+ try:
89
+ file_extension = path.suffix.lower()
90
+
91
+ if file_extension in FileReader.docling_extensions:
92
+ # Use docling for complex document formats
93
+ converter = DocumentConverter()
94
+ result = converter.convert(path)
95
+ return result.document.export_to_markdown()
96
+ elif file_extension in FileReader.text_extensions:
97
+ # Read plain text files directly
98
+ content = path.read_text(encoding="utf-8")
99
+
100
+ # Wrap code files (but not plain txt) in markdown code blocks for better presentation
101
+ if file_extension in FileReader.code_markdown_identifier:
102
+ language = FileReader.code_markdown_identifier[file_extension]
103
+ return f"```{language}\n{content}\n```"
104
+
105
+ return content
106
+ else:
107
+ # Fallback: try to read as text
108
+ return path.read_text(encoding="utf-8")
109
+ except Exception:
110
+ raise ValueError(f"Failed to parse file: {path}")
@@ -37,6 +37,11 @@ class Store:
37
37
  db = sqlite3.connect(self.db_path)
38
38
  db.enable_load_extension(True)
39
39
  sqlite_vec.load(db)
40
+
41
+ # Enable WAL mode for better concurrency (skip for in-memory databases)
42
+ if self.db_path != ":memory:":
43
+ db.execute("PRAGMA journal_mode=WAL")
44
+
40
45
  self._connection = db
41
46
  existing_tables = [
42
47
  row[0]
@@ -63,7 +63,6 @@ class SettingsRepository:
63
63
  "EMBEDDINGS_MODEL",
64
64
  "EMBEDDINGS_VECTOR_DIM",
65
65
  "CHUNK_SIZE",
66
- "CHUNK_OVERLAP",
67
66
  ]
68
67
 
69
68
  errors = []
@@ -0,0 +1,34 @@
1
+ import pytest
2
+ from datasets import Dataset
3
+
4
+ from haiku.rag.chunker import Chunker
5
+
6
+
7
+ @pytest.mark.asyncio
8
+ async def test_chunker(qa_corpus: Dataset):
9
+ chunker = Chunker()
10
+ doc = qa_corpus[0]["document_extracted"]
11
+ chunks = await Chunker().chunk(doc)
12
+
13
+ # Ensure that the text is split into multiple chunks
14
+ assert len(chunks) > 1
15
+
16
+ # Ensure that chunks are reasonably sized (allowing more flexibility for structure-aware chunking)
17
+ total_tokens = 0
18
+ for chunk in chunks:
19
+ encoded_tokens = Chunker.encoder.encode(chunk, disallowed_special=())
20
+ token_count = len(encoded_tokens)
21
+ total_tokens += token_count
22
+
23
+ # Each chunk should be reasonably sized (allowing more flexibility than the old strict limits)
24
+ assert (
25
+ token_count <= chunker.chunk_size * 1.2
26
+ ) # Allow some flexibility for semantic boundaries
27
+ assert token_count > 5 # Ensure chunks aren't too small
28
+
29
+ # Ensure that all chunks together contain roughly the same content as original
30
+ original_tokens = len(Chunker.encoder.encode(doc, disallowed_special=()))
31
+
32
+ # Due to structure-aware chunking, we might have some variation in token count
33
+ # but it should be reasonable
34
+ assert abs(total_tokens - original_tokens) <= original_tokens * 0.1
@@ -0,0 +1,22 @@
1
+ import tempfile
2
+ from pathlib import Path
3
+
4
+ from haiku.rag.reader import FileReader
5
+
6
+
7
+ def test_code_file_wrapped_in_code_block():
8
+ """Test that code files are wrapped in markdown code blocks."""
9
+ python_code = '''def hello_world():
10
+ print("Hello, World!")
11
+ return "success"'''
12
+
13
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".py") as f:
14
+ f.write(python_code)
15
+ f.flush()
16
+ temp_path = Path(f.name)
17
+
18
+ result = FileReader.parse_file(temp_path)
19
+
20
+ assert result.startswith("```python\n")
21
+ assert result.endswith("\n```")
22
+ assert "def hello_world():" in result
@@ -36,7 +36,7 @@ async def test_search_qa_corpus(qa_corpus: Dataset):
36
36
  created_document = await doc_repo.create(document)
37
37
  documents.append((created_document, doc_data))
38
38
 
39
- for i in range(num_documents): # Test with first few documents
39
+ for i in range(5): # Test with first few documents
40
40
  target_document, doc_data = documents[i]
41
41
  question = doc_data["question"]
42
42
 
@@ -50,6 +50,10 @@ async def test_search_qa_corpus(qa_corpus: Dataset):
50
50
  target_document_ids = {chunk.document_id for chunk, _ in fts_results}
51
51
  assert target_document.id in target_document_ids
52
52
 
53
+ for i in range(num_documents): # Test with first few documents
54
+ target_document, doc_data = documents[i]
55
+ question = doc_data["question"]
56
+
53
57
  # Test hybrid search
54
58
  hybrid_results = await chunk_repo.search_chunks_hybrid(question, limit=5)
55
59
  target_document_ids = {chunk.document_id for chunk, _ in hybrid_results}