haiku.rag 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

haiku/rag/chunker.py CHANGED
@@ -1,6 +1,11 @@
1
+ from io import BytesIO
1
2
  from typing import ClassVar
2
3
 
3
4
  import tiktoken
5
+ from docling.chunking import HybridChunker # type: ignore
6
+ from docling.document_converter import DocumentConverter
7
+ from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
8
+ from docling_core.types.io import DocumentStream
4
9
 
5
10
  from haiku.rag.config import Config
6
11
 
@@ -8,9 +13,11 @@ from haiku.rag.config import Config
8
13
  class Chunker:
9
14
  """A class that chunks text into smaller pieces for embedding and retrieval.
10
15
 
16
+ Uses docling's structure-aware chunking to create semantically meaningful chunks
17
+ that respect document boundaries.
18
+
11
19
  Args:
12
20
  chunk_size: The maximum size of a chunk in tokens.
13
- chunk_overlap: The number of tokens of overlap between chunks.
14
21
  """
15
22
 
16
23
  encoder: ClassVar[tiktoken.Encoding] = tiktoken.encoding_for_model("gpt-4o")
@@ -18,50 +25,36 @@ class Chunker:
18
25
  def __init__(
19
26
  self,
20
27
  chunk_size: int = Config.CHUNK_SIZE,
21
- chunk_overlap: int = Config.CHUNK_OVERLAP,
22
28
  ):
23
29
  self.chunk_size = chunk_size
24
- self.chunk_overlap = chunk_overlap
30
+ tokenizer = OpenAITokenizer(
31
+ tokenizer=tiktoken.encoding_for_model("gpt-4o"), max_tokens=chunk_size
32
+ )
33
+
34
+ self.chunker = HybridChunker(tokenizer=tokenizer) # type: ignore
25
35
 
26
36
  async def chunk(self, text: str) -> list[str]:
27
- """Split the text into chunks based on token boundaries.
37
+ """Split the text into chunks using docling's structure-aware chunking.
28
38
 
29
39
  Args:
30
40
  text: The text to be split into chunks.
31
41
 
32
42
  Returns:
33
- A list of text chunks with token-based boundaries and overlap.
43
+ A list of text chunks with semantic boundaries.
34
44
  """
35
45
  if not text:
36
46
  return []
37
47
 
38
- encoded_tokens = self.encoder.encode(text, disallowed_special=())
39
-
40
- if self.chunk_size > len(encoded_tokens):
41
- return [text]
42
-
43
- chunks = []
44
- i = 0
45
- split_id_counter = 0
46
- while i < len(encoded_tokens):
47
- # Overlap
48
- start_i = i
49
- end_i = min(i + self.chunk_size, len(encoded_tokens))
50
-
51
- chunk_tokens = encoded_tokens[start_i:end_i]
52
- chunk_text = self.encoder.decode(chunk_tokens)
53
-
54
- chunks.append(chunk_text)
55
- split_id_counter += 1
56
-
57
- # Exit loop if this was the last possible chunk
58
- if end_i == len(encoded_tokens):
59
- break
48
+ # Convert to docling document
49
+ bytes_io = BytesIO(text.encode("utf-8"))
50
+ doc_stream = DocumentStream(name="text.md", stream=bytes_io)
51
+ converter = DocumentConverter()
52
+ result = converter.convert(doc_stream)
53
+ doc = result.document
60
54
 
61
- i += (
62
- self.chunk_size - self.chunk_overlap
63
- ) # Step forward, considering overlap
64
- return chunks
55
+ # Chunk using docling's hybrid chunker
56
+ chunks = list(self.chunker.chunk(doc))
57
+ return [self.chunker.contextualize(chunk) for chunk in chunks]
65
58
 
66
59
 
67
60
  chunker = Chunker()
haiku/rag/cli.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ from importlib.metadata import version
2
3
  from pathlib import Path
3
4
 
4
5
  import typer
@@ -26,8 +27,23 @@ async def check_version():
26
27
  console.print("[yellow]Please update.[/yellow]")
27
28
 
28
29
 
30
+ def version_callback(value: bool):
31
+ if value:
32
+ v = version("haiku.rag")
33
+ console.print(f"haiku.rag version {v}")
34
+ raise typer.Exit()
35
+
36
+
29
37
  @cli.callback()
30
- def main():
38
+ def main(
39
+ _version: bool = typer.Option(
40
+ False,
41
+ "-v",
42
+ "--version",
43
+ callback=version_callback,
44
+ help="Show version and exit",
45
+ ),
46
+ ):
31
47
  """haiku.rag CLI - SQLite-based RAG system"""
32
48
  # Run version check before any command
33
49
  event_loop.run_until_complete(check_version())
haiku/rag/config.py CHANGED
@@ -27,7 +27,6 @@ class AppConfig(BaseModel):
27
27
  QA_MODEL: str = "qwen3"
28
28
 
29
29
  CHUNK_SIZE: int = 256
30
- CHUNK_OVERLAP: int = 32
31
30
 
32
31
  OLLAMA_BASE_URL: str = "http://localhost:11434"
33
32
 
haiku/rag/reader.py CHANGED
@@ -1,32 +1,46 @@
1
1
  from pathlib import Path
2
2
  from typing import ClassVar
3
3
 
4
- from markitdown import MarkItDown
4
+ from docling.document_converter import DocumentConverter
5
5
 
6
6
 
7
7
  class FileReader:
8
- extensions: ClassVar[list[str]] = [
8
+ # Extensions supported by docling
9
+ docling_extensions: ClassVar[list[str]] = [
10
+ ".asciidoc",
11
+ ".bmp",
12
+ ".csv",
13
+ ".docx",
14
+ ".html",
15
+ ".xhtml",
16
+ ".jpeg",
17
+ ".jpg",
18
+ ".md",
19
+ ".pdf",
20
+ ".png",
21
+ ".pptx",
22
+ ".tiff",
23
+ ".xlsx",
24
+ ".xml",
25
+ ".webp",
26
+ ]
27
+
28
+ # Plain text extensions that we'll read directly
29
+ text_extensions: ClassVar[list[str]] = [
9
30
  ".astro",
10
31
  ".c",
11
32
  ".cpp",
12
33
  ".css",
13
- ".csv",
14
- ".docx",
15
34
  ".go",
16
35
  ".h",
17
36
  ".hpp",
18
- ".html",
19
37
  ".java",
20
38
  ".js",
21
39
  ".json",
22
40
  ".kt",
23
- ".md",
24
41
  ".mdx",
25
42
  ".mjs",
26
- ".mp3",
27
- ".pdf",
28
43
  ".php",
29
- ".pptx",
30
44
  ".py",
31
45
  ".rb",
32
46
  ".rs",
@@ -36,17 +50,61 @@ class FileReader:
36
50
  ".tsx",
37
51
  ".txt",
38
52
  ".vue",
39
- ".wav",
40
- ".xml",
41
- ".xlsx",
42
53
  ".yaml",
43
54
  ".yml",
44
55
  ]
45
56
 
57
+ # Code file extensions with their markdown language identifiers for syntax highlighting
58
+ code_markdown_identifier: ClassVar[dict[str, str]] = {
59
+ ".astro": "astro",
60
+ ".c": "c",
61
+ ".cpp": "cpp",
62
+ ".css": "css",
63
+ ".go": "go",
64
+ ".h": "c",
65
+ ".hpp": "cpp",
66
+ ".java": "java",
67
+ ".js": "javascript",
68
+ ".json": "json",
69
+ ".kt": "kotlin",
70
+ ".mjs": "javascript",
71
+ ".php": "php",
72
+ ".py": "python",
73
+ ".rb": "ruby",
74
+ ".rs": "rust",
75
+ ".svelte": "svelte",
76
+ ".swift": "swift",
77
+ ".ts": "typescript",
78
+ ".tsx": "tsx",
79
+ ".vue": "vue",
80
+ ".yaml": "yaml",
81
+ ".yml": "yaml",
82
+ }
83
+
84
+ extensions: ClassVar[list[str]] = docling_extensions + text_extensions
85
+
46
86
  @staticmethod
47
87
  def parse_file(path: Path) -> str:
48
88
  try:
49
- reader = MarkItDown()
50
- return reader.convert(path).text_content
89
+ file_extension = path.suffix.lower()
90
+
91
+ if file_extension in FileReader.docling_extensions:
92
+ # Use docling for complex document formats
93
+ converter = DocumentConverter()
94
+ result = converter.convert(path)
95
+ return result.document.export_to_markdown()
96
+ elif file_extension in FileReader.text_extensions:
97
+ # Read plain text files directly
98
+ content = path.read_text(encoding="utf-8")
99
+
100
+ # Wrap code files (but not plain txt) in markdown code blocks for better presentation
101
+ if file_extension in FileReader.code_markdown_identifier:
102
+ language = FileReader.code_markdown_identifier[file_extension]
103
+ return f"```{language}\n{content}\n```"
104
+
105
+ return content
106
+ else:
107
+ # Fallback: try to read as text
108
+ return path.read_text(encoding="utf-8")
51
109
  except Exception:
52
110
  raise ValueError(f"Failed to parse file: {path}")
haiku/rag/store/engine.py CHANGED
@@ -37,6 +37,11 @@ class Store:
37
37
  db = sqlite3.connect(self.db_path)
38
38
  db.enable_load_extension(True)
39
39
  sqlite_vec.load(db)
40
+
41
+ # Enable WAL mode for better concurrency (skip for in-memory databases)
42
+ if self.db_path != ":memory:":
43
+ db.execute("PRAGMA journal_mode=WAL")
44
+
40
45
  self._connection = db
41
46
  existing_tables = [
42
47
  row[0]
@@ -63,7 +63,6 @@ class SettingsRepository:
63
63
  "EMBEDDINGS_MODEL",
64
64
  "EMBEDDINGS_VECTOR_DIM",
65
65
  "CHUNK_SIZE",
66
- "CHUNK_OVERLAP",
67
66
  ]
68
67
 
69
68
  errors = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.4.2
3
+ Version: 0.5.0
4
4
  Summary: Retrieval Augmented Generation (RAG) with SQLite
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -18,9 +18,9 @@ Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Typing :: Typed
20
20
  Requires-Python: >=3.10
21
+ Requires-Dist: docling>=2.15.0
21
22
  Requires-Dist: fastmcp>=2.8.1
22
23
  Requires-Dist: httpx>=0.28.1
23
- Requires-Dist: markitdown[audio-transcription,docx,pdf,pptx,xlsx]>=0.1.2
24
24
  Requires-Dist: mxbai-rerank>=0.1.6
25
25
  Requires-Dist: ollama>=0.5.1
26
26
  Requires-Dist: pydantic>=2.11.7
@@ -55,7 +55,7 @@ Retrieval-Augmented Generation (RAG) library on SQLite.
55
55
  - **Reranking**: Default search result reranking with MixedBread AI or Cohere
56
56
  - **Question answering**: Built-in QA agents on your documents
57
57
  - **File monitoring**: Auto-index files when run as server
58
- - **40+ file formats**: PDF, DOCX, HTML, Markdown, audio, URLs
58
+ - **40+ file formats**: PDF, DOCX, HTML, Markdown, code files, URLs
59
59
  - **MCP server**: Expose as tools for AI assistants
60
60
  - **CLI & Python API**: Use from command line or Python
61
61
 
@@ -1,13 +1,13 @@
1
1
  haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  haiku/rag/app.py,sha256=FpLVyP1-zAq_XPmU8CPVLkuIAeuhBOGvMqhYS8RbN40,7649
3
- haiku/rag/chunker.py,sha256=MbCtP66OfTFoIBvqmVT9T9c87fozsYYzAQzJJEfPBVI,1812
4
- haiku/rag/cli.py,sha256=oCj65JcV2MEhzA2okbLHAK1I0FrClIKuYZx2jtbjbqE,5628
3
+ haiku/rag/chunker.py,sha256=P2slbmoABygYRlqjOGzPBEOYsBZNTnNpE6bnW_dkVOE,1850
4
+ haiku/rag/cli.py,sha256=k7EhLkvTncxsdh5TYrg8BHLYh_lfyzupsWGj1dEEdqY,5992
5
5
  haiku/rag/client.py,sha256=MZNIpMm6MS3P6vjLqiCztT2dBOM7-bZOosX5IpbHJbI,12724
6
- haiku/rag/config.py,sha256=_Ss54kmfxVAJupExLKaYjYUlFxJgb7hEEdbG4-isapY,1662
6
+ haiku/rag/config.py,sha256=GXTWC3vasBMaWju-yh8Es3CidBz1ftqRH6E5PHpgsSQ,1634
7
7
  haiku/rag/logging.py,sha256=zTTGpGq5tPdcd7RpCbd9EGw1IZlQDbYkrCg9t9pqRc4,580
8
8
  haiku/rag/mcp.py,sha256=tMN6fNX7ZtAER1R6DL1GkC9HZozTC4HzuQs199p7icI,4551
9
9
  haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
10
- haiku/rag/reader.py,sha256=S7-Z72pDvSHedvgt4-RkTOwZadG88Oed9keJ69SVITk,962
10
+ haiku/rag/reader.py,sha256=s5dinZ-WffioiRH7OWZtE2v7FHRPd1PkqpPYsXtwqtc,2927
11
11
  haiku/rag/utils.py,sha256=Ez_tvNlRO_D8c2CBZ83Hs9Gmzcqdq4cmw_V5GBdKy_8,2214
12
12
  haiku/rag/embeddings/__init__.py,sha256=yFBlxS0jBiVHl_rWz5kb43t6Ha132U1ZGdlIPfhzPdg,1491
13
13
  haiku/rag/embeddings/base.py,sha256=NTQvuzbZPu0LBo5wAu3qGyJ4xXUaRAt1fjBO0ygWn_Y,465
@@ -25,7 +25,7 @@ haiku/rag/reranking/base.py,sha256=LM9yUSSJ414UgBZhFTgxGprlRqzfTe4I1vgjricz2JY,4
25
25
  haiku/rag/reranking/cohere.py,sha256=1iTdiaa8vvb6oHVB2qpWzUOVkyfUcimVSZp6Qr4aq4c,1049
26
26
  haiku/rag/reranking/mxbai.py,sha256=46sVTsTIkzIX9THgM3u8HaEmgY7evvEyB-N54JTHvK8,867
27
27
  haiku/rag/store/__init__.py,sha256=hq0W0DAC7ysqhWSP2M2uHX8cbG6kbr-sWHxhq6qQcY0,103
28
- haiku/rag/store/engine.py,sha256=4ouAD0s-TFwEoEHjVVw_KnV6aaw5nwhe9fdT8PRXfok,6061
28
+ haiku/rag/store/engine.py,sha256=cOMBToLilI1Di1qQrFzGLqtRMsuvtiX0Q5RNIEzQy9w,6232
29
29
  haiku/rag/store/models/__init__.py,sha256=s0E72zneGlowvZrFWaNxHYjOAUjgWdLxzdYsnvNRVlY,88
30
30
  haiku/rag/store/models/chunk.py,sha256=9-vIxW75-kMTelIhgVIMd_WhP-Drc1q65vjaWMP8w1E,364
31
31
  haiku/rag/store/models/document.py,sha256=TVXVY-nQs-1vCORQEs9rA7zOtndeGC4dgCoujLAS054,396
@@ -33,11 +33,11 @@ haiku/rag/store/repositories/__init__.py,sha256=uIBhxjQh-4o3O-ck8b7BQ58qXQTuJdPv
33
33
  haiku/rag/store/repositories/base.py,sha256=cm3VyQXhtxvRfk1uJHpA0fDSxMpYN-mjQmRiDiLsQ68,1008
34
34
  haiku/rag/store/repositories/chunk.py,sha256=UyvHhKb1ESZePoTp2GneAARdfKoocEdfPOwgWPPQ0v8,16878
35
35
  haiku/rag/store/repositories/document.py,sha256=fXIWevJaOe6x2cK4u9cQxiEGD0ntKQb9y3VRqklQypE,7920
36
- haiku/rag/store/repositories/settings.py,sha256=dme3_ulQdQvyF9daavSjAd-SjZ5hh0MJoxP7iXgap-A,2492
36
+ haiku/rag/store/repositories/settings.py,sha256=qZLXvLsErnCWL0nBQQNfRnatHzCKhtUDLvUK9k-W_fU,2463
37
37
  haiku/rag/store/upgrades/__init__.py,sha256=kKS1YWT_P-CYKhKtokOLTIFNKf9jlfjFFr8lyIMeogM,100
38
38
  haiku/rag/store/upgrades/v0_3_4.py,sha256=GLogKZdZ40NX1vBHKdOJju7fFzNUCHoEnjSZg17Hm2U,663
39
- haiku_rag-0.4.2.dist-info/METADATA,sha256=0ctTSGB6uqGl2INUcNxnCphxwrLTlK7KVdKKXXB35mg,4235
40
- haiku_rag-0.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
- haiku_rag-0.4.2.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
42
- haiku_rag-0.4.2.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
43
- haiku_rag-0.4.2.dist-info/RECORD,,
39
+ haiku_rag-0.5.0.dist-info/METADATA,sha256=Z29lOzGgaD2PJ6OxZc53QuMzFdosEZCdm7HZYOUNN3M,4198
40
+ haiku_rag-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
+ haiku_rag-0.5.0.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
42
+ haiku_rag-0.5.0.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
43
+ haiku_rag-0.5.0.dist-info/RECORD,,