haiku.rag 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

haiku/rag/cli.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ from importlib.metadata import version
2
3
  from pathlib import Path
3
4
 
4
5
  import typer
@@ -26,8 +27,23 @@ async def check_version():
26
27
  console.print("[yellow]Please update.[/yellow]")
27
28
 
28
29
 
30
+ def version_callback(value: bool):
31
+ if value:
32
+ v = version("haiku.rag")
33
+ console.print(f"haiku.rag version {v}")
34
+ raise typer.Exit()
35
+
36
+
29
37
  @cli.callback()
30
- def main():
38
+ def main(
39
+ _version: bool = typer.Option(
40
+ False,
41
+ "-v",
42
+ "--version",
43
+ callback=version_callback,
44
+ help="Show version and exit",
45
+ ),
46
+ ):
31
47
  """haiku.rag CLI - SQLite-based RAG system"""
32
48
  # Run version check before any command
33
49
  event_loop.run_until_complete(check_version())
haiku/rag/client.py CHANGED
@@ -50,7 +50,11 @@ class HaikuRAG:
50
50
  return False
51
51
 
52
52
  async def create_document(
53
- self, content: str, uri: str | None = None, metadata: dict | None = None
53
+ self,
54
+ content: str,
55
+ uri: str | None = None,
56
+ metadata: dict | None = None,
57
+ chunks: list[Chunk] | None = None,
54
58
  ) -> Document:
55
59
  """Create a new document with optional URI and metadata.
56
60
 
@@ -58,6 +62,7 @@ class HaikuRAG:
58
62
  content: The text content of the document.
59
63
  uri: Optional URI identifier for the document.
60
64
  metadata: Optional metadata dictionary.
65
+ chunks: Optional list of pre-created chunks to use instead of generating new ones.
61
66
 
62
67
  Returns:
63
68
  The created Document instance.
@@ -67,7 +72,7 @@ class HaikuRAG:
67
72
  uri=uri,
68
73
  metadata=metadata or {},
69
74
  )
70
- return await self.document_repository.create(document)
75
+ return await self.document_repository.create(document, chunks)
71
76
 
72
77
  async def create_document_from_source(
73
78
  self, source: str | Path, metadata: dict = {}
haiku/rag/reader.py CHANGED
@@ -1,32 +1,45 @@
1
1
  from pathlib import Path
2
2
  from typing import ClassVar
3
3
 
4
- from markitdown import MarkItDown
4
+ from docling.document_converter import DocumentConverter
5
5
 
6
6
 
7
7
  class FileReader:
8
- extensions: ClassVar[list[str]] = [
8
+ # Extensions supported by docling
9
+ docling_extensions: ClassVar[list[str]] = [
10
+ ".asciidoc",
11
+ ".bmp",
12
+ ".csv",
13
+ ".docx",
14
+ ".html",
15
+ ".xhtml",
16
+ ".jpeg",
17
+ ".jpg",
18
+ ".md",
19
+ ".pdf.png",
20
+ ".pptx",
21
+ ".tiff",
22
+ ".xlsx",
23
+ ".xml",
24
+ ".webp",
25
+ ]
26
+
27
+ # Plain text extensions that we'll read directly
28
+ text_extensions: ClassVar[list[str]] = [
9
29
  ".astro",
10
30
  ".c",
11
31
  ".cpp",
12
32
  ".css",
13
- ".csv",
14
- ".docx",
15
33
  ".go",
16
34
  ".h",
17
35
  ".hpp",
18
- ".html",
19
36
  ".java",
20
37
  ".js",
21
38
  ".json",
22
39
  ".kt",
23
- ".md",
24
40
  ".mdx",
25
41
  ".mjs",
26
- ".mp3",
27
- ".pdf",
28
42
  ".php",
29
- ".pptx",
30
43
  ".py",
31
44
  ".rb",
32
45
  ".rs",
@@ -36,17 +49,61 @@ class FileReader:
36
49
  ".tsx",
37
50
  ".txt",
38
51
  ".vue",
39
- ".wav",
40
- ".xml",
41
- ".xlsx",
42
52
  ".yaml",
43
53
  ".yml",
44
54
  ]
45
55
 
56
+ # Code file extensions with their markdown language identifiers for syntax highlighting
57
+ code_markdown_identifier: ClassVar[dict[str, str]] = {
58
+ ".astro": "astro",
59
+ ".c": "c",
60
+ ".cpp": "cpp",
61
+ ".css": "css",
62
+ ".go": "go",
63
+ ".h": "c",
64
+ ".hpp": "cpp",
65
+ ".java": "java",
66
+ ".js": "javascript",
67
+ ".json": "json",
68
+ ".kt": "kotlin",
69
+ ".mjs": "javascript",
70
+ ".php": "php",
71
+ ".py": "python",
72
+ ".rb": "ruby",
73
+ ".rs": "rust",
74
+ ".svelte": "svelte",
75
+ ".swift": "swift",
76
+ ".ts": "typescript",
77
+ ".tsx": "tsx",
78
+ ".vue": "vue",
79
+ ".yaml": "yaml",
80
+ ".yml": "yaml",
81
+ }
82
+
83
+ extensions: ClassVar[list[str]] = docling_extensions + text_extensions
84
+
46
85
  @staticmethod
47
86
  def parse_file(path: Path) -> str:
48
87
  try:
49
- reader = MarkItDown()
50
- return reader.convert(path).text_content
88
+ file_extension = path.suffix.lower()
89
+
90
+ if file_extension in FileReader.docling_extensions:
91
+ # Use docling for complex document formats
92
+ converter = DocumentConverter()
93
+ result = converter.convert(path)
94
+ return result.document.export_to_markdown()
95
+ elif file_extension in FileReader.text_extensions:
96
+ # Read plain text files directly
97
+ content = path.read_text(encoding="utf-8")
98
+
99
+ # Wrap code files (but not plain txt) in markdown code blocks for better presentation
100
+ if file_extension in FileReader.code_markdown_identifier:
101
+ language = FileReader.code_markdown_identifier[file_extension]
102
+ return f"```{language}\n{content}\n```"
103
+
104
+ return content
105
+ else:
106
+ # Fallback: try to read as text
107
+ return path.read_text(encoding="utf-8")
51
108
  except Exception:
52
109
  raise ValueError(f"Failed to parse file: {path}")
haiku/rag/store/engine.py CHANGED
@@ -37,6 +37,11 @@ class Store:
37
37
  db = sqlite3.connect(self.db_path)
38
38
  db.enable_load_extension(True)
39
39
  sqlite_vec.load(db)
40
+
41
+ # Enable WAL mode for better concurrency (skip for in-memory databases)
42
+ if self.db_path != ":memory:":
43
+ db.execute("PRAGMA journal_mode=WAL")
44
+
40
45
  self._connection = db
41
46
  existing_tables = [
42
47
  row[0]
@@ -7,8 +7,9 @@ class Chunk(BaseModel):
7
7
  """
8
8
 
9
9
  id: int | None = None
10
- document_id: int
10
+ document_id: int | None = None
11
11
  content: str
12
12
  metadata: dict = {}
13
13
  document_uri: str | None = None
14
14
  document_meta: dict = {}
15
+ embedding: list[float] | None = None
@@ -18,6 +18,8 @@ class ChunkRepository(BaseRepository[Chunk]):
18
18
  """Create a chunk in the database."""
19
19
  if self.store._connection is None:
20
20
  raise ValueError("Store connection is not available")
21
+ if entity.document_id is None:
22
+ raise ValueError("Chunk must have a document_id to be created")
21
23
 
22
24
  cursor = self.store._connection.cursor()
23
25
  cursor.execute(
@@ -34,9 +36,15 @@ class ChunkRepository(BaseRepository[Chunk]):
34
36
 
35
37
  entity.id = cursor.lastrowid
36
38
 
37
- # Generate and store embedding
38
- embedding = await self.embedder.embed(entity.content)
39
- serialized_embedding = self.store.serialize_embedding(embedding)
39
+ # Generate and store embedding - use existing one if provided
40
+ if entity.embedding is not None:
41
+ # Use the provided embedding
42
+ serialized_embedding = self.store.serialize_embedding(entity.embedding)
43
+ else:
44
+ # Generate embedding from content
45
+ embedding = await self.embedder.embed(entity.content)
46
+ serialized_embedding = self.store.serialize_embedding(embedding)
47
+
40
48
  cursor.execute(
41
49
  """
42
50
  INSERT INTO chunk_embeddings (chunk_id, embedding)
@@ -1,8 +1,12 @@
1
1
  import json
2
+ from typing import TYPE_CHECKING
2
3
 
3
4
  from haiku.rag.store.models.document import Document
4
5
  from haiku.rag.store.repositories.base import BaseRepository
5
6
 
7
+ if TYPE_CHECKING:
8
+ from haiku.rag.store.models.chunk import Chunk
9
+
6
10
 
7
11
  class DocumentRepository(BaseRepository[Document]):
8
12
  """Repository for Document database operations."""
@@ -16,7 +20,9 @@ class DocumentRepository(BaseRepository[Document]):
16
20
  chunk_repository = ChunkRepository(store)
17
21
  self.chunk_repository = chunk_repository
18
22
 
19
- async def create(self, entity: Document) -> Document:
23
+ async def create(
24
+ self, entity: Document, chunks: list["Chunk"] | None = None
25
+ ) -> Document:
20
26
  """Create a document with its chunks and embeddings."""
21
27
  if self.store._connection is None:
22
28
  raise ValueError("Store connection is not available")
@@ -46,10 +52,20 @@ class DocumentRepository(BaseRepository[Document]):
46
52
  assert document_id is not None, "Failed to create document in database"
47
53
  entity.id = document_id
48
54
 
49
- # Create chunks and embeddings using ChunkRepository
50
- await self.chunk_repository.create_chunks_for_document(
51
- document_id, entity.content, commit=False
52
- )
55
+ # Create chunks - either use provided chunks or generate from content
56
+ if chunks is not None:
57
+ # Use provided chunks, but update their document_id and set order from list position
58
+ for order, chunk in enumerate(chunks):
59
+ chunk.document_id = document_id
60
+ # Ensure order is set from list position
61
+ chunk.metadata = chunk.metadata.copy() if chunk.metadata else {}
62
+ chunk.metadata["order"] = order
63
+ await self.chunk_repository.create(chunk, commit=False)
64
+ else:
65
+ # Create chunks and embeddings using ChunkRepository
66
+ await self.chunk_repository.create_chunks_for_document(
67
+ document_id, entity.content, commit=False
68
+ )
53
69
 
54
70
  cursor.execute("COMMIT")
55
71
  return entity
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Retrieval Augmented Generation (RAG) with SQLite
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -18,9 +18,9 @@ Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Typing :: Typed
20
20
  Requires-Python: >=3.10
21
+ Requires-Dist: docling>=2.15.0
21
22
  Requires-Dist: fastmcp>=2.8.1
22
23
  Requires-Dist: httpx>=0.28.1
23
- Requires-Dist: markitdown[audio-transcription,docx,pdf,pptx,xlsx]>=0.1.2
24
24
  Requires-Dist: mxbai-rerank>=0.1.6
25
25
  Requires-Dist: ollama>=0.5.1
26
26
  Requires-Dist: pydantic>=2.11.7
@@ -55,7 +55,7 @@ Retrieval-Augmented Generation (RAG) library on SQLite.
55
55
  - **Reranking**: Default search result reranking with MixedBread AI or Cohere
56
56
  - **Question answering**: Built-in QA agents on your documents
57
57
  - **File monitoring**: Auto-index files when run as server
58
- - **40+ file formats**: PDF, DOCX, HTML, Markdown, audio, URLs
58
+ - **40+ file formats**: PDF, DOCX, HTML, Markdown, code files, URLs
59
59
  - **MCP server**: Expose as tools for AI assistants
60
60
  - **CLI & Python API**: Use from command line or Python
61
61
 
@@ -1,13 +1,13 @@
1
1
  haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  haiku/rag/app.py,sha256=FpLVyP1-zAq_XPmU8CPVLkuIAeuhBOGvMqhYS8RbN40,7649
3
3
  haiku/rag/chunker.py,sha256=MbCtP66OfTFoIBvqmVT9T9c87fozsYYzAQzJJEfPBVI,1812
4
- haiku/rag/cli.py,sha256=oCj65JcV2MEhzA2okbLHAK1I0FrClIKuYZx2jtbjbqE,5628
5
- haiku/rag/client.py,sha256=gqHsRAZqM5s4-c-RjeR4HanOKqPuK0z_MtsfmZMvT-w,12553
4
+ haiku/rag/cli.py,sha256=k7EhLkvTncxsdh5TYrg8BHLYh_lfyzupsWGj1dEEdqY,5992
5
+ haiku/rag/client.py,sha256=MZNIpMm6MS3P6vjLqiCztT2dBOM7-bZOosX5IpbHJbI,12724
6
6
  haiku/rag/config.py,sha256=_Ss54kmfxVAJupExLKaYjYUlFxJgb7hEEdbG4-isapY,1662
7
7
  haiku/rag/logging.py,sha256=zTTGpGq5tPdcd7RpCbd9EGw1IZlQDbYkrCg9t9pqRc4,580
8
8
  haiku/rag/mcp.py,sha256=tMN6fNX7ZtAER1R6DL1GkC9HZozTC4HzuQs199p7icI,4551
9
9
  haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
10
- haiku/rag/reader.py,sha256=S7-Z72pDvSHedvgt4-RkTOwZadG88Oed9keJ69SVITk,962
10
+ haiku/rag/reader.py,sha256=dLz3yyc5r8dzdqCc2VViC3fADpScw4lxXueKiu-cI7c,2915
11
11
  haiku/rag/utils.py,sha256=Ez_tvNlRO_D8c2CBZ83Hs9Gmzcqdq4cmw_V5GBdKy_8,2214
12
12
  haiku/rag/embeddings/__init__.py,sha256=yFBlxS0jBiVHl_rWz5kb43t6Ha132U1ZGdlIPfhzPdg,1491
13
13
  haiku/rag/embeddings/base.py,sha256=NTQvuzbZPu0LBo5wAu3qGyJ4xXUaRAt1fjBO0ygWn_Y,465
@@ -25,19 +25,19 @@ haiku/rag/reranking/base.py,sha256=LM9yUSSJ414UgBZhFTgxGprlRqzfTe4I1vgjricz2JY,4
25
25
  haiku/rag/reranking/cohere.py,sha256=1iTdiaa8vvb6oHVB2qpWzUOVkyfUcimVSZp6Qr4aq4c,1049
26
26
  haiku/rag/reranking/mxbai.py,sha256=46sVTsTIkzIX9THgM3u8HaEmgY7evvEyB-N54JTHvK8,867
27
27
  haiku/rag/store/__init__.py,sha256=hq0W0DAC7ysqhWSP2M2uHX8cbG6kbr-sWHxhq6qQcY0,103
28
- haiku/rag/store/engine.py,sha256=4ouAD0s-TFwEoEHjVVw_KnV6aaw5nwhe9fdT8PRXfok,6061
28
+ haiku/rag/store/engine.py,sha256=cOMBToLilI1Di1qQrFzGLqtRMsuvtiX0Q5RNIEzQy9w,6232
29
29
  haiku/rag/store/models/__init__.py,sha256=s0E72zneGlowvZrFWaNxHYjOAUjgWdLxzdYsnvNRVlY,88
30
- haiku/rag/store/models/chunk.py,sha256=lmbPOOTz-N4PXhrA5XCUxyRcSTZBo135fqkV1mwnGcE,309
30
+ haiku/rag/store/models/chunk.py,sha256=9-vIxW75-kMTelIhgVIMd_WhP-Drc1q65vjaWMP8w1E,364
31
31
  haiku/rag/store/models/document.py,sha256=TVXVY-nQs-1vCORQEs9rA7zOtndeGC4dgCoujLAS054,396
32
32
  haiku/rag/store/repositories/__init__.py,sha256=uIBhxjQh-4o3O-ck8b7BQ58qXQTuJdPvrDIHVhY5T1A,263
33
33
  haiku/rag/store/repositories/base.py,sha256=cm3VyQXhtxvRfk1uJHpA0fDSxMpYN-mjQmRiDiLsQ68,1008
34
- haiku/rag/store/repositories/chunk.py,sha256=gik7ZPOK3gCoG6tU1pGueAZBPmJxIb7obYFUhwINrYg,16497
35
- haiku/rag/store/repositories/document.py,sha256=xpWOpjHFbhVwNJ1gpusEKNY6l_Qyibg9y_bdHCwcfpk,7133
34
+ haiku/rag/store/repositories/chunk.py,sha256=UyvHhKb1ESZePoTp2GneAARdfKoocEdfPOwgWPPQ0v8,16878
35
+ haiku/rag/store/repositories/document.py,sha256=fXIWevJaOe6x2cK4u9cQxiEGD0ntKQb9y3VRqklQypE,7920
36
36
  haiku/rag/store/repositories/settings.py,sha256=dme3_ulQdQvyF9daavSjAd-SjZ5hh0MJoxP7iXgap-A,2492
37
37
  haiku/rag/store/upgrades/__init__.py,sha256=kKS1YWT_P-CYKhKtokOLTIFNKf9jlfjFFr8lyIMeogM,100
38
38
  haiku/rag/store/upgrades/v0_3_4.py,sha256=GLogKZdZ40NX1vBHKdOJju7fFzNUCHoEnjSZg17Hm2U,663
39
- haiku_rag-0.4.1.dist-info/METADATA,sha256=Vqg_r9uBqdKh3V4dUgPGzx40cNUtXodIELD9_sU2xYs,4235
40
- haiku_rag-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
- haiku_rag-0.4.1.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
42
- haiku_rag-0.4.1.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
43
- haiku_rag-0.4.1.dist-info/RECORD,,
39
+ haiku_rag-0.4.3.dist-info/METADATA,sha256=T2ZHdGL_zd1eSfEjFolh3R_zJpuWmUhKsnNkYLKtT7E,4198
40
+ haiku_rag-0.4.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
+ haiku_rag-0.4.3.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
42
+ haiku_rag-0.4.3.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
43
+ haiku_rag-0.4.3.dist-info/RECORD,,