haiku.rag 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

haiku/rag/config.py CHANGED
@@ -32,6 +32,10 @@ class AppConfig(BaseModel):
32
32
  CHUNK_SIZE: int = 256
33
33
  CONTEXT_CHUNK_RADIUS: int = 0
34
34
 
35
+ # Optional dotted path or file path to a callable that preprocesses
36
+ # markdown content before chunking. Examples:
37
+ MARKDOWN_PREPROCESSOR: str = ""
38
+
35
39
  OLLAMA_BASE_URL: str = "http://localhost:11434"
36
40
  VLLM_EMBEDDINGS_BASE_URL: str = ""
37
41
  VLLM_RERANK_BASE_URL: str = ""
haiku/rag/store/engine.py CHANGED
@@ -209,6 +209,21 @@ class Store:
209
209
  # LanceDB connections are automatically managed
210
210
  pass
211
211
 
212
+ def current_table_versions(self) -> dict[str, int]:
213
+ """Capture current versions of key tables for rollback using LanceDB's API."""
214
+ return {
215
+ "documents": int(self.documents_table.version),
216
+ "chunks": int(self.chunks_table.version),
217
+ "settings": int(self.settings_table.version),
218
+ }
219
+
220
+ def restore_table_versions(self, versions: dict[str, int]) -> bool:
221
+ """Restore tables to the provided versions using LanceDB's API."""
222
+ self.documents_table.restore(int(versions["documents"]))
223
+ self.chunks_table.restore(int(versions["chunks"]))
224
+ self.settings_table.restore(int(versions["settings"]))
225
+ return True
226
+
212
227
  @property
213
228
  def _connection(self):
214
229
  """Compatibility property for repositories expecting _connection."""
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import inspect
2
3
  import json
3
4
  import logging
4
5
  from uuid import uuid4
@@ -11,6 +12,7 @@ from haiku.rag.config import Config
11
12
  from haiku.rag.embeddings import get_embedder
12
13
  from haiku.rag.store.engine import DocumentRecord, Store
13
14
  from haiku.rag.store.models.chunk import Chunk
15
+ from haiku.rag.utils import load_callable, text_to_docling_document
14
16
 
15
17
  logger = logging.getLogger(__name__)
16
18
 
@@ -152,7 +154,28 @@ class ChunkRepository:
152
154
  self, document_id: str, document: DoclingDocument
153
155
  ) -> list[Chunk]:
154
156
  """Create chunks and embeddings for a document from DoclingDocument."""
155
- chunk_texts = await chunker.chunk(document)
157
+ # Optionally preprocess markdown before chunking
158
+ processed_document = document
159
+ preprocessor_path = Config.MARKDOWN_PREPROCESSOR
160
+ if preprocessor_path:
161
+ try:
162
+ pre_fn = load_callable(preprocessor_path)
163
+ markdown = document.export_to_markdown()
164
+ result = pre_fn(markdown)
165
+ if inspect.isawaitable(result):
166
+ result = await result # type: ignore[assignment]
167
+ processed_markdown = result
168
+ if not isinstance(processed_markdown, str):
169
+ raise ValueError("Preprocessor must return a markdown string")
170
+ processed_document = text_to_docling_document(
171
+ processed_markdown, name="content.md"
172
+ )
173
+ except Exception as e:
174
+ logger.warning(
175
+ f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
176
+ )
177
+
178
+ chunk_texts = await chunker.chunk(processed_document)
156
179
 
157
180
  embeddings = await self.embedder.embed(chunk_texts)
158
181
 
@@ -171,44 +171,64 @@ class DocumentRepository:
171
171
  chunks: list["Chunk"] | None = None,
172
172
  ) -> Document:
173
173
  """Create a document with its chunks and embeddings."""
174
+ # Snapshot table versions for versioned rollback (if supported)
175
+ versions = self.store.current_table_versions()
176
+
174
177
  # Create the document
175
178
  created_doc = await self.create(entity)
176
179
 
177
- # Create chunks if not provided
178
- if chunks is None:
179
- assert created_doc.id is not None, (
180
- "Document ID should not be None after creation"
181
- )
182
- await self.chunk_repository.create_chunks_for_document(
183
- created_doc.id, docling_document
184
- )
185
- else:
186
- # Use provided chunks, set order from list position
187
- assert created_doc.id is not None, (
188
- "Document ID should not be None after creation"
189
- )
190
- for order, chunk in enumerate(chunks):
191
- chunk.document_id = created_doc.id
192
- chunk.metadata["order"] = order
193
- await self.chunk_repository.create(chunk)
194
-
195
- return created_doc
180
+ # Attempt to create chunks; on failure, prefer version rollback
181
+ try:
182
+ # Create chunks if not provided
183
+ if chunks is None:
184
+ assert created_doc.id is not None, (
185
+ "Document ID should not be None after creation"
186
+ )
187
+ await self.chunk_repository.create_chunks_for_document(
188
+ created_doc.id, docling_document
189
+ )
190
+ else:
191
+ # Use provided chunks, set order from list position
192
+ assert created_doc.id is not None, (
193
+ "Document ID should not be None after creation"
194
+ )
195
+ for order, chunk in enumerate(chunks):
196
+ chunk.document_id = created_doc.id
197
+ chunk.metadata["order"] = order
198
+ await self.chunk_repository.create(chunk)
199
+
200
+ return created_doc
201
+ except Exception:
202
+ # Roll back to the captured versions and re-raise
203
+ self.store.restore_table_versions(versions)
204
+ raise
196
205
 
197
206
  async def _update_with_docling(
198
207
  self, entity: Document, docling_document: DoclingDocument
199
208
  ) -> Document:
200
209
  """Update a document and regenerate its chunks."""
201
- # Delete existing chunks
202
210
  assert entity.id is not None, "Document ID is required for update"
211
+
212
+ # Snapshot table versions for versioned rollback
213
+ versions = self.store.current_table_versions()
214
+
215
+ # Delete existing chunks before writing new ones
203
216
  await self.chunk_repository.delete_by_document_id(entity.id)
204
217
 
205
- # Update the document
206
- updated_doc = await self.update(entity)
218
+ try:
219
+ # Update the document
220
+ updated_doc = await self.update(entity)
207
221
 
208
- # Create new chunks
209
- assert updated_doc.id is not None, "Document ID should not be None after update"
210
- await self.chunk_repository.create_chunks_for_document(
211
- updated_doc.id, docling_document
212
- )
222
+ # Create new chunks
223
+ assert updated_doc.id is not None, (
224
+ "Document ID should not be None after update"
225
+ )
226
+ await self.chunk_repository.create_chunks_for_document(
227
+ updated_doc.id, docling_document
228
+ )
213
229
 
214
- return updated_doc
230
+ return updated_doc
231
+ except Exception:
232
+ # Roll back to the captured versions and re-raise
233
+ self.store.restore_table_versions(versions)
234
+ raise
haiku/rag/utils.py CHANGED
@@ -1,10 +1,13 @@
1
1
  import asyncio
2
+ import importlib
3
+ import importlib.util
2
4
  import sys
3
5
  from collections.abc import Callable
4
6
  from functools import wraps
5
7
  from importlib import metadata
6
8
  from io import BytesIO
7
9
  from pathlib import Path
10
+ from types import ModuleType
8
11
 
9
12
  import httpx
10
13
  from docling.document_converter import DocumentConverter
@@ -106,3 +109,54 @@ def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocu
106
109
  converter = DocumentConverter()
107
110
  result = converter.convert(doc_stream)
108
111
  return result.document
112
+
113
+
114
+ def load_callable(path: str):
115
+ """Load a callable from a dotted path or file path.
116
+
117
+ Supported formats:
118
+ - "package.module:func" or "package.module.func"
119
+ - "path/to/file.py:func"
120
+
121
+ Returns the loaded callable. Raises ValueError on failure.
122
+ """
123
+ if not path:
124
+ raise ValueError("Empty callable path provided")
125
+
126
+ module_part = None
127
+ func_name = None
128
+
129
+ if ":" in path:
130
+ module_part, func_name = path.split(":", 1)
131
+ else:
132
+ # split by last dot for module.attr
133
+ if "." in path:
134
+ module_part, func_name = path.rsplit(".", 1)
135
+ else:
136
+ raise ValueError(
137
+ "Invalid callable path format. Use 'module:func' or 'module.func' or 'file.py:func'."
138
+ )
139
+
140
+ # Try file path first
141
+ mod: ModuleType | None = None
142
+ module_path = Path(module_part)
143
+ if module_path.suffix == ".py" and module_path.exists():
144
+ spec = importlib.util.spec_from_file_location(module_path.stem, module_path)
145
+ if spec and spec.loader:
146
+ mod = importlib.util.module_from_spec(spec)
147
+ spec.loader.exec_module(mod)
148
+ else:
149
+ # Import as a module path
150
+ try:
151
+ mod = importlib.import_module(module_part)
152
+ except Exception as e:
153
+ raise ValueError(f"Failed to import module '{module_part}': {e}")
154
+
155
+ if not hasattr(mod, func_name):
156
+ raise ValueError(f"Callable '{func_name}' not found in module '{module_part}'")
157
+ func = getattr(mod, func_name)
158
+ if not callable(func):
159
+ raise ValueError(
160
+ f"Attribute '{func_name}' in module '{module_part}' is not callable"
161
+ )
162
+ return func
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.8.0
3
+ Version: 0.8.1
4
4
  Summary: Retrieval Augmented Generation (RAG) with LanceDB
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -3,13 +3,13 @@ haiku/rag/app.py,sha256=XlL6PNPSqeBKF6bemvdSfXEnQghywudwZv-C116NuZU,8254
3
3
  haiku/rag/chunker.py,sha256=PVe6ysv8UlacUd4Zb3_8RFWIaWDXnzBAy2VDJ4TaUsE,1555
4
4
  haiku/rag/cli.py,sha256=houkHTeVc89BA3zPksCjUooEnScSg1Ez_BIHBH6cmJQ,8920
5
5
  haiku/rag/client.py,sha256=NJVGXzVzpoVy1sttz_xEU7mXWtObKT8pGpvo5pZyzwc,21288
6
- haiku/rag/config.py,sha256=3H41da9BU1R1y2JJHD0cOSErX_VSM1UXA7M2JSOxFXE,1795
6
+ haiku/rag/config.py,sha256=k5SSh7nYIFKX5LcWYu4bP-4GV5Y-Wq1UzrLUtRAM5Pw,1954
7
7
  haiku/rag/logging.py,sha256=dm65AwADpcQsH5OAPtRA-4hsw0w5DK-sGOvzYkj6jzw,1720
8
8
  haiku/rag/mcp.py,sha256=bR9Y-Nz-hvjiql20Y0KE0hwNGwyjmPGX8K9d-qmXptY,4683
9
9
  haiku/rag/migration.py,sha256=M--KnSF3lxgKjxmokb4vuzGH-pV8eg0C_8e7jvPqW8Y,11058
10
10
  haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
11
11
  haiku/rag/reader.py,sha256=qkPTMJuQ_o4sK-8zpDl9WFYe_MJ7aL_gUw6rczIpW-g,3274
12
- haiku/rag/utils.py,sha256=c8F0ECsFSqvQxzxINAOAnvShoOnJPLsOaNE3JEY2JSc,3230
12
+ haiku/rag/utils.py,sha256=aiuPu_rrfpyIvJJq0o5boUIIvCdNzdpKwAIPYYn3iG8,4965
13
13
  haiku/rag/embeddings/__init__.py,sha256=44IfDITGIFTflGT6UEmiYOwpWFVbYv5smLY59D0YeCs,1419
14
14
  haiku/rag/embeddings/base.py,sha256=BnSviKrlzjv3L0sZJs_T-pxfawd-bcTak-rsX-D2f3A,497
15
15
  haiku/rag/embeddings/ollama.py,sha256=LuLlHH6RGoO9_gFCIlbmesuXOj017gTw6z-p8Ez0CfE,595
@@ -25,17 +25,17 @@ haiku/rag/reranking/cohere.py,sha256=1iTdiaa8vvb6oHVB2qpWzUOVkyfUcimVSZp6Qr4aq4c
25
25
  haiku/rag/reranking/mxbai.py,sha256=46sVTsTIkzIX9THgM3u8HaEmgY7evvEyB-N54JTHvK8,867
26
26
  haiku/rag/reranking/vllm.py,sha256=xVGH9ss-ISWdJ5SKUUHUbTqBo7PIEmA_SQv0ScdJ6XA,1479
27
27
  haiku/rag/store/__init__.py,sha256=hq0W0DAC7ysqhWSP2M2uHX8cbG6kbr-sWHxhq6qQcY0,103
28
- haiku/rag/store/engine.py,sha256=uzw09IOebaKo8b_FyvVHMUQMDVKfBpN7WGfuY3fKiEE,7757
28
+ haiku/rag/store/engine.py,sha256=fNrykqMX7PRSCt4LSRfuJ66OLrb8BVYq2bpbfI2iaWU,8455
29
29
  haiku/rag/store/models/__init__.py,sha256=s0E72zneGlowvZrFWaNxHYjOAUjgWdLxzdYsnvNRVlY,88
30
30
  haiku/rag/store/models/chunk.py,sha256=ZNyTfO6lh3rXWLVYO3TZcitbL4LSUGr42fR6jQQ5iQc,364
31
31
  haiku/rag/store/models/document.py,sha256=zSSpt6pyrMJAIXGQvIcqojcqUzwZnhp3WxVokaWxNRc,396
32
32
  haiku/rag/store/repositories/__init__.py,sha256=Olv5dLfBQINRV3HrsfUpjzkZ7Qm7goEYyMNykgo_DaY,291
33
- haiku/rag/store/repositories/chunk.py,sha256=v4y4eh4yIf6zJaWfHxljvnmb12dmvwdinzmxQt8Lvhs,13343
34
- haiku/rag/store/repositories/document.py,sha256=lP8Lo82KTP-qwXFRpYZ46WjeAdAsHwZ5pJcrXdz4g0U,6988
33
+ haiku/rag/store/repositories/chunk.py,sha256=n4VMVFPhKj7K2V7llehrpH0wGa-3XYvl6gPYc5H09Vw,14445
34
+ haiku/rag/store/repositories/document.py,sha256=XoLCrMrZqs0iCZoHlDOfRDaVUux77Vdu5iZczduF1rY,7812
35
35
  haiku/rag/store/repositories/settings.py,sha256=wx3fuP_5CpPflZHRrIkeoer6ml-iD0qXERh5k6MQRzI,5291
36
36
  haiku/rag/store/upgrades/__init__.py,sha256=wUiEoSiHTahvuagx93E4FB07v123AhdbOjwUkPusiIg,14
37
- haiku_rag-0.8.0.dist-info/METADATA,sha256=OZfvP7S7MBndpjjTg59UaD9JgB_W39pXpYAjyULjn8A,4610
38
- haiku_rag-0.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
39
- haiku_rag-0.8.0.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
40
- haiku_rag-0.8.0.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
41
- haiku_rag-0.8.0.dist-info/RECORD,,
37
+ haiku_rag-0.8.1.dist-info/METADATA,sha256=jPmTXHAXvT99zs0mFw_UXt80j8APLNkJuP7KspJaVro,4610
38
+ haiku_rag-0.8.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
39
+ haiku_rag-0.8.1.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
40
+ haiku_rag-0.8.1.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
41
+ haiku_rag-0.8.1.dist-info/RECORD,,