haiku.rag 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

haiku/rag/client.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import hashlib
2
+ import logging
2
3
  import mimetypes
3
4
  import tempfile
4
5
  from collections.abc import AsyncGenerator
@@ -18,6 +19,8 @@ from haiku.rag.store.repositories.document import DocumentRepository
18
19
  from haiku.rag.store.repositories.settings import SettingsRepository
19
20
  from haiku.rag.utils import text_to_docling_document
20
21
 
22
+ logger = logging.getLogger(__name__)
23
+
21
24
 
22
25
  class HaikuRAG:
23
26
  """High-level haiku-rag client."""
@@ -538,8 +541,8 @@ class HaikuRAG:
538
541
  """Rebuild the database by deleting all chunks and re-indexing all documents.
539
542
 
540
543
  For documents with URIs:
541
- - Deletes the document and re-adds it from source if source exists
542
- - Skips documents where source no longer exists
544
+ - Re-adds from source if source exists
545
+ - Re-embeds from existing content if source is missing
543
546
 
544
547
  For documents without URIs:
545
548
  - Re-creates chunks from existing content
@@ -559,29 +562,51 @@ class HaikuRAG:
559
562
  for doc in documents:
560
563
  assert doc.id is not None, "Document ID should not be None"
561
564
  if doc.uri:
562
- # Document has a URI - delete and try to re-add from source
563
- try:
564
- # Delete the old document first
565
- await self.delete_document(doc.id)
565
+ # Document has a URI - check if source is accessible
566
+ source_accessible = False
567
+ parsed_url = urlparse(doc.uri)
566
568
 
567
- # Try to re-create from source (this creates the document with chunks)
568
- new_doc = await self.create_document_from_source(
569
- source=doc.uri, metadata=doc.metadata or {}
569
+ try:
570
+ if parsed_url.scheme == "file":
571
+ # Check if file exists
572
+ source_path = Path(parsed_url.path)
573
+ source_accessible = source_path.exists()
574
+ elif parsed_url.scheme in ("http", "https"):
575
+ # For URLs, we'll try to create and catch errors
576
+ source_accessible = True
577
+ else:
578
+ source_accessible = False
579
+ except Exception:
580
+ source_accessible = False
581
+
582
+ if source_accessible:
583
+ # Source exists - delete and recreate from source
584
+ try:
585
+ await self.delete_document(doc.id)
586
+ new_doc = await self.create_document_from_source(
587
+ source=doc.uri, metadata=doc.metadata or {}
588
+ )
589
+ assert new_doc.id is not None, (
590
+ "New document ID should not be None"
591
+ )
592
+ yield new_doc.id
593
+ except Exception as e:
594
+ logger.error(
595
+ "Error recreating document from source %s: %s",
596
+ doc.uri,
597
+ e,
598
+ )
599
+ continue
600
+ else:
601
+ # Source missing - re-embed from existing content
602
+ logger.warning(
603
+ "Source missing for %s, re-embedding from content", doc.uri
570
604
  )
571
-
572
- assert new_doc.id is not None, "New document ID should not be None"
573
- yield new_doc.id
574
-
575
- except (FileNotFoundError, ValueError, OSError) as e:
576
- # Source doesn't exist or can't be accessed - document already deleted, skip
577
- print(f"Skipping document with URI {doc.uri}: {e}")
578
- continue
579
- except Exception as e:
580
- # Unexpected error - log it and skip
581
- print(
582
- f"Unexpected error processing document with URI {doc.uri}: {e}"
605
+ docling_document = text_to_docling_document(doc.content)
606
+ await self.chunk_repository.create_chunks_for_document(
607
+ doc.id, docling_document
583
608
  )
584
- continue
609
+ yield doc.id
585
610
  else:
586
611
  # Document without URI - re-create chunks from existing content
587
612
  docling_document = text_to_docling_document(doc.content)
haiku/rag/config.py CHANGED
@@ -20,8 +20,8 @@ class AppConfig(BaseModel):
20
20
  MONITOR_DIRECTORIES: list[Path] = []
21
21
 
22
22
  EMBEDDINGS_PROVIDER: str = "ollama"
23
- EMBEDDINGS_MODEL: str = "mxbai-embed-large"
24
- EMBEDDINGS_VECTOR_DIM: int = 1024
23
+ EMBEDDINGS_MODEL: str = "qwen3-embedding"
24
+ EMBEDDINGS_VECTOR_DIM: int = 4096
25
25
 
26
26
  RERANK_PROVIDER: str = ""
27
27
  RERANK_MODEL: str = ""
@@ -7,6 +7,8 @@ from haiku.rag.embeddings.base import EmbedderBase
7
7
  class Embedder(EmbedderBase):
8
8
  async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
9
9
  client = AsyncOpenAI(base_url=f"{Config.OLLAMA_BASE_URL}/v1", api_key="dummy")
10
+ if not text:
11
+ return []
10
12
  response = await client.embeddings.create(
11
13
  model=self._model,
12
14
  input=text,
@@ -6,6 +6,8 @@ from haiku.rag.embeddings.base import EmbedderBase
6
6
  class Embedder(EmbedderBase):
7
7
  async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
8
8
  client = AsyncOpenAI()
9
+ if not text:
10
+ return []
9
11
  response = await client.embeddings.create(
10
12
  model=self._model,
11
13
  input=text,
@@ -9,6 +9,8 @@ class Embedder(EmbedderBase):
9
9
  client = AsyncOpenAI(
10
10
  base_url=f"{Config.VLLM_EMBEDDINGS_BASE_URL}/v1", api_key="dummy"
11
11
  )
12
+ if not text:
13
+ return []
12
14
  response = await client.embeddings.create(
13
15
  model=self._model,
14
16
  input=text,
@@ -6,6 +6,8 @@ try:
6
6
  class Embedder(EmbedderBase):
7
7
  async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
8
8
  client = Client()
9
+ if not text:
10
+ return []
9
11
  if isinstance(text, str):
10
12
  res = client.embed([text], model=self._model, output_dtype="float")
11
13
  return res.embeddings[0] # type: ignore[return-value]
haiku/rag/monitor.py CHANGED
@@ -1,13 +1,13 @@
1
+ import logging
1
2
  from pathlib import Path
2
3
 
3
4
  from watchfiles import Change, DefaultFilter, awatch
4
5
 
5
6
  from haiku.rag.client import HaikuRAG
6
- from haiku.rag.logging import get_logger
7
7
  from haiku.rag.reader import FileReader
8
8
  from haiku.rag.store.models.document import Document
9
9
 
10
- logger = get_logger()
10
+ logger = logging.getLogger(__name__)
11
11
 
12
12
 
13
13
  class FileFilter(DefaultFilter):
@@ -1,3 +1,5 @@
1
+ import os
2
+
1
3
  from haiku.rag.config import Config
2
4
  from haiku.rag.reranking.base import RerankerBase
3
5
 
@@ -17,6 +19,7 @@ def get_reranker() -> RerankerBase | None:
17
19
  try:
18
20
  from haiku.rag.reranking.mxbai import MxBAIReranker
19
21
 
22
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
20
23
  _reranker = MxBAIReranker()
21
24
  return _reranker
22
25
  except ImportError:
@@ -133,17 +133,17 @@ class SettingsRepository:
133
133
 
134
134
  if stored_provider and stored_provider != current_provider:
135
135
  incompatible_changes.append(
136
- f"Embedding provider changed from '{stored_provider}' to '{current_provider}'"
136
+ f"Stored (db) embedding provider: '{stored_provider}' -> Environment (current) embedding provider: '{current_provider}'"
137
137
  )
138
138
 
139
139
  if stored_model and stored_model != current_model:
140
140
  incompatible_changes.append(
141
- f"Embedding model changed from '{stored_model}' to '{current_model}'"
141
+ f"Stored (db) embedding model '{stored_model}' -> Environment (current) embedding model '{current_model}'"
142
142
  )
143
143
 
144
144
  if stored_vector_dim and stored_vector_dim != current_vector_dim:
145
145
  incompatible_changes.append(
146
- f"Vector dimension changed from {stored_vector_dim} to {current_vector_dim}"
146
+ f"Stored (db) embedding vector dimension {stored_vector_dim} -> Environment (current) embedding vector dimension {current_vector_dim}"
147
147
  )
148
148
 
149
149
  if incompatible_changes:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.11.1
3
+ Version: 0.11.2
4
4
  Summary: Agentic Retrieval Augmented Generation (RAG) with LanceDB
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -2,24 +2,24 @@ haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  haiku/rag/app.py,sha256=TRFwMP9mzLaM7EPc7dhsPODKZxCDkSSgPCnGAdj65VU,17929
3
3
  haiku/rag/chunker.py,sha256=PVe6ysv8UlacUd4Zb3_8RFWIaWDXnzBAy2VDJ4TaUsE,1555
4
4
  haiku/rag/cli.py,sha256=wreAxyXSRnn7f09t9SGe4uAXQjlieUQIpNpOapJT7y8,12910
5
- haiku/rag/client.py,sha256=iUaa6YUac3CXFniIm8DsaaNsiyHsi4cp8-fPhF5XuVU,22925
6
- haiku/rag/config.py,sha256=SEV2OzaKavYwHZ0LmRzBj-0dbI6YFIRuNiTw9el7SO0,2307
5
+ haiku/rag/client.py,sha256=fz5bZP1KNWbc9cvpEC8puMBHEvt-vVtFjRMItt_WD0M,23920
6
+ haiku/rag/config.py,sha256=c2WoaieI3-HAWb6lCmVnJHY22NXl2SGLsndRbiqCzeA,2305
7
7
  haiku/rag/logging.py,sha256=dm65AwADpcQsH5OAPtRA-4hsw0w5DK-sGOvzYkj6jzw,1720
8
8
  haiku/rag/mcp.py,sha256=H7XibtSNUviFeaJVsXzHiRqUm0nJCpA7A1QHuBv6SKQ,5057
9
9
  haiku/rag/migration.py,sha256=zm0-60PiS1hIQnZz65B7qfsgM7GwZVXFqMFowjpVBs8,11058
10
- haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
10
+ haiku/rag/monitor.py,sha256=VP3bqY0mEodOP60eN4RMldgrL1ti5gMjuDuQ-_vBvFc,2759
11
11
  haiku/rag/reader.py,sha256=aW8LG0X31kVWS7kU2tKVpe8RqP3Ne_oIidd_X3UDLH0,3307
12
12
  haiku/rag/utils.py,sha256=dBzhKaOHI9KRiJqHErcXUnqtnXY2AgOK8PCLA3rhO0A,6115
13
13
  haiku/rag/embeddings/__init__.py,sha256=44IfDITGIFTflGT6UEmiYOwpWFVbYv5smLY59D0YeCs,1419
14
14
  haiku/rag/embeddings/base.py,sha256=BnSviKrlzjv3L0sZJs_T-pxfawd-bcTak-rsX-D2f3A,497
15
- haiku/rag/embeddings/ollama.py,sha256=LuLlHH6RGoO9_gFCIlbmesuXOj017gTw6z-p8Ez0CfE,595
16
- haiku/rag/embeddings/openai.py,sha256=fIFCk-jpUtaW0xsnrQnJ824O0UCjaGG2sgvBzREhilc,503
17
- haiku/rag/embeddings/vllm.py,sha256=vhaUnCn6VMkfSluLhWKtSV-sekFaPsp4pKo2N7-SBCY,626
18
- haiku/rag/embeddings/voyageai.py,sha256=UW-MW4tJKnPB6Fs2P7A3yt-ZeRm46H9npckchSriPX8,661
15
+ haiku/rag/embeddings/ollama.py,sha256=c1BeKTgpymniZw1sm4iAIdK5vA0MYoRzHLcd2_pFA44,638
16
+ haiku/rag/embeddings/openai.py,sha256=bwoUVlzu9UtbDpN7CtG6OPt0d5tfJNeje4lR81Btpl0,546
17
+ haiku/rag/embeddings/vllm.py,sha256=7ocp9D9bD1R5rqRIC4-Vih9VlKQNuD429k8-9wu234E,669
18
+ haiku/rag/embeddings/voyageai.py,sha256=I4kVdT2KPtwcbjxD22GWJmgcIQIEEHpkOY2_QbFh7mQ,712
19
19
  haiku/rag/qa/__init__.py,sha256=Sl7Kzrg9CuBOcMF01wc1NtQhUNWjJI0MhIHfCWrb8V4,434
20
20
  haiku/rag/qa/agent.py,sha256=rtUkEmnD8lMHIxpPPVY6TdmF4aSlZnLjad5eDefrlBw,3145
21
21
  haiku/rag/qa/prompts.py,sha256=Lqwn3m4zCsu_CJiC4s9cLsuPNbb9nq6j2PqEF3lw1eA,3380
22
- haiku/rag/reranking/__init__.py,sha256=IRXHs4qPu6VbGJQpzSwhgtVWWumURH_vEoVFE-extlo,894
22
+ haiku/rag/reranking/__init__.py,sha256=95ApqN51rcog9MLkTh_uNE69qOVozO1Z6KMbZZj8nH0,963
23
23
  haiku/rag/reranking/base.py,sha256=LM9yUSSJ414UgBZhFTgxGprlRqzfTe4I1vgjricz2JY,405
24
24
  haiku/rag/reranking/cohere.py,sha256=1iTdiaa8vvb6oHVB2qpWzUOVkyfUcimVSZp6Qr4aq4c,1049
25
25
  haiku/rag/reranking/mxbai.py,sha256=uveGFIdmNmepd2EQsvYr64wv0ra2_wB845hdSZXy5Cw,908
@@ -44,12 +44,12 @@ haiku/rag/store/models/document.py,sha256=cZXy_jEti-hnhq7FKhuhCfd99ccY9fIHMLovB_
44
44
  haiku/rag/store/repositories/__init__.py,sha256=Olv5dLfBQINRV3HrsfUpjzkZ7Qm7goEYyMNykgo_DaY,291
45
45
  haiku/rag/store/repositories/chunk.py,sha256=UfajEWf5VmMuSozGRDlWBjJNR0ngvOVFDrp6_augzBg,15217
46
46
  haiku/rag/store/repositories/document.py,sha256=C9GbIl8sa2-Djaml4hlaPTtjV2HwHaz_Wzs35sdbdhg,7876
47
- haiku/rag/store/repositories/settings.py,sha256=7XMBMavU8zRgdBoQzQg0Obfa7UKjuVnBugidTC6sEW0,5548
47
+ haiku/rag/store/repositories/settings.py,sha256=ObrDrzxHn-yA1WcbgIoJoVmAbVvQHAFvEdRyJFt5Opc,5685
48
48
  haiku/rag/store/upgrades/__init__.py,sha256=RQ8A6rEXBASLb5PD9vdDnEas_m_GgRzzdVu4B88Snqc,1975
49
49
  haiku/rag/store/upgrades/v0_10_1.py,sha256=qNGnxj6hoHaHJ1rKTiALfw0c9NQOi0KAK-VZCD_073A,1959
50
50
  haiku/rag/store/upgrades/v0_9_3.py,sha256=NrjNilQSgDtFWRbL3ZUtzQzJ8tf9u0dDRJtnDFwwbdw,3322
51
- haiku_rag-0.11.1.dist-info/METADATA,sha256=hNE2XuctQAN0UeUYW1l0eVYDioV0iHNy3ZfMmDjFI_M,6542
52
- haiku_rag-0.11.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
53
- haiku_rag-0.11.1.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
54
- haiku_rag-0.11.1.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
55
- haiku_rag-0.11.1.dist-info/RECORD,,
51
+ haiku_rag-0.11.2.dist-info/METADATA,sha256=Eij1eM8K5MOYM1QZccm96n6v97r9x-lS3RG8tdPcXPw,6542
52
+ haiku_rag-0.11.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
53
+ haiku_rag-0.11.2.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
54
+ haiku_rag-0.11.2.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
55
+ haiku_rag-0.11.2.dist-info/RECORD,,