haiku.rag 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- haiku/rag/client.py +47 -22
- haiku/rag/config.py +2 -2
- haiku/rag/embeddings/ollama.py +2 -0
- haiku/rag/embeddings/openai.py +2 -0
- haiku/rag/embeddings/vllm.py +2 -0
- haiku/rag/embeddings/voyageai.py +2 -0
- haiku/rag/monitor.py +2 -2
- haiku/rag/reranking/__init__.py +3 -0
- haiku/rag/store/repositories/settings.py +3 -3
- {haiku_rag-0.11.1.dist-info → haiku_rag-0.11.2.dist-info}/METADATA +1 -1
- {haiku_rag-0.11.1.dist-info → haiku_rag-0.11.2.dist-info}/RECORD +14 -14
- {haiku_rag-0.11.1.dist-info → haiku_rag-0.11.2.dist-info}/WHEEL +0 -0
- {haiku_rag-0.11.1.dist-info → haiku_rag-0.11.2.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.11.1.dist-info → haiku_rag-0.11.2.dist-info}/licenses/LICENSE +0 -0
haiku/rag/client.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
+
import logging
|
|
2
3
|
import mimetypes
|
|
3
4
|
import tempfile
|
|
4
5
|
from collections.abc import AsyncGenerator
|
|
@@ -18,6 +19,8 @@ from haiku.rag.store.repositories.document import DocumentRepository
|
|
|
18
19
|
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
19
20
|
from haiku.rag.utils import text_to_docling_document
|
|
20
21
|
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
21
24
|
|
|
22
25
|
class HaikuRAG:
|
|
23
26
|
"""High-level haiku-rag client."""
|
|
@@ -538,8 +541,8 @@ class HaikuRAG:
|
|
|
538
541
|
"""Rebuild the database by deleting all chunks and re-indexing all documents.
|
|
539
542
|
|
|
540
543
|
For documents with URIs:
|
|
541
|
-
-
|
|
542
|
-
-
|
|
544
|
+
- Re-adds from source if source exists
|
|
545
|
+
- Re-embeds from existing content if source is missing
|
|
543
546
|
|
|
544
547
|
For documents without URIs:
|
|
545
548
|
- Re-creates chunks from existing content
|
|
@@ -559,29 +562,51 @@ class HaikuRAG:
|
|
|
559
562
|
for doc in documents:
|
|
560
563
|
assert doc.id is not None, "Document ID should not be None"
|
|
561
564
|
if doc.uri:
|
|
562
|
-
# Document has a URI -
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
await self.delete_document(doc.id)
|
|
565
|
+
# Document has a URI - check if source is accessible
|
|
566
|
+
source_accessible = False
|
|
567
|
+
parsed_url = urlparse(doc.uri)
|
|
566
568
|
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
569
|
+
try:
|
|
570
|
+
if parsed_url.scheme == "file":
|
|
571
|
+
# Check if file exists
|
|
572
|
+
source_path = Path(parsed_url.path)
|
|
573
|
+
source_accessible = source_path.exists()
|
|
574
|
+
elif parsed_url.scheme in ("http", "https"):
|
|
575
|
+
# For URLs, we'll try to create and catch errors
|
|
576
|
+
source_accessible = True
|
|
577
|
+
else:
|
|
578
|
+
source_accessible = False
|
|
579
|
+
except Exception:
|
|
580
|
+
source_accessible = False
|
|
581
|
+
|
|
582
|
+
if source_accessible:
|
|
583
|
+
# Source exists - delete and recreate from source
|
|
584
|
+
try:
|
|
585
|
+
await self.delete_document(doc.id)
|
|
586
|
+
new_doc = await self.create_document_from_source(
|
|
587
|
+
source=doc.uri, metadata=doc.metadata or {}
|
|
588
|
+
)
|
|
589
|
+
assert new_doc.id is not None, (
|
|
590
|
+
"New document ID should not be None"
|
|
591
|
+
)
|
|
592
|
+
yield new_doc.id
|
|
593
|
+
except Exception as e:
|
|
594
|
+
logger.error(
|
|
595
|
+
"Error recreating document from source %s: %s",
|
|
596
|
+
doc.uri,
|
|
597
|
+
e,
|
|
598
|
+
)
|
|
599
|
+
continue
|
|
600
|
+
else:
|
|
601
|
+
# Source missing - re-embed from existing content
|
|
602
|
+
logger.warning(
|
|
603
|
+
"Source missing for %s, re-embedding from content", doc.uri
|
|
570
604
|
)
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
except (FileNotFoundError, ValueError, OSError) as e:
|
|
576
|
-
# Source doesn't exist or can't be accessed - document already deleted, skip
|
|
577
|
-
print(f"Skipping document with URI {doc.uri}: {e}")
|
|
578
|
-
continue
|
|
579
|
-
except Exception as e:
|
|
580
|
-
# Unexpected error - log it and skip
|
|
581
|
-
print(
|
|
582
|
-
f"Unexpected error processing document with URI {doc.uri}: {e}"
|
|
605
|
+
docling_document = text_to_docling_document(doc.content)
|
|
606
|
+
await self.chunk_repository.create_chunks_for_document(
|
|
607
|
+
doc.id, docling_document
|
|
583
608
|
)
|
|
584
|
-
|
|
609
|
+
yield doc.id
|
|
585
610
|
else:
|
|
586
611
|
# Document without URI - re-create chunks from existing content
|
|
587
612
|
docling_document = text_to_docling_document(doc.content)
|
haiku/rag/config.py
CHANGED
|
@@ -20,8 +20,8 @@ class AppConfig(BaseModel):
|
|
|
20
20
|
MONITOR_DIRECTORIES: list[Path] = []
|
|
21
21
|
|
|
22
22
|
EMBEDDINGS_PROVIDER: str = "ollama"
|
|
23
|
-
EMBEDDINGS_MODEL: str = "
|
|
24
|
-
EMBEDDINGS_VECTOR_DIM: int =
|
|
23
|
+
EMBEDDINGS_MODEL: str = "qwen3-embedding"
|
|
24
|
+
EMBEDDINGS_VECTOR_DIM: int = 4096
|
|
25
25
|
|
|
26
26
|
RERANK_PROVIDER: str = ""
|
|
27
27
|
RERANK_MODEL: str = ""
|
haiku/rag/embeddings/ollama.py
CHANGED
|
@@ -7,6 +7,8 @@ from haiku.rag.embeddings.base import EmbedderBase
|
|
|
7
7
|
class Embedder(EmbedderBase):
|
|
8
8
|
async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
|
|
9
9
|
client = AsyncOpenAI(base_url=f"{Config.OLLAMA_BASE_URL}/v1", api_key="dummy")
|
|
10
|
+
if not text:
|
|
11
|
+
return []
|
|
10
12
|
response = await client.embeddings.create(
|
|
11
13
|
model=self._model,
|
|
12
14
|
input=text,
|
haiku/rag/embeddings/openai.py
CHANGED
|
@@ -6,6 +6,8 @@ from haiku.rag.embeddings.base import EmbedderBase
|
|
|
6
6
|
class Embedder(EmbedderBase):
|
|
7
7
|
async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
|
|
8
8
|
client = AsyncOpenAI()
|
|
9
|
+
if not text:
|
|
10
|
+
return []
|
|
9
11
|
response = await client.embeddings.create(
|
|
10
12
|
model=self._model,
|
|
11
13
|
input=text,
|
haiku/rag/embeddings/vllm.py
CHANGED
haiku/rag/embeddings/voyageai.py
CHANGED
|
@@ -6,6 +6,8 @@ try:
|
|
|
6
6
|
class Embedder(EmbedderBase):
|
|
7
7
|
async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
|
|
8
8
|
client = Client()
|
|
9
|
+
if not text:
|
|
10
|
+
return []
|
|
9
11
|
if isinstance(text, str):
|
|
10
12
|
res = client.embed([text], model=self._model, output_dtype="float")
|
|
11
13
|
return res.embeddings[0] # type: ignore[return-value]
|
haiku/rag/monitor.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
|
|
3
4
|
from watchfiles import Change, DefaultFilter, awatch
|
|
4
5
|
|
|
5
6
|
from haiku.rag.client import HaikuRAG
|
|
6
|
-
from haiku.rag.logging import get_logger
|
|
7
7
|
from haiku.rag.reader import FileReader
|
|
8
8
|
from haiku.rag.store.models.document import Document
|
|
9
9
|
|
|
10
|
-
logger =
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class FileFilter(DefaultFilter):
|
haiku/rag/reranking/__init__.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
1
3
|
from haiku.rag.config import Config
|
|
2
4
|
from haiku.rag.reranking.base import RerankerBase
|
|
3
5
|
|
|
@@ -17,6 +19,7 @@ def get_reranker() -> RerankerBase | None:
|
|
|
17
19
|
try:
|
|
18
20
|
from haiku.rag.reranking.mxbai import MxBAIReranker
|
|
19
21
|
|
|
22
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
|
20
23
|
_reranker = MxBAIReranker()
|
|
21
24
|
return _reranker
|
|
22
25
|
except ImportError:
|
|
@@ -133,17 +133,17 @@ class SettingsRepository:
|
|
|
133
133
|
|
|
134
134
|
if stored_provider and stored_provider != current_provider:
|
|
135
135
|
incompatible_changes.append(
|
|
136
|
-
f"
|
|
136
|
+
f"Stored (db) embedding provider: '{stored_provider}' -> Environment (current) embedding provider: '{current_provider}'"
|
|
137
137
|
)
|
|
138
138
|
|
|
139
139
|
if stored_model and stored_model != current_model:
|
|
140
140
|
incompatible_changes.append(
|
|
141
|
-
f"
|
|
141
|
+
f"Stored (db) embedding model '{stored_model}' -> Environment (current) embedding model '{current_model}'"
|
|
142
142
|
)
|
|
143
143
|
|
|
144
144
|
if stored_vector_dim and stored_vector_dim != current_vector_dim:
|
|
145
145
|
incompatible_changes.append(
|
|
146
|
-
f"
|
|
146
|
+
f"Stored (db) embedding vector dimension {stored_vector_dim} -> Environment (current) embedding vector dimension {current_vector_dim}"
|
|
147
147
|
)
|
|
148
148
|
|
|
149
149
|
if incompatible_changes:
|
|
@@ -2,24 +2,24 @@ haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
2
2
|
haiku/rag/app.py,sha256=TRFwMP9mzLaM7EPc7dhsPODKZxCDkSSgPCnGAdj65VU,17929
|
|
3
3
|
haiku/rag/chunker.py,sha256=PVe6ysv8UlacUd4Zb3_8RFWIaWDXnzBAy2VDJ4TaUsE,1555
|
|
4
4
|
haiku/rag/cli.py,sha256=wreAxyXSRnn7f09t9SGe4uAXQjlieUQIpNpOapJT7y8,12910
|
|
5
|
-
haiku/rag/client.py,sha256=
|
|
6
|
-
haiku/rag/config.py,sha256=
|
|
5
|
+
haiku/rag/client.py,sha256=fz5bZP1KNWbc9cvpEC8puMBHEvt-vVtFjRMItt_WD0M,23920
|
|
6
|
+
haiku/rag/config.py,sha256=c2WoaieI3-HAWb6lCmVnJHY22NXl2SGLsndRbiqCzeA,2305
|
|
7
7
|
haiku/rag/logging.py,sha256=dm65AwADpcQsH5OAPtRA-4hsw0w5DK-sGOvzYkj6jzw,1720
|
|
8
8
|
haiku/rag/mcp.py,sha256=H7XibtSNUviFeaJVsXzHiRqUm0nJCpA7A1QHuBv6SKQ,5057
|
|
9
9
|
haiku/rag/migration.py,sha256=zm0-60PiS1hIQnZz65B7qfsgM7GwZVXFqMFowjpVBs8,11058
|
|
10
|
-
haiku/rag/monitor.py,sha256=
|
|
10
|
+
haiku/rag/monitor.py,sha256=VP3bqY0mEodOP60eN4RMldgrL1ti5gMjuDuQ-_vBvFc,2759
|
|
11
11
|
haiku/rag/reader.py,sha256=aW8LG0X31kVWS7kU2tKVpe8RqP3Ne_oIidd_X3UDLH0,3307
|
|
12
12
|
haiku/rag/utils.py,sha256=dBzhKaOHI9KRiJqHErcXUnqtnXY2AgOK8PCLA3rhO0A,6115
|
|
13
13
|
haiku/rag/embeddings/__init__.py,sha256=44IfDITGIFTflGT6UEmiYOwpWFVbYv5smLY59D0YeCs,1419
|
|
14
14
|
haiku/rag/embeddings/base.py,sha256=BnSviKrlzjv3L0sZJs_T-pxfawd-bcTak-rsX-D2f3A,497
|
|
15
|
-
haiku/rag/embeddings/ollama.py,sha256=
|
|
16
|
-
haiku/rag/embeddings/openai.py,sha256=
|
|
17
|
-
haiku/rag/embeddings/vllm.py,sha256=
|
|
18
|
-
haiku/rag/embeddings/voyageai.py,sha256=
|
|
15
|
+
haiku/rag/embeddings/ollama.py,sha256=c1BeKTgpymniZw1sm4iAIdK5vA0MYoRzHLcd2_pFA44,638
|
|
16
|
+
haiku/rag/embeddings/openai.py,sha256=bwoUVlzu9UtbDpN7CtG6OPt0d5tfJNeje4lR81Btpl0,546
|
|
17
|
+
haiku/rag/embeddings/vllm.py,sha256=7ocp9D9bD1R5rqRIC4-Vih9VlKQNuD429k8-9wu234E,669
|
|
18
|
+
haiku/rag/embeddings/voyageai.py,sha256=I4kVdT2KPtwcbjxD22GWJmgcIQIEEHpkOY2_QbFh7mQ,712
|
|
19
19
|
haiku/rag/qa/__init__.py,sha256=Sl7Kzrg9CuBOcMF01wc1NtQhUNWjJI0MhIHfCWrb8V4,434
|
|
20
20
|
haiku/rag/qa/agent.py,sha256=rtUkEmnD8lMHIxpPPVY6TdmF4aSlZnLjad5eDefrlBw,3145
|
|
21
21
|
haiku/rag/qa/prompts.py,sha256=Lqwn3m4zCsu_CJiC4s9cLsuPNbb9nq6j2PqEF3lw1eA,3380
|
|
22
|
-
haiku/rag/reranking/__init__.py,sha256=
|
|
22
|
+
haiku/rag/reranking/__init__.py,sha256=95ApqN51rcog9MLkTh_uNE69qOVozO1Z6KMbZZj8nH0,963
|
|
23
23
|
haiku/rag/reranking/base.py,sha256=LM9yUSSJ414UgBZhFTgxGprlRqzfTe4I1vgjricz2JY,405
|
|
24
24
|
haiku/rag/reranking/cohere.py,sha256=1iTdiaa8vvb6oHVB2qpWzUOVkyfUcimVSZp6Qr4aq4c,1049
|
|
25
25
|
haiku/rag/reranking/mxbai.py,sha256=uveGFIdmNmepd2EQsvYr64wv0ra2_wB845hdSZXy5Cw,908
|
|
@@ -44,12 +44,12 @@ haiku/rag/store/models/document.py,sha256=cZXy_jEti-hnhq7FKhuhCfd99ccY9fIHMLovB_
|
|
|
44
44
|
haiku/rag/store/repositories/__init__.py,sha256=Olv5dLfBQINRV3HrsfUpjzkZ7Qm7goEYyMNykgo_DaY,291
|
|
45
45
|
haiku/rag/store/repositories/chunk.py,sha256=UfajEWf5VmMuSozGRDlWBjJNR0ngvOVFDrp6_augzBg,15217
|
|
46
46
|
haiku/rag/store/repositories/document.py,sha256=C9GbIl8sa2-Djaml4hlaPTtjV2HwHaz_Wzs35sdbdhg,7876
|
|
47
|
-
haiku/rag/store/repositories/settings.py,sha256=
|
|
47
|
+
haiku/rag/store/repositories/settings.py,sha256=ObrDrzxHn-yA1WcbgIoJoVmAbVvQHAFvEdRyJFt5Opc,5685
|
|
48
48
|
haiku/rag/store/upgrades/__init__.py,sha256=RQ8A6rEXBASLb5PD9vdDnEas_m_GgRzzdVu4B88Snqc,1975
|
|
49
49
|
haiku/rag/store/upgrades/v0_10_1.py,sha256=qNGnxj6hoHaHJ1rKTiALfw0c9NQOi0KAK-VZCD_073A,1959
|
|
50
50
|
haiku/rag/store/upgrades/v0_9_3.py,sha256=NrjNilQSgDtFWRbL3ZUtzQzJ8tf9u0dDRJtnDFwwbdw,3322
|
|
51
|
-
haiku_rag-0.11.
|
|
52
|
-
haiku_rag-0.11.
|
|
53
|
-
haiku_rag-0.11.
|
|
54
|
-
haiku_rag-0.11.
|
|
55
|
-
haiku_rag-0.11.
|
|
51
|
+
haiku_rag-0.11.2.dist-info/METADATA,sha256=Eij1eM8K5MOYM1QZccm96n6v97r9x-lS3RG8tdPcXPw,6542
|
|
52
|
+
haiku_rag-0.11.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
53
|
+
haiku_rag-0.11.2.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
|
|
54
|
+
haiku_rag-0.11.2.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
|
|
55
|
+
haiku_rag-0.11.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|