ragit 0.8.2__tar.gz → 0.10.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ragit-0.8.2/ragit.egg-info → ragit-0.10.1}/PKG-INFO +9 -22
- {ragit-0.8.2 → ragit-0.10.1}/README.md +6 -18
- {ragit-0.8.2 → ragit-0.10.1}/pyproject.toml +2 -3
- {ragit-0.8.2 → ragit-0.10.1}/ragit/__init__.py +27 -15
- {ragit-0.8.2 → ragit-0.10.1}/ragit/assistant.py +189 -9
- ragit-0.10.1/ragit/config.py +204 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit/core/experiment/experiment.py +7 -1
- ragit-0.10.1/ragit/exceptions.py +271 -0
- ragit-0.10.1/ragit/loaders.py +401 -0
- ragit-0.10.1/ragit/logging.py +194 -0
- ragit-0.10.1/ragit/monitor.py +307 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit/providers/__init__.py +1 -13
- ragit-0.10.1/ragit/providers/ollama.py +670 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit/version.py +1 -1
- {ragit-0.8.2 → ragit-0.10.1/ragit.egg-info}/PKG-INFO +9 -22
- {ragit-0.8.2 → ragit-0.10.1}/ragit.egg-info/SOURCES.txt +3 -1
- {ragit-0.8.2 → ragit-0.10.1}/ragit.egg-info/requires.txt +1 -3
- ragit-0.8.2/ragit/config.py +0 -60
- ragit-0.8.2/ragit/loaders.py +0 -245
- ragit-0.8.2/ragit/providers/ollama.py +0 -446
- ragit-0.8.2/ragit/providers/sentence_transformers.py +0 -225
- {ragit-0.8.2 → ragit-0.10.1}/LICENSE +0 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit/core/__init__.py +0 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit/core/experiment/__init__.py +0 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit/core/experiment/results.py +0 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit/providers/base.py +0 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit/providers/function_adapter.py +0 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit/utils/__init__.py +0 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit.egg-info/dependency_links.txt +0 -0
- {ragit-0.8.2 → ragit-0.10.1}/ragit.egg-info/top_level.txt +0 -0
- {ragit-0.8.2 → ragit-0.10.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ragit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.1
|
|
4
4
|
Summary: Automatic RAG Pattern Optimization Engine
|
|
5
5
|
Author: RODMENA LIMITED
|
|
6
6
|
Maintainer-email: RODMENA LIMITED <info@rodmena.co.uk>
|
|
@@ -16,7 +16,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.14
|
|
17
17
|
Classifier: Operating System :: MacOS :: MacOS X
|
|
18
18
|
Classifier: Operating System :: POSIX :: Linux
|
|
19
|
-
Requires-Python:
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: requests>=2.31.0
|
|
@@ -28,6 +28,7 @@ Requires-Dist: scikit-learn>=1.5.0
|
|
|
28
28
|
Requires-Dist: tqdm>=4.66.0
|
|
29
29
|
Requires-Dist: trio>=0.24.0
|
|
30
30
|
Requires-Dist: httpx>=0.27.0
|
|
31
|
+
Requires-Dist: resilient-circuit>=0.4.7
|
|
31
32
|
Provides-Extra: dev
|
|
32
33
|
Requires-Dist: ragit[test]; extra == "dev"
|
|
33
34
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -39,8 +40,6 @@ Provides-Extra: test
|
|
|
39
40
|
Requires-Dist: pytest; extra == "test"
|
|
40
41
|
Requires-Dist: pytest-cov; extra == "test"
|
|
41
42
|
Requires-Dist: pytest-mock; extra == "test"
|
|
42
|
-
Provides-Extra: transformers
|
|
43
|
-
Requires-Dist: sentence-transformers>=2.2.0; extra == "transformers"
|
|
44
43
|
Provides-Extra: docs
|
|
45
44
|
Requires-Dist: sphinx>=7.0; extra == "docs"
|
|
46
45
|
Requires-Dist: sphinx-rtd-theme>=2.0; extra == "docs"
|
|
@@ -55,14 +54,11 @@ RAG toolkit for Python. Document loading, chunking, vector search, LLM integrati
|
|
|
55
54
|
|
|
56
55
|
```bash
|
|
57
56
|
pip install ragit
|
|
58
|
-
|
|
59
|
-
# For offline embedding
|
|
60
|
-
pip install ragit[transformers]
|
|
61
57
|
```
|
|
62
58
|
|
|
63
59
|
## Quick Start
|
|
64
60
|
|
|
65
|
-
You must provide an embedding source: custom function,
|
|
61
|
+
You must provide an embedding source: custom function, Ollama, or any provider.
|
|
66
62
|
|
|
67
63
|
### Custom Embedding Function
|
|
68
64
|
|
|
@@ -90,26 +86,17 @@ assistant = RAGAssistant("docs/", embed_fn=my_embed, generate_fn=my_generate)
|
|
|
90
86
|
answer = assistant.ask("How does authentication work?")
|
|
91
87
|
```
|
|
92
88
|
|
|
93
|
-
###
|
|
94
|
-
|
|
95
|
-
Models are downloaded automatically on first use (~90MB for default model).
|
|
89
|
+
### With Ollama (nomic-embed-text)
|
|
96
90
|
|
|
97
91
|
```python
|
|
98
92
|
from ragit import RAGAssistant
|
|
99
|
-
from ragit.providers import
|
|
93
|
+
from ragit.providers import OllamaProvider
|
|
100
94
|
|
|
101
|
-
# Uses
|
|
102
|
-
assistant = RAGAssistant("docs/", provider=
|
|
103
|
-
|
|
104
|
-
# Or specify a model
|
|
105
|
-
assistant = RAGAssistant(
|
|
106
|
-
"docs/",
|
|
107
|
-
provider=SentenceTransformersProvider(model_name="all-mpnet-base-v2")
|
|
108
|
-
)
|
|
95
|
+
# Uses nomic-embed-text for embeddings (768d)
|
|
96
|
+
assistant = RAGAssistant("docs/", provider=OllamaProvider())
|
|
97
|
+
results = assistant.retrieve("search query")
|
|
109
98
|
```
|
|
110
99
|
|
|
111
|
-
Available models: `all-MiniLM-L6-v2` (384d), `all-mpnet-base-v2` (768d), `paraphrase-MiniLM-L6-v2` (384d)
|
|
112
|
-
|
|
113
100
|
## Core API
|
|
114
101
|
|
|
115
102
|
```python
|
|
@@ -6,14 +6,11 @@ RAG toolkit for Python. Document loading, chunking, vector search, LLM integrati
|
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
8
|
pip install ragit
|
|
9
|
-
|
|
10
|
-
# For offline embedding
|
|
11
|
-
pip install ragit[transformers]
|
|
12
9
|
```
|
|
13
10
|
|
|
14
11
|
## Quick Start
|
|
15
12
|
|
|
16
|
-
You must provide an embedding source: custom function,
|
|
13
|
+
You must provide an embedding source: custom function, Ollama, or any provider.
|
|
17
14
|
|
|
18
15
|
### Custom Embedding Function
|
|
19
16
|
|
|
@@ -41,26 +38,17 @@ assistant = RAGAssistant("docs/", embed_fn=my_embed, generate_fn=my_generate)
|
|
|
41
38
|
answer = assistant.ask("How does authentication work?")
|
|
42
39
|
```
|
|
43
40
|
|
|
44
|
-
###
|
|
45
|
-
|
|
46
|
-
Models are downloaded automatically on first use (~90MB for default model).
|
|
41
|
+
### With Ollama (nomic-embed-text)
|
|
47
42
|
|
|
48
43
|
```python
|
|
49
44
|
from ragit import RAGAssistant
|
|
50
|
-
from ragit.providers import
|
|
45
|
+
from ragit.providers import OllamaProvider
|
|
51
46
|
|
|
52
|
-
# Uses
|
|
53
|
-
assistant = RAGAssistant("docs/", provider=
|
|
54
|
-
|
|
55
|
-
# Or specify a model
|
|
56
|
-
assistant = RAGAssistant(
|
|
57
|
-
"docs/",
|
|
58
|
-
provider=SentenceTransformersProvider(model_name="all-mpnet-base-v2")
|
|
59
|
-
)
|
|
47
|
+
# Uses nomic-embed-text for embeddings (768d)
|
|
48
|
+
assistant = RAGAssistant("docs/", provider=OllamaProvider())
|
|
49
|
+
results = assistant.retrieve("search query")
|
|
60
50
|
```
|
|
61
51
|
|
|
62
|
-
Available models: `all-MiniLM-L6-v2` (384d), `all-mpnet-base-v2` (768d), `paraphrase-MiniLM-L6-v2` (384d)
|
|
63
|
-
|
|
64
52
|
## Core API
|
|
65
53
|
|
|
66
54
|
```python
|
|
@@ -10,7 +10,7 @@ maintainers = [
|
|
|
10
10
|
{ name = "RODMENA LIMITED", email = "info@rodmena.co.uk" },
|
|
11
11
|
]
|
|
12
12
|
readme = "README.md"
|
|
13
|
-
requires-python = ">=3.12
|
|
13
|
+
requires-python = ">=3.12"
|
|
14
14
|
classifiers = [
|
|
15
15
|
"Development Status :: 2 - Pre-Alpha",
|
|
16
16
|
"Natural Language :: English",
|
|
@@ -40,6 +40,7 @@ dependencies = [
|
|
|
40
40
|
"tqdm>=4.66.0",
|
|
41
41
|
"trio>=0.24.0",
|
|
42
42
|
"httpx>=0.27.0",
|
|
43
|
+
"resilient-circuit>=0.4.7",
|
|
43
44
|
]
|
|
44
45
|
|
|
45
46
|
[project.urls]
|
|
@@ -59,8 +60,6 @@ dev = [
|
|
|
59
60
|
|
|
60
61
|
test = ["pytest", "pytest-cov", "pytest-mock"]
|
|
61
62
|
|
|
62
|
-
transformers = ["sentence-transformers>=2.2.0"]
|
|
63
|
-
|
|
64
63
|
docs = [
|
|
65
64
|
"sphinx>=7.0",
|
|
66
65
|
"sphinx-rtd-theme>=2.0",
|
|
@@ -16,11 +16,7 @@ Quick Start
|
|
|
16
16
|
>>> assistant = RAGAssistant("docs/", embed_fn=my_embed)
|
|
17
17
|
>>> results = assistant.retrieve("How do I create a REST API?")
|
|
18
18
|
>>>
|
|
19
|
-
>>> # With
|
|
20
|
-
>>> from ragit.providers import SentenceTransformersProvider
|
|
21
|
-
>>> assistant = RAGAssistant("docs/", provider=SentenceTransformersProvider())
|
|
22
|
-
>>>
|
|
23
|
-
>>> # With Ollama (explicit)
|
|
19
|
+
>>> # With Ollama
|
|
24
20
|
>>> from ragit.providers import OllamaProvider
|
|
25
21
|
>>> assistant = RAGAssistant("docs/", provider=OllamaProvider())
|
|
26
22
|
>>> answer = assistant.ask("How do I create a REST API?")
|
|
@@ -63,14 +59,27 @@ from ragit.core.experiment.experiment import ( # noqa: E402
|
|
|
63
59
|
RagitExperiment,
|
|
64
60
|
)
|
|
65
61
|
from ragit.core.experiment.results import EvaluationResult, ExperimentResults # noqa: E402
|
|
62
|
+
from ragit.exceptions import ( # noqa: E402
|
|
63
|
+
ConfigurationError,
|
|
64
|
+
EvaluationError,
|
|
65
|
+
ExceptionAggregator,
|
|
66
|
+
GenerationError,
|
|
67
|
+
IndexingError,
|
|
68
|
+
ProviderError,
|
|
69
|
+
RagitError,
|
|
70
|
+
RetrievalError,
|
|
71
|
+
)
|
|
66
72
|
from ragit.loaders import ( # noqa: E402
|
|
67
73
|
chunk_by_separator,
|
|
68
74
|
chunk_document,
|
|
69
75
|
chunk_rst_sections,
|
|
70
76
|
chunk_text,
|
|
77
|
+
deduplicate_documents,
|
|
78
|
+
generate_document_id,
|
|
71
79
|
load_directory,
|
|
72
80
|
load_text,
|
|
73
81
|
)
|
|
82
|
+
from ragit.monitor import ExecutionMonitor # noqa: E402
|
|
74
83
|
from ragit.providers import ( # noqa: E402
|
|
75
84
|
BaseEmbeddingProvider,
|
|
76
85
|
BaseLLMProvider,
|
|
@@ -89,6 +98,8 @@ __all__ = [
|
|
|
89
98
|
"chunk_document",
|
|
90
99
|
"chunk_by_separator",
|
|
91
100
|
"chunk_rst_sections",
|
|
101
|
+
"generate_document_id",
|
|
102
|
+
"deduplicate_documents",
|
|
92
103
|
# Core classes
|
|
93
104
|
"Document",
|
|
94
105
|
"Chunk",
|
|
@@ -97,6 +108,17 @@ __all__ = [
|
|
|
97
108
|
"FunctionProvider",
|
|
98
109
|
"BaseLLMProvider",
|
|
99
110
|
"BaseEmbeddingProvider",
|
|
111
|
+
# Exceptions
|
|
112
|
+
"RagitError",
|
|
113
|
+
"ConfigurationError",
|
|
114
|
+
"ProviderError",
|
|
115
|
+
"IndexingError",
|
|
116
|
+
"RetrievalError",
|
|
117
|
+
"GenerationError",
|
|
118
|
+
"EvaluationError",
|
|
119
|
+
"ExceptionAggregator",
|
|
120
|
+
# Monitoring
|
|
121
|
+
"ExecutionMonitor",
|
|
100
122
|
# Optimization
|
|
101
123
|
"RagitExperiment",
|
|
102
124
|
"BenchmarkQuestion",
|
|
@@ -104,13 +126,3 @@ __all__ = [
|
|
|
104
126
|
"EvaluationResult",
|
|
105
127
|
"ExperimentResults",
|
|
106
128
|
]
|
|
107
|
-
|
|
108
|
-
# Conditionally add SentenceTransformersProvider if available
|
|
109
|
-
try:
|
|
110
|
-
from ragit.providers import ( # noqa: E402
|
|
111
|
-
SentenceTransformersProvider as SentenceTransformersProvider,
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
__all__ += ["SentenceTransformersProvider"]
|
|
115
|
-
except ImportError:
|
|
116
|
-
pass
|
|
@@ -19,6 +19,7 @@ from numpy.typing import NDArray
|
|
|
19
19
|
|
|
20
20
|
from ragit.core.experiment.experiment import Chunk, Document
|
|
21
21
|
from ragit.loaders import chunk_document, chunk_rst_sections, load_directory, load_text
|
|
22
|
+
from ragit.logging import log_operation
|
|
22
23
|
from ragit.providers.base import BaseEmbeddingProvider, BaseLLMProvider
|
|
23
24
|
from ragit.providers.function_adapter import FunctionProvider
|
|
24
25
|
|
|
@@ -76,13 +77,9 @@ class RAGAssistant:
|
|
|
76
77
|
>>> assistant = RAGAssistant(docs, embed_fn=my_embed, generate_fn=my_llm)
|
|
77
78
|
>>> answer = assistant.ask("What is X?")
|
|
78
79
|
>>>
|
|
79
|
-
>>> # With
|
|
80
|
+
>>> # With Ollama provider (supports nomic-embed-text)
|
|
80
81
|
>>> from ragit.providers import OllamaProvider
|
|
81
82
|
>>> assistant = RAGAssistant(docs, provider=OllamaProvider())
|
|
82
|
-
>>>
|
|
83
|
-
>>> # With SentenceTransformers (offline)
|
|
84
|
-
>>> from ragit.providers import SentenceTransformersProvider
|
|
85
|
-
>>> assistant = RAGAssistant(docs, provider=SentenceTransformersProvider())
|
|
86
83
|
"""
|
|
87
84
|
|
|
88
85
|
def __init__(
|
|
@@ -126,8 +123,7 @@ class RAGAssistant:
|
|
|
126
123
|
"Must provide embed_fn or provider for embeddings. "
|
|
127
124
|
"Examples:\n"
|
|
128
125
|
" RAGAssistant(docs, embed_fn=my_embed_function)\n"
|
|
129
|
-
" RAGAssistant(docs, provider=OllamaProvider())
|
|
130
|
-
" RAGAssistant(docs, provider=SentenceTransformersProvider())"
|
|
126
|
+
" RAGAssistant(docs, provider=OllamaProvider())"
|
|
131
127
|
)
|
|
132
128
|
|
|
133
129
|
self.embedding_model = embedding_model or "default"
|
|
@@ -181,7 +177,7 @@ class RAGAssistant:
|
|
|
181
177
|
for doc in self.documents:
|
|
182
178
|
# Use RST section chunking for .rst files, otherwise regular chunking
|
|
183
179
|
if doc.metadata.get("filename", "").endswith(".rst"):
|
|
184
|
-
chunks = chunk_rst_sections(doc.content, doc.id
|
|
180
|
+
chunks = chunk_rst_sections(doc.content, doc.id)
|
|
185
181
|
else:
|
|
186
182
|
chunks = chunk_document(doc, self.chunk_size, self.chunk_overlap)
|
|
187
183
|
all_chunks.extend(chunks)
|
|
@@ -225,7 +221,7 @@ class RAGAssistant:
|
|
|
225
221
|
new_chunks: list[Chunk] = []
|
|
226
222
|
for doc in new_docs:
|
|
227
223
|
if doc.metadata.get("filename", "").endswith(".rst"):
|
|
228
|
-
chunks = chunk_rst_sections(doc.content, doc.id
|
|
224
|
+
chunks = chunk_rst_sections(doc.content, doc.id)
|
|
229
225
|
else:
|
|
230
226
|
chunks = chunk_document(doc, self.chunk_size, self.chunk_overlap)
|
|
231
227
|
new_chunks.extend(chunks)
|
|
@@ -378,6 +374,190 @@ class RAGAssistant:
|
|
|
378
374
|
|
|
379
375
|
return [(self._chunks[i], float(similarities[i])) for i in top_indices]
|
|
380
376
|
|
|
377
|
+
def retrieve_with_context(
|
|
378
|
+
self,
|
|
379
|
+
query: str,
|
|
380
|
+
top_k: int = 3,
|
|
381
|
+
window_size: int = 1,
|
|
382
|
+
min_score: float = 0.0,
|
|
383
|
+
) -> list[tuple[Chunk, float]]:
|
|
384
|
+
"""
|
|
385
|
+
Retrieve chunks with adjacent context expansion (window search).
|
|
386
|
+
|
|
387
|
+
For each retrieved chunk, also includes adjacent chunks from the
|
|
388
|
+
same document to provide more context. This is useful when relevant
|
|
389
|
+
information spans multiple chunks.
|
|
390
|
+
|
|
391
|
+
Pattern inspired by ai4rag window_search.
|
|
392
|
+
|
|
393
|
+
Parameters
|
|
394
|
+
----------
|
|
395
|
+
query : str
|
|
396
|
+
Search query.
|
|
397
|
+
top_k : int
|
|
398
|
+
Number of initial chunks to retrieve (default: 3).
|
|
399
|
+
window_size : int
|
|
400
|
+
Number of adjacent chunks to include on each side (default: 1).
|
|
401
|
+
Set to 0 to disable window expansion.
|
|
402
|
+
min_score : float
|
|
403
|
+
Minimum similarity score threshold (default: 0.0).
|
|
404
|
+
|
|
405
|
+
Returns
|
|
406
|
+
-------
|
|
407
|
+
list[tuple[Chunk, float]]
|
|
408
|
+
List of (chunk, similarity_score) tuples, sorted by relevance.
|
|
409
|
+
Adjacent chunks have slightly lower scores.
|
|
410
|
+
|
|
411
|
+
Examples
|
|
412
|
+
--------
|
|
413
|
+
>>> # Get chunks with 1 adjacent chunk on each side
|
|
414
|
+
>>> results = assistant.retrieve_with_context("query", window_size=1)
|
|
415
|
+
>>> for chunk, score in results:
|
|
416
|
+
... print(f"{score:.2f}: {chunk.content[:50]}...")
|
|
417
|
+
"""
|
|
418
|
+
with log_operation("retrieve_with_context", query_len=len(query), top_k=top_k, window_size=window_size) as ctx:
|
|
419
|
+
# Get initial results (more than top_k to account for filtering)
|
|
420
|
+
results = self.retrieve(query, top_k * 2)
|
|
421
|
+
|
|
422
|
+
# Apply minimum score threshold
|
|
423
|
+
if min_score > 0:
|
|
424
|
+
results = [(chunk, score) for chunk, score in results if score >= min_score]
|
|
425
|
+
|
|
426
|
+
if window_size == 0 or not results:
|
|
427
|
+
ctx["expanded_chunks"] = len(results)
|
|
428
|
+
return results[:top_k]
|
|
429
|
+
|
|
430
|
+
# Build chunk index for fast lookup
|
|
431
|
+
chunk_to_idx = {id(chunk): i for i, chunk in enumerate(self._chunks)}
|
|
432
|
+
|
|
433
|
+
expanded_results: list[tuple[Chunk, float]] = []
|
|
434
|
+
seen_indices: set[int] = set()
|
|
435
|
+
|
|
436
|
+
for chunk, score in results[:top_k]:
|
|
437
|
+
chunk_idx = chunk_to_idx.get(id(chunk))
|
|
438
|
+
if chunk_idx is None:
|
|
439
|
+
expanded_results.append((chunk, score))
|
|
440
|
+
continue
|
|
441
|
+
|
|
442
|
+
# Get window of adjacent chunks from same document
|
|
443
|
+
start_idx = max(0, chunk_idx - window_size)
|
|
444
|
+
end_idx = min(len(self._chunks), chunk_idx + window_size + 1)
|
|
445
|
+
|
|
446
|
+
for idx in range(start_idx, end_idx):
|
|
447
|
+
if idx in seen_indices:
|
|
448
|
+
continue
|
|
449
|
+
|
|
450
|
+
adjacent_chunk = self._chunks[idx]
|
|
451
|
+
# Only include adjacent chunks from same document
|
|
452
|
+
if adjacent_chunk.doc_id == chunk.doc_id:
|
|
453
|
+
seen_indices.add(idx)
|
|
454
|
+
# Original chunk keeps full score, adjacent get 80%
|
|
455
|
+
adj_score = score if idx == chunk_idx else score * 0.8
|
|
456
|
+
expanded_results.append((adjacent_chunk, adj_score))
|
|
457
|
+
|
|
458
|
+
# Sort by score (highest first)
|
|
459
|
+
expanded_results.sort(key=lambda x: (-x[1], self._chunks.index(x[0]) if x[0] in self._chunks else 0))
|
|
460
|
+
ctx["expanded_chunks"] = len(expanded_results)
|
|
461
|
+
|
|
462
|
+
return expanded_results
|
|
463
|
+
|
|
464
|
+
def get_context_with_window(
|
|
465
|
+
self,
|
|
466
|
+
query: str,
|
|
467
|
+
top_k: int = 3,
|
|
468
|
+
window_size: int = 1,
|
|
469
|
+
min_score: float = 0.0,
|
|
470
|
+
) -> str:
|
|
471
|
+
"""
|
|
472
|
+
Get formatted context with adjacent chunk expansion.
|
|
473
|
+
|
|
474
|
+
Merges overlapping text from adjacent chunks intelligently.
|
|
475
|
+
|
|
476
|
+
Parameters
|
|
477
|
+
----------
|
|
478
|
+
query : str
|
|
479
|
+
Search query.
|
|
480
|
+
top_k : int
|
|
481
|
+
Number of initial chunks to retrieve.
|
|
482
|
+
window_size : int
|
|
483
|
+
Number of adjacent chunks on each side.
|
|
484
|
+
min_score : float
|
|
485
|
+
Minimum similarity score threshold.
|
|
486
|
+
|
|
487
|
+
Returns
|
|
488
|
+
-------
|
|
489
|
+
str
|
|
490
|
+
Formatted context string with merged chunks.
|
|
491
|
+
"""
|
|
492
|
+
results = self.retrieve_with_context(query, top_k, window_size, min_score)
|
|
493
|
+
|
|
494
|
+
if not results:
|
|
495
|
+
return ""
|
|
496
|
+
|
|
497
|
+
# Group chunks by document to merge properly
|
|
498
|
+
doc_chunks: dict[str, list[tuple[Chunk, float]]] = {}
|
|
499
|
+
for chunk, score in results:
|
|
500
|
+
doc_id = chunk.doc_id or "unknown"
|
|
501
|
+
if doc_id not in doc_chunks:
|
|
502
|
+
doc_chunks[doc_id] = []
|
|
503
|
+
doc_chunks[doc_id].append((chunk, score))
|
|
504
|
+
|
|
505
|
+
merged_sections: list[str] = []
|
|
506
|
+
|
|
507
|
+
for _doc_id, chunks in doc_chunks.items():
|
|
508
|
+
# Sort chunks by their position in the original list
|
|
509
|
+
chunks.sort(key=lambda x: self._chunks.index(x[0]) if x[0] in self._chunks else 0)
|
|
510
|
+
|
|
511
|
+
# Merge overlapping text
|
|
512
|
+
merged_content = []
|
|
513
|
+
for chunk, _ in chunks:
|
|
514
|
+
if merged_content:
|
|
515
|
+
# Check for overlap with previous chunk
|
|
516
|
+
prev_content = merged_content[-1]
|
|
517
|
+
non_overlapping = self._get_non_overlapping_text(prev_content, chunk.content)
|
|
518
|
+
if non_overlapping != chunk.content:
|
|
519
|
+
# Found overlap, extend previous chunk
|
|
520
|
+
merged_content[-1] = prev_content + non_overlapping
|
|
521
|
+
else:
|
|
522
|
+
# No overlap, add as new section
|
|
523
|
+
merged_content.append(chunk.content)
|
|
524
|
+
else:
|
|
525
|
+
merged_content.append(chunk.content)
|
|
526
|
+
|
|
527
|
+
merged_sections.append("\n".join(merged_content))
|
|
528
|
+
|
|
529
|
+
return "\n\n---\n\n".join(merged_sections)
|
|
530
|
+
|
|
531
|
+
def _get_non_overlapping_text(self, str1: str, str2: str) -> str:
|
|
532
|
+
"""
|
|
533
|
+
Find non-overlapping portion of str2 when appending after str1.
|
|
534
|
+
|
|
535
|
+
Detects overlap where the end of str1 matches the beginning of str2,
|
|
536
|
+
and returns only the non-overlapping portion of str2.
|
|
537
|
+
|
|
538
|
+
Pattern from ai4rag vector_store/utils.py.
|
|
539
|
+
|
|
540
|
+
Parameters
|
|
541
|
+
----------
|
|
542
|
+
str1 : str
|
|
543
|
+
First string (previous content).
|
|
544
|
+
str2 : str
|
|
545
|
+
Second string (content to potentially append).
|
|
546
|
+
|
|
547
|
+
Returns
|
|
548
|
+
-------
|
|
549
|
+
str
|
|
550
|
+
Non-overlapping portion of str2, or full str2 if no overlap.
|
|
551
|
+
"""
|
|
552
|
+
# Limit overlap search to avoid O(n^2) for large strings
|
|
553
|
+
max_overlap = min(len(str1), len(str2), 200)
|
|
554
|
+
|
|
555
|
+
for i in range(max_overlap, 0, -1):
|
|
556
|
+
if str1[-i:] == str2[:i]:
|
|
557
|
+
return str2[i:]
|
|
558
|
+
|
|
559
|
+
return str2
|
|
560
|
+
|
|
381
561
|
def get_context(self, query: str, top_k: int = 3) -> str:
|
|
382
562
|
"""
|
|
383
563
|
Get formatted context string from retrieved chunks.
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""
|
|
6
|
+
Ragit configuration management with Pydantic validation.
|
|
7
|
+
|
|
8
|
+
Loads configuration from environment variables and .env files.
|
|
9
|
+
Validates all configuration values at startup.
|
|
10
|
+
|
|
11
|
+
Note: As of v0.8.0, ragit no longer has default LLM or embedding models.
|
|
12
|
+
Users must explicitly configure providers.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from dotenv import load_dotenv
|
|
19
|
+
from pydantic import BaseModel, Field, field_validator
|
|
20
|
+
|
|
21
|
+
# Note: We define ConfigValidationError locally to avoid circular imports,
|
|
22
|
+
# but ragit.exceptions.ConfigurationError can be used elsewhere
|
|
23
|
+
|
|
24
|
+
# Load .env file from current working directory or project root
|
|
25
|
+
_env_path = Path.cwd() / ".env"
|
|
26
|
+
if _env_path.exists():
|
|
27
|
+
load_dotenv(_env_path)
|
|
28
|
+
else:
|
|
29
|
+
# Try to find .env in parent directories
|
|
30
|
+
for parent in Path.cwd().parents:
|
|
31
|
+
_env_path = parent / ".env"
|
|
32
|
+
if _env_path.exists():
|
|
33
|
+
load_dotenv(_env_path)
|
|
34
|
+
break
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ConfigValidationError(Exception):
|
|
38
|
+
"""Raised when configuration validation fails."""
|
|
39
|
+
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class RagitConfig(BaseModel):
|
|
44
|
+
"""Validated ragit configuration.
|
|
45
|
+
|
|
46
|
+
All configuration values are validated at startup. Invalid values
|
|
47
|
+
raise ConfigValidationError with a descriptive message.
|
|
48
|
+
|
|
49
|
+
Attributes
|
|
50
|
+
----------
|
|
51
|
+
ollama_base_url : str
|
|
52
|
+
Ollama server URL (default: http://localhost:11434)
|
|
53
|
+
ollama_embedding_url : str
|
|
54
|
+
Embedding API URL (defaults to ollama_base_url)
|
|
55
|
+
ollama_api_key : str | None
|
|
56
|
+
API key for authentication
|
|
57
|
+
ollama_timeout : int
|
|
58
|
+
Request timeout in seconds (1-600)
|
|
59
|
+
default_llm_model : str | None
|
|
60
|
+
Default LLM model name
|
|
61
|
+
default_embedding_model : str | None
|
|
62
|
+
Default embedding model name
|
|
63
|
+
log_level : str
|
|
64
|
+
Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
ollama_base_url: str = Field(default="http://localhost:11434")
|
|
68
|
+
ollama_embedding_url: str | None = None
|
|
69
|
+
ollama_api_key: str | None = None
|
|
70
|
+
ollama_timeout: int = Field(default=120, gt=0, le=600)
|
|
71
|
+
default_llm_model: str | None = None
|
|
72
|
+
default_embedding_model: str | None = None
|
|
73
|
+
log_level: str = Field(default="INFO")
|
|
74
|
+
|
|
75
|
+
@field_validator("ollama_base_url", "ollama_embedding_url", mode="before")
|
|
76
|
+
@classmethod
|
|
77
|
+
def validate_url(cls, v: str | None) -> str | None:
|
|
78
|
+
"""Validate URL format."""
|
|
79
|
+
if v is None:
|
|
80
|
+
return v
|
|
81
|
+
v = str(v).strip().rstrip("/")
|
|
82
|
+
if not v:
|
|
83
|
+
return None
|
|
84
|
+
if not v.startswith(("http://", "https://")):
|
|
85
|
+
raise ValueError(f"URL must start with http:// or https://: {v}")
|
|
86
|
+
return v
|
|
87
|
+
|
|
88
|
+
@field_validator("log_level", mode="before")
|
|
89
|
+
@classmethod
|
|
90
|
+
def validate_log_level(cls, v: str) -> str:
|
|
91
|
+
"""Validate log level is a valid Python logging level."""
|
|
92
|
+
valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
|
|
93
|
+
v = str(v).upper().strip()
|
|
94
|
+
if v not in valid_levels:
|
|
95
|
+
raise ValueError(f"Invalid log level: {v}. Must be one of {valid_levels}")
|
|
96
|
+
return v
|
|
97
|
+
|
|
98
|
+
@field_validator("ollama_api_key", mode="before")
|
|
99
|
+
@classmethod
|
|
100
|
+
def validate_api_key(cls, v: str | None) -> str | None:
|
|
101
|
+
"""Treat empty string as None."""
|
|
102
|
+
if v is not None and not str(v).strip():
|
|
103
|
+
return None
|
|
104
|
+
return v
|
|
105
|
+
|
|
106
|
+
@field_validator("ollama_timeout", mode="before")
|
|
107
|
+
@classmethod
|
|
108
|
+
def validate_timeout(cls, v: int | str) -> int:
|
|
109
|
+
"""Parse and validate timeout value."""
|
|
110
|
+
try:
|
|
111
|
+
timeout = int(v)
|
|
112
|
+
except (ValueError, TypeError) as e:
|
|
113
|
+
raise ValueError(f"Invalid timeout value '{v}': must be an integer") from e
|
|
114
|
+
return timeout
|
|
115
|
+
|
|
116
|
+
model_config = {"extra": "forbid"}
|
|
117
|
+
|
|
118
|
+
# Uppercase aliases for backwards compatibility
|
|
119
|
+
@property
|
|
120
|
+
def OLLAMA_BASE_URL(self) -> str:
|
|
121
|
+
return self.ollama_base_url
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def OLLAMA_EMBEDDING_URL(self) -> str:
|
|
125
|
+
return self.ollama_embedding_url or self.ollama_base_url
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def OLLAMA_API_KEY(self) -> str | None:
|
|
129
|
+
return self.ollama_api_key
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def OLLAMA_TIMEOUT(self) -> int:
|
|
133
|
+
return self.ollama_timeout
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def DEFAULT_LLM_MODEL(self) -> str | None:
|
|
137
|
+
return self.default_llm_model
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def DEFAULT_EMBEDDING_MODEL(self) -> str | None:
|
|
141
|
+
return self.default_embedding_model
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def LOG_LEVEL(self) -> str:
|
|
145
|
+
return self.log_level
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _safe_get_env(key: str, default: str | None = None) -> str | None:
|
|
149
|
+
"""Get environment variable, returning None for empty strings."""
|
|
150
|
+
value = os.getenv(key, default)
|
|
151
|
+
if value is not None and not value.strip():
|
|
152
|
+
return default
|
|
153
|
+
return value
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _safe_get_int_env(key: str, default: int) -> int | str:
|
|
157
|
+
"""Get environment variable as int, returning raw string if invalid."""
|
|
158
|
+
value = os.getenv(key)
|
|
159
|
+
if value is None:
|
|
160
|
+
return default
|
|
161
|
+
try:
|
|
162
|
+
return int(value)
|
|
163
|
+
except ValueError:
|
|
164
|
+
# Return the raw string so Pydantic can give a better error message
|
|
165
|
+
return value
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def load_config() -> RagitConfig:
|
|
169
|
+
"""Load and validate configuration from environment variables.
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
RagitConfig
|
|
174
|
+
Validated configuration object.
|
|
175
|
+
|
|
176
|
+
Raises
|
|
177
|
+
------
|
|
178
|
+
ConfigValidationError
|
|
179
|
+
If configuration validation fails.
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
return RagitConfig(
|
|
183
|
+
ollama_base_url=_safe_get_env("OLLAMA_BASE_URL", "http://localhost:11434") or "http://localhost:11434",
|
|
184
|
+
ollama_embedding_url=_safe_get_env("OLLAMA_EMBEDDING_URL") or _safe_get_env("OLLAMA_BASE_URL"),
|
|
185
|
+
ollama_api_key=_safe_get_env("OLLAMA_API_KEY"),
|
|
186
|
+
ollama_timeout=_safe_get_int_env("OLLAMA_TIMEOUT", 120),
|
|
187
|
+
default_llm_model=_safe_get_env("RAGIT_DEFAULT_LLM_MODEL"),
|
|
188
|
+
default_embedding_model=_safe_get_env("RAGIT_DEFAULT_EMBEDDING_MODEL"),
|
|
189
|
+
log_level=_safe_get_env("RAGIT_LOG_LEVEL", "INFO") or "INFO",
|
|
190
|
+
)
|
|
191
|
+
except Exception as e:
|
|
192
|
+
raise ConfigValidationError(f"Configuration error: {e}") from e
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# Singleton instance - validates configuration at import time
|
|
196
|
+
try:
|
|
197
|
+
config = load_config()
|
|
198
|
+
except ConfigValidationError as e:
|
|
199
|
+
# Re-raise with clear message
|
|
200
|
+
raise ConfigValidationError(str(e)) from e
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# Backwards compatibility alias
|
|
204
|
+
Config = RagitConfig
|
|
@@ -45,7 +45,13 @@ class Document:
|
|
|
45
45
|
|
|
46
46
|
@dataclass
|
|
47
47
|
class Chunk:
|
|
48
|
-
"""A document chunk.
|
|
48
|
+
"""A document chunk with optional rich metadata.
|
|
49
|
+
|
|
50
|
+
Metadata can include:
|
|
51
|
+
- document_id: SHA256 hash for deduplication and window search
|
|
52
|
+
- sequence_number: Order within the document
|
|
53
|
+
- chunk_start/chunk_end: Character positions in original text
|
|
54
|
+
"""
|
|
49
55
|
|
|
50
56
|
content: str
|
|
51
57
|
doc_id: str
|