ragit 0.8__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,131 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Ragit experiment results.
7
+ """
8
+
9
+ from collections.abc import Iterator
10
+ from dataclasses import asdict, dataclass, field
11
+ from typing import Any
12
+
13
+
14
+ @dataclass
15
+ class EvaluationResult:
16
+ """
17
+ Result from evaluating a single RAG configuration.
18
+
19
+ Parameters
20
+ ----------
21
+ pattern_name : str
22
+ Name of the RAG pattern (e.g., "Pattern_1").
23
+ indexing_params : dict[str, Any]
24
+ Hyperparameters used during indexing (chunk_size, overlap, etc.).
25
+ inference_params : dict[str, Any]
26
+ Hyperparameters used during inference (num_chunks, llm_model, etc.).
27
+ scores : dict[str, dict]
28
+ Evaluation scores (answer_correctness, context_relevance, faithfulness).
29
+ execution_time : float
30
+ Time taken for evaluation in seconds.
31
+ final_score : float
32
+ Combined score for optimization ranking.
33
+ """
34
+
35
+ pattern_name: str
36
+ indexing_params: dict[str, Any]
37
+ inference_params: dict[str, Any]
38
+ scores: dict[str, dict[str, float]]
39
+ execution_time: float
40
+ final_score: float
41
+
42
+ def to_dict(self) -> dict[str, Any]:
43
+ """Convert to dictionary."""
44
+ return asdict(self)
45
+
46
+ def __repr__(self) -> str:
47
+ return (
48
+ f"EvaluationResult(name={self.pattern_name}, score={self.final_score:.3f}, time={self.execution_time:.1f}s)"
49
+ )
50
+
51
+
52
+ @dataclass
53
+ class ExperimentResults:
54
+ """
55
+ Collection of evaluation results from an optimization experiment.
56
+
57
+ Attributes
58
+ ----------
59
+ evaluations : list[EvaluationResult]
60
+ All evaluation results.
61
+ """
62
+
63
+ evaluations: list[EvaluationResult] = field(default_factory=list)
64
+
65
+ def __len__(self) -> int:
66
+ return len(self.evaluations)
67
+
68
+ def __iter__(self) -> Iterator[EvaluationResult]:
69
+ yield from self.evaluations
70
+
71
+ def __bool__(self) -> bool:
72
+ return bool(self.evaluations)
73
+
74
+ def add(self, result: EvaluationResult) -> None:
75
+ """Add an evaluation result."""
76
+ self.evaluations.append(result)
77
+
78
+ def is_cached(
79
+ self,
80
+ indexing_params: dict[str, Any],
81
+ inference_params: dict[str, Any],
82
+ ) -> float | None:
83
+ """
84
+ Check if this configuration was already evaluated.
85
+
86
+ Returns
87
+ -------
88
+ float or None
89
+ Final score if cached, None otherwise.
90
+ """
91
+ for ev in self.evaluations:
92
+ if ev.indexing_params == indexing_params and ev.inference_params == inference_params:
93
+ return ev.final_score
94
+ return None
95
+
96
+ @property
97
+ def scores(self) -> list[float]:
98
+ """All final scores."""
99
+ return [ev.final_score for ev in self.evaluations]
100
+
101
+ def sorted(self, reverse: bool = True) -> list[EvaluationResult]:
102
+ """
103
+ Get results sorted by final score.
104
+
105
+ Parameters
106
+ ----------
107
+ reverse : bool
108
+ If True (default), best scores first.
109
+
110
+ Returns
111
+ -------
112
+ list[EvaluationResult]
113
+ Sorted results.
114
+ """
115
+ return sorted(self.evaluations, key=lambda x: x.final_score, reverse=reverse)
116
+
117
+ def get_best(self, k: int = 1) -> list[EvaluationResult]:
118
+ """
119
+ Get k best results.
120
+
121
+ Parameters
122
+ ----------
123
+ k : int
124
+ Number of results to return.
125
+
126
+ Returns
127
+ -------
128
+ list[EvaluationResult]
129
+ Top k results by score.
130
+ """
131
+ return self.sorted()[:k]
ragit/loaders.py ADDED
@@ -0,0 +1,245 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Document loading and chunking utilities.
7
+
8
+ Provides simple functions to load documents from files and chunk text.
9
+ """
10
+
11
+ import re
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from ragit.core.experiment.experiment import Chunk, Document
16
+
17
+
18
+ def load_text(path: str | Path) -> Document:
19
+ """
20
+ Load a single text file as a Document.
21
+
22
+ Parameters
23
+ ----------
24
+ path : str or Path
25
+ Path to the text file (.txt, .md, .rst, etc.)
26
+
27
+ Returns
28
+ -------
29
+ Document
30
+ Document with file content and metadata.
31
+
32
+ Examples
33
+ --------
34
+ >>> doc = load_text("docs/tutorial.rst")
35
+ >>> print(doc.id, len(doc.content))
36
+ """
37
+ path = Path(path)
38
+ content = path.read_text(encoding="utf-8")
39
+ return Document(id=path.stem, content=content, metadata={"source": str(path), "filename": path.name})
40
+
41
+
42
+ def load_directory(path: str | Path, pattern: str = "*.txt", recursive: bool = False) -> list[Document]:
43
+ """
44
+ Load all matching files from a directory as Documents.
45
+
46
+ Parameters
47
+ ----------
48
+ path : str or Path
49
+ Directory path.
50
+ pattern : str
51
+ Glob pattern for files (default: "*.txt").
52
+ recursive : bool
53
+ If True, search recursively (default: False).
54
+
55
+ Returns
56
+ -------
57
+ list[Document]
58
+ List of loaded documents.
59
+
60
+ Examples
61
+ --------
62
+ >>> docs = load_directory("docs/", "*.rst")
63
+ >>> docs = load_directory("docs/", "**/*.md", recursive=True)
64
+ """
65
+ path = Path(path)
66
+ glob_method = path.rglob if recursive else path.glob
67
+ documents = []
68
+
69
+ for file_path in sorted(glob_method(pattern)):
70
+ if file_path.is_file():
71
+ documents.append(load_text(file_path))
72
+
73
+ return documents
74
+
75
+
76
+ def chunk_text(
77
+ text: str,
78
+ chunk_size: int = 512,
79
+ chunk_overlap: int = 50,
80
+ doc_id: str = "doc",
81
+ metadata: dict[str, Any] | None = None,
82
+ ) -> list[Chunk]:
83
+ """
84
+ Split text into overlapping chunks.
85
+
86
+ Parameters
87
+ ----------
88
+ text : str
89
+ Text to chunk.
90
+ chunk_size : int
91
+ Maximum characters per chunk (default: 512).
92
+ chunk_overlap : int
93
+ Overlap between chunks (default: 50).
94
+ doc_id : str
95
+ Document ID for the chunks (default: "doc").
96
+ metadata : dict, optional
97
+ Metadata to attach to each chunk (default: None).
98
+
99
+ Returns
100
+ -------
101
+ list[Chunk]
102
+ List of text chunks.
103
+
104
+ Examples
105
+ --------
106
+ >>> chunks = chunk_text("Long document...", chunk_size=256, chunk_overlap=50)
107
+ """
108
+ if chunk_overlap >= chunk_size:
109
+ raise ValueError("chunk_overlap must be less than chunk_size")
110
+
111
+ chunks = []
112
+ start = 0
113
+ chunk_idx = 0
114
+ chunk_metadata = metadata or {}
115
+
116
+ while start < len(text):
117
+ end = start + chunk_size
118
+ chunk_content = text[start:end].strip()
119
+
120
+ if chunk_content:
121
+ chunks.append(
122
+ Chunk(content=chunk_content, doc_id=doc_id, chunk_index=chunk_idx, metadata=chunk_metadata.copy())
123
+ )
124
+ chunk_idx += 1
125
+
126
+ start = end - chunk_overlap
127
+ if start >= len(text) - chunk_overlap:
128
+ break
129
+
130
+ return chunks
131
+
132
+
133
+ def chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50) -> list[Chunk]:
134
+ """
135
+ Split a Document into overlapping chunks.
136
+
137
+ Parameters
138
+ ----------
139
+ doc : Document
140
+ Document to chunk.
141
+ chunk_size : int
142
+ Maximum characters per chunk.
143
+ chunk_overlap : int
144
+ Overlap between chunks.
145
+
146
+ Returns
147
+ -------
148
+ list[Chunk]
149
+ List of chunks from the document.
150
+ """
151
+ return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, metadata=doc.metadata)
152
+
153
+
154
+ def chunk_by_separator(
155
+ text: str, separator: str = "\n\n", doc_id: str = "doc", metadata: dict[str, Any] | None = None
156
+ ) -> list[Chunk]:
157
+ """
158
+ Split text by a separator (e.g., paragraphs, sections).
159
+
160
+ Parameters
161
+ ----------
162
+ text : str
163
+ Text to split.
164
+ separator : str
165
+ Separator string (default: double newline for paragraphs).
166
+ doc_id : str
167
+ Document ID for the chunks.
168
+ metadata : dict, optional
169
+ Metadata to attach to each chunk (default: None).
170
+
171
+ Returns
172
+ -------
173
+ list[Chunk]
174
+ List of chunks.
175
+
176
+ Examples
177
+ --------
178
+ >>> chunks = chunk_by_separator(text, separator="\\n---\\n")
179
+ """
180
+ parts = text.split(separator)
181
+ chunks = []
182
+ chunk_metadata = metadata or {}
183
+
184
+ for idx, part in enumerate(parts):
185
+ content = part.strip()
186
+ if content:
187
+ chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx, metadata=chunk_metadata.copy()))
188
+
189
+ return chunks
190
+
191
+
192
+ def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any] | None = None) -> list[Chunk]:
193
+ """
194
+ Split RST document by section headers.
195
+
196
+ Parameters
197
+ ----------
198
+ text : str
199
+ RST document text.
200
+ doc_id : str
201
+ Document ID for the chunks.
202
+ metadata : dict, optional
203
+ Metadata to attach to each chunk (default: None).
204
+
205
+ Returns
206
+ -------
207
+ list[Chunk]
208
+ List of section chunks.
209
+ """
210
+ # Match RST section headers (title followed by underline of =, -, ~, etc.)
211
+ pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
212
+ chunk_metadata = metadata or {}
213
+
214
+ # Find all section positions
215
+ matches = list(re.finditer(pattern, text))
216
+
217
+ if not matches:
218
+ # No sections found, return whole text as one chunk
219
+ return (
220
+ [Chunk(content=text.strip(), doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy())]
221
+ if text.strip()
222
+ else []
223
+ )
224
+
225
+ chunks = []
226
+
227
+ # Handle content before first section
228
+ first_pos = matches[0].start()
229
+ if first_pos > 0:
230
+ pre_content = text[:first_pos].strip()
231
+ if pre_content:
232
+ chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy()))
233
+
234
+ # Extract each section
235
+ for i, match in enumerate(matches):
236
+ start = match.start()
237
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
238
+
239
+ section_content = text[start:end].strip()
240
+ if section_content:
241
+ chunks.append(
242
+ Chunk(content=section_content, doc_id=doc_id, chunk_index=len(chunks), metadata=chunk_metadata.copy())
243
+ )
244
+
245
+ return chunks
@@ -0,0 +1,47 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Ragit Providers - LLM and Embedding providers for RAG optimization.
7
+
8
+ Supported providers:
9
+ - OllamaProvider: Connect to local or remote Ollama servers
10
+ - FunctionProvider: Wrap custom embedding/LLM functions
11
+ - SentenceTransformersProvider: Offline embedding (requires ragit[transformers])
12
+
13
+ Base classes for implementing custom providers:
14
+ - BaseLLMProvider: Abstract base for LLM providers
15
+ - BaseEmbeddingProvider: Abstract base for embedding providers
16
+ """
17
+
18
+ from ragit.providers.base import (
19
+ BaseEmbeddingProvider,
20
+ BaseLLMProvider,
21
+ EmbeddingResponse,
22
+ LLMResponse,
23
+ )
24
+ from ragit.providers.function_adapter import FunctionProvider
25
+ from ragit.providers.ollama import OllamaProvider
26
+
27
+ __all__ = [
28
+ # Base classes
29
+ "BaseLLMProvider",
30
+ "BaseEmbeddingProvider",
31
+ "LLMResponse",
32
+ "EmbeddingResponse",
33
+ # Built-in providers
34
+ "OllamaProvider",
35
+ "FunctionProvider",
36
+ ]
37
+
38
+ # Conditionally export SentenceTransformersProvider if available
39
+ try:
40
+ from ragit.providers.sentence_transformers import (
41
+ SentenceTransformersProvider as SentenceTransformersProvider,
42
+ )
43
+
44
+ __all__ += ["SentenceTransformersProvider"]
45
+ except ImportError:
46
+ # sentence-transformers not installed, SentenceTransformersProvider not available
47
+ pass
@@ -0,0 +1,147 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Base provider interfaces for LLM and Embedding providers.
7
+
8
+ These abstract classes define the interface that all providers must implement,
9
+ making it easy to add new providers (Gemini, Claude, OpenAI, etc.)
10
+ """
11
+
12
+ from abc import ABC, abstractmethod
13
+ from dataclasses import dataclass
14
+
15
+
16
+ @dataclass
17
+ class LLMResponse:
18
+ """Response from an LLM call."""
19
+
20
+ text: str
21
+ model: str
22
+ provider: str
23
+ usage: dict[str, int] | None = None
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class EmbeddingResponse:
28
+ """Response from an embedding call (immutable)."""
29
+
30
+ embedding: tuple[float, ...]
31
+ model: str
32
+ provider: str
33
+ dimensions: int
34
+
35
+
36
+ class BaseLLMProvider(ABC):
37
+ """
38
+ Abstract base class for LLM providers.
39
+
40
+ Implement this to add support for new LLM providers like Gemini, Claude, etc.
41
+ """
42
+
43
+ @property
44
+ @abstractmethod
45
+ def provider_name(self) -> str:
46
+ """Return the provider name (e.g., 'ollama', 'gemini', 'claude')."""
47
+ pass
48
+
49
+ @abstractmethod
50
+ def generate(
51
+ self,
52
+ prompt: str,
53
+ model: str,
54
+ system_prompt: str | None = None,
55
+ temperature: float = 0.7,
56
+ max_tokens: int | None = None,
57
+ ) -> LLMResponse:
58
+ """
59
+ Generate text from the LLM.
60
+
61
+ Parameters
62
+ ----------
63
+ prompt : str
64
+ The user prompt/query.
65
+ model : str
66
+ Model identifier (e.g., 'llama3', 'qwen3-vl:235b-instruct-cloud').
67
+ system_prompt : str, optional
68
+ System prompt for context/instructions.
69
+ temperature : float
70
+ Sampling temperature (0.0 to 1.0).
71
+ max_tokens : int, optional
72
+ Maximum tokens to generate.
73
+
74
+ Returns
75
+ -------
76
+ LLMResponse
77
+ The generated response.
78
+ """
79
+ pass
80
+
81
+ @abstractmethod
82
+ def is_available(self) -> bool:
83
+ """Check if the provider is available and configured."""
84
+ pass
85
+
86
+
87
+ class BaseEmbeddingProvider(ABC):
88
+ """
89
+ Abstract base class for embedding providers.
90
+
91
+ Implement this to add support for new embedding providers.
92
+ """
93
+
94
+ @property
95
+ @abstractmethod
96
+ def provider_name(self) -> str:
97
+ """Return the provider name."""
98
+ pass
99
+
100
+ @property
101
+ @abstractmethod
102
+ def dimensions(self) -> int:
103
+ """Return the embedding dimensions for the current model."""
104
+ pass
105
+
106
+ @abstractmethod
107
+ def embed(self, text: str, model: str) -> EmbeddingResponse:
108
+ """
109
+ Generate embedding for text.
110
+
111
+ Parameters
112
+ ----------
113
+ text : str
114
+ Text to embed.
115
+ model : str
116
+ Model identifier (e.g., 'nomic-embed-text').
117
+
118
+ Returns
119
+ -------
120
+ EmbeddingResponse
121
+ The embedding response.
122
+ """
123
+ pass
124
+
125
+ @abstractmethod
126
+ def embed_batch(self, texts: list[str], model: str) -> list[EmbeddingResponse]:
127
+ """
128
+ Generate embeddings for multiple texts.
129
+
130
+ Parameters
131
+ ----------
132
+ texts : list[str]
133
+ Texts to embed.
134
+ model : str
135
+ Model identifier.
136
+
137
+ Returns
138
+ -------
139
+ list[EmbeddingResponse]
140
+ List of embedding responses.
141
+ """
142
+ pass
143
+
144
+ @abstractmethod
145
+ def is_available(self) -> bool:
146
+ """Check if the provider is available and configured."""
147
+ pass