ragit 0.8__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragit/loaders.py ADDED
@@ -0,0 +1,219 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Document loading and chunking utilities.
7
+
8
+ Provides simple functions to load documents from files and chunk text.
9
+ """
10
+
11
+ import re
12
+ from pathlib import Path
13
+
14
+ from ragit.core.experiment.experiment import Chunk, Document
15
+
16
+
17
+ def load_text(path: str | Path) -> Document:
18
+ """
19
+ Load a single text file as a Document.
20
+
21
+ Parameters
22
+ ----------
23
+ path : str or Path
24
+ Path to the text file (.txt, .md, .rst, etc.)
25
+
26
+ Returns
27
+ -------
28
+ Document
29
+ Document with file content and metadata.
30
+
31
+ Examples
32
+ --------
33
+ >>> doc = load_text("docs/tutorial.rst")
34
+ >>> print(doc.id, len(doc.content))
35
+ """
36
+ path = Path(path)
37
+ content = path.read_text(encoding="utf-8")
38
+ return Document(id=path.stem, content=content, metadata={"source": str(path), "filename": path.name})
39
+
40
+
41
+ def load_directory(path: str | Path, pattern: str = "*.txt", recursive: bool = False) -> list[Document]:
42
+ """
43
+ Load all matching files from a directory as Documents.
44
+
45
+ Parameters
46
+ ----------
47
+ path : str or Path
48
+ Directory path.
49
+ pattern : str
50
+ Glob pattern for files (default: "*.txt").
51
+ recursive : bool
52
+ If True, search recursively (default: False).
53
+
54
+ Returns
55
+ -------
56
+ list[Document]
57
+ List of loaded documents.
58
+
59
+ Examples
60
+ --------
61
+ >>> docs = load_directory("docs/", "*.rst")
62
+ >>> docs = load_directory("docs/", "**/*.md", recursive=True)
63
+ """
64
+ path = Path(path)
65
+ glob_method = path.rglob if recursive else path.glob
66
+ documents = []
67
+
68
+ for file_path in sorted(glob_method(pattern)):
69
+ if file_path.is_file():
70
+ documents.append(load_text(file_path))
71
+
72
+ return documents
73
+
74
+
75
+ def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id: str = "doc") -> list[Chunk]:
76
+ """
77
+ Split text into overlapping chunks.
78
+
79
+ Parameters
80
+ ----------
81
+ text : str
82
+ Text to chunk.
83
+ chunk_size : int
84
+ Maximum characters per chunk (default: 512).
85
+ chunk_overlap : int
86
+ Overlap between chunks (default: 50).
87
+ doc_id : str
88
+ Document ID for the chunks (default: "doc").
89
+
90
+ Returns
91
+ -------
92
+ list[Chunk]
93
+ List of text chunks.
94
+
95
+ Examples
96
+ --------
97
+ >>> chunks = chunk_text("Long document...", chunk_size=256, chunk_overlap=50)
98
+ """
99
+ if chunk_overlap >= chunk_size:
100
+ raise ValueError("chunk_overlap must be less than chunk_size")
101
+
102
+ chunks = []
103
+ start = 0
104
+ chunk_idx = 0
105
+
106
+ while start < len(text):
107
+ end = start + chunk_size
108
+ chunk_text = text[start:end].strip()
109
+
110
+ if chunk_text:
111
+ chunks.append(Chunk(content=chunk_text, doc_id=doc_id, chunk_index=chunk_idx))
112
+ chunk_idx += 1
113
+
114
+ start = end - chunk_overlap
115
+ if start >= len(text) - chunk_overlap:
116
+ break
117
+
118
+ return chunks
119
+
120
+
121
+ def chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50) -> list[Chunk]:
122
+ """
123
+ Split a Document into overlapping chunks.
124
+
125
+ Parameters
126
+ ----------
127
+ doc : Document
128
+ Document to chunk.
129
+ chunk_size : int
130
+ Maximum characters per chunk.
131
+ chunk_overlap : int
132
+ Overlap between chunks.
133
+
134
+ Returns
135
+ -------
136
+ list[Chunk]
137
+ List of chunks from the document.
138
+ """
139
+ return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id)
140
+
141
+
142
+ def chunk_by_separator(text: str, separator: str = "\n\n", doc_id: str = "doc") -> list[Chunk]:
143
+ """
144
+ Split text by a separator (e.g., paragraphs, sections).
145
+
146
+ Parameters
147
+ ----------
148
+ text : str
149
+ Text to split.
150
+ separator : str
151
+ Separator string (default: double newline for paragraphs).
152
+ doc_id : str
153
+ Document ID for the chunks.
154
+
155
+ Returns
156
+ -------
157
+ list[Chunk]
158
+ List of chunks.
159
+
160
+ Examples
161
+ --------
162
+ >>> chunks = chunk_by_separator(text, separator="\\n---\\n")
163
+ """
164
+ parts = text.split(separator)
165
+ chunks = []
166
+
167
+ for idx, part in enumerate(parts):
168
+ content = part.strip()
169
+ if content:
170
+ chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx))
171
+
172
+ return chunks
173
+
174
+
175
+ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
176
+ """
177
+ Split RST document by section headers.
178
+
179
+ Parameters
180
+ ----------
181
+ text : str
182
+ RST document text.
183
+ doc_id : str
184
+ Document ID for the chunks.
185
+
186
+ Returns
187
+ -------
188
+ list[Chunk]
189
+ List of section chunks.
190
+ """
191
+ # Match RST section headers (title followed by underline of =, -, ~, etc.)
192
+ pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
193
+
194
+ # Find all section positions
195
+ matches = list(re.finditer(pattern, text))
196
+
197
+ if not matches:
198
+ # No sections found, return whole text as one chunk
199
+ return [Chunk(content=text.strip(), doc_id=doc_id, chunk_index=0)] if text.strip() else []
200
+
201
+ chunks = []
202
+
203
+ # Handle content before first section
204
+ first_pos = matches[0].start()
205
+ if first_pos > 0:
206
+ pre_content = text[:first_pos].strip()
207
+ if pre_content:
208
+ chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0))
209
+
210
+ # Extract each section
211
+ for i, match in enumerate(matches):
212
+ start = match.start()
213
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
214
+
215
+ section_content = text[start:end].strip()
216
+ if section_content:
217
+ chunks.append(Chunk(content=section_content, doc_id=doc_id, chunk_index=len(chunks)))
218
+
219
+ return chunks
@@ -0,0 +1,47 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Ragit Providers - LLM and Embedding providers for RAG optimization.
7
+
8
+ Supported providers:
9
+ - OllamaProvider: Connect to local or remote Ollama servers
10
+ - FunctionProvider: Wrap custom embedding/LLM functions
11
+ - SentenceTransformersProvider: Offline embedding (requires ragit[transformers])
12
+
13
+ Base classes for implementing custom providers:
14
+ - BaseLLMProvider: Abstract base for LLM providers
15
+ - BaseEmbeddingProvider: Abstract base for embedding providers
16
+ """
17
+
18
+ from ragit.providers.base import (
19
+ BaseEmbeddingProvider,
20
+ BaseLLMProvider,
21
+ EmbeddingResponse,
22
+ LLMResponse,
23
+ )
24
+ from ragit.providers.function_adapter import FunctionProvider
25
+ from ragit.providers.ollama import OllamaProvider
26
+
27
+ __all__ = [
28
+ # Base classes
29
+ "BaseLLMProvider",
30
+ "BaseEmbeddingProvider",
31
+ "LLMResponse",
32
+ "EmbeddingResponse",
33
+ # Built-in providers
34
+ "OllamaProvider",
35
+ "FunctionProvider",
36
+ ]
37
+
38
+ # Conditionally export SentenceTransformersProvider if available
39
+ try:
40
+ from ragit.providers.sentence_transformers import (
41
+ SentenceTransformersProvider as SentenceTransformersProvider,
42
+ )
43
+
44
+ __all__ += ["SentenceTransformersProvider"]
45
+ except ImportError:
46
+ # sentence-transformers not installed, SentenceTransformersProvider not available
47
+ pass
@@ -0,0 +1,147 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Base provider interfaces for LLM and Embedding providers.
7
+
8
+ These abstract classes define the interface that all providers must implement,
9
+ making it easy to add new providers (Gemini, Claude, OpenAI, etc.)
10
+ """
11
+
12
+ from abc import ABC, abstractmethod
13
+ from dataclasses import dataclass
14
+
15
+
16
+ @dataclass
17
+ class LLMResponse:
18
+ """Response from an LLM call."""
19
+
20
+ text: str
21
+ model: str
22
+ provider: str
23
+ usage: dict[str, int] | None = None
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class EmbeddingResponse:
28
+ """Response from an embedding call (immutable)."""
29
+
30
+ embedding: tuple[float, ...]
31
+ model: str
32
+ provider: str
33
+ dimensions: int
34
+
35
+
36
+ class BaseLLMProvider(ABC):
37
+ """
38
+ Abstract base class for LLM providers.
39
+
40
+ Implement this to add support for new LLM providers like Gemini, Claude, etc.
41
+ """
42
+
43
+ @property
44
+ @abstractmethod
45
+ def provider_name(self) -> str:
46
+ """Return the provider name (e.g., 'ollama', 'gemini', 'claude')."""
47
+ pass
48
+
49
+ @abstractmethod
50
+ def generate(
51
+ self,
52
+ prompt: str,
53
+ model: str,
54
+ system_prompt: str | None = None,
55
+ temperature: float = 0.7,
56
+ max_tokens: int | None = None,
57
+ ) -> LLMResponse:
58
+ """
59
+ Generate text from the LLM.
60
+
61
+ Parameters
62
+ ----------
63
+ prompt : str
64
+ The user prompt/query.
65
+ model : str
66
+ Model identifier (e.g., 'llama3', 'qwen3-vl:235b-instruct-cloud').
67
+ system_prompt : str, optional
68
+ System prompt for context/instructions.
69
+ temperature : float
70
+ Sampling temperature (0.0 to 1.0).
71
+ max_tokens : int, optional
72
+ Maximum tokens to generate.
73
+
74
+ Returns
75
+ -------
76
+ LLMResponse
77
+ The generated response.
78
+ """
79
+ pass
80
+
81
+ @abstractmethod
82
+ def is_available(self) -> bool:
83
+ """Check if the provider is available and configured."""
84
+ pass
85
+
86
+
87
+ class BaseEmbeddingProvider(ABC):
88
+ """
89
+ Abstract base class for embedding providers.
90
+
91
+ Implement this to add support for new embedding providers.
92
+ """
93
+
94
+ @property
95
+ @abstractmethod
96
+ def provider_name(self) -> str:
97
+ """Return the provider name."""
98
+ pass
99
+
100
+ @property
101
+ @abstractmethod
102
+ def dimensions(self) -> int:
103
+ """Return the embedding dimensions for the current model."""
104
+ pass
105
+
106
+ @abstractmethod
107
+ def embed(self, text: str, model: str) -> EmbeddingResponse:
108
+ """
109
+ Generate embedding for text.
110
+
111
+ Parameters
112
+ ----------
113
+ text : str
114
+ Text to embed.
115
+ model : str
116
+ Model identifier (e.g., 'nomic-embed-text').
117
+
118
+ Returns
119
+ -------
120
+ EmbeddingResponse
121
+ The embedding response.
122
+ """
123
+ pass
124
+
125
+ @abstractmethod
126
+ def embed_batch(self, texts: list[str], model: str) -> list[EmbeddingResponse]:
127
+ """
128
+ Generate embeddings for multiple texts.
129
+
130
+ Parameters
131
+ ----------
132
+ texts : list[str]
133
+ Texts to embed.
134
+ model : str
135
+ Model identifier.
136
+
137
+ Returns
138
+ -------
139
+ list[EmbeddingResponse]
140
+ List of embedding responses.
141
+ """
142
+ pass
143
+
144
+ @abstractmethod
145
+ def is_available(self) -> bool:
146
+ """Check if the provider is available and configured."""
147
+ pass
@@ -0,0 +1,237 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Function-based provider adapter for pluggable embedding and LLM functions.
7
+
8
+ This module provides a simple adapter that wraps user-provided functions
9
+ into the provider interface, enabling easy integration with custom
10
+ embedding and LLM implementations.
11
+ """
12
+
13
+ import inspect
14
+ from collections.abc import Callable
15
+
16
+ from ragit.providers.base import (
17
+ BaseEmbeddingProvider,
18
+ BaseLLMProvider,
19
+ EmbeddingResponse,
20
+ LLMResponse,
21
+ )
22
+
23
+
24
+ class FunctionProvider(BaseLLMProvider, BaseEmbeddingProvider):
25
+ """
26
+ Adapter that wraps user-provided embedding and generation functions.
27
+
28
+ This provider allows users to bring their own embedding and/or LLM functions
29
+ without implementing the full provider interface.
30
+
31
+ Parameters
32
+ ----------
33
+ embed_fn : Callable[[str], list[float]], optional
34
+ Function that takes text and returns an embedding vector.
35
+ Example: `lambda text: openai.embeddings.create(input=text).data[0].embedding`
36
+ generate_fn : Callable, optional
37
+ Function for text generation. Supports two signatures:
38
+ - (prompt: str) -> str
39
+ - (prompt: str, system_prompt: str) -> str
40
+ embedding_dimensions : int, optional
41
+ Embedding dimensions. Auto-detected on first call if not provided.
42
+
43
+ Examples
44
+ --------
45
+ >>> # Simple embedding function
46
+ >>> def my_embed(text: str) -> list[float]:
47
+ ... return openai.embeddings.create(input=text).data[0].embedding
48
+ >>>
49
+ >>> # Use with RAGAssistant (retrieval-only)
50
+ >>> assistant = RAGAssistant(docs, embed_fn=my_embed)
51
+ >>> results = assistant.retrieve("query")
52
+ >>>
53
+ >>> # With LLM for full RAG
54
+ >>> def my_llm(prompt: str, system_prompt: str = None) -> str:
55
+ ... return openai.chat.completions.create(
56
+ ... messages=[{"role": "user", "content": prompt}]
57
+ ... ).choices[0].message.content
58
+ >>>
59
+ >>> assistant = RAGAssistant(docs, embed_fn=my_embed, generate_fn=my_llm)
60
+ >>> answer = assistant.ask("What is X?")
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ embed_fn: Callable[[str], list[float]] | None = None,
66
+ generate_fn: Callable[..., str] | None = None,
67
+ embedding_dimensions: int | None = None,
68
+ ) -> None:
69
+ self._embed_fn = embed_fn
70
+ self._generate_fn = generate_fn
71
+ self._embedding_dimensions = embedding_dimensions
72
+ self._generate_fn_signature: int | None = None # Number of args (1 or 2)
73
+
74
+ # Detect generate_fn signature if provided
75
+ if generate_fn is not None:
76
+ self._detect_generate_signature()
77
+
78
+ def _detect_generate_signature(self) -> None:
79
+ """Detect whether generate_fn accepts 1 or 2 arguments."""
80
+ if self._generate_fn is None:
81
+ return
82
+
83
+ sig = inspect.signature(self._generate_fn)
84
+ params = [
85
+ p
86
+ for p in sig.parameters.values()
87
+ if p.default is inspect.Parameter.empty and p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD)
88
+ ]
89
+ # Count required parameters
90
+ required_count = len(params)
91
+
92
+ if required_count == 1:
93
+ self._generate_fn_signature = 1
94
+ else:
95
+ # Assume 2 args if more than 1 required or if has optional args
96
+ self._generate_fn_signature = 2
97
+
98
+ @property
99
+ def provider_name(self) -> str:
100
+ return "function"
101
+
102
+ @property
103
+ def dimensions(self) -> int:
104
+ if self._embedding_dimensions is None:
105
+ raise ValueError("Embedding dimensions not yet determined. Call embed() first or provide dimensions.")
106
+ return self._embedding_dimensions
107
+
108
+ @property
109
+ def has_embedding(self) -> bool:
110
+ """Check if embedding function is configured."""
111
+ return self._embed_fn is not None
112
+
113
+ @property
114
+ def has_llm(self) -> bool:
115
+ """Check if LLM generation function is configured."""
116
+ return self._generate_fn is not None
117
+
118
+ def is_available(self) -> bool:
119
+ """Check if the provider has at least one function configured."""
120
+ return self._embed_fn is not None or self._generate_fn is not None
121
+
122
+ def embed(self, text: str, model: str = "") -> EmbeddingResponse:
123
+ """
124
+ Generate embedding using the provided function.
125
+
126
+ Parameters
127
+ ----------
128
+ text : str
129
+ Text to embed.
130
+ model : str
131
+ Model identifier (ignored, kept for interface compatibility).
132
+
133
+ Returns
134
+ -------
135
+ EmbeddingResponse
136
+ The embedding response.
137
+
138
+ Raises
139
+ ------
140
+ ValueError
141
+ If no embedding function was provided.
142
+ """
143
+ if self._embed_fn is None:
144
+ raise ValueError("No embedding function configured. Provide embed_fn to use embeddings.")
145
+
146
+ raw_embedding = self._embed_fn(text)
147
+
148
+ # Convert to tuple for immutability
149
+ embedding_tuple: tuple[float, ...] = tuple(raw_embedding)
150
+
151
+ # Auto-detect dimensions on first call
152
+ if self._embedding_dimensions is None:
153
+ self._embedding_dimensions = len(embedding_tuple)
154
+
155
+ return EmbeddingResponse(
156
+ embedding=embedding_tuple,
157
+ model=model or "function",
158
+ provider=self.provider_name,
159
+ dimensions=len(embedding_tuple),
160
+ )
161
+
162
+ def embed_batch(self, texts: list[str], model: str = "") -> list[EmbeddingResponse]:
163
+ """
164
+ Generate embeddings for multiple texts.
165
+
166
+ Iterates over embed_fn for each text. For providers with native batch
167
+ support, users should implement their own BatchEmbeddingProvider.
168
+
169
+ Parameters
170
+ ----------
171
+ texts : list[str]
172
+ Texts to embed.
173
+ model : str
174
+ Model identifier (ignored).
175
+
176
+ Returns
177
+ -------
178
+ list[EmbeddingResponse]
179
+ List of embedding responses.
180
+ """
181
+ return [self.embed(text, model) for text in texts]
182
+
183
+ def generate(
184
+ self,
185
+ prompt: str,
186
+ model: str = "",
187
+ system_prompt: str | None = None,
188
+ temperature: float = 0.7,
189
+ max_tokens: int | None = None,
190
+ ) -> LLMResponse:
191
+ """
192
+ Generate text using the provided function.
193
+
194
+ Parameters
195
+ ----------
196
+ prompt : str
197
+ The user prompt.
198
+ model : str
199
+ Model identifier (ignored, kept for interface compatibility).
200
+ system_prompt : str, optional
201
+ System prompt for context.
202
+ temperature : float
203
+ Sampling temperature (ignored if function doesn't support it).
204
+ max_tokens : int, optional
205
+ Maximum tokens (ignored if function doesn't support it).
206
+
207
+ Returns
208
+ -------
209
+ LLMResponse
210
+ The generated response.
211
+
212
+ Raises
213
+ ------
214
+ NotImplementedError
215
+ If no generation function was provided.
216
+ """
217
+ if self._generate_fn is None:
218
+ raise NotImplementedError(
219
+ "No LLM configured. Provide generate_fn or a provider with LLM support "
220
+ "to use ask(), generate(), or generate_code() methods."
221
+ )
222
+
223
+ # Call with appropriate signature
224
+ if self._generate_fn_signature == 1:
225
+ # Single argument - prepend system prompt to prompt if provided
226
+ full_prompt = f"{system_prompt}\n\n{prompt}" if system_prompt else prompt
227
+ text = self._generate_fn(full_prompt)
228
+ else:
229
+ # Two arguments - pass separately
230
+ text = self._generate_fn(prompt, system_prompt)
231
+
232
+ return LLMResponse(
233
+ text=text,
234
+ model=model or "function",
235
+ provider=self.provider_name,
236
+ usage=None,
237
+ )