ragit 0.3__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragit/exceptions.py ADDED
@@ -0,0 +1,271 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Custom exception hierarchy for ragit.
7
+
8
+ Provides structured exceptions for different failure types,
9
+ enabling better error handling and debugging.
10
+
11
+ Pattern inspired by ai4rag exception_handler.py.
12
+ """
13
+
14
+ from typing import Any
15
+
16
+
17
+ class RagitError(Exception):
18
+ """Base exception for all ragit errors.
19
+
20
+ All ragit-specific exceptions inherit from this class,
21
+ making it easy to catch all ragit errors with a single handler.
22
+
23
+ Parameters
24
+ ----------
25
+ message : str
26
+ Human-readable error message.
27
+ original_exception : Exception, optional
28
+ The underlying exception that caused this error.
29
+
30
+ Examples
31
+ --------
32
+ >>> try:
33
+ ... provider.embed("text", "model")
34
+ ... except RagitError as e:
35
+ ... print(f"Ragit error: {e}")
36
+ ... if e.original_exception:
37
+ ... print(f"Caused by: {e.original_exception}")
38
+ """
39
+
40
+ def __init__(self, message: str, original_exception: Exception | None = None):
41
+ self.message = message
42
+ self.original_exception = original_exception
43
+ super().__init__(self._format_message())
44
+
45
+ def _format_message(self) -> str:
46
+ """Format the error message, including original exception if present."""
47
+ if self.original_exception:
48
+ return f"{self.message}: {self.original_exception}"
49
+ return self.message
50
+
51
+
52
+ class ConfigurationError(RagitError):
53
+ """Configuration validation or loading failed.
54
+
55
+ Raised when:
56
+ - Environment variables have invalid values
57
+ - Required configuration is missing
58
+ - URL formats are invalid
59
+ """
60
+
61
+ pass
62
+
63
+
64
+ class ProviderError(RagitError):
65
+ """Provider communication or operation failed.
66
+
67
+ Raised when:
68
+ - Network connection to provider fails
69
+ - Provider returns an error response
70
+ - Provider timeout occurs
71
+ """
72
+
73
+ pass
74
+
75
+
76
+ class IndexingError(RagitError):
77
+ """Document indexing or embedding failed.
78
+
79
+ Raised when:
80
+ - Embedding generation fails
81
+ - Document chunking fails
82
+ - Index building fails
83
+ """
84
+
85
+ pass
86
+
87
+
88
+ class RetrievalError(RagitError):
89
+ """Retrieval operation failed.
90
+
91
+ Raised when:
92
+ - Query embedding fails
93
+ - Search operation fails
94
+ - No results can be retrieved
95
+ """
96
+
97
+ pass
98
+
99
+
100
+ class GenerationError(RagitError):
101
+ """LLM generation failed.
102
+
103
+ Raised when:
104
+ - LLM call fails
105
+ - Response parsing fails
106
+ - Context exceeds model limits
107
+ """
108
+
109
+ pass
110
+
111
+
112
+ class EvaluationError(RagitError):
113
+ """Evaluation or scoring failed.
114
+
115
+ Raised when:
116
+ - Metric calculation fails
117
+ - Benchmark validation fails
118
+ - Score extraction fails
119
+ """
120
+
121
+ pass
122
+
123
+
124
+ class ExceptionAggregator:
125
+ """Collect and report exceptions during batch operations.
126
+
127
+ Useful for operations that should continue even when some
128
+ items fail, then report all failures at the end.
129
+
130
+ Pattern from ai4rag exception_handler.py.
131
+
132
+ Examples
133
+ --------
134
+ >>> aggregator = ExceptionAggregator()
135
+ >>> for doc in documents:
136
+ ... try:
137
+ ... process(doc)
138
+ ... except Exception as e:
139
+ ... aggregator.record(f"doc:{doc.id}", e)
140
+ >>> if aggregator.has_errors:
141
+ ... print(aggregator.get_summary())
142
+ """
143
+
144
+ def __init__(self) -> None:
145
+ self._exceptions: list[tuple[str, Exception]] = []
146
+
147
+ def record(self, context: str, exception: Exception) -> None:
148
+ """Record an exception with context.
149
+
150
+ Parameters
151
+ ----------
152
+ context : str
153
+ Description of where/why the exception occurred.
154
+ exception : Exception
155
+ The exception that was raised.
156
+ """
157
+ self._exceptions.append((context, exception))
158
+
159
+ @property
160
+ def has_errors(self) -> bool:
161
+ """Check if any errors have been recorded."""
162
+ return len(self._exceptions) > 0
163
+
164
+ @property
165
+ def error_count(self) -> int:
166
+ """Get the number of recorded errors."""
167
+ return len(self._exceptions)
168
+
169
+ @property
170
+ def exceptions(self) -> list[tuple[str, Exception]]:
171
+ """Get all recorded exceptions with their contexts."""
172
+ return list(self._exceptions)
173
+
174
+ def get_by_type(self, exc_type: type[Exception]) -> list[tuple[str, Exception]]:
175
+ """Get exceptions of a specific type.
176
+
177
+ Parameters
178
+ ----------
179
+ exc_type : type
180
+ The exception type to filter by.
181
+
182
+ Returns
183
+ -------
184
+ list[tuple[str, Exception]]
185
+ Exceptions matching the type with their contexts.
186
+ """
187
+ return [(ctx, exc) for ctx, exc in self._exceptions if isinstance(exc, exc_type)]
188
+
189
+ def get_summary(self) -> str:
190
+ """Get a summary of all recorded errors.
191
+
192
+ Returns
193
+ -------
194
+ str
195
+ Human-readable summary of errors.
196
+ """
197
+ if not self._exceptions:
198
+ return "No errors recorded"
199
+
200
+ # Group by exception type
201
+ by_type: dict[str, int] = {}
202
+ for _, exc in self._exceptions:
203
+ exc_type = type(exc).__name__
204
+ by_type[exc_type] = by_type.get(exc_type, 0) + 1
205
+
206
+ most_common = max(by_type.items(), key=lambda x: x[1])
207
+ type_summary = ", ".join(f"{t}:{c}" for t, c in sorted(by_type.items(), key=lambda x: -x[1]))
208
+
209
+ return f"{self.error_count} errors ({type_summary}). Most common: {most_common[0]} ({most_common[1]}x)"
210
+
211
+ def get_details(self) -> str:
212
+ """Get detailed information about all errors.
213
+
214
+ Returns
215
+ -------
216
+ str
217
+ Detailed error information with contexts.
218
+ """
219
+ if not self._exceptions:
220
+ return "No errors recorded"
221
+
222
+ lines = [f"Total errors: {self.error_count}", ""]
223
+ for i, (context, exc) in enumerate(self._exceptions, 1):
224
+ lines.append(f"{i}. [{context}] {type(exc).__name__}: {exc}")
225
+
226
+ return "\n".join(lines)
227
+
228
+ def raise_if_errors(self, message: str = "Operation failed") -> None:
229
+ """Raise RagitError if any errors were recorded.
230
+
231
+ Parameters
232
+ ----------
233
+ message : str
234
+ Base message for the raised error.
235
+
236
+ Raises
237
+ ------
238
+ RagitError
239
+ If any errors were recorded.
240
+ """
241
+ if self.has_errors:
242
+ raise RagitError(f"{message}: {self.get_summary()}")
243
+
244
+ def clear(self) -> None:
245
+ """Clear all recorded exceptions."""
246
+ self._exceptions.clear()
247
+
248
+ def merge_from(self, other: "ExceptionAggregator") -> None:
249
+ """Merge exceptions from another aggregator.
250
+
251
+ Parameters
252
+ ----------
253
+ other : ExceptionAggregator
254
+ Another aggregator to merge from.
255
+ """
256
+ self._exceptions.extend(other._exceptions)
257
+
258
+ def to_dict(self) -> dict[str, Any]:
259
+ """Export as dictionary for JSON serialization.
260
+
261
+ Returns
262
+ -------
263
+ dict
264
+ Dictionary representation of aggregated errors.
265
+ """
266
+ return {
267
+ "error_count": self.error_count,
268
+ "errors": [
269
+ {"context": ctx, "type": type(exc).__name__, "message": str(exc)} for ctx, exc in self._exceptions
270
+ ],
271
+ }
ragit/loaders.py ADDED
@@ -0,0 +1,401 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Document loading and chunking utilities.
7
+
8
+ Provides simple functions to load documents from files and chunk text.
9
+
10
+ Includes ai4rag-inspired patterns:
11
+ - Auto-generated document IDs via SHA256 hash
12
+ - Sequence numbering for chunk ordering
13
+ - Deduplication via content hashing
14
+ """
15
+
16
+ import hashlib
17
+ import re
18
+ from pathlib import Path
19
+
20
+ from ragit.core.experiment.experiment import Chunk, Document
21
+
22
+
23
+ def generate_document_id(content: str) -> str:
24
+ """
25
+ Generate a unique document ID from content using SHA256 hash.
26
+
27
+ Pattern from ai4rag langchain_chunker.py.
28
+
29
+ Parameters
30
+ ----------
31
+ content : str
32
+ Document content to hash.
33
+
34
+ Returns
35
+ -------
36
+ str
37
+ 16-character hex string (first 64 bits of SHA256).
38
+
39
+ Examples
40
+ --------
41
+ >>> doc_id = generate_document_id("Hello, world!")
42
+ >>> len(doc_id)
43
+ 16
44
+ """
45
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
46
+
47
+
48
+ def deduplicate_documents(documents: list[Document]) -> list[Document]:
49
+ """
50
+ Remove duplicate documents based on content hash.
51
+
52
+ Pattern from ai4rag chroma.py.
53
+
54
+ Parameters
55
+ ----------
56
+ documents : list[Document]
57
+ Documents to deduplicate.
58
+
59
+ Returns
60
+ -------
61
+ list[Document]
62
+ Unique documents (first occurrence kept).
63
+
64
+ Examples
65
+ --------
66
+ >>> unique_docs = deduplicate_documents(docs)
67
+ >>> print(f"Removed {len(docs) - len(unique_docs)} duplicates")
68
+ """
69
+ seen_hashes: set[str] = set()
70
+ unique_docs: list[Document] = []
71
+
72
+ for doc in documents:
73
+ content_hash = generate_document_id(doc.content)
74
+ if content_hash not in seen_hashes:
75
+ seen_hashes.add(content_hash)
76
+ unique_docs.append(doc)
77
+
78
+ return unique_docs
79
+
80
+
81
+ def load_text(path: str | Path) -> Document:
82
+ """
83
+ Load a single text file as a Document.
84
+
85
+ Parameters
86
+ ----------
87
+ path : str or Path
88
+ Path to the text file (.txt, .md, .rst, etc.)
89
+
90
+ Returns
91
+ -------
92
+ Document
93
+ Document with file content and metadata.
94
+
95
+ Examples
96
+ --------
97
+ >>> doc = load_text("docs/tutorial.rst")
98
+ >>> print(doc.id, len(doc.content))
99
+ """
100
+ path = Path(path)
101
+ content = path.read_text(encoding="utf-8")
102
+ return Document(id=path.stem, content=content, metadata={"source": str(path), "filename": path.name})
103
+
104
+
105
+ def load_directory(path: str | Path, pattern: str = "*.txt", recursive: bool = False) -> list[Document]:
106
+ """
107
+ Load all matching files from a directory as Documents.
108
+
109
+ Parameters
110
+ ----------
111
+ path : str or Path
112
+ Directory path.
113
+ pattern : str
114
+ Glob pattern for files (default: "*.txt").
115
+ recursive : bool
116
+ If True, search recursively (default: False).
117
+
118
+ Returns
119
+ -------
120
+ list[Document]
121
+ List of loaded documents.
122
+
123
+ Examples
124
+ --------
125
+ >>> docs = load_directory("docs/", "*.rst")
126
+ >>> docs = load_directory("docs/", "**/*.md", recursive=True)
127
+ """
128
+ path = Path(path)
129
+ glob_method = path.rglob if recursive else path.glob
130
+ documents = []
131
+
132
+ for file_path in sorted(glob_method(pattern)):
133
+ if file_path.is_file():
134
+ documents.append(load_text(file_path))
135
+
136
+ return documents
137
+
138
+
139
+ def chunk_text(
140
+ text: str,
141
+ chunk_size: int = 512,
142
+ chunk_overlap: int = 50,
143
+ doc_id: str | None = None,
144
+ include_metadata: bool = True,
145
+ ) -> list[Chunk]:
146
+ """
147
+ Split text into overlapping chunks with rich metadata.
148
+
149
+ Includes ai4rag-inspired metadata:
150
+ - document_id: SHA256 hash for deduplication and window search
151
+ - sequence_number: Order within the document
152
+ - chunk_start/chunk_end: Character positions in original text
153
+
154
+ Parameters
155
+ ----------
156
+ text : str
157
+ Text to chunk.
158
+ chunk_size : int
159
+ Maximum characters per chunk (default: 512).
160
+ chunk_overlap : int
161
+ Overlap between chunks (default: 50).
162
+ doc_id : str, optional
163
+ Document ID for the chunks. If None, generates from content hash.
164
+ include_metadata : bool
165
+ Include rich metadata in chunks (default: True).
166
+
167
+ Returns
168
+ -------
169
+ list[Chunk]
170
+ List of text chunks with metadata.
171
+
172
+ Examples
173
+ --------
174
+ >>> chunks = chunk_text("Long document...", chunk_size=256)
175
+ >>> print(chunks[0].metadata)
176
+ {'document_id': 'a1b2c3...', 'sequence_number': 0, 'chunk_start': 0, 'chunk_end': 256}
177
+ """
178
+ if chunk_overlap >= chunk_size:
179
+ raise ValueError("chunk_overlap must be less than chunk_size")
180
+
181
+ # Generate document ID if not provided
182
+ effective_doc_id = doc_id or generate_document_id(text)
183
+
184
+ chunks = []
185
+ start = 0
186
+ sequence_number = 0
187
+
188
+ while start < len(text):
189
+ end = min(start + chunk_size, len(text))
190
+ chunk_content = text[start:end].strip()
191
+
192
+ if chunk_content:
193
+ metadata = {}
194
+ if include_metadata:
195
+ metadata = {
196
+ "document_id": effective_doc_id,
197
+ "sequence_number": sequence_number,
198
+ "chunk_start": start,
199
+ "chunk_end": end,
200
+ }
201
+
202
+ chunks.append(
203
+ Chunk(
204
+ content=chunk_content,
205
+ doc_id=effective_doc_id,
206
+ chunk_index=sequence_number,
207
+ metadata=metadata,
208
+ )
209
+ )
210
+ sequence_number += 1
211
+
212
+ start = end - chunk_overlap
213
+ if start >= len(text) - chunk_overlap:
214
+ break
215
+
216
+ return chunks
217
+
218
+
219
+ def chunk_document(
220
+ doc: Document,
221
+ chunk_size: int = 512,
222
+ chunk_overlap: int = 50,
223
+ include_metadata: bool = True,
224
+ ) -> list[Chunk]:
225
+ """
226
+ Split a Document into overlapping chunks with rich metadata.
227
+
228
+ Parameters
229
+ ----------
230
+ doc : Document
231
+ Document to chunk.
232
+ chunk_size : int
233
+ Maximum characters per chunk.
234
+ chunk_overlap : int
235
+ Overlap between chunks.
236
+ include_metadata : bool
237
+ Include rich metadata in chunks (default: True).
238
+
239
+ Returns
240
+ -------
241
+ list[Chunk]
242
+ List of chunks from the document with metadata.
243
+ """
244
+ chunks = chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, include_metadata)
245
+
246
+ # Merge document metadata into chunk metadata
247
+ if doc.metadata and include_metadata:
248
+ for chunk in chunks:
249
+ chunk.metadata = {**doc.metadata, **chunk.metadata}
250
+
251
+ return chunks
252
+
253
+
254
+ def chunk_by_separator(
255
+ text: str,
256
+ separator: str = "\n\n",
257
+ doc_id: str | None = None,
258
+ include_metadata: bool = True,
259
+ ) -> list[Chunk]:
260
+ """
261
+ Split text by a separator (e.g., paragraphs, sections).
262
+
263
+ Parameters
264
+ ----------
265
+ text : str
266
+ Text to split.
267
+ separator : str
268
+ Separator string (default: double newline for paragraphs).
269
+ doc_id : str, optional
270
+ Document ID for the chunks. If None, generates from content hash.
271
+ include_metadata : bool
272
+ Include rich metadata in chunks (default: True).
273
+
274
+ Returns
275
+ -------
276
+ list[Chunk]
277
+ List of chunks with metadata.
278
+
279
+ Examples
280
+ --------
281
+ >>> chunks = chunk_by_separator(text, separator="\\n---\\n")
282
+ """
283
+ effective_doc_id = doc_id or generate_document_id(text)
284
+ parts = text.split(separator)
285
+ chunks = []
286
+ current_pos = 0
287
+
288
+ for _idx, part in enumerate(parts):
289
+ content = part.strip()
290
+ if content:
291
+ metadata = {}
292
+ if include_metadata:
293
+ # Find actual position in original text
294
+ part_start = text.find(part, current_pos)
295
+ part_end = part_start + len(part) if part_start >= 0 else current_pos + len(part)
296
+ metadata = {
297
+ "document_id": effective_doc_id,
298
+ "sequence_number": len(chunks),
299
+ "chunk_start": part_start if part_start >= 0 else current_pos,
300
+ "chunk_end": part_end,
301
+ }
302
+ current_pos = part_end
303
+
304
+ chunks.append(
305
+ Chunk(
306
+ content=content,
307
+ doc_id=effective_doc_id,
308
+ chunk_index=len(chunks),
309
+ metadata=metadata,
310
+ )
311
+ )
312
+
313
+ return chunks
314
+
315
+
316
+ def chunk_rst_sections(
317
+ text: str,
318
+ doc_id: str | None = None,
319
+ include_metadata: bool = True,
320
+ ) -> list[Chunk]:
321
+ """
322
+ Split RST document by section headers with rich metadata.
323
+
324
+ Parameters
325
+ ----------
326
+ text : str
327
+ RST document text.
328
+ doc_id : str, optional
329
+ Document ID for the chunks. If None, generates from content hash.
330
+ include_metadata : bool
331
+ Include rich metadata in chunks (default: True).
332
+
333
+ Returns
334
+ -------
335
+ list[Chunk]
336
+ List of section chunks with metadata.
337
+ """
338
+ effective_doc_id = doc_id or generate_document_id(text)
339
+
340
+ # Match RST section headers (title followed by underline of =, -, ~, etc.)
341
+ pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
342
+
343
+ # Find all section positions
344
+ matches = list(re.finditer(pattern, text))
345
+
346
+ if not matches:
347
+ # No sections found, return whole text as one chunk
348
+ if text.strip():
349
+ metadata = {}
350
+ if include_metadata:
351
+ metadata = {
352
+ "document_id": effective_doc_id,
353
+ "sequence_number": 0,
354
+ "chunk_start": 0,
355
+ "chunk_end": len(text),
356
+ }
357
+ return [Chunk(content=text.strip(), doc_id=effective_doc_id, chunk_index=0, metadata=metadata)]
358
+ return []
359
+
360
+ chunks = []
361
+
362
+ # Handle content before first section
363
+ first_pos = matches[0].start()
364
+ if first_pos > 0:
365
+ pre_content = text[:first_pos].strip()
366
+ if pre_content:
367
+ metadata = {}
368
+ if include_metadata:
369
+ metadata = {
370
+ "document_id": effective_doc_id,
371
+ "sequence_number": 0,
372
+ "chunk_start": 0,
373
+ "chunk_end": first_pos,
374
+ }
375
+ chunks.append(Chunk(content=pre_content, doc_id=effective_doc_id, chunk_index=0, metadata=metadata))
376
+
377
+ # Extract each section
378
+ for i, match in enumerate(matches):
379
+ start = match.start()
380
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
381
+
382
+ section_content = text[start:end].strip()
383
+ if section_content:
384
+ metadata = {}
385
+ if include_metadata:
386
+ metadata = {
387
+ "document_id": effective_doc_id,
388
+ "sequence_number": len(chunks),
389
+ "chunk_start": start,
390
+ "chunk_end": end,
391
+ }
392
+ chunks.append(
393
+ Chunk(
394
+ content=section_content,
395
+ doc_id=effective_doc_id,
396
+ chunk_index=len(chunks),
397
+ metadata=metadata,
398
+ )
399
+ )
400
+
401
+ return chunks