ragit 0.3__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragit/__init__.py +128 -2
- ragit/assistant.py +757 -0
- ragit/config.py +204 -0
- ragit/core/__init__.py +5 -0
- ragit/core/experiment/__init__.py +22 -0
- ragit/core/experiment/experiment.py +577 -0
- ragit/core/experiment/results.py +131 -0
- ragit/exceptions.py +271 -0
- ragit/loaders.py +401 -0
- ragit/logging.py +194 -0
- ragit/monitor.py +307 -0
- ragit/providers/__init__.py +35 -0
- ragit/providers/base.py +147 -0
- ragit/providers/function_adapter.py +237 -0
- ragit/providers/ollama.py +670 -0
- ragit/utils/__init__.py +105 -0
- ragit/version.py +5 -0
- ragit-0.10.1.dist-info/METADATA +153 -0
- ragit-0.10.1.dist-info/RECORD +22 -0
- {ragit-0.3.dist-info → ragit-0.10.1.dist-info}/WHEEL +1 -1
- ragit-0.10.1.dist-info/licenses/LICENSE +201 -0
- ragit/main.py +0 -384
- ragit-0.3.dist-info/METADATA +0 -163
- ragit-0.3.dist-info/RECORD +0 -6
- {ragit-0.3.dist-info → ragit-0.10.1.dist-info}/top_level.txt +0 -0
ragit/exceptions.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""
|
|
6
|
+
Custom exception hierarchy for ragit.
|
|
7
|
+
|
|
8
|
+
Provides structured exceptions for different failure types,
|
|
9
|
+
enabling better error handling and debugging.
|
|
10
|
+
|
|
11
|
+
Pattern inspired by ai4rag exception_handler.py.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RagitError(Exception):
|
|
18
|
+
"""Base exception for all ragit errors.
|
|
19
|
+
|
|
20
|
+
All ragit-specific exceptions inherit from this class,
|
|
21
|
+
making it easy to catch all ragit errors with a single handler.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
message : str
|
|
26
|
+
Human-readable error message.
|
|
27
|
+
original_exception : Exception, optional
|
|
28
|
+
The underlying exception that caused this error.
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
>>> try:
|
|
33
|
+
... provider.embed("text", "model")
|
|
34
|
+
... except RagitError as e:
|
|
35
|
+
... print(f"Ragit error: {e}")
|
|
36
|
+
... if e.original_exception:
|
|
37
|
+
... print(f"Caused by: {e.original_exception}")
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, message: str, original_exception: Exception | None = None):
|
|
41
|
+
self.message = message
|
|
42
|
+
self.original_exception = original_exception
|
|
43
|
+
super().__init__(self._format_message())
|
|
44
|
+
|
|
45
|
+
def _format_message(self) -> str:
|
|
46
|
+
"""Format the error message, including original exception if present."""
|
|
47
|
+
if self.original_exception:
|
|
48
|
+
return f"{self.message}: {self.original_exception}"
|
|
49
|
+
return self.message
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ConfigurationError(RagitError):
|
|
53
|
+
"""Configuration validation or loading failed.
|
|
54
|
+
|
|
55
|
+
Raised when:
|
|
56
|
+
- Environment variables have invalid values
|
|
57
|
+
- Required configuration is missing
|
|
58
|
+
- URL formats are invalid
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ProviderError(RagitError):
|
|
65
|
+
"""Provider communication or operation failed.
|
|
66
|
+
|
|
67
|
+
Raised when:
|
|
68
|
+
- Network connection to provider fails
|
|
69
|
+
- Provider returns an error response
|
|
70
|
+
- Provider timeout occurs
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class IndexingError(RagitError):
|
|
77
|
+
"""Document indexing or embedding failed.
|
|
78
|
+
|
|
79
|
+
Raised when:
|
|
80
|
+
- Embedding generation fails
|
|
81
|
+
- Document chunking fails
|
|
82
|
+
- Index building fails
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class RetrievalError(RagitError):
|
|
89
|
+
"""Retrieval operation failed.
|
|
90
|
+
|
|
91
|
+
Raised when:
|
|
92
|
+
- Query embedding fails
|
|
93
|
+
- Search operation fails
|
|
94
|
+
- No results can be retrieved
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class GenerationError(RagitError):
|
|
101
|
+
"""LLM generation failed.
|
|
102
|
+
|
|
103
|
+
Raised when:
|
|
104
|
+
- LLM call fails
|
|
105
|
+
- Response parsing fails
|
|
106
|
+
- Context exceeds model limits
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class EvaluationError(RagitError):
|
|
113
|
+
"""Evaluation or scoring failed.
|
|
114
|
+
|
|
115
|
+
Raised when:
|
|
116
|
+
- Metric calculation fails
|
|
117
|
+
- Benchmark validation fails
|
|
118
|
+
- Score extraction fails
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class ExceptionAggregator:
|
|
125
|
+
"""Collect and report exceptions during batch operations.
|
|
126
|
+
|
|
127
|
+
Useful for operations that should continue even when some
|
|
128
|
+
items fail, then report all failures at the end.
|
|
129
|
+
|
|
130
|
+
Pattern from ai4rag exception_handler.py.
|
|
131
|
+
|
|
132
|
+
Examples
|
|
133
|
+
--------
|
|
134
|
+
>>> aggregator = ExceptionAggregator()
|
|
135
|
+
>>> for doc in documents:
|
|
136
|
+
... try:
|
|
137
|
+
... process(doc)
|
|
138
|
+
... except Exception as e:
|
|
139
|
+
... aggregator.record(f"doc:{doc.id}", e)
|
|
140
|
+
>>> if aggregator.has_errors:
|
|
141
|
+
... print(aggregator.get_summary())
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
def __init__(self) -> None:
|
|
145
|
+
self._exceptions: list[tuple[str, Exception]] = []
|
|
146
|
+
|
|
147
|
+
def record(self, context: str, exception: Exception) -> None:
|
|
148
|
+
"""Record an exception with context.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
context : str
|
|
153
|
+
Description of where/why the exception occurred.
|
|
154
|
+
exception : Exception
|
|
155
|
+
The exception that was raised.
|
|
156
|
+
"""
|
|
157
|
+
self._exceptions.append((context, exception))
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def has_errors(self) -> bool:
|
|
161
|
+
"""Check if any errors have been recorded."""
|
|
162
|
+
return len(self._exceptions) > 0
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def error_count(self) -> int:
|
|
166
|
+
"""Get the number of recorded errors."""
|
|
167
|
+
return len(self._exceptions)
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def exceptions(self) -> list[tuple[str, Exception]]:
|
|
171
|
+
"""Get all recorded exceptions with their contexts."""
|
|
172
|
+
return list(self._exceptions)
|
|
173
|
+
|
|
174
|
+
def get_by_type(self, exc_type: type[Exception]) -> list[tuple[str, Exception]]:
|
|
175
|
+
"""Get exceptions of a specific type.
|
|
176
|
+
|
|
177
|
+
Parameters
|
|
178
|
+
----------
|
|
179
|
+
exc_type : type
|
|
180
|
+
The exception type to filter by.
|
|
181
|
+
|
|
182
|
+
Returns
|
|
183
|
+
-------
|
|
184
|
+
list[tuple[str, Exception]]
|
|
185
|
+
Exceptions matching the type with their contexts.
|
|
186
|
+
"""
|
|
187
|
+
return [(ctx, exc) for ctx, exc in self._exceptions if isinstance(exc, exc_type)]
|
|
188
|
+
|
|
189
|
+
def get_summary(self) -> str:
|
|
190
|
+
"""Get a summary of all recorded errors.
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
str
|
|
195
|
+
Human-readable summary of errors.
|
|
196
|
+
"""
|
|
197
|
+
if not self._exceptions:
|
|
198
|
+
return "No errors recorded"
|
|
199
|
+
|
|
200
|
+
# Group by exception type
|
|
201
|
+
by_type: dict[str, int] = {}
|
|
202
|
+
for _, exc in self._exceptions:
|
|
203
|
+
exc_type = type(exc).__name__
|
|
204
|
+
by_type[exc_type] = by_type.get(exc_type, 0) + 1
|
|
205
|
+
|
|
206
|
+
most_common = max(by_type.items(), key=lambda x: x[1])
|
|
207
|
+
type_summary = ", ".join(f"{t}:{c}" for t, c in sorted(by_type.items(), key=lambda x: -x[1]))
|
|
208
|
+
|
|
209
|
+
return f"{self.error_count} errors ({type_summary}). Most common: {most_common[0]} ({most_common[1]}x)"
|
|
210
|
+
|
|
211
|
+
def get_details(self) -> str:
|
|
212
|
+
"""Get detailed information about all errors.
|
|
213
|
+
|
|
214
|
+
Returns
|
|
215
|
+
-------
|
|
216
|
+
str
|
|
217
|
+
Detailed error information with contexts.
|
|
218
|
+
"""
|
|
219
|
+
if not self._exceptions:
|
|
220
|
+
return "No errors recorded"
|
|
221
|
+
|
|
222
|
+
lines = [f"Total errors: {self.error_count}", ""]
|
|
223
|
+
for i, (context, exc) in enumerate(self._exceptions, 1):
|
|
224
|
+
lines.append(f"{i}. [{context}] {type(exc).__name__}: {exc}")
|
|
225
|
+
|
|
226
|
+
return "\n".join(lines)
|
|
227
|
+
|
|
228
|
+
def raise_if_errors(self, message: str = "Operation failed") -> None:
|
|
229
|
+
"""Raise RagitError if any errors were recorded.
|
|
230
|
+
|
|
231
|
+
Parameters
|
|
232
|
+
----------
|
|
233
|
+
message : str
|
|
234
|
+
Base message for the raised error.
|
|
235
|
+
|
|
236
|
+
Raises
|
|
237
|
+
------
|
|
238
|
+
RagitError
|
|
239
|
+
If any errors were recorded.
|
|
240
|
+
"""
|
|
241
|
+
if self.has_errors:
|
|
242
|
+
raise RagitError(f"{message}: {self.get_summary()}")
|
|
243
|
+
|
|
244
|
+
def clear(self) -> None:
|
|
245
|
+
"""Clear all recorded exceptions."""
|
|
246
|
+
self._exceptions.clear()
|
|
247
|
+
|
|
248
|
+
def merge_from(self, other: "ExceptionAggregator") -> None:
|
|
249
|
+
"""Merge exceptions from another aggregator.
|
|
250
|
+
|
|
251
|
+
Parameters
|
|
252
|
+
----------
|
|
253
|
+
other : ExceptionAggregator
|
|
254
|
+
Another aggregator to merge from.
|
|
255
|
+
"""
|
|
256
|
+
self._exceptions.extend(other._exceptions)
|
|
257
|
+
|
|
258
|
+
def to_dict(self) -> dict[str, Any]:
|
|
259
|
+
"""Export as dictionary for JSON serialization.
|
|
260
|
+
|
|
261
|
+
Returns
|
|
262
|
+
-------
|
|
263
|
+
dict
|
|
264
|
+
Dictionary representation of aggregated errors.
|
|
265
|
+
"""
|
|
266
|
+
return {
|
|
267
|
+
"error_count": self.error_count,
|
|
268
|
+
"errors": [
|
|
269
|
+
{"context": ctx, "type": type(exc).__name__, "message": str(exc)} for ctx, exc in self._exceptions
|
|
270
|
+
],
|
|
271
|
+
}
|
ragit/loaders.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright RODMENA LIMITED 2025
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
#
|
|
5
|
+
"""
|
|
6
|
+
Document loading and chunking utilities.
|
|
7
|
+
|
|
8
|
+
Provides simple functions to load documents from files and chunk text.
|
|
9
|
+
|
|
10
|
+
Includes ai4rag-inspired patterns:
|
|
11
|
+
- Auto-generated document IDs via SHA256 hash
|
|
12
|
+
- Sequence numbering for chunk ordering
|
|
13
|
+
- Deduplication via content hashing
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import hashlib
|
|
17
|
+
import re
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from ragit.core.experiment.experiment import Chunk, Document
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def generate_document_id(content: str) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Generate a unique document ID from content using SHA256 hash.
|
|
26
|
+
|
|
27
|
+
Pattern from ai4rag langchain_chunker.py.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
content : str
|
|
32
|
+
Document content to hash.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
str
|
|
37
|
+
16-character hex string (first 64 bits of SHA256).
|
|
38
|
+
|
|
39
|
+
Examples
|
|
40
|
+
--------
|
|
41
|
+
>>> doc_id = generate_document_id("Hello, world!")
|
|
42
|
+
>>> len(doc_id)
|
|
43
|
+
16
|
|
44
|
+
"""
|
|
45
|
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def deduplicate_documents(documents: list[Document]) -> list[Document]:
|
|
49
|
+
"""
|
|
50
|
+
Remove duplicate documents based on content hash.
|
|
51
|
+
|
|
52
|
+
Pattern from ai4rag chroma.py.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
documents : list[Document]
|
|
57
|
+
Documents to deduplicate.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
list[Document]
|
|
62
|
+
Unique documents (first occurrence kept).
|
|
63
|
+
|
|
64
|
+
Examples
|
|
65
|
+
--------
|
|
66
|
+
>>> unique_docs = deduplicate_documents(docs)
|
|
67
|
+
>>> print(f"Removed {len(docs) - len(unique_docs)} duplicates")
|
|
68
|
+
"""
|
|
69
|
+
seen_hashes: set[str] = set()
|
|
70
|
+
unique_docs: list[Document] = []
|
|
71
|
+
|
|
72
|
+
for doc in documents:
|
|
73
|
+
content_hash = generate_document_id(doc.content)
|
|
74
|
+
if content_hash not in seen_hashes:
|
|
75
|
+
seen_hashes.add(content_hash)
|
|
76
|
+
unique_docs.append(doc)
|
|
77
|
+
|
|
78
|
+
return unique_docs
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def load_text(path: str | Path) -> Document:
|
|
82
|
+
"""
|
|
83
|
+
Load a single text file as a Document.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
path : str or Path
|
|
88
|
+
Path to the text file (.txt, .md, .rst, etc.)
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
Document
|
|
93
|
+
Document with file content and metadata.
|
|
94
|
+
|
|
95
|
+
Examples
|
|
96
|
+
--------
|
|
97
|
+
>>> doc = load_text("docs/tutorial.rst")
|
|
98
|
+
>>> print(doc.id, len(doc.content))
|
|
99
|
+
"""
|
|
100
|
+
path = Path(path)
|
|
101
|
+
content = path.read_text(encoding="utf-8")
|
|
102
|
+
return Document(id=path.stem, content=content, metadata={"source": str(path), "filename": path.name})
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def load_directory(path: str | Path, pattern: str = "*.txt", recursive: bool = False) -> list[Document]:
|
|
106
|
+
"""
|
|
107
|
+
Load all matching files from a directory as Documents.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
path : str or Path
|
|
112
|
+
Directory path.
|
|
113
|
+
pattern : str
|
|
114
|
+
Glob pattern for files (default: "*.txt").
|
|
115
|
+
recursive : bool
|
|
116
|
+
If True, search recursively (default: False).
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
list[Document]
|
|
121
|
+
List of loaded documents.
|
|
122
|
+
|
|
123
|
+
Examples
|
|
124
|
+
--------
|
|
125
|
+
>>> docs = load_directory("docs/", "*.rst")
|
|
126
|
+
>>> docs = load_directory("docs/", "**/*.md", recursive=True)
|
|
127
|
+
"""
|
|
128
|
+
path = Path(path)
|
|
129
|
+
glob_method = path.rglob if recursive else path.glob
|
|
130
|
+
documents = []
|
|
131
|
+
|
|
132
|
+
for file_path in sorted(glob_method(pattern)):
|
|
133
|
+
if file_path.is_file():
|
|
134
|
+
documents.append(load_text(file_path))
|
|
135
|
+
|
|
136
|
+
return documents
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def chunk_text(
|
|
140
|
+
text: str,
|
|
141
|
+
chunk_size: int = 512,
|
|
142
|
+
chunk_overlap: int = 50,
|
|
143
|
+
doc_id: str | None = None,
|
|
144
|
+
include_metadata: bool = True,
|
|
145
|
+
) -> list[Chunk]:
|
|
146
|
+
"""
|
|
147
|
+
Split text into overlapping chunks with rich metadata.
|
|
148
|
+
|
|
149
|
+
Includes ai4rag-inspired metadata:
|
|
150
|
+
- document_id: SHA256 hash for deduplication and window search
|
|
151
|
+
- sequence_number: Order within the document
|
|
152
|
+
- chunk_start/chunk_end: Character positions in original text
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
text : str
|
|
157
|
+
Text to chunk.
|
|
158
|
+
chunk_size : int
|
|
159
|
+
Maximum characters per chunk (default: 512).
|
|
160
|
+
chunk_overlap : int
|
|
161
|
+
Overlap between chunks (default: 50).
|
|
162
|
+
doc_id : str, optional
|
|
163
|
+
Document ID for the chunks. If None, generates from content hash.
|
|
164
|
+
include_metadata : bool
|
|
165
|
+
Include rich metadata in chunks (default: True).
|
|
166
|
+
|
|
167
|
+
Returns
|
|
168
|
+
-------
|
|
169
|
+
list[Chunk]
|
|
170
|
+
List of text chunks with metadata.
|
|
171
|
+
|
|
172
|
+
Examples
|
|
173
|
+
--------
|
|
174
|
+
>>> chunks = chunk_text("Long document...", chunk_size=256)
|
|
175
|
+
>>> print(chunks[0].metadata)
|
|
176
|
+
{'document_id': 'a1b2c3...', 'sequence_number': 0, 'chunk_start': 0, 'chunk_end': 256}
|
|
177
|
+
"""
|
|
178
|
+
if chunk_overlap >= chunk_size:
|
|
179
|
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
|
180
|
+
|
|
181
|
+
# Generate document ID if not provided
|
|
182
|
+
effective_doc_id = doc_id or generate_document_id(text)
|
|
183
|
+
|
|
184
|
+
chunks = []
|
|
185
|
+
start = 0
|
|
186
|
+
sequence_number = 0
|
|
187
|
+
|
|
188
|
+
while start < len(text):
|
|
189
|
+
end = min(start + chunk_size, len(text))
|
|
190
|
+
chunk_content = text[start:end].strip()
|
|
191
|
+
|
|
192
|
+
if chunk_content:
|
|
193
|
+
metadata = {}
|
|
194
|
+
if include_metadata:
|
|
195
|
+
metadata = {
|
|
196
|
+
"document_id": effective_doc_id,
|
|
197
|
+
"sequence_number": sequence_number,
|
|
198
|
+
"chunk_start": start,
|
|
199
|
+
"chunk_end": end,
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
chunks.append(
|
|
203
|
+
Chunk(
|
|
204
|
+
content=chunk_content,
|
|
205
|
+
doc_id=effective_doc_id,
|
|
206
|
+
chunk_index=sequence_number,
|
|
207
|
+
metadata=metadata,
|
|
208
|
+
)
|
|
209
|
+
)
|
|
210
|
+
sequence_number += 1
|
|
211
|
+
|
|
212
|
+
start = end - chunk_overlap
|
|
213
|
+
if start >= len(text) - chunk_overlap:
|
|
214
|
+
break
|
|
215
|
+
|
|
216
|
+
return chunks
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def chunk_document(
|
|
220
|
+
doc: Document,
|
|
221
|
+
chunk_size: int = 512,
|
|
222
|
+
chunk_overlap: int = 50,
|
|
223
|
+
include_metadata: bool = True,
|
|
224
|
+
) -> list[Chunk]:
|
|
225
|
+
"""
|
|
226
|
+
Split a Document into overlapping chunks with rich metadata.
|
|
227
|
+
|
|
228
|
+
Parameters
|
|
229
|
+
----------
|
|
230
|
+
doc : Document
|
|
231
|
+
Document to chunk.
|
|
232
|
+
chunk_size : int
|
|
233
|
+
Maximum characters per chunk.
|
|
234
|
+
chunk_overlap : int
|
|
235
|
+
Overlap between chunks.
|
|
236
|
+
include_metadata : bool
|
|
237
|
+
Include rich metadata in chunks (default: True).
|
|
238
|
+
|
|
239
|
+
Returns
|
|
240
|
+
-------
|
|
241
|
+
list[Chunk]
|
|
242
|
+
List of chunks from the document with metadata.
|
|
243
|
+
"""
|
|
244
|
+
chunks = chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, include_metadata)
|
|
245
|
+
|
|
246
|
+
# Merge document metadata into chunk metadata
|
|
247
|
+
if doc.metadata and include_metadata:
|
|
248
|
+
for chunk in chunks:
|
|
249
|
+
chunk.metadata = {**doc.metadata, **chunk.metadata}
|
|
250
|
+
|
|
251
|
+
return chunks
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def chunk_by_separator(
|
|
255
|
+
text: str,
|
|
256
|
+
separator: str = "\n\n",
|
|
257
|
+
doc_id: str | None = None,
|
|
258
|
+
include_metadata: bool = True,
|
|
259
|
+
) -> list[Chunk]:
|
|
260
|
+
"""
|
|
261
|
+
Split text by a separator (e.g., paragraphs, sections).
|
|
262
|
+
|
|
263
|
+
Parameters
|
|
264
|
+
----------
|
|
265
|
+
text : str
|
|
266
|
+
Text to split.
|
|
267
|
+
separator : str
|
|
268
|
+
Separator string (default: double newline for paragraphs).
|
|
269
|
+
doc_id : str, optional
|
|
270
|
+
Document ID for the chunks. If None, generates from content hash.
|
|
271
|
+
include_metadata : bool
|
|
272
|
+
Include rich metadata in chunks (default: True).
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
list[Chunk]
|
|
277
|
+
List of chunks with metadata.
|
|
278
|
+
|
|
279
|
+
Examples
|
|
280
|
+
--------
|
|
281
|
+
>>> chunks = chunk_by_separator(text, separator="\\n---\\n")
|
|
282
|
+
"""
|
|
283
|
+
effective_doc_id = doc_id or generate_document_id(text)
|
|
284
|
+
parts = text.split(separator)
|
|
285
|
+
chunks = []
|
|
286
|
+
current_pos = 0
|
|
287
|
+
|
|
288
|
+
for _idx, part in enumerate(parts):
|
|
289
|
+
content = part.strip()
|
|
290
|
+
if content:
|
|
291
|
+
metadata = {}
|
|
292
|
+
if include_metadata:
|
|
293
|
+
# Find actual position in original text
|
|
294
|
+
part_start = text.find(part, current_pos)
|
|
295
|
+
part_end = part_start + len(part) if part_start >= 0 else current_pos + len(part)
|
|
296
|
+
metadata = {
|
|
297
|
+
"document_id": effective_doc_id,
|
|
298
|
+
"sequence_number": len(chunks),
|
|
299
|
+
"chunk_start": part_start if part_start >= 0 else current_pos,
|
|
300
|
+
"chunk_end": part_end,
|
|
301
|
+
}
|
|
302
|
+
current_pos = part_end
|
|
303
|
+
|
|
304
|
+
chunks.append(
|
|
305
|
+
Chunk(
|
|
306
|
+
content=content,
|
|
307
|
+
doc_id=effective_doc_id,
|
|
308
|
+
chunk_index=len(chunks),
|
|
309
|
+
metadata=metadata,
|
|
310
|
+
)
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
return chunks
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def chunk_rst_sections(
|
|
317
|
+
text: str,
|
|
318
|
+
doc_id: str | None = None,
|
|
319
|
+
include_metadata: bool = True,
|
|
320
|
+
) -> list[Chunk]:
|
|
321
|
+
"""
|
|
322
|
+
Split RST document by section headers with rich metadata.
|
|
323
|
+
|
|
324
|
+
Parameters
|
|
325
|
+
----------
|
|
326
|
+
text : str
|
|
327
|
+
RST document text.
|
|
328
|
+
doc_id : str, optional
|
|
329
|
+
Document ID for the chunks. If None, generates from content hash.
|
|
330
|
+
include_metadata : bool
|
|
331
|
+
Include rich metadata in chunks (default: True).
|
|
332
|
+
|
|
333
|
+
Returns
|
|
334
|
+
-------
|
|
335
|
+
list[Chunk]
|
|
336
|
+
List of section chunks with metadata.
|
|
337
|
+
"""
|
|
338
|
+
effective_doc_id = doc_id or generate_document_id(text)
|
|
339
|
+
|
|
340
|
+
# Match RST section headers (title followed by underline of =, -, ~, etc.)
|
|
341
|
+
pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
|
|
342
|
+
|
|
343
|
+
# Find all section positions
|
|
344
|
+
matches = list(re.finditer(pattern, text))
|
|
345
|
+
|
|
346
|
+
if not matches:
|
|
347
|
+
# No sections found, return whole text as one chunk
|
|
348
|
+
if text.strip():
|
|
349
|
+
metadata = {}
|
|
350
|
+
if include_metadata:
|
|
351
|
+
metadata = {
|
|
352
|
+
"document_id": effective_doc_id,
|
|
353
|
+
"sequence_number": 0,
|
|
354
|
+
"chunk_start": 0,
|
|
355
|
+
"chunk_end": len(text),
|
|
356
|
+
}
|
|
357
|
+
return [Chunk(content=text.strip(), doc_id=effective_doc_id, chunk_index=0, metadata=metadata)]
|
|
358
|
+
return []
|
|
359
|
+
|
|
360
|
+
chunks = []
|
|
361
|
+
|
|
362
|
+
# Handle content before first section
|
|
363
|
+
first_pos = matches[0].start()
|
|
364
|
+
if first_pos > 0:
|
|
365
|
+
pre_content = text[:first_pos].strip()
|
|
366
|
+
if pre_content:
|
|
367
|
+
metadata = {}
|
|
368
|
+
if include_metadata:
|
|
369
|
+
metadata = {
|
|
370
|
+
"document_id": effective_doc_id,
|
|
371
|
+
"sequence_number": 0,
|
|
372
|
+
"chunk_start": 0,
|
|
373
|
+
"chunk_end": first_pos,
|
|
374
|
+
}
|
|
375
|
+
chunks.append(Chunk(content=pre_content, doc_id=effective_doc_id, chunk_index=0, metadata=metadata))
|
|
376
|
+
|
|
377
|
+
# Extract each section
|
|
378
|
+
for i, match in enumerate(matches):
|
|
379
|
+
start = match.start()
|
|
380
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
381
|
+
|
|
382
|
+
section_content = text[start:end].strip()
|
|
383
|
+
if section_content:
|
|
384
|
+
metadata = {}
|
|
385
|
+
if include_metadata:
|
|
386
|
+
metadata = {
|
|
387
|
+
"document_id": effective_doc_id,
|
|
388
|
+
"sequence_number": len(chunks),
|
|
389
|
+
"chunk_start": start,
|
|
390
|
+
"chunk_end": end,
|
|
391
|
+
}
|
|
392
|
+
chunks.append(
|
|
393
|
+
Chunk(
|
|
394
|
+
content=section_content,
|
|
395
|
+
doc_id=effective_doc_id,
|
|
396
|
+
chunk_index=len(chunks),
|
|
397
|
+
metadata=metadata,
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
return chunks
|