ragit 0.8.1__py3-none-any.whl → 0.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragit/assistant.py +139 -4
- ragit/core/experiment/experiment.py +3 -4
- ragit/loaders.py +37 -11
- ragit/version.py +1 -1
- {ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/METADATA +1 -1
- {ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/RECORD +9 -9
- {ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/WHEEL +1 -1
- {ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/licenses/LICENSE +0 -0
- {ragit-0.8.1.dist-info → ragit-0.8.2.dist-info}/top_level.txt +0 -0
ragit/assistant.py
CHANGED
|
@@ -116,8 +116,7 @@ class RAGAssistant:
|
|
|
116
116
|
# Use explicit provider
|
|
117
117
|
if not isinstance(provider, BaseEmbeddingProvider):
|
|
118
118
|
raise ValueError(
|
|
119
|
-
"Provider must implement BaseEmbeddingProvider for embeddings. "
|
|
120
|
-
"Alternatively, provide embed_fn."
|
|
119
|
+
"Provider must implement BaseEmbeddingProvider for embeddings. Alternatively, provide embed_fn."
|
|
121
120
|
)
|
|
122
121
|
self._embedding_provider = provider
|
|
123
122
|
if isinstance(provider, BaseLLMProvider):
|
|
@@ -156,7 +155,20 @@ class RAGAssistant:
|
|
|
156
155
|
|
|
157
156
|
if path.is_dir():
|
|
158
157
|
docs: list[Document] = []
|
|
159
|
-
for pattern in (
|
|
158
|
+
for pattern in (
|
|
159
|
+
"*.txt",
|
|
160
|
+
"*.md",
|
|
161
|
+
"*.rst",
|
|
162
|
+
"*.py",
|
|
163
|
+
"*.js",
|
|
164
|
+
"*.ts",
|
|
165
|
+
"*.go",
|
|
166
|
+
"*.java",
|
|
167
|
+
"*.c",
|
|
168
|
+
"*.cpp",
|
|
169
|
+
"*.h",
|
|
170
|
+
"*.hpp",
|
|
171
|
+
):
|
|
160
172
|
docs.extend(load_directory(path, pattern))
|
|
161
173
|
return docs
|
|
162
174
|
|
|
@@ -169,7 +181,7 @@ class RAGAssistant:
|
|
|
169
181
|
for doc in self.documents:
|
|
170
182
|
# Use RST section chunking for .rst files, otherwise regular chunking
|
|
171
183
|
if doc.metadata.get("filename", "").endswith(".rst"):
|
|
172
|
-
chunks = chunk_rst_sections(doc.content, doc.id)
|
|
184
|
+
chunks = chunk_rst_sections(doc.content, doc.id, metadata=doc.metadata)
|
|
173
185
|
else:
|
|
174
186
|
chunks = chunk_document(doc, self.chunk_size, self.chunk_overlap)
|
|
175
187
|
all_chunks.extend(chunks)
|
|
@@ -194,6 +206,129 @@ class RAGAssistant:
|
|
|
194
206
|
self._chunks = tuple(all_chunks)
|
|
195
207
|
self._embedding_matrix = embedding_matrix / norms
|
|
196
208
|
|
|
209
|
+
def add_documents(self, documents: list[Document] | str | Path) -> int:
|
|
210
|
+
"""Add documents to the existing index incrementally.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
documents: Documents to add.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Number of chunks added.
|
|
217
|
+
"""
|
|
218
|
+
new_docs = self._load_documents(documents)
|
|
219
|
+
if not new_docs:
|
|
220
|
+
return 0
|
|
221
|
+
|
|
222
|
+
self.documents.extend(new_docs)
|
|
223
|
+
|
|
224
|
+
# Chunk new docs
|
|
225
|
+
new_chunks: list[Chunk] = []
|
|
226
|
+
for doc in new_docs:
|
|
227
|
+
if doc.metadata.get("filename", "").endswith(".rst"):
|
|
228
|
+
chunks = chunk_rst_sections(doc.content, doc.id, metadata=doc.metadata)
|
|
229
|
+
else:
|
|
230
|
+
chunks = chunk_document(doc, self.chunk_size, self.chunk_overlap)
|
|
231
|
+
new_chunks.extend(chunks)
|
|
232
|
+
|
|
233
|
+
if not new_chunks:
|
|
234
|
+
return 0
|
|
235
|
+
|
|
236
|
+
# Embed new chunks
|
|
237
|
+
texts = [chunk.content for chunk in new_chunks]
|
|
238
|
+
responses = self._embedding_provider.embed_batch(texts, self.embedding_model)
|
|
239
|
+
|
|
240
|
+
new_matrix = np.array([response.embedding for response in responses], dtype=np.float64)
|
|
241
|
+
|
|
242
|
+
# Normalize
|
|
243
|
+
norms = np.linalg.norm(new_matrix, axis=1, keepdims=True)
|
|
244
|
+
norms[norms == 0] = 1
|
|
245
|
+
new_matrix_norm = new_matrix / norms
|
|
246
|
+
|
|
247
|
+
# Update state
|
|
248
|
+
current_chunks = list(self._chunks)
|
|
249
|
+
current_chunks.extend(new_chunks)
|
|
250
|
+
self._chunks = tuple(current_chunks)
|
|
251
|
+
|
|
252
|
+
if self._embedding_matrix is None:
|
|
253
|
+
self._embedding_matrix = new_matrix_norm
|
|
254
|
+
else:
|
|
255
|
+
self._embedding_matrix = np.vstack((self._embedding_matrix, new_matrix_norm))
|
|
256
|
+
|
|
257
|
+
return len(new_chunks)
|
|
258
|
+
|
|
259
|
+
def remove_documents(self, source_path_pattern: str) -> int:
|
|
260
|
+
"""Remove documents matching a source path pattern.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
source_path_pattern: Glob pattern to match 'source' metadata.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Number of chunks removed.
|
|
267
|
+
"""
|
|
268
|
+
import fnmatch
|
|
269
|
+
|
|
270
|
+
if not self._chunks:
|
|
271
|
+
return 0
|
|
272
|
+
|
|
273
|
+
indices_to_keep = []
|
|
274
|
+
kept_chunks = []
|
|
275
|
+
removed_count = 0
|
|
276
|
+
|
|
277
|
+
for i, chunk in enumerate(self._chunks):
|
|
278
|
+
source = chunk.metadata.get("source", "")
|
|
279
|
+
if not source or not fnmatch.fnmatch(source, source_path_pattern):
|
|
280
|
+
indices_to_keep.append(i)
|
|
281
|
+
kept_chunks.append(chunk)
|
|
282
|
+
else:
|
|
283
|
+
removed_count += 1
|
|
284
|
+
|
|
285
|
+
if removed_count == 0:
|
|
286
|
+
return 0
|
|
287
|
+
|
|
288
|
+
self._chunks = tuple(kept_chunks)
|
|
289
|
+
|
|
290
|
+
if self._embedding_matrix is not None:
|
|
291
|
+
if not kept_chunks:
|
|
292
|
+
self._embedding_matrix = None
|
|
293
|
+
else:
|
|
294
|
+
self._embedding_matrix = self._embedding_matrix[indices_to_keep]
|
|
295
|
+
|
|
296
|
+
# Also remove from self.documents
|
|
297
|
+
self.documents = [
|
|
298
|
+
doc for doc in self.documents if not fnmatch.fnmatch(doc.metadata.get("source", ""), source_path_pattern)
|
|
299
|
+
]
|
|
300
|
+
|
|
301
|
+
return removed_count
|
|
302
|
+
|
|
303
|
+
def update_documents(self, documents: list[Document] | str | Path) -> int:
|
|
304
|
+
"""Update existing documents (remove old, add new).
|
|
305
|
+
|
|
306
|
+
Uses document source path to identify what to remove.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
documents: New versions of documents.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Number of chunks added.
|
|
313
|
+
"""
|
|
314
|
+
new_docs = self._load_documents(documents)
|
|
315
|
+
if not new_docs:
|
|
316
|
+
return 0
|
|
317
|
+
|
|
318
|
+
# Identify sources to remove
|
|
319
|
+
sources_to_remove = set()
|
|
320
|
+
for doc in new_docs:
|
|
321
|
+
source = doc.metadata.get("source")
|
|
322
|
+
if source:
|
|
323
|
+
sources_to_remove.add(source)
|
|
324
|
+
|
|
325
|
+
# Remove old versions
|
|
326
|
+
for source in sources_to_remove:
|
|
327
|
+
self.remove_documents(source)
|
|
328
|
+
|
|
329
|
+
# Add new versions
|
|
330
|
+
return self.add_documents(new_docs)
|
|
331
|
+
|
|
197
332
|
def retrieve(self, query: str, top_k: int = 3) -> list[tuple[Chunk, float]]:
|
|
198
333
|
"""
|
|
199
334
|
Retrieve relevant chunks for a query.
|
|
@@ -51,6 +51,7 @@ class Chunk:
|
|
|
51
51
|
doc_id: str
|
|
52
52
|
chunk_index: int
|
|
53
53
|
embedding: tuple[float, ...] | list[float] | None = None
|
|
54
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
54
55
|
|
|
55
56
|
|
|
56
57
|
@dataclass
|
|
@@ -203,8 +204,7 @@ class RagitExperiment:
|
|
|
203
204
|
elif provider is not None:
|
|
204
205
|
if not isinstance(provider, BaseEmbeddingProvider):
|
|
205
206
|
raise ValueError(
|
|
206
|
-
"Provider must implement BaseEmbeddingProvider for embeddings. "
|
|
207
|
-
"Alternatively, provide embed_fn."
|
|
207
|
+
"Provider must implement BaseEmbeddingProvider for embeddings. Alternatively, provide embed_fn."
|
|
208
208
|
)
|
|
209
209
|
self._embedding_provider = provider
|
|
210
210
|
if isinstance(provider, BaseLLMProvider):
|
|
@@ -220,8 +220,7 @@ class RagitExperiment:
|
|
|
220
220
|
# LLM is required for evaluation
|
|
221
221
|
if self._llm_provider is None:
|
|
222
222
|
raise ValueError(
|
|
223
|
-
"RagitExperiment requires LLM for evaluation. "
|
|
224
|
-
"Provide generate_fn or a provider with LLM support."
|
|
223
|
+
"RagitExperiment requires LLM for evaluation. Provide generate_fn or a provider with LLM support."
|
|
225
224
|
)
|
|
226
225
|
|
|
227
226
|
@property
|
ragit/loaders.py
CHANGED
|
@@ -10,6 +10,7 @@ Provides simple functions to load documents from files and chunk text.
|
|
|
10
10
|
|
|
11
11
|
import re
|
|
12
12
|
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
13
14
|
|
|
14
15
|
from ragit.core.experiment.experiment import Chunk, Document
|
|
15
16
|
|
|
@@ -72,7 +73,13 @@ def load_directory(path: str | Path, pattern: str = "*.txt", recursive: bool = F
|
|
|
72
73
|
return documents
|
|
73
74
|
|
|
74
75
|
|
|
75
|
-
def chunk_text(
|
|
76
|
+
def chunk_text(
|
|
77
|
+
text: str,
|
|
78
|
+
chunk_size: int = 512,
|
|
79
|
+
chunk_overlap: int = 50,
|
|
80
|
+
doc_id: str = "doc",
|
|
81
|
+
metadata: dict[str, Any] | None = None,
|
|
82
|
+
) -> list[Chunk]:
|
|
76
83
|
"""
|
|
77
84
|
Split text into overlapping chunks.
|
|
78
85
|
|
|
@@ -86,6 +93,8 @@ def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id
|
|
|
86
93
|
Overlap between chunks (default: 50).
|
|
87
94
|
doc_id : str
|
|
88
95
|
Document ID for the chunks (default: "doc").
|
|
96
|
+
metadata : dict, optional
|
|
97
|
+
Metadata to attach to each chunk (default: None).
|
|
89
98
|
|
|
90
99
|
Returns
|
|
91
100
|
-------
|
|
@@ -102,13 +111,16 @@ def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id
|
|
|
102
111
|
chunks = []
|
|
103
112
|
start = 0
|
|
104
113
|
chunk_idx = 0
|
|
114
|
+
chunk_metadata = metadata or {}
|
|
105
115
|
|
|
106
116
|
while start < len(text):
|
|
107
117
|
end = start + chunk_size
|
|
108
|
-
|
|
118
|
+
chunk_content = text[start:end].strip()
|
|
109
119
|
|
|
110
|
-
if
|
|
111
|
-
chunks.append(
|
|
120
|
+
if chunk_content:
|
|
121
|
+
chunks.append(
|
|
122
|
+
Chunk(content=chunk_content, doc_id=doc_id, chunk_index=chunk_idx, metadata=chunk_metadata.copy())
|
|
123
|
+
)
|
|
112
124
|
chunk_idx += 1
|
|
113
125
|
|
|
114
126
|
start = end - chunk_overlap
|
|
@@ -136,10 +148,12 @@ def chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50
|
|
|
136
148
|
list[Chunk]
|
|
137
149
|
List of chunks from the document.
|
|
138
150
|
"""
|
|
139
|
-
return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id)
|
|
151
|
+
return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, metadata=doc.metadata)
|
|
140
152
|
|
|
141
153
|
|
|
142
|
-
def chunk_by_separator(
|
|
154
|
+
def chunk_by_separator(
|
|
155
|
+
text: str, separator: str = "\n\n", doc_id: str = "doc", metadata: dict[str, Any] | None = None
|
|
156
|
+
) -> list[Chunk]:
|
|
143
157
|
"""
|
|
144
158
|
Split text by a separator (e.g., paragraphs, sections).
|
|
145
159
|
|
|
@@ -151,6 +165,8 @@ def chunk_by_separator(text: str, separator: str = "\n\n", doc_id: str = "doc")
|
|
|
151
165
|
Separator string (default: double newline for paragraphs).
|
|
152
166
|
doc_id : str
|
|
153
167
|
Document ID for the chunks.
|
|
168
|
+
metadata : dict, optional
|
|
169
|
+
Metadata to attach to each chunk (default: None).
|
|
154
170
|
|
|
155
171
|
Returns
|
|
156
172
|
-------
|
|
@@ -163,16 +179,17 @@ def chunk_by_separator(text: str, separator: str = "\n\n", doc_id: str = "doc")
|
|
|
163
179
|
"""
|
|
164
180
|
parts = text.split(separator)
|
|
165
181
|
chunks = []
|
|
182
|
+
chunk_metadata = metadata or {}
|
|
166
183
|
|
|
167
184
|
for idx, part in enumerate(parts):
|
|
168
185
|
content = part.strip()
|
|
169
186
|
if content:
|
|
170
|
-
chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx))
|
|
187
|
+
chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx, metadata=chunk_metadata.copy()))
|
|
171
188
|
|
|
172
189
|
return chunks
|
|
173
190
|
|
|
174
191
|
|
|
175
|
-
def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
|
|
192
|
+
def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any] | None = None) -> list[Chunk]:
|
|
176
193
|
"""
|
|
177
194
|
Split RST document by section headers.
|
|
178
195
|
|
|
@@ -182,6 +199,8 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
|
|
|
182
199
|
RST document text.
|
|
183
200
|
doc_id : str
|
|
184
201
|
Document ID for the chunks.
|
|
202
|
+
metadata : dict, optional
|
|
203
|
+
Metadata to attach to each chunk (default: None).
|
|
185
204
|
|
|
186
205
|
Returns
|
|
187
206
|
-------
|
|
@@ -190,13 +209,18 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
|
|
|
190
209
|
"""
|
|
191
210
|
# Match RST section headers (title followed by underline of =, -, ~, etc.)
|
|
192
211
|
pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
|
|
212
|
+
chunk_metadata = metadata or {}
|
|
193
213
|
|
|
194
214
|
# Find all section positions
|
|
195
215
|
matches = list(re.finditer(pattern, text))
|
|
196
216
|
|
|
197
217
|
if not matches:
|
|
198
218
|
# No sections found, return whole text as one chunk
|
|
199
|
-
return
|
|
219
|
+
return (
|
|
220
|
+
[Chunk(content=text.strip(), doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy())]
|
|
221
|
+
if text.strip()
|
|
222
|
+
else []
|
|
223
|
+
)
|
|
200
224
|
|
|
201
225
|
chunks = []
|
|
202
226
|
|
|
@@ -205,7 +229,7 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
|
|
|
205
229
|
if first_pos > 0:
|
|
206
230
|
pre_content = text[:first_pos].strip()
|
|
207
231
|
if pre_content:
|
|
208
|
-
chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0))
|
|
232
|
+
chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy()))
|
|
209
233
|
|
|
210
234
|
# Extract each section
|
|
211
235
|
for i, match in enumerate(matches):
|
|
@@ -214,6 +238,8 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
|
|
|
214
238
|
|
|
215
239
|
section_content = text[start:end].strip()
|
|
216
240
|
if section_content:
|
|
217
|
-
chunks.append(
|
|
241
|
+
chunks.append(
|
|
242
|
+
Chunk(content=section_content, doc_id=doc_id, chunk_index=len(chunks), metadata=chunk_metadata.copy())
|
|
243
|
+
)
|
|
218
244
|
|
|
219
245
|
return chunks
|
ragit/version.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
ragit/__init__.py,sha256=JUkL7ivgr4o4nZak-96P1C-pzKdNuN3Tl0X0WvpeXBU,3142
|
|
2
|
-
ragit/assistant.py,sha256=
|
|
2
|
+
ragit/assistant.py,sha256=LNof1zJAQWLIfhd7aPmKCpPQDCShpt9ezeM2nQ8ouyQ,18777
|
|
3
3
|
ragit/config.py,sha256=7XnueNO4h22ibeWd1akHnfVoGSD8xE5vuOCMYeQOOU4,1898
|
|
4
|
-
ragit/loaders.py,sha256=
|
|
5
|
-
ragit/version.py,sha256=
|
|
4
|
+
ragit/loaders.py,sha256=1JXgDLorvmtaDaRpbnKEqQjbQ4O5yfZxlb4QRUdGr58,6415
|
|
5
|
+
ragit/version.py,sha256=WCqbf2oV6eXhq3DvqECcVFop-dseJIExoMxZ4fCtkvs,97
|
|
6
6
|
ragit/core/__init__.py,sha256=j53PFfoSMXwSbK1rRHpMbo8mX2i4R1LJ5kvTxBd7-0w,100
|
|
7
7
|
ragit/core/experiment/__init__.py,sha256=4vAPOOYlY5Dcr2gOolyhBSPGIUxZKwEkgQffxS9BodA,452
|
|
8
|
-
ragit/core/experiment/experiment.py,sha256=
|
|
8
|
+
ragit/core/experiment/experiment.py,sha256=aANDJ-XlMB0ijT8SBsPkb2U-lM3cChOuRO3oP9u3XxA,19331
|
|
9
9
|
ragit/core/experiment/results.py,sha256=KHpN3YSLJ83_JUfIMccRPS-q7LEt0S9p8ehDRawk_4k,3487
|
|
10
10
|
ragit/providers/__init__.py,sha256=tKWjUV31OZprD8k9aUUidtDMg7C_dWBXN7igtxeB8Ec,1339
|
|
11
11
|
ragit/providers/base.py,sha256=MJ8mVeXuGWhkX2XGTbkWIY3cVoTOPr4h5XBXw8rAX2Q,3434
|
|
@@ -13,8 +13,8 @@ ragit/providers/function_adapter.py,sha256=A-TQhBgBWbuO_w1sy795Dxep1FOCBpAlWpXCK
|
|
|
13
13
|
ragit/providers/ollama.py,sha256=YJH5a9nQHnP0NrIK7G9PqjV5A53f9JxmEJDAJ6d297M,15410
|
|
14
14
|
ragit/providers/sentence_transformers.py,sha256=tTkd4HpE1MyfFJAwur-a7w-GlBxe93HlyM_dRffDrdY,6996
|
|
15
15
|
ragit/utils/__init__.py,sha256=-UsE5oJSnmEnBDswl-ph0A09Iu8yKNbPhd1-_7Lcb8Y,3051
|
|
16
|
-
ragit-0.8.
|
|
17
|
-
ragit-0.8.
|
|
18
|
-
ragit-0.8.
|
|
19
|
-
ragit-0.8.
|
|
20
|
-
ragit-0.8.
|
|
16
|
+
ragit-0.8.2.dist-info/licenses/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
|
|
17
|
+
ragit-0.8.2.dist-info/METADATA,sha256=wlBpVj_aHxR7ZWy5yzpo2Wt-IoLcVlFGo4oBXGzMajY,4888
|
|
18
|
+
ragit-0.8.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
19
|
+
ragit-0.8.2.dist-info/top_level.txt,sha256=pkPbG7yrw61wt9_y_xcLE2vq2a55fzockASD0yq0g4s,6
|
|
20
|
+
ragit-0.8.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|