ragit 0.8.1__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragit/assistant.py CHANGED
@@ -116,8 +116,7 @@ class RAGAssistant:
116
116
  # Use explicit provider
117
117
  if not isinstance(provider, BaseEmbeddingProvider):
118
118
  raise ValueError(
119
- "Provider must implement BaseEmbeddingProvider for embeddings. "
120
- "Alternatively, provide embed_fn."
119
+ "Provider must implement BaseEmbeddingProvider for embeddings. Alternatively, provide embed_fn."
121
120
  )
122
121
  self._embedding_provider = provider
123
122
  if isinstance(provider, BaseLLMProvider):
@@ -156,7 +155,20 @@ class RAGAssistant:
156
155
 
157
156
  if path.is_dir():
158
157
  docs: list[Document] = []
159
- for pattern in ("*.txt", "*.md", "*.rst"):
158
+ for pattern in (
159
+ "*.txt",
160
+ "*.md",
161
+ "*.rst",
162
+ "*.py",
163
+ "*.js",
164
+ "*.ts",
165
+ "*.go",
166
+ "*.java",
167
+ "*.c",
168
+ "*.cpp",
169
+ "*.h",
170
+ "*.hpp",
171
+ ):
160
172
  docs.extend(load_directory(path, pattern))
161
173
  return docs
162
174
 
@@ -169,7 +181,7 @@ class RAGAssistant:
169
181
  for doc in self.documents:
170
182
  # Use RST section chunking for .rst files, otherwise regular chunking
171
183
  if doc.metadata.get("filename", "").endswith(".rst"):
172
- chunks = chunk_rst_sections(doc.content, doc.id)
184
+ chunks = chunk_rst_sections(doc.content, doc.id, metadata=doc.metadata)
173
185
  else:
174
186
  chunks = chunk_document(doc, self.chunk_size, self.chunk_overlap)
175
187
  all_chunks.extend(chunks)
@@ -194,6 +206,129 @@ class RAGAssistant:
194
206
  self._chunks = tuple(all_chunks)
195
207
  self._embedding_matrix = embedding_matrix / norms
196
208
 
209
+ def add_documents(self, documents: list[Document] | str | Path) -> int:
210
+ """Add documents to the existing index incrementally.
211
+
212
+ Args:
213
+ documents: Documents to add.
214
+
215
+ Returns:
216
+ Number of chunks added.
217
+ """
218
+ new_docs = self._load_documents(documents)
219
+ if not new_docs:
220
+ return 0
221
+
222
+ self.documents.extend(new_docs)
223
+
224
+ # Chunk new docs
225
+ new_chunks: list[Chunk] = []
226
+ for doc in new_docs:
227
+ if doc.metadata.get("filename", "").endswith(".rst"):
228
+ chunks = chunk_rst_sections(doc.content, doc.id, metadata=doc.metadata)
229
+ else:
230
+ chunks = chunk_document(doc, self.chunk_size, self.chunk_overlap)
231
+ new_chunks.extend(chunks)
232
+
233
+ if not new_chunks:
234
+ return 0
235
+
236
+ # Embed new chunks
237
+ texts = [chunk.content for chunk in new_chunks]
238
+ responses = self._embedding_provider.embed_batch(texts, self.embedding_model)
239
+
240
+ new_matrix = np.array([response.embedding for response in responses], dtype=np.float64)
241
+
242
+ # Normalize
243
+ norms = np.linalg.norm(new_matrix, axis=1, keepdims=True)
244
+ norms[norms == 0] = 1
245
+ new_matrix_norm = new_matrix / norms
246
+
247
+ # Update state
248
+ current_chunks = list(self._chunks)
249
+ current_chunks.extend(new_chunks)
250
+ self._chunks = tuple(current_chunks)
251
+
252
+ if self._embedding_matrix is None:
253
+ self._embedding_matrix = new_matrix_norm
254
+ else:
255
+ self._embedding_matrix = np.vstack((self._embedding_matrix, new_matrix_norm))
256
+
257
+ return len(new_chunks)
258
+
259
+ def remove_documents(self, source_path_pattern: str) -> int:
260
+ """Remove documents matching a source path pattern.
261
+
262
+ Args:
263
+ source_path_pattern: Glob pattern to match 'source' metadata.
264
+
265
+ Returns:
266
+ Number of chunks removed.
267
+ """
268
+ import fnmatch
269
+
270
+ if not self._chunks:
271
+ return 0
272
+
273
+ indices_to_keep = []
274
+ kept_chunks = []
275
+ removed_count = 0
276
+
277
+ for i, chunk in enumerate(self._chunks):
278
+ source = chunk.metadata.get("source", "")
279
+ if not source or not fnmatch.fnmatch(source, source_path_pattern):
280
+ indices_to_keep.append(i)
281
+ kept_chunks.append(chunk)
282
+ else:
283
+ removed_count += 1
284
+
285
+ if removed_count == 0:
286
+ return 0
287
+
288
+ self._chunks = tuple(kept_chunks)
289
+
290
+ if self._embedding_matrix is not None:
291
+ if not kept_chunks:
292
+ self._embedding_matrix = None
293
+ else:
294
+ self._embedding_matrix = self._embedding_matrix[indices_to_keep]
295
+
296
+ # Also remove from self.documents
297
+ self.documents = [
298
+ doc for doc in self.documents if not fnmatch.fnmatch(doc.metadata.get("source", ""), source_path_pattern)
299
+ ]
300
+
301
+ return removed_count
302
+
303
+ def update_documents(self, documents: list[Document] | str | Path) -> int:
304
+ """Update existing documents (remove old, add new).
305
+
306
+ Uses document source path to identify what to remove.
307
+
308
+ Args:
309
+ documents: New versions of documents.
310
+
311
+ Returns:
312
+ Number of chunks added.
313
+ """
314
+ new_docs = self._load_documents(documents)
315
+ if not new_docs:
316
+ return 0
317
+
318
+ # Identify sources to remove
319
+ sources_to_remove = set()
320
+ for doc in new_docs:
321
+ source = doc.metadata.get("source")
322
+ if source:
323
+ sources_to_remove.add(source)
324
+
325
+ # Remove old versions
326
+ for source in sources_to_remove:
327
+ self.remove_documents(source)
328
+
329
+ # Add new versions
330
+ return self.add_documents(new_docs)
331
+
197
332
  def retrieve(self, query: str, top_k: int = 3) -> list[tuple[Chunk, float]]:
198
333
  """
199
334
  Retrieve relevant chunks for a query.
@@ -51,6 +51,7 @@ class Chunk:
51
51
  doc_id: str
52
52
  chunk_index: int
53
53
  embedding: tuple[float, ...] | list[float] | None = None
54
+ metadata: dict[str, Any] = field(default_factory=dict)
54
55
 
55
56
 
56
57
  @dataclass
@@ -203,8 +204,7 @@ class RagitExperiment:
203
204
  elif provider is not None:
204
205
  if not isinstance(provider, BaseEmbeddingProvider):
205
206
  raise ValueError(
206
- "Provider must implement BaseEmbeddingProvider for embeddings. "
207
- "Alternatively, provide embed_fn."
207
+ "Provider must implement BaseEmbeddingProvider for embeddings. Alternatively, provide embed_fn."
208
208
  )
209
209
  self._embedding_provider = provider
210
210
  if isinstance(provider, BaseLLMProvider):
@@ -220,8 +220,7 @@ class RagitExperiment:
220
220
  # LLM is required for evaluation
221
221
  if self._llm_provider is None:
222
222
  raise ValueError(
223
- "RagitExperiment requires LLM for evaluation. "
224
- "Provide generate_fn or a provider with LLM support."
223
+ "RagitExperiment requires LLM for evaluation. Provide generate_fn or a provider with LLM support."
225
224
  )
226
225
 
227
226
  @property
ragit/loaders.py CHANGED
@@ -10,6 +10,7 @@ Provides simple functions to load documents from files and chunk text.
10
10
 
11
11
  import re
12
12
  from pathlib import Path
13
+ from typing import Any
13
14
 
14
15
  from ragit.core.experiment.experiment import Chunk, Document
15
16
 
@@ -72,7 +73,13 @@ def load_directory(path: str | Path, pattern: str = "*.txt", recursive: bool = F
72
73
  return documents
73
74
 
74
75
 
75
- def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id: str = "doc") -> list[Chunk]:
76
+ def chunk_text(
77
+ text: str,
78
+ chunk_size: int = 512,
79
+ chunk_overlap: int = 50,
80
+ doc_id: str = "doc",
81
+ metadata: dict[str, Any] | None = None,
82
+ ) -> list[Chunk]:
76
83
  """
77
84
  Split text into overlapping chunks.
78
85
 
@@ -86,6 +93,8 @@ def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id
86
93
  Overlap between chunks (default: 50).
87
94
  doc_id : str
88
95
  Document ID for the chunks (default: "doc").
96
+ metadata : dict, optional
97
+ Metadata to attach to each chunk (default: None).
89
98
 
90
99
  Returns
91
100
  -------
@@ -102,13 +111,16 @@ def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id
102
111
  chunks = []
103
112
  start = 0
104
113
  chunk_idx = 0
114
+ chunk_metadata = metadata or {}
105
115
 
106
116
  while start < len(text):
107
117
  end = start + chunk_size
108
- chunk_text = text[start:end].strip()
118
+ chunk_content = text[start:end].strip()
109
119
 
110
- if chunk_text:
111
- chunks.append(Chunk(content=chunk_text, doc_id=doc_id, chunk_index=chunk_idx))
120
+ if chunk_content:
121
+ chunks.append(
122
+ Chunk(content=chunk_content, doc_id=doc_id, chunk_index=chunk_idx, metadata=chunk_metadata.copy())
123
+ )
112
124
  chunk_idx += 1
113
125
 
114
126
  start = end - chunk_overlap
@@ -136,10 +148,12 @@ def chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50
136
148
  list[Chunk]
137
149
  List of chunks from the document.
138
150
  """
139
- return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id)
151
+ return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, metadata=doc.metadata)
140
152
 
141
153
 
142
- def chunk_by_separator(text: str, separator: str = "\n\n", doc_id: str = "doc") -> list[Chunk]:
154
+ def chunk_by_separator(
155
+ text: str, separator: str = "\n\n", doc_id: str = "doc", metadata: dict[str, Any] | None = None
156
+ ) -> list[Chunk]:
143
157
  """
144
158
  Split text by a separator (e.g., paragraphs, sections).
145
159
 
@@ -151,6 +165,8 @@ def chunk_by_separator(text: str, separator: str = "\n\n", doc_id: str = "doc")
151
165
  Separator string (default: double newline for paragraphs).
152
166
  doc_id : str
153
167
  Document ID for the chunks.
168
+ metadata : dict, optional
169
+ Metadata to attach to each chunk (default: None).
154
170
 
155
171
  Returns
156
172
  -------
@@ -163,16 +179,17 @@ def chunk_by_separator(text: str, separator: str = "\n\n", doc_id: str = "doc")
163
179
  """
164
180
  parts = text.split(separator)
165
181
  chunks = []
182
+ chunk_metadata = metadata or {}
166
183
 
167
184
  for idx, part in enumerate(parts):
168
185
  content = part.strip()
169
186
  if content:
170
- chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx))
187
+ chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx, metadata=chunk_metadata.copy()))
171
188
 
172
189
  return chunks
173
190
 
174
191
 
175
- def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
192
+ def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any] | None = None) -> list[Chunk]:
176
193
  """
177
194
  Split RST document by section headers.
178
195
 
@@ -182,6 +199,8 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
182
199
  RST document text.
183
200
  doc_id : str
184
201
  Document ID for the chunks.
202
+ metadata : dict, optional
203
+ Metadata to attach to each chunk (default: None).
185
204
 
186
205
  Returns
187
206
  -------
@@ -190,13 +209,18 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
190
209
  """
191
210
  # Match RST section headers (title followed by underline of =, -, ~, etc.)
192
211
  pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
212
+ chunk_metadata = metadata or {}
193
213
 
194
214
  # Find all section positions
195
215
  matches = list(re.finditer(pattern, text))
196
216
 
197
217
  if not matches:
198
218
  # No sections found, return whole text as one chunk
199
- return [Chunk(content=text.strip(), doc_id=doc_id, chunk_index=0)] if text.strip() else []
219
+ return (
220
+ [Chunk(content=text.strip(), doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy())]
221
+ if text.strip()
222
+ else []
223
+ )
200
224
 
201
225
  chunks = []
202
226
 
@@ -205,7 +229,7 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
205
229
  if first_pos > 0:
206
230
  pre_content = text[:first_pos].strip()
207
231
  if pre_content:
208
- chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0))
232
+ chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy()))
209
233
 
210
234
  # Extract each section
211
235
  for i, match in enumerate(matches):
@@ -214,6 +238,8 @@ def chunk_rst_sections(text: str, doc_id: str = "doc") -> list[Chunk]:
214
238
 
215
239
  section_content = text[start:end].strip()
216
240
  if section_content:
217
- chunks.append(Chunk(content=section_content, doc_id=doc_id, chunk_index=len(chunks)))
241
+ chunks.append(
242
+ Chunk(content=section_content, doc_id=doc_id, chunk_index=len(chunks), metadata=chunk_metadata.copy())
243
+ )
218
244
 
219
245
  return chunks
ragit/version.py CHANGED
@@ -2,4 +2,4 @@
2
2
  # Copyright RODMENA LIMITED 2025
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
  #
5
- __version__ = "0.8.1"
5
+ __version__ = "0.8.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragit
3
- Version: 0.8.1
3
+ Version: 0.8.2
4
4
  Summary: Automatic RAG Pattern Optimization Engine
5
5
  Author: RODMENA LIMITED
6
6
  Maintainer-email: RODMENA LIMITED <info@rodmena.co.uk>
@@ -1,11 +1,11 @@
1
1
  ragit/__init__.py,sha256=JUkL7ivgr4o4nZak-96P1C-pzKdNuN3Tl0X0WvpeXBU,3142
2
- ragit/assistant.py,sha256=FW8LVqEOA1nemTMdTZhb79aONeHsQM8tHADxCQ47p1Y,14705
2
+ ragit/assistant.py,sha256=LNof1zJAQWLIfhd7aPmKCpPQDCShpt9ezeM2nQ8ouyQ,18777
3
3
  ragit/config.py,sha256=7XnueNO4h22ibeWd1akHnfVoGSD8xE5vuOCMYeQOOU4,1898
4
- ragit/loaders.py,sha256=keusuPzXPBiLDVj4hKfPCcge-rm-cnzNRk50fGXvTJs,5571
5
- ragit/version.py,sha256=_qpX4vMVMSqb-_4jdv6EZJ3tkvFsyu_Pj00vRC6T2sg,97
4
+ ragit/loaders.py,sha256=1JXgDLorvmtaDaRpbnKEqQjbQ4O5yfZxlb4QRUdGr58,6415
5
+ ragit/version.py,sha256=WCqbf2oV6eXhq3DvqECcVFop-dseJIExoMxZ4fCtkvs,97
6
6
  ragit/core/__init__.py,sha256=j53PFfoSMXwSbK1rRHpMbo8mX2i4R1LJ5kvTxBd7-0w,100
7
7
  ragit/core/experiment/__init__.py,sha256=4vAPOOYlY5Dcr2gOolyhBSPGIUxZKwEkgQffxS9BodA,452
8
- ragit/core/experiment/experiment.py,sha256=WQZWRLbLPuGpG0tpCZCEz3sKgSv4CNimmABbOLR_oKs,19314
8
+ ragit/core/experiment/experiment.py,sha256=aANDJ-XlMB0ijT8SBsPkb2U-lM3cChOuRO3oP9u3XxA,19331
9
9
  ragit/core/experiment/results.py,sha256=KHpN3YSLJ83_JUfIMccRPS-q7LEt0S9p8ehDRawk_4k,3487
10
10
  ragit/providers/__init__.py,sha256=tKWjUV31OZprD8k9aUUidtDMg7C_dWBXN7igtxeB8Ec,1339
11
11
  ragit/providers/base.py,sha256=MJ8mVeXuGWhkX2XGTbkWIY3cVoTOPr4h5XBXw8rAX2Q,3434
@@ -13,8 +13,8 @@ ragit/providers/function_adapter.py,sha256=A-TQhBgBWbuO_w1sy795Dxep1FOCBpAlWpXCK
13
13
  ragit/providers/ollama.py,sha256=YJH5a9nQHnP0NrIK7G9PqjV5A53f9JxmEJDAJ6d297M,15410
14
14
  ragit/providers/sentence_transformers.py,sha256=tTkd4HpE1MyfFJAwur-a7w-GlBxe93HlyM_dRffDrdY,6996
15
15
  ragit/utils/__init__.py,sha256=-UsE5oJSnmEnBDswl-ph0A09Iu8yKNbPhd1-_7Lcb8Y,3051
16
- ragit-0.8.1.dist-info/licenses/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
17
- ragit-0.8.1.dist-info/METADATA,sha256=OaOeM-ujuMlkfjiNcXRUC6JpIApFgkvP536nHsaLW0g,4888
18
- ragit-0.8.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- ragit-0.8.1.dist-info/top_level.txt,sha256=pkPbG7yrw61wt9_y_xcLE2vq2a55fzockASD0yq0g4s,6
20
- ragit-0.8.1.dist-info/RECORD,,
16
+ ragit-0.8.2.dist-info/licenses/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
17
+ ragit-0.8.2.dist-info/METADATA,sha256=wlBpVj_aHxR7ZWy5yzpo2Wt-IoLcVlFGo4oBXGzMajY,4888
18
+ ragit-0.8.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
19
+ ragit-0.8.2.dist-info/top_level.txt,sha256=pkPbG7yrw61wt9_y_xcLE2vq2a55fzockASD0yq0g4s,6
20
+ ragit-0.8.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5