piragi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ """Change detection for automatic updates."""
2
+
3
+ import hashlib
4
+ import os
5
+ import time
6
+ from typing import Dict, Optional
7
+ from urllib.parse import urlparse
8
+
9
+ import requests
10
+
11
+
12
+ class ChangeDetector:
13
+ """Detects changes in files and URLs for automatic updates."""
14
+
15
+ @staticmethod
16
+ def compute_content_hash(content: str) -> str:
17
+ """
18
+ Compute SHA256 hash of content.
19
+
20
+ Args:
21
+ content: Content to hash
22
+
23
+ Returns:
24
+ Hex digest of SHA256 hash
25
+ """
26
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()
27
+
28
+ @staticmethod
29
+ def is_url(source: str) -> bool:
30
+ """Check if source is a URL."""
31
+ parsed = urlparse(source)
32
+ return parsed.scheme in ("http", "https")
33
+
34
+ @staticmethod
35
+ def check_file_changed(
36
+ source: str, stored_mtime: Optional[float], stored_hash: str
37
+ ) -> bool:
38
+ """
39
+ Check if a file has changed using mtime and content hash.
40
+
41
+ Args:
42
+ source: File path
43
+ stored_mtime: Previously stored modification time
44
+ stored_hash: Previously stored content hash
45
+
46
+ Returns:
47
+ True if file changed, False otherwise
48
+ """
49
+ if not os.path.exists(source):
50
+ return False
51
+
52
+ # Quick check: modification time
53
+ current_mtime = os.path.getmtime(source)
54
+ if stored_mtime and current_mtime == stored_mtime:
55
+ # File hasn't been touched, definitely not changed
56
+ return False
57
+
58
+ # Modification time changed, check actual content
59
+ try:
60
+ with open(source, "r", encoding="utf-8", errors="ignore") as f:
61
+ content = f.read()
62
+ current_hash = ChangeDetector.compute_content_hash(content)
63
+ return current_hash != stored_hash
64
+ except Exception:
65
+ # If we can't read, assume changed to be safe
66
+ return True
67
+
68
+ @staticmethod
69
+ def check_url_changed(
70
+ source: str,
71
+ stored_etag: Optional[str],
72
+ stored_last_modified: Optional[str],
73
+ timeout: int = 10,
74
+ ) -> Dict[str, any]:
75
+ """
76
+ Check if a URL has changed using HTTP headers.
77
+ Uses conditional requests for minimal latency.
78
+
79
+ Args:
80
+ source: URL
81
+ stored_etag: Previously stored ETag
82
+ stored_last_modified: Previously stored Last-Modified
83
+ timeout: Request timeout in seconds
84
+
85
+ Returns:
86
+ Dict with 'changed' bool and optional 'etag', 'last_modified'
87
+ """
88
+ try:
89
+ headers = {}
90
+
91
+ # Add conditional request headers
92
+ if stored_etag:
93
+ headers["If-None-Match"] = stored_etag
94
+ if stored_last_modified:
95
+ headers["If-Modified-Since"] = stored_last_modified
96
+
97
+ # Send HEAD request first (faster, no body download)
98
+ response = requests.head(source, headers=headers, timeout=timeout, allow_redirects=True)
99
+
100
+ # 304 Not Modified - content hasn't changed
101
+ if response.status_code == 304:
102
+ return {"changed": False}
103
+
104
+ # If HEAD not supported, try GET with same conditional headers
105
+ if response.status_code == 405: # Method Not Allowed
106
+ response = requests.get(
107
+ source, headers=headers, timeout=timeout, stream=True, allow_redirects=True
108
+ )
109
+ # Close connection immediately without downloading body
110
+ response.close()
111
+
112
+ # 200 OK - content might have changed
113
+ if response.status_code == 200:
114
+ new_etag = response.headers.get("ETag")
115
+ new_last_modified = response.headers.get("Last-Modified")
116
+
117
+ # If server provides ETag or Last-Modified, use them
118
+ if new_etag and new_etag == stored_etag:
119
+ return {"changed": False}
120
+ if new_last_modified and new_last_modified == stored_last_modified:
121
+ return {"changed": False}
122
+
123
+ # Headers changed or not available, assume content changed
124
+ return {
125
+ "changed": True,
126
+ "etag": new_etag,
127
+ "last_modified": new_last_modified,
128
+ }
129
+
130
+ # Other status codes - assume changed to be safe
131
+ return {"changed": True}
132
+
133
+ except Exception as e:
134
+ # Network error - can't verify, assume not changed
135
+ # This prevents errors from forcing unnecessary updates
136
+ return {"changed": False, "error": str(e)}
137
+
138
+ @staticmethod
139
+ def get_file_metadata(source: str, content: str) -> Dict[str, any]:
140
+ """
141
+ Get metadata for a file source.
142
+
143
+ Args:
144
+ source: File path
145
+ content: File content
146
+
147
+ Returns:
148
+ Metadata dict with mtime and content_hash
149
+ """
150
+ mtime = os.path.getmtime(source) if os.path.exists(source) else None
151
+ content_hash = ChangeDetector.compute_content_hash(content)
152
+
153
+ return {
154
+ "source": source,
155
+ "last_checked": time.time(),
156
+ "content_hash": content_hash,
157
+ "mtime": mtime,
158
+ "etag": None,
159
+ "last_modified": None,
160
+ "check_interval": 300.0, # 5 minutes default
161
+ }
162
+
163
+ @staticmethod
164
+ def get_url_metadata(
165
+ source: str, content: str, timeout: int = 10
166
+ ) -> Dict[str, any]:
167
+ """
168
+ Get metadata for a URL source.
169
+
170
+ Args:
171
+ source: URL
172
+ content: URL content
173
+ timeout: Request timeout
174
+
175
+ Returns:
176
+ Metadata dict with etag, last_modified, and content_hash
177
+ """
178
+ content_hash = ChangeDetector.compute_content_hash(content)
179
+
180
+ # Fetch HTTP headers
181
+ try:
182
+ response = requests.head(source, timeout=timeout, allow_redirects=True)
183
+ etag = response.headers.get("ETag")
184
+ last_modified = response.headers.get("Last-Modified")
185
+ except Exception:
186
+ etag = None
187
+ last_modified = None
188
+
189
+ return {
190
+ "source": source,
191
+ "last_checked": time.time(),
192
+ "content_hash": content_hash,
193
+ "mtime": None,
194
+ "etag": etag,
195
+ "last_modified": last_modified,
196
+ "check_interval": 300.0, # 5 minutes default for URLs
197
+ }
198
+
199
+ @staticmethod
200
+ def should_check_now(last_checked: float, check_interval: float) -> bool:
201
+ """
202
+ Determine if enough time has passed to check for updates.
203
+
204
+ Args:
205
+ last_checked: Unix timestamp of last check
206
+ check_interval: Seconds between checks
207
+
208
+ Returns:
209
+ True if should check now
210
+ """
211
+ return (time.time() - last_checked) >= check_interval
ragi/chunking.py ADDED
@@ -0,0 +1,150 @@
1
+ """Smart chunking strategies for documents."""
2
+
3
+ import re
4
+ from typing import List
5
+
6
+ from transformers import AutoTokenizer
7
+
8
+ from .types import Chunk, Document
9
+
10
+
11
+ class Chunker:
12
+ """Smart document chunker with markdown awareness."""
13
+
14
+ def __init__(
15
+ self,
16
+ chunk_size: int = 512,
17
+ chunk_overlap: int = 50,
18
+ tokenizer_name: str = "nvidia/llama-embed-nemotron-8b",
19
+ ) -> None:
20
+ """
21
+ Initialize the chunker.
22
+
23
+ Args:
24
+ chunk_size: Target chunk size in tokens
25
+ chunk_overlap: Number of tokens to overlap between chunks
26
+ tokenizer_name: Tokenizer to use (default: nvidia/llama-embed-nemotron-8b)
27
+ """
28
+ self.chunk_size = chunk_size
29
+ self.chunk_overlap = chunk_overlap
30
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
31
+
32
+ def chunk_document(self, document: Document) -> List[Chunk]:
33
+ """
34
+ Chunk a document into smaller pieces.
35
+
36
+ Args:
37
+ document: Document to chunk
38
+
39
+ Returns:
40
+ List of chunks
41
+ """
42
+ # Split by markdown headers first to respect document structure
43
+ sections = self._split_by_headers(document.content)
44
+
45
+ chunks = []
46
+ chunk_index = 0
47
+
48
+ for section in sections:
49
+ section_chunks = self._chunk_text(section, document.source, chunk_index)
50
+ chunks.extend(section_chunks)
51
+ chunk_index += len(section_chunks)
52
+
53
+ # Add document metadata to all chunks
54
+ for chunk in chunks:
55
+ chunk.metadata.update(document.metadata)
56
+
57
+ return chunks
58
+
59
+ def _split_by_headers(self, text: str) -> List[str]:
60
+ """Split text by markdown headers while preserving structure."""
61
+ # Pattern to match markdown headers (# Header)
62
+ header_pattern = r"^(#{1,6}\s+.+)$"
63
+
64
+ lines = text.split("\n")
65
+ sections = []
66
+ current_section = []
67
+
68
+ for line in lines:
69
+ if re.match(header_pattern, line.strip()):
70
+ # Save previous section if it exists
71
+ if current_section:
72
+ sections.append("\n".join(current_section))
73
+ current_section = [line]
74
+ else:
75
+ current_section.append(line)
76
+
77
+ # Add the last section
78
+ if current_section:
79
+ sections.append("\n".join(current_section))
80
+
81
+ return sections if sections else [text]
82
+
83
+ def _chunk_text(self, text: str, source: str, start_index: int) -> List[Chunk]:
84
+ """
85
+ Chunk text into token-sized pieces with overlap.
86
+
87
+ Args:
88
+ text: Text to chunk
89
+ source: Source identifier
90
+ start_index: Starting chunk index
91
+
92
+ Returns:
93
+ List of chunks
94
+ """
95
+ tokens = self.tokenizer.encode(text, add_special_tokens=False)
96
+
97
+ if len(tokens) <= self.chunk_size:
98
+ return [
99
+ Chunk(
100
+ text=text,
101
+ source=source,
102
+ chunk_index=start_index,
103
+ metadata={},
104
+ )
105
+ ]
106
+
107
+ chunks = []
108
+ start = 0
109
+ chunk_idx = start_index
110
+
111
+ while start < len(tokens):
112
+ end = start + self.chunk_size
113
+ chunk_tokens = tokens[start:end]
114
+
115
+ # Decode back to text
116
+ chunk_text = self.tokenizer.decode(chunk_tokens, skip_special_tokens=True)
117
+
118
+ # Try to break at sentence boundary if possible
119
+ if end < len(tokens):
120
+ chunk_text = self._break_at_sentence(chunk_text)
121
+
122
+ chunks.append(
123
+ Chunk(
124
+ text=chunk_text.strip(),
125
+ source=source,
126
+ chunk_index=chunk_idx,
127
+ metadata={},
128
+ )
129
+ )
130
+
131
+ # Move start with overlap
132
+ start = end - self.chunk_overlap
133
+ chunk_idx += 1
134
+
135
+ return chunks
136
+
137
+ def _break_at_sentence(self, text: str) -> str:
138
+ """Try to break text at a sentence boundary."""
139
+ # Look for sentence endings
140
+ sentence_endings = [". ", ".\n", "? ", "?\n", "! ", "!\n"]
141
+
142
+ for ending in sentence_endings:
143
+ if ending in text:
144
+ # Find the last occurrence
145
+ idx = text.rfind(ending)
146
+ if idx > len(text) * 0.5: # Only if it's in the latter half
147
+ return text[: idx + len(ending)]
148
+
149
+ # If no good break point, return as is
150
+ return text
ragi/core.py ADDED
@@ -0,0 +1,318 @@
1
+ """Core Ragi class - the main interface."""
2
+
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ from .chunking import Chunker
6
+ from .embeddings import EmbeddingGenerator
7
+ from .loader import DocumentLoader
8
+ from .retrieval import Retriever
9
+ from .store import VectorStore
10
+ from .types import Answer, Document
11
+ from .async_updater import AsyncUpdater
12
+ from .change_detection import ChangeDetector
13
+
14
+
15
+ class Ragi:
16
+ """
17
+ Zero-setup RAG library with auto-chunking, embeddings, and smart citations.
18
+
19
+ Examples:
20
+ >>> from ragi import Ragi
21
+ >>>
22
+ >>> # Simple - uses free local models
23
+ >>> kb = Ragi("./docs")
24
+ >>>
25
+ >>> # Custom config
26
+ >>> kb = Ragi("./docs", config={
27
+ ... "llm": {"model": "gpt-4o-mini"},
28
+ ... "embedding": {"device": "cuda"}
29
+ ... })
30
+ >>>
31
+ >>> # Ask questions
32
+ >>> answer = kb.ask("How do I install this?")
33
+ >>> print(answer.text)
34
+ >>>
35
+ >>> # Callable shorthand
36
+ >>> answer = kb("What's the API?")
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ sources: Union[str, List[str], None] = None,
42
+ persist_dir: str = ".ragi",
43
+ config: Optional[Dict[str, Any]] = None,
44
+ ) -> None:
45
+ """
46
+ Initialize Ragi with optional document sources.
47
+
48
+ Args:
49
+ sources: File paths, URLs, or glob patterns to load
50
+ persist_dir: Directory to persist vector database
51
+ config: Configuration dict with optional sections:
52
+ - llm: LLM configuration
53
+ - model: Model name (default: "llama3.2")
54
+ - base_url: API base URL (default: "http://localhost:11434/v1")
55
+ - api_key: API key (default: "not-needed")
56
+ - embedding: Embedding configuration
57
+ - model: Model name (default: "nvidia/llama-embed-nemotron-8b")
58
+ - device: Device to use for local models (default: None for auto-detect)
59
+ - base_url: API base URL for remote embeddings (optional)
60
+ - api_key: API key for remote embeddings (optional)
61
+ - chunk: Chunking configuration
62
+ - size: Chunk size in tokens (default: 512)
63
+ - overlap: Overlap in tokens (default: 50)
64
+ - auto_update: Auto-update configuration (enabled by default)
65
+ - enabled: Enable background updates (default: True)
66
+ - interval: Check interval in seconds (default: 300)
67
+ - workers: Number of background workers (default: 2)
68
+
69
+ Examples:
70
+ >>> # Use defaults
71
+ >>> kb = Ragi("./docs")
72
+ >>>
73
+ >>> # Custom LLM
74
+ >>> kb = Ragi("./docs", config={
75
+ ... "llm": {"model": "gpt-4o-mini", "api_key": "sk-..."}
76
+ ... })
77
+ >>>
78
+ >>> # Full config
79
+ >>> kb = Ragi("./docs", config={
80
+ ... "llm": {"model": "llama3.2"},
81
+ ... "embedding": {"device": "cuda"},
82
+ ... "chunk": {"size": 1024, "overlap": 200}
83
+ ... })
84
+ """
85
+ # Initialize config
86
+ cfg = config or {}
87
+
88
+ # Initialize components
89
+ self.loader = DocumentLoader()
90
+
91
+ # Chunking
92
+ chunk_cfg = cfg.get("chunk", {})
93
+ self.chunker = Chunker(
94
+ chunk_size=chunk_cfg.get("size", 512),
95
+ chunk_overlap=chunk_cfg.get("overlap", 50),
96
+ )
97
+
98
+ # Embeddings
99
+ embed_cfg = cfg.get("embedding", {})
100
+ self.embedder = EmbeddingGenerator(
101
+ model=embed_cfg.get("model", "nvidia/llama-embed-nemotron-8b"),
102
+ device=embed_cfg.get("device"),
103
+ base_url=embed_cfg.get("base_url"),
104
+ api_key=embed_cfg.get("api_key"),
105
+ )
106
+
107
+ # Vector store
108
+ self.store = VectorStore(persist_dir=persist_dir)
109
+
110
+ # LLM
111
+ llm_cfg = cfg.get("llm", {})
112
+ self.retriever = Retriever(
113
+ model=llm_cfg.get("model", "llama3.2"),
114
+ api_key=llm_cfg.get("api_key"),
115
+ base_url=llm_cfg.get("base_url"),
116
+ )
117
+
118
+ # State for filtering
119
+ self._filters: Optional[Dict[str, Any]] = None
120
+
121
+ # Auto-update setup
122
+ auto_update_cfg = cfg.get("auto_update", {})
123
+ self._auto_update_enabled = auto_update_cfg.get("enabled", True)
124
+ self._updater: Optional[AsyncUpdater] = None
125
+ self._tracked_sources: Dict[str, Document] = {}
126
+
127
+ if self._auto_update_enabled:
128
+ interval = auto_update_cfg.get("interval", 300.0)
129
+ workers = auto_update_cfg.get("workers", 2)
130
+
131
+ self._updater = AsyncUpdater(
132
+ refresh_callback=self._background_refresh,
133
+ check_interval=interval,
134
+ max_workers=workers,
135
+ )
136
+ self._updater.start()
137
+
138
+ # Load initial sources if provided
139
+ if sources:
140
+ self.add(sources)
141
+
142
+ def add(self, sources: Union[str, List[str]]) -> "Ragi":
143
+ """
144
+ Add documents to the knowledge base.
145
+
146
+ Args:
147
+ sources: File paths, URLs, or glob patterns
148
+
149
+ Returns:
150
+ Self for chaining
151
+ """
152
+ # Load documents
153
+ documents = self.loader.load(sources)
154
+
155
+ # Chunk documents
156
+ all_chunks = []
157
+ for doc in documents:
158
+ chunks = self.chunker.chunk_document(doc)
159
+ all_chunks.extend(chunks)
160
+
161
+ # Generate embeddings
162
+ chunks_with_embeddings = self.embedder.embed_chunks(all_chunks)
163
+
164
+ # Store in vector database
165
+ self.store.add_chunks(chunks_with_embeddings)
166
+
167
+ # Register sources for auto-update
168
+ if self._auto_update_enabled and self._updater:
169
+ for doc in documents:
170
+ self._tracked_sources[doc.source] = doc
171
+ # Register with updater
172
+ if ChangeDetector.is_url(doc.source):
173
+ metadata = ChangeDetector.get_url_metadata(doc.source, doc.content)
174
+ else:
175
+ metadata = ChangeDetector.get_file_metadata(doc.source, doc.content)
176
+
177
+ self._updater.register_source(
178
+ doc.source, doc.content, check_interval=None
179
+ )
180
+
181
+ return self
182
+
183
+ def _background_refresh(self, source: Union[str, List[str]]) -> None:
184
+ """
185
+ Internal method called by background updater.
186
+ Refreshes sources without user interaction.
187
+
188
+ Args:
189
+ source: Source(s) to refresh
190
+ """
191
+ # This is called from background thread, so be careful with state
192
+ self.refresh(source)
193
+
194
+ def ask(
195
+ self,
196
+ query: str,
197
+ top_k: int = 5,
198
+ system_prompt: Optional[str] = None,
199
+ ) -> Answer:
200
+ """
201
+ Ask a question and get an answer with citations.
202
+
203
+ Args:
204
+ query: Question to ask
205
+ top_k: Number of relevant chunks to retrieve
206
+ system_prompt: Optional custom system prompt for answer generation
207
+
208
+ Returns:
209
+ Answer with citations
210
+ """
211
+ # Generate query embedding
212
+ query_embedding = self.embedder.embed_query(query)
213
+
214
+ # Search for relevant chunks
215
+ citations = self.store.search(
216
+ query_embedding=query_embedding,
217
+ top_k=top_k,
218
+ filters=self._filters,
219
+ )
220
+
221
+ # Generate answer
222
+ answer = self.retriever.generate_answer(
223
+ query=query,
224
+ citations=citations,
225
+ system_prompt=system_prompt,
226
+ )
227
+
228
+ # Reset filters after use
229
+ self._filters = None
230
+
231
+ return answer
232
+
233
+ def filter(self, **kwargs: Any) -> "Ragi":
234
+ """
235
+ Filter documents by metadata for the next query.
236
+
237
+ Args:
238
+ **kwargs: Metadata key-value pairs to filter by
239
+
240
+ Returns:
241
+ Self for chaining
242
+
243
+ Examples:
244
+ >>> kb.filter(type="api").ask("How does auth work?")
245
+ >>> kb.filter(source="docs/guide.pdf").ask("What's in the guide?")
246
+ """
247
+ self._filters = kwargs
248
+ return self
249
+
250
+ def __call__(self, query: str, top_k: int = 5) -> Answer:
251
+ """
252
+ Callable shorthand for ask().
253
+
254
+ Args:
255
+ query: Question to ask
256
+ top_k: Number of relevant chunks to retrieve
257
+
258
+ Returns:
259
+ Answer with citations
260
+ """
261
+ return self.ask(query, top_k=top_k)
262
+
263
+ def count(self) -> int:
264
+ """Return the number of chunks in the knowledge base."""
265
+ return self.store.count()
266
+
267
+ def refresh(self, sources: Union[str, List[str]]) -> "Ragi":
268
+ """
269
+ Refresh specific sources by deleting old chunks and re-adding.
270
+ Useful when documents have been updated.
271
+
272
+ Args:
273
+ sources: File paths, URLs, or glob patterns to refresh
274
+
275
+ Returns:
276
+ Self for chaining
277
+
278
+ Examples:
279
+ >>> # Refresh a single file
280
+ >>> kb.refresh("./docs/api.md")
281
+ >>>
282
+ >>> # Refresh multiple files
283
+ >>> kb.refresh(["./docs/*.pdf", "./README.md"])
284
+ """
285
+ # Load documents to get their actual source paths
286
+ documents = self.loader.load(sources)
287
+
288
+ # Delete old chunks for each source
289
+ for doc in documents:
290
+ deleted = self.store.delete_by_source(doc.source)
291
+
292
+ # Re-add the documents
293
+ all_chunks = []
294
+ for doc in documents:
295
+ chunks = self.chunker.chunk_document(doc)
296
+ all_chunks.extend(chunks)
297
+
298
+ # Generate embeddings
299
+ chunks_with_embeddings = self.embedder.embed_chunks(all_chunks)
300
+
301
+ # Store in vector database
302
+ self.store.add_chunks(chunks_with_embeddings)
303
+
304
+ return self
305
+
306
+ def clear(self) -> None:
307
+ """Clear all data from the knowledge base."""
308
+ # Stop auto-updater if running
309
+ if self._updater:
310
+ self._updater.stop()
311
+ self._tracked_sources.clear()
312
+
313
+ self.store.clear()
314
+
315
+ def __del__(self):
316
+ """Cleanup on deletion."""
317
+ if hasattr(self, "_updater") and self._updater:
318
+ self._updater.stop()