piragi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,149 @@
1
+ Metadata-Version: 2.4
2
+ Name: piragi
3
+ Version: 0.1.0
4
+ Summary: The best RAG interface yet - Built-in vector store, embeddings, citations, and auto-updates
5
+ Project-URL: Homepage, https://github.com/hemanth/ragi
6
+ Project-URL: Documentation, https://github.com/hemanth/ragi#readme
7
+ Project-URL: Repository, https://github.com/hemanth/ragi
8
+ Project-URL: Issues, https://github.com/hemanth/ragi/issues
9
+ Author-email: Hemanth HM <hemanth.hm@gmail.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: ai,embeddings,llm,rag,retrieval,vector-search
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Requires-Python: >=3.9
22
+ Requires-Dist: lancedb>=0.3.0
23
+ Requires-Dist: markitdown[all]>=0.0.1
24
+ Requires-Dist: openai>=1.0.0
25
+ Requires-Dist: pydantic>=2.0.0
26
+ Requires-Dist: python-dotenv>=1.0.0
27
+ Requires-Dist: sentence-transformers>=2.2.0
28
+ Requires-Dist: torch>=2.0.0
29
+ Requires-Dist: transformers>=4.51.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: black>=23.0.0; extra == 'dev'
32
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
33
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
35
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
36
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # Ragi
40
+
41
+ **The best RAG interface yet.**
42
+
43
+ ```python
44
+ from ragi import Ragi
45
+
46
+ kb = Ragi(["./docs", "./code/**/*.py", "https://api.example.com/docs"])
47
+ answer = kb.ask("How do I deploy this?")
48
+ ```
49
+
50
+ That's it. Built-in vector store, embeddings, citations, and auto-updates. Free & local by default.
51
+
52
+ ---
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install piragi
58
+
59
+ # Optional: Install Ollama for local LLM
60
+ curl -fsSL https://ollama.com/install.sh | sh
61
+ ollama pull llama3.2
62
+ ```
63
+
64
+ ---
65
+
66
+ ## Features
67
+
68
+ - **Simple Setup** - Works with free local models out of the box
69
+ - **All Formats** - PDF, Word, Excel, Markdown, Code, URLs, Images, Audio
70
+ - **Auto-Updates** - Background refresh, queries never blocked
71
+ - **Smart Citations** - Every answer includes sources
72
+ - **OpenAI Compatible** - Drop-in support for any OpenAI-compatible API
73
+
74
+ ---
75
+
76
+ ## Examples
77
+
78
+ ```python
79
+ # Basic
80
+ kb = Ragi("./docs")
81
+ answer = kb("What is this?")
82
+
83
+ # Multiple sources
84
+ kb = Ragi(["./docs/*.pdf", "https://api.docs.com", "./code/**/*.py"])
85
+
86
+ # OpenAI
87
+ kb = Ragi("./docs", config={
88
+ "llm": {"model": "gpt-4o-mini", "api_key": "sk-..."},
89
+ "embedding": {"model": "text-embedding-3-small", "api_key": "sk-..."}
90
+ })
91
+
92
+ # Filter
93
+ answer = kb.filter(file_type="pdf").ask("What's in the PDFs?")
94
+ ```
95
+
96
+ ---
97
+
98
+ ## Configuration
99
+
100
+ ```python
101
+ # Defaults (all optional)
102
+ config = {
103
+ "llm": {
104
+ "model": "llama3.2",
105
+ "base_url": "http://localhost:11434/v1"
106
+ },
107
+ "embedding": {
108
+ "model": "nvidia/llama-embed-nemotron-8b"
109
+ },
110
+ "auto_update": {
111
+ "enabled": True,
112
+ "interval": 300 # seconds
113
+ }
114
+ }
115
+ ```
116
+
117
+ ---
118
+
119
+ ## Auto-Updates
120
+
121
+ Changes detected and refreshed automatically in background. Zero query latency.
122
+
123
+ ```python
124
+ kb = Ragi(["./docs", "https://api.docs.com"])
125
+ # That's it - auto-updates enabled by default
126
+
127
+ # Disable if needed
128
+ kb = Ragi("./docs", config={"auto_update": {"enabled": False}})
129
+ ```
130
+
131
+ ---
132
+
133
+ ## API
134
+
135
+ ```python
136
+ kb = Ragi(sources, persist_dir=".ragi", config=None)
137
+ kb.add("./more-docs")
138
+ kb.ask(query, top_k=5)
139
+ kb(query) # Shorthand
140
+ kb.filter(**metadata).ask(query)
141
+ kb.count()
142
+ kb.clear()
143
+ ```
144
+
145
+ Full docs: [API.md](API.md)
146
+
147
+ ---
148
+
149
+ MIT License | **Ragi** = **R**etrieval **A**ugmented **G**eneration **I**nterface
@@ -0,0 +1,14 @@
1
+ ragi/__init__.py,sha256=BKXHlsfnnCP9bIc-WfbztwKAUbQUAI38oOy8iEHMO9k,781
2
+ ragi/async_updater.py,sha256=rjiaMUPSJ0cDzCdKRdifZ1yU6c70D_N0t9zkj281Jws,11218
3
+ ragi/change_detection.py,sha256=m2-bMywK9WDVrjniKJyTrrkUzzsOvGXAWf5ZFIFVLiU,6863
4
+ ragi/chunking.py,sha256=L9tRiLL0Hz10bD4gaLc-eYyviTTfXka2_QdPpZ_FBmQ,4525
5
+ ragi/core.py,sha256=IT558mpiUhE7JpeTczC3f05vtSs8U_MOEk5MPiO1btQ,10194
6
+ ragi/embeddings.py,sha256=tPVB8fc73BWXMvwAbOfOmiyVYrrs84d9lJfeKf5l00M,5094
7
+ ragi/loader.py,sha256=WBiDn6iuJLGHq31O0Q2nuiDNVtc3hzbCOIxwevqOtMo,3978
8
+ ragi/retrieval.py,sha256=uTMqLCZwpjI03_xVq8OIGmtKyrbCc6-sVpoNH2eh8r4,3788
9
+ ragi/store.py,sha256=7mYDnD9DW065nklrUjgngVDeY97rF8G4i-tLfI469wc,5176
10
+ ragi/types.py,sha256=5EY50FJNB7CceGQkpX-5O3vLkWRsElKvpf0oxizxPOs,2142
11
+ piragi-0.1.0.dist-info/METADATA,sha256=NDSacGNkfd4Ms_Yit9CKIGFtWpq_2TGoeM7F3BRrP9M,3747
12
+ piragi-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
+ piragi-0.1.0.dist-info/licenses/LICENSE,sha256=p8azJxT7o36aVnN6FDXGGTs0JjfOMab1J2REesqgoJM,1074
14
+ piragi-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ragi Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
ragi/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ """
2
+ Ragi - Zero-setup RAG library with auto-chunking, embeddings, and smart citations.
3
+
4
+ Example:
5
+ >>> from ragi import Ragi
6
+ >>>
7
+ >>> # One-liner setup and query
8
+ >>> kb = Ragi("./docs")
9
+ >>> answer = kb.ask("How do I install this?")
10
+ >>>
11
+ >>> # Access answer and citations
12
+ >>> print(answer.text)
13
+ >>> for citation in answer.citations:
14
+ ... print(f"Source: {citation.source}")
15
+ ... print(f"Relevance: {citation.score:.2f}")
16
+ >>>
17
+ >>> # Callable shorthand
18
+ >>> answer = kb("What's the API?")
19
+ >>>
20
+ >>> # Filter by metadata
21
+ >>> answer = kb.filter(type="documentation").ask("How to configure?")
22
+ """
23
+
24
+ from .core import Ragi
25
+ from .types import Answer, Citation
26
+
27
+ __version__ = "0.1.0"
28
+ __all__ = ["Ragi", "Answer", "Citation"]
ragi/async_updater.py ADDED
@@ -0,0 +1,345 @@
1
+ """Async background updater for automatic document refresh."""
2
+
3
+ import asyncio
4
+ import threading
5
+ import time
6
+ from typing import Any, Callable, Dict, List, Optional, Union
7
+ from queue import Queue, Empty
8
+ import logging
9
+
10
+ from .change_detection import ChangeDetector
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class UpdateTask:
16
+ """Represents an update task."""
17
+
18
+ def __init__(self, source: str, priority: int = 0):
19
+ self.source = source
20
+ self.priority = priority
21
+ self.created_at = time.time()
22
+
23
+
24
+ class AsyncUpdater:
25
+ """
26
+ Background updater that checks and refreshes sources asynchronously.
27
+
28
+ This prevents queries from being blocked while checking for updates.
29
+ Updates happen in a background thread/task.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ refresh_callback: Callable[[Union[str, List[str]]], None],
35
+ check_interval: float = 300.0,
36
+ max_workers: int = 2,
37
+ ):
38
+ """
39
+ Initialize the async updater.
40
+
41
+ Args:
42
+ refresh_callback: Function to call when refresh is needed (e.g., kb.refresh)
43
+ check_interval: Default interval between checks in seconds
44
+ max_workers: Number of background workers
45
+ """
46
+ self.refresh_callback = refresh_callback
47
+ self.check_interval = check_interval
48
+ self.max_workers = max_workers
49
+
50
+ # Source metadata tracking
51
+ self.sources_metadata: Dict[str, Dict[str, Any]] = {}
52
+
53
+ # Update queue and control
54
+ self.update_queue: Queue = Queue()
55
+ self.running = False
56
+ self.workers: List[threading.Thread] = []
57
+ self._lock = threading.RLock()
58
+
59
+ # Statistics
60
+ self.stats = {
61
+ "checks_performed": 0,
62
+ "updates_performed": 0,
63
+ "last_check_time": None,
64
+ }
65
+
66
+ def register_source(
67
+ self,
68
+ source: str,
69
+ content: str,
70
+ check_interval: Optional[float] = None,
71
+ ) -> None:
72
+ """
73
+ Register a source for automatic update checking.
74
+
75
+ Args:
76
+ source: File path or URL
77
+ content: Current content
78
+ check_interval: Custom check interval (uses default if None)
79
+ """
80
+ with self._lock:
81
+ if ChangeDetector.is_url(source):
82
+ metadata = ChangeDetector.get_url_metadata(source, content)
83
+ else:
84
+ metadata = ChangeDetector.get_file_metadata(source, content)
85
+
86
+ if check_interval is not None:
87
+ metadata["check_interval"] = check_interval
88
+
89
+ self.sources_metadata[source] = metadata
90
+ logger.info(f"Registered source for auto-update: {source}")
91
+
92
+ def unregister_source(self, source: str) -> None:
93
+ """Remove a source from auto-update tracking."""
94
+ with self._lock:
95
+ if source in self.sources_metadata:
96
+ del self.sources_metadata[source]
97
+ logger.info(f"Unregistered source: {source}")
98
+
99
+ def start(self) -> None:
100
+ """Start background update workers."""
101
+ if self.running:
102
+ logger.warning("Updater already running")
103
+ return
104
+
105
+ self.running = True
106
+
107
+ # Start worker threads
108
+ for i in range(self.max_workers):
109
+ worker = threading.Thread(
110
+ target=self._worker_loop,
111
+ name=f"AsyncUpdater-Worker-{i}",
112
+ daemon=True,
113
+ )
114
+ worker.start()
115
+ self.workers.append(worker)
116
+
117
+ # Start scheduler thread
118
+ scheduler = threading.Thread(
119
+ target=self._scheduler_loop,
120
+ name="AsyncUpdater-Scheduler",
121
+ daemon=True,
122
+ )
123
+ scheduler.start()
124
+ self.workers.append(scheduler)
125
+
126
+ logger.info(f"Started {self.max_workers} update workers + scheduler")
127
+
128
+ def stop(self) -> None:
129
+ """Stop background workers."""
130
+ self.running = False
131
+ logger.info("Stopping async updater...")
132
+
133
+ # Wait for workers to finish current tasks
134
+ for worker in self.workers:
135
+ if worker.is_alive():
136
+ worker.join(timeout=5.0)
137
+
138
+ self.workers.clear()
139
+ logger.info("Async updater stopped")
140
+
141
+ def _scheduler_loop(self) -> None:
142
+ """
143
+ Main scheduler loop that checks which sources need updating.
144
+ Runs in background thread.
145
+ """
146
+ while self.running:
147
+ try:
148
+ current_time = time.time()
149
+
150
+ with self._lock:
151
+ sources_to_check = []
152
+
153
+ for source, metadata in self.sources_metadata.items():
154
+ # Check if enough time has passed
155
+ if ChangeDetector.should_check_now(
156
+ metadata["last_checked"], metadata["check_interval"]
157
+ ):
158
+ sources_to_check.append(source)
159
+
160
+ # Queue checks
161
+ for source in sources_to_check:
162
+ self.update_queue.put(UpdateTask(source))
163
+
164
+ self.stats["last_check_time"] = current_time
165
+
166
+ # Sleep until next scheduled check (check every 10s for any due sources)
167
+ time.sleep(10)
168
+
169
+ except Exception as e:
170
+ logger.error(f"Scheduler error: {e}")
171
+ time.sleep(10)
172
+
173
+ def _worker_loop(self) -> None:
174
+ """
175
+ Worker loop that processes update tasks.
176
+ Runs in background thread.
177
+ """
178
+ while self.running:
179
+ try:
180
+ # Get task from queue (timeout to allow checking self.running)
181
+ try:
182
+ task = self.update_queue.get(timeout=1.0)
183
+ except Empty:
184
+ continue
185
+
186
+ # Process the update task
187
+ self._check_and_refresh(task.source)
188
+
189
+ self.update_queue.task_done()
190
+
191
+ except Exception as e:
192
+ logger.error(f"Worker error: {e}")
193
+
194
+ def _check_and_refresh(self, source: str) -> None:
195
+ """
196
+ Check if source changed and refresh if needed.
197
+
198
+ Args:
199
+ source: Source to check
200
+ """
201
+ try:
202
+ with self._lock:
203
+ if source not in self.sources_metadata:
204
+ return
205
+
206
+ metadata = self.sources_metadata[source]
207
+
208
+ self.stats["checks_performed"] += 1
209
+
210
+ # Check for changes
211
+ changed = False
212
+ new_metadata = {}
213
+
214
+ if ChangeDetector.is_url(source):
215
+ result = ChangeDetector.check_url_changed(
216
+ source,
217
+ metadata.get("etag"),
218
+ metadata.get("last_modified"),
219
+ )
220
+ changed = result.get("changed", False)
221
+ if "etag" in result:
222
+ new_metadata["etag"] = result["etag"]
223
+ if "last_modified" in result:
224
+ new_metadata["last_modified"] = result["last_modified"]
225
+ else:
226
+ changed = ChangeDetector.check_file_changed(
227
+ source, metadata.get("mtime"), metadata["content_hash"]
228
+ )
229
+
230
+ # Update last_checked timestamp
231
+ with self._lock:
232
+ self.sources_metadata[source]["last_checked"] = time.time()
233
+ if new_metadata:
234
+ self.sources_metadata[source].update(new_metadata)
235
+
236
+ # Refresh if changed
237
+ if changed:
238
+ logger.info(f"Change detected in {source}, refreshing...")
239
+ self.refresh_callback(source)
240
+ self.stats["updates_performed"] += 1
241
+
242
+ # Update metadata after refresh
243
+ # Note: In production, you'd want to read the new content and hash
244
+ # For now, we just update the timestamp
245
+
246
+ except Exception as e:
247
+ logger.error(f"Error checking source {source}: {e}")
248
+
249
+ def queue_update(self, source: str, priority: int = 0) -> None:
250
+ """
251
+ Manually queue a source for update check.
252
+
253
+ Args:
254
+ source: Source to check
255
+ priority: Priority (higher = more urgent)
256
+ """
257
+ self.update_queue.put(UpdateTask(source, priority))
258
+ logger.debug(f"Queued update check for {source}")
259
+
260
+ def get_stats(self) -> Dict[str, Any]:
261
+ """Get updater statistics."""
262
+ with self._lock:
263
+ return {
264
+ **self.stats,
265
+ "registered_sources": len(self.sources_metadata),
266
+ "queue_size": self.update_queue.qsize(),
267
+ "running": self.running,
268
+ }
269
+
270
+
271
+ class AsyncUpdaterMixin:
272
+ """
273
+ Mixin to add async update capabilities to Ragi.
274
+
275
+ Usage:
276
+ class AutoUpdateRagi(AsyncUpdaterMixin, Ragi):
277
+ pass
278
+
279
+ kb = AutoUpdateRagi("./docs", auto_update=True)
280
+ """
281
+
282
+ def __init__(self, *args, auto_update: bool = False, **kwargs):
283
+ """
284
+ Initialize with optional auto-update.
285
+
286
+ Args:
287
+ auto_update: Enable background auto-updates
288
+ **kwargs: Additional config including:
289
+ - auto_update_interval: Check interval in seconds
290
+ - auto_update_workers: Number of background workers
291
+ """
292
+ super().__init__(*args, **kwargs)
293
+
294
+ self.auto_update_enabled = auto_update
295
+ self.updater: Optional[AsyncUpdater] = None
296
+
297
+ if auto_update:
298
+ # Extract auto-update config
299
+ config = kwargs.get("config", {})
300
+ auto_config = config.get("auto_update", {})
301
+
302
+ interval = auto_config.get("interval", 300.0)
303
+ workers = auto_config.get("workers", 2)
304
+
305
+ # Initialize updater
306
+ self.updater = AsyncUpdater(
307
+ refresh_callback=self.refresh,
308
+ check_interval=interval,
309
+ max_workers=workers,
310
+ )
311
+
312
+ # Start background workers
313
+ self.updater.start()
314
+
315
+ def add(self, sources, **kwargs):
316
+ """Override add to register sources with auto-updater."""
317
+ result = super().add(sources, **kwargs)
318
+
319
+ # Register sources with updater if auto-update enabled
320
+ if self.auto_update_enabled and self.updater:
321
+ # Note: Would need access to loaded documents to register properly
322
+ # This is a simplified version
323
+ source_list = sources if isinstance(sources, list) else [sources]
324
+ for source in source_list:
325
+ # Register with default metadata
326
+ # In production, you'd track the actual content
327
+ self.updater.register_source(source, "", check_interval=None)
328
+
329
+ return result
330
+
331
+ def get_update_stats(self) -> Dict[str, Any]:
332
+ """Get auto-update statistics."""
333
+ if self.updater:
334
+ return self.updater.get_stats()
335
+ return {"auto_update_enabled": False}
336
+
337
+ def stop_auto_update(self) -> None:
338
+ """Stop auto-update background workers."""
339
+ if self.updater:
340
+ self.updater.stop()
341
+
342
+ def __del__(self):
343
+ """Cleanup on deletion."""
344
+ if hasattr(self, "updater") and self.updater:
345
+ self.updater.stop()