piragi 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ # LLM Configuration (Optional - defaults to Ollama on localhost)
2
+ # For Ollama (default):
3
+ LLM_BASE_URL=http://localhost:11434/v1
4
+ LLM_API_KEY=not-needed
5
+
6
+ # For OpenAI:
7
+ # LLM_BASE_URL=https://api.openai.com/v1
8
+ # LLM_API_KEY=sk-your-openai-key-here
9
+
10
+ # For other OpenAI-compatible APIs (e.g., LM Studio, vLLM, etc.):
11
+ # LLM_BASE_URL=http://localhost:1234/v1
12
+ # LLM_API_KEY=your-api-key-or-not-needed
13
+
14
+ # Embedding Configuration (Optional - defaults to local sentence-transformers)
15
+ # For local models (default - no API needed):
16
+ # Uses sentence-transformers library, no base_url needed
17
+
18
+ # For OpenAI embeddings:
19
+ # EMBEDDING_BASE_URL=https://api.openai.com/v1
20
+ # EMBEDDING_API_KEY=sk-your-openai-key-here
21
+
22
+ # For other OpenAI-compatible embedding APIs:
23
+ # EMBEDDING_BASE_URL=http://localhost:8080/v1
24
+ # EMBEDDING_API_KEY=your-api-key-or-not-needed
@@ -0,0 +1,53 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ venv/
25
+ venv_*/
26
+ env/
27
+ ENV/
28
+ .venv
29
+
30
+ # IDEs
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+
37
+ # Testing
38
+ .pytest_cache/
39
+ .coverage
40
+ htmlcov/
41
+ .tox/
42
+
43
+ # Environment
44
+ .env
45
+ .env.local
46
+
47
+ # LanceDB
48
+ *.lance
49
+ lancedb/
50
+
51
+ # OS
52
+ .DS_Store
53
+ Thumbs.db
piragi-0.1.0/API.md ADDED
@@ -0,0 +1,428 @@
1
+ # API Reference
2
+
3
+ Complete API documentation for Ragi.
4
+
5
+ ## Main Class
6
+
7
+ ### `Ragi`
8
+
9
+ The main interface for creating and querying RAG systems.
10
+
11
+ ```python
12
+ from ragi import Ragi
13
+ ```
14
+
15
+ #### Constructor
16
+
17
+ ```python
18
+ Ragi(
19
+ sources: Union[str, List[str], None] = None,
20
+ persist_dir: str = ".ragi",
21
+ config: Optional[Dict[str, Any]] = None,
22
+ )
23
+ ```
24
+
25
+ **Parameters:**
26
+ - `sources` - File paths, URLs, or glob patterns to load initially
27
+ - `persist_dir` - Directory to persist vector database (default: `.ragi`)
28
+ - `config` - Optional configuration dict with nested sections:
29
+ - `llm` - LLM configuration:
30
+ - `model` - Model name (default: `llama3.2`)
31
+ - `api_key` - API key (default: env `LLM_API_KEY` or `"not-needed"`)
32
+ - `base_url` - API base URL (default: env `LLM_BASE_URL` or `"http://localhost:11434/v1"`)
33
+ - `embedding` - Embedding configuration:
34
+ - `model` - Model name (default: `nvidia/llama-embed-nemotron-8b`)
35
+ - `device` - Device to use for local models (default: auto-detect)
36
+ - `base_url` - API base URL for remote embeddings (optional)
37
+ - `api_key` - API key for remote embeddings (optional, defaults to env `EMBEDDING_API_KEY`)
38
+ - `chunk` - Chunking configuration:
39
+ - `size` - Target chunk size in tokens (default: 512)
40
+ - `overlap` - Number of tokens to overlap (default: 50)
41
+
42
+ **Examples:**
43
+ ```python
44
+ # Basic initialization (uses free local models)
45
+ kb = Ragi("./docs")
46
+
47
+ # With public embedding model
48
+ kb = Ragi("./docs", config={
49
+ "embedding": {"model": "sentence-transformers/all-MiniLM-L6-v2"}
50
+ })
51
+
52
+ # Custom Ollama model
53
+ kb = Ragi("./docs", config={
54
+ "llm": {"model": "mistral"},
55
+ "embedding": {"model": "sentence-transformers/all-MiniLM-L6-v2"}
56
+ })
57
+
58
+ # With OpenAI-compatible API (LLM only, local embeddings)
59
+ kb = Ragi("./docs", config={
60
+ "llm": {
61
+ "model": "gpt-4o-mini",
62
+ "api_key": "sk-...",
63
+ "base_url": "https://api.openai.com/v1"
64
+ },
65
+ "embedding": {"model": "sentence-transformers/all-MiniLM-L6-v2"}
66
+ })
67
+
68
+ # With OpenAI for both LLM and embeddings
69
+ kb = Ragi("./docs", config={
70
+ "llm": {
71
+ "model": "gpt-4o-mini",
72
+ "api_key": "sk-...",
73
+ "base_url": "https://api.openai.com/v1"
74
+ },
75
+ "embedding": {
76
+ "model": "text-embedding-3-small",
77
+ "base_url": "https://api.openai.com/v1",
78
+ "api_key": "sk-..."
79
+ }
80
+ })
81
+
82
+ # Custom chunking
83
+ kb = Ragi("./docs", config={
84
+ "chunk": {"size": 1024, "overlap": 100},
85
+ "embedding": {"model": "sentence-transformers/all-MiniLM-L6-v2"}
86
+ })
87
+
88
+ # Empty initialization (add documents later)
89
+ kb = Ragi(persist_dir=".my_kb")
90
+ kb.add("./docs")
91
+ ```
92
+
93
+ #### Methods
94
+
95
+ ##### `add(sources: Union[str, List[str]]) -> Ragi`
96
+
97
+ Add documents to the knowledge base.
98
+
99
+ **Parameters:**
100
+ - `sources` - File paths, URLs, or glob patterns
101
+
102
+ **Returns:** Self for chaining
103
+
104
+ **Examples:**
105
+ ```python
106
+ # Single file
107
+ kb.add("./README.md")
108
+
109
+ # Multiple files
110
+ kb.add(["./docs/*.pdf", "./src/**/*.py"])
111
+
112
+ # Chaining
113
+ kb.add("./docs").add("./src")
114
+
115
+ # URLs
116
+ kb.add("https://example.com/guide")
117
+ ```
118
+
119
+ ##### `ask(query: str, top_k: int = 5, system_prompt: Optional[str] = None) -> Answer`
120
+
121
+ Ask a question and get an answer with citations.
122
+
123
+ **Parameters:**
124
+ - `query` - Question to ask
125
+ - `top_k` - Number of relevant chunks to retrieve (default: 5)
126
+ - `system_prompt` - Custom system prompt for answer generation
127
+
128
+ **Returns:** `Answer` object with text and citations
129
+
130
+ **Examples:**
131
+ ```python
132
+ answer = kb.ask("How do I install this?")
133
+ print(answer.text)
134
+
135
+ # More context
136
+ answer = kb.ask("How does auth work?", top_k=10)
137
+
138
+ # Custom prompt
139
+ prompt = "Answer concisely with code examples when relevant."
140
+ answer = kb.ask("Show me usage examples", system_prompt=prompt)
141
+ ```
142
+
143
+ ##### `__call__(query: str, top_k: int = 5) -> Answer`
144
+
145
+ Callable shorthand for `ask()`.
146
+
147
+ **Parameters:**
148
+ - `query` - Question to ask
149
+ - `top_k` - Number of relevant chunks to retrieve
150
+
151
+ **Returns:** `Answer` object
152
+
153
+ **Examples:**
154
+ ```python
155
+ # These are equivalent:
156
+ answer = kb.ask("What is this?")
157
+ answer = kb("What is this?")
158
+ ```
159
+
160
+ ##### `filter(**kwargs) -> Ragi`
161
+
162
+ Filter documents by metadata for the next query.
163
+
164
+ **Parameters:**
165
+ - `**kwargs` - Metadata key-value pairs to filter by
166
+
167
+ **Returns:** Self for chaining
168
+
169
+ **Examples:**
170
+ ```python
171
+ # Filter by file type
172
+ answer = kb.filter(file_type="pdf").ask("What's in the PDFs?")
173
+
174
+ # Filter by custom metadata
175
+ answer = kb.filter(category="api", version="v2").ask("How does it work?")
176
+
177
+ # Multiple filters
178
+ answer = kb.filter(author="Alice", topic="security").ask("Security guidelines?")
179
+ ```
180
+
181
+ ##### `count() -> int`
182
+
183
+ Return the number of chunks in the knowledge base.
184
+
185
+ **Returns:** Number of chunks
186
+
187
+ **Examples:**
188
+ ```python
189
+ print(f"Knowledge base contains {kb.count()} chunks")
190
+ ```
191
+
192
+ ##### `refresh(sources: Union[str, List[str]]) -> Ragi`
193
+
194
+ Refresh specific sources by deleting old chunks and re-adding. Useful when documents have been updated.
195
+
196
+ **Parameters:**
197
+ - `sources` - File paths, URLs, or glob patterns to refresh
198
+
199
+ **Returns:** Self for chaining
200
+
201
+ **Examples:**
202
+ ```python
203
+ # Refresh a single file
204
+ kb.refresh("./docs/api.md")
205
+
206
+ # Refresh multiple files
207
+ kb.refresh(["./docs/*.pdf", "./README.md"])
208
+
209
+ # Refresh after editing
210
+ with open("./docs/guide.md", "w") as f:
211
+ f.write("Updated content...")
212
+ kb.refresh("./docs/guide.md")
213
+ ```
214
+
215
+ ##### `clear() -> None`
216
+
217
+ Clear all data from the knowledge base.
218
+
219
+ **Examples:**
220
+ ```python
221
+ kb.clear()
222
+ print(kb.count()) # 0
223
+ ```
224
+
225
+ ## Data Types
226
+
227
+ ### `Answer`
228
+
229
+ Result from a query with answer text and citations.
230
+
231
+ **Attributes:**
232
+ - `text: str` - The generated answer
233
+ - `citations: List[Citation]` - Source citations
234
+ - `query: str` - Original query
235
+
236
+ **Methods:**
237
+ - `__str__()` - Returns answer text
238
+ - `__repr__()` - Returns detailed representation
239
+
240
+ **Examples:**
241
+ ```python
242
+ answer = kb.ask("What is RAG?")
243
+
244
+ print(answer.text) # The answer
245
+ print(answer.query) # "What is RAG?"
246
+ print(len(answer.citations)) # Number of citations
247
+
248
+ # String representation
249
+ print(answer) # Same as answer.text
250
+ print(repr(answer)) # Answer(text='...', citations=3)
251
+ ```
252
+
253
+ ### `Citation`
254
+
255
+ A single source citation with relevance score.
256
+
257
+ **Attributes:**
258
+ - `source: str` - Source file path or URL
259
+ - `chunk: str` - The actual text chunk
260
+ - `score: float` - Relevance score (0-1, higher is better)
261
+ - `metadata: Dict[str, Any]` - Additional metadata
262
+
263
+ **Properties:**
264
+ - `preview: str` - Preview of chunk (first 100 chars)
265
+
266
+ **Examples:**
267
+ ```python
268
+ for citation in answer.citations:
269
+ print(f"Source: {citation.source}")
270
+ print(f"Score: {citation.score:.2%}")
271
+ print(f"Preview: {citation.preview}")
272
+ print(f"Metadata: {citation.metadata}")
273
+ ```
274
+
275
+ ## Supported File Formats
276
+
277
+ Ragi uses [markitdown](https://github.com/microsoft/markitdown) for document conversion and supports:
278
+
279
+ ### Documents
280
+ - PDF (`.pdf`)
281
+ - Microsoft Word (`.docx`, `.doc`)
282
+ - Microsoft PowerPoint (`.pptx`, `.ppt`)
283
+ - Microsoft Excel (`.xlsx`, `.xls`)
284
+
285
+ ### Text
286
+ - Markdown (`.md`)
287
+ - Plain text (`.txt`)
288
+ - Source code (`.py`, `.js`, `.java`, `.cpp`, etc.)
289
+ - HTML (`.html`)
290
+
291
+ ### Data
292
+ - JSON (`.json`)
293
+ - XML (`.xml`)
294
+ - CSV (`.csv`)
295
+
296
+ ### Media
297
+ - Images (`.png`, `.jpg`, `.jpeg`, `.gif`) - with OCR
298
+ - Audio (`.mp3`, `.wav`) - with transcription
299
+
300
+ ### Web
301
+ - URLs (converted to markdown)
302
+
303
+ ### Archives
304
+ - ZIP files (`.zip`)
305
+
306
+ ### E-books
307
+ - EPub (`.epub`)
308
+
309
+ ## Metadata Fields
310
+
311
+ ### Automatic Metadata
312
+
313
+ Automatically extracted for all documents:
314
+ - `filename` - File name
315
+ - `file_type` - File extension without dot
316
+ - `file_path` - Absolute file path
317
+
318
+ For URLs:
319
+ - `url` - The URL
320
+ - `source_type` - Always "url"
321
+
322
+ ### Custom Metadata
323
+
324
+ Add custom metadata when loading:
325
+ ```python
326
+ # This is a planned feature
327
+ kb.add("./docs/api.pdf", metadata={"category": "api", "version": "v2"})
328
+ ```
329
+
330
+ Filter by custom metadata:
331
+ ```python
332
+ answer = kb.filter(category="api").ask("How does it work?")
333
+ ```
334
+
335
+ ## Error Handling
336
+
337
+ ### Common Exceptions
338
+
339
+ ```python
340
+ # Invalid source
341
+ try:
342
+ kb = Ragi("/nonexistent/path")
343
+ except ValueError as e:
344
+ print(f"Error: {e}")
345
+
346
+ # Missing API key
347
+ try:
348
+ kb = Ragi("./docs")
349
+ except RuntimeError as e:
350
+ print(f"Error: {e}")
351
+
352
+ # Embedding generation failed
353
+ try:
354
+ answer = kb.ask("question")
355
+ except RuntimeError as e:
356
+ print(f"Error: {e}")
357
+ ```
358
+
359
+ ## Environment Variables
360
+
361
+ - `LLM_BASE_URL` - LLM API base URL (default: `http://localhost:11434/v1`)
362
+ - `LLM_API_KEY` - LLM API key (default: `not-needed`)
363
+
364
+ For Ollama (default, free local models):
365
+ ```bash
366
+ # No environment variables needed!
367
+ # Just make sure Ollama is running: ollama serve
368
+ ```
369
+
370
+ For OpenAI or other providers:
371
+ ```bash
372
+ export LLM_BASE_URL="https://api.openai.com/v1"
373
+ export LLM_API_KEY="sk-..."
374
+ ```
375
+
376
+ Or in `.env` file:
377
+ ```
378
+ LLM_BASE_URL=https://api.openai.com/v1
379
+ LLM_API_KEY=sk-...
380
+ ```
381
+
382
+ ## Best Practices
383
+
384
+ ### Chunking
385
+ - Use smaller chunks (256-512) for precise retrieval
386
+ - Use larger chunks (1024+) when more context is needed
387
+ - Increase overlap (100-200) for better continuity
388
+
389
+ ### Embeddings
390
+ - Use `sentence-transformers/all-MiniLM-L6-v2` for free, fast embeddings (recommended for getting started)
391
+ - Use `nvidia/llama-embed-nemotron-8b` for higher quality (requires HuggingFace auth)
392
+ - Use any sentence-transformers model from HuggingFace
393
+
394
+ ### LLM Selection
395
+ - Use `llama3.2` via Ollama for free local inference (default)
396
+ - Use `mistral` via Ollama for fast responses
397
+ - Use OpenAI-compatible APIs for cloud-based models (configure via `config` dict)
398
+
399
+ ### Performance
400
+ - Persist data to disk to avoid re-processing:
401
+ ```python
402
+ kb = Ragi("./docs", persist_dir=".kb")
403
+ ```
404
+ - Batch document additions:
405
+ ```python
406
+ kb.add(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
407
+ ```
408
+ - Use appropriate `top_k` values (5-10 for most cases)
409
+
410
+ ### Filtering
411
+ - Use metadata filters to narrow search space
412
+ - Combine filters for precise targeting:
413
+ ```python
414
+ kb.filter(type="api", version="v2").ask("...")
415
+ ```
416
+
417
+ ## Type Hints
418
+
419
+ Ragi is fully typed. Example:
420
+
421
+ ```python
422
+ from typing import List
423
+ from ragi import Ragi, Answer, Citation
424
+
425
+ kb: Ragi = Ragi("./docs")
426
+ answer: Answer = kb.ask("What is this?")
427
+ citations: List[Citation] = answer.citations
428
+ ```
@@ -0,0 +1,35 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2025-01-10
9
+
10
+ ### Added
11
+ - Initial release of Ragi
12
+ - Zero-config RAG with built-in vector store (LanceDB)
13
+ - Universal document support (PDF, Word, Excel, Markdown, Code, URLs, Images, Audio)
14
+ - Auto-chunking with markdown-aware splitting
15
+ - Local embeddings via sentence-transformers (nvidia/llama-embed-nemotron-8b)
16
+ - Remote embeddings via OpenAI-compatible APIs
17
+ - Local LLM via Ollama (llama3.2)
18
+ - OpenAI-compatible LLM support
19
+ - Smart citations with relevance scores
20
+ - Metadata filtering
21
+ - Auto-updates with background workers
22
+ - Change detection for files (mtime + hash) and URLs (HTTP HEAD)
23
+ - Concurrent query support
24
+ - Single unified config dict
25
+ - Examples: quickstart, ollama, code_qa, multi_format, embedding_options, update_documents
26
+ - Comprehensive API documentation
27
+
28
+ ### Features
29
+ - **Simple Setup** - Works with free local models out of the box
30
+ - **All Formats** - PDF, Word, Excel, Markdown, Code, URLs, Images, Audio
31
+ - **Auto-Updates** - Background refresh, queries never blocked
32
+ - **Smart Citations** - Every answer includes ranked source citations
33
+ - **OpenAI Compatible** - Drop-in support for any OpenAI-compatible API
34
+
35
+ [0.1.0]: https://github.com/hemanth/ragi/releases/tag/v0.1.0
piragi-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ragi Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
piragi-0.1.0/Makefile ADDED
@@ -0,0 +1,50 @@
1
+ .PHONY: install test format lint type-check clean build publish help
2
+
3
+ help:
4
+ @echo "Ragi Development Commands"
5
+ @echo "========================="
6
+ @echo "install - Install package in development mode"
7
+ @echo "test - Run tests"
8
+ @echo "test-cov - Run tests with coverage"
9
+ @echo "format - Format code with black"
10
+ @echo "lint - Lint code with ruff"
11
+ @echo "type-check - Check types with mypy"
12
+ @echo "clean - Remove build artifacts"
13
+ @echo "build - Build package"
14
+ @echo "publish - Publish to PyPI"
15
+
16
+ install:
17
+ pip install -e ".[dev]"
18
+
19
+ test:
20
+ pytest
21
+
22
+ test-cov:
23
+ pytest --cov=ragi --cov-report=term-missing --cov-report=html
24
+
25
+ format:
26
+ black src/ tests/ examples/
27
+
28
+ lint:
29
+ ruff check src/ tests/ examples/
30
+
31
+ type-check:
32
+ mypy src/
33
+
34
+ clean:
35
+ rm -rf build/
36
+ rm -rf dist/
37
+ rm -rf *.egg-info
38
+ rm -rf .pytest_cache
39
+ rm -rf .coverage
40
+ rm -rf htmlcov/
41
+ rm -rf .mypy_cache
42
+ rm -rf .ruff_cache
43
+ find . -type d -name __pycache__ -exec rm -rf {} +
44
+ find . -type f -name "*.pyc" -delete
45
+
46
+ build: clean
47
+ python -m build
48
+
49
+ publish: build
50
+ python -m twine upload dist/*