PyPI - piragi - Versions diffs - 0.1.0__tar.gz - Mend

piragi 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

piragi-0.1.0/.env.example +24 -0
piragi-0.1.0/.gitignore +53 -0
piragi-0.1.0/API.md +428 -0
piragi-0.1.0/CHANGELOG.md +35 -0
piragi-0.1.0/LICENSE +21 -0
piragi-0.1.0/Makefile +50 -0
piragi-0.1.0/PKG-INFO +149 -0
piragi-0.1.0/README.md +111 -0
piragi-0.1.0/examples/README.md +77 -0
piragi-0.1.0/examples/async_auto_update.py +309 -0
piragi-0.1.0/examples/async_update_demo.py +78 -0
piragi-0.1.0/examples/auto_update_detection.py +266 -0
piragi-0.1.0/examples/code_qa.py +138 -0
piragi-0.1.0/examples/embedding_options.py +151 -0
piragi-0.1.0/examples/multi_format.py +115 -0
piragi-0.1.0/examples/ollama_example.py +191 -0
piragi-0.1.0/examples/quickstart.py +79 -0
piragi-0.1.0/examples/test_ollama_mock.py +42 -0
piragi-0.1.0/examples/update_documents.py +163 -0
piragi-0.1.0/pyproject.toml +77 -0
piragi-0.1.0/src/ragi/__init__.py +28 -0
piragi-0.1.0/src/ragi/async_updater.py +345 -0
piragi-0.1.0/src/ragi/change_detection.py +211 -0
piragi-0.1.0/src/ragi/chunking.py +150 -0
piragi-0.1.0/src/ragi/core.py +318 -0
piragi-0.1.0/src/ragi/embeddings.py +150 -0
piragi-0.1.0/src/ragi/loader.py +131 -0
piragi-0.1.0/src/ragi/retrieval.py +125 -0
piragi-0.1.0/src/ragi/store.py +177 -0
piragi-0.1.0/src/ragi/types.py +54 -0
piragi-0.1.0/tests/__init__.py +1 -0
piragi-0.1.0/tests/conftest.py +87 -0
piragi-0.1.0/tests/test_chunking.py +86 -0
piragi-0.1.0/tests/test_core.py +245 -0
piragi-0.1.0/tests/test_loader.py +67 -0
piragi-0.1.0/tests/test_types.py +114 -0

piragi-0.1.0/.env.example ADDED Viewed

@@ -0,0 +1,24 @@
+# LLM Configuration (Optional - defaults to Ollama on localhost)
+# For Ollama (default):
+LLM_BASE_URL=http://localhost:11434/v1
+LLM_API_KEY=not-needed
+# For OpenAI:
+# LLM_BASE_URL=https://api.openai.com/v1
+# LLM_API_KEY=sk-your-openai-key-here
+# For other OpenAI-compatible APIs (e.g., LM Studio, vLLM, etc.):
+# LLM_BASE_URL=http://localhost:1234/v1
+# LLM_API_KEY=your-api-key-or-not-needed
+# Embedding Configuration (Optional - defaults to local sentence-transformers)
+# For local models (default - no API needed):
+# Uses sentence-transformers library, no base_url needed
+# For OpenAI embeddings:
+# EMBEDDING_BASE_URL=https://api.openai.com/v1
+# EMBEDDING_API_KEY=sk-your-openai-key-here
+# For other OpenAI-compatible embedding APIs:
+# EMBEDDING_BASE_URL=http://localhost:8080/v1
+# EMBEDDING_API_KEY=your-api-key-or-not-needed

piragi-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,53 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+venv/
+venv_*/
+env/
+ENV/
+.venv
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+# Environment
+.env
+.env.local
+# LanceDB
+*.lance
+lancedb/
+# OS
+.DS_Store
+Thumbs.db

piragi-0.1.0/API.md ADDED Viewed

@@ -0,0 +1,428 @@
+# API Reference
+Complete API documentation for Ragi.
+## Main Class
+### `Ragi`
+The main interface for creating and querying RAG systems.
+```python
+from ragi import Ragi
+```
+#### Constructor
+```python
+Ragi(
+    sources: Union[str, List[str], None] = None,
+    persist_dir: str = ".ragi",
+    config: Optional[Dict[str, Any]] = None,
+)
+```
+**Parameters:**
+- `sources` - File paths, URLs, or glob patterns to load initially
+- `persist_dir` - Directory to persist vector database (default: `.ragi`)
+- `config` - Optional configuration dict with nested sections:
+  - `llm` - LLM configuration:
+    - `model` - Model name (default: `llama3.2`)
+    - `api_key` - API key (default: env `LLM_API_KEY` or `"not-needed"`)
+    - `base_url` - API base URL (default: env `LLM_BASE_URL` or `"http://localhost:11434/v1"`)
+  - `embedding` - Embedding configuration:
+    - `model` - Model name (default: `nvidia/llama-embed-nemotron-8b`)
+    - `device` - Device to use for local models (default: auto-detect)
+    - `base_url` - API base URL for remote embeddings (optional)
+    - `api_key` - API key for remote embeddings (optional, defaults to env `EMBEDDING_API_KEY`)
+  - `chunk` - Chunking configuration:
+    - `size` - Target chunk size in tokens (default: 512)
+    - `overlap` - Number of tokens to overlap (default: 50)
+**Examples:**
+```python
+# Basic initialization (uses free local models)
+kb = Ragi("./docs")
+# With public embedding model
+kb = Ragi("./docs", config={
+    "embedding": {"model": "sentence-transformers/all-MiniLM-L6-v2"}
+})
+# Custom Ollama model
+kb = Ragi("./docs", config={
+    "llm": {"model": "mistral"},
+    "embedding": {"model": "sentence-transformers/all-MiniLM-L6-v2"}
+})
+# With OpenAI-compatible API (LLM only, local embeddings)
+kb = Ragi("./docs", config={
+    "llm": {
+        "model": "gpt-4o-mini",
+        "api_key": "sk-...",
+        "base_url": "https://api.openai.com/v1"
+    },
+    "embedding": {"model": "sentence-transformers/all-MiniLM-L6-v2"}
+})
+# With OpenAI for both LLM and embeddings
+kb = Ragi("./docs", config={
+    "llm": {
+        "model": "gpt-4o-mini",
+        "api_key": "sk-...",
+        "base_url": "https://api.openai.com/v1"
+    },
+    "embedding": {
+        "model": "text-embedding-3-small",
+        "base_url": "https://api.openai.com/v1",
+        "api_key": "sk-..."
+    }
+})
+# Custom chunking
+kb = Ragi("./docs", config={
+    "chunk": {"size": 1024, "overlap": 100},
+    "embedding": {"model": "sentence-transformers/all-MiniLM-L6-v2"}
+})
+# Empty initialization (add documents later)
+kb = Ragi(persist_dir=".my_kb")
+kb.add("./docs")
+```
+#### Methods
+##### `add(sources: Union[str, List[str]]) -> Ragi`
+Add documents to the knowledge base.
+**Parameters:**
+- `sources` - File paths, URLs, or glob patterns
+**Returns:** Self for chaining
+**Examples:**
+```python
+# Single file
+kb.add("./README.md")
+# Multiple files
+kb.add(["./docs/*.pdf", "./src/**/*.py"])
+# Chaining
+kb.add("./docs").add("./src")
+# URLs
+kb.add("https://example.com/guide")
+```
+##### `ask(query: str, top_k: int = 5, system_prompt: Optional[str] = None) -> Answer`
+Ask a question and get an answer with citations.
+**Parameters:**
+- `query` - Question to ask
+- `top_k` - Number of relevant chunks to retrieve (default: 5)
+- `system_prompt` - Custom system prompt for answer generation
+**Returns:** `Answer` object with text and citations
+**Examples:**
+```python
+answer = kb.ask("How do I install this?")
+print(answer.text)
+# More context
+answer = kb.ask("How does auth work?", top_k=10)
+# Custom prompt
+prompt = "Answer concisely with code examples when relevant."
+answer = kb.ask("Show me usage examples", system_prompt=prompt)
+```
+##### `__call__(query: str, top_k: int = 5) -> Answer`
+Callable shorthand for `ask()`.
+**Parameters:**
+- `query` - Question to ask
+- `top_k` - Number of relevant chunks to retrieve
+**Returns:** `Answer` object
+**Examples:**
+```python
+# These are equivalent:
+answer = kb.ask("What is this?")
+answer = kb("What is this?")
+```
+##### `filter(**kwargs) -> Ragi`
+Filter documents by metadata for the next query.
+**Parameters:**
+- `**kwargs` - Metadata key-value pairs to filter by
+**Returns:** Self for chaining
+**Examples:**
+```python
+# Filter by file type
+answer = kb.filter(file_type="pdf").ask("What's in the PDFs?")
+# Filter by custom metadata
+answer = kb.filter(category="api", version="v2").ask("How does it work?")
+# Multiple filters
+answer = kb.filter(author="Alice", topic="security").ask("Security guidelines?")
+```
+##### `count() -> int`
+Return the number of chunks in the knowledge base.
+**Returns:** Number of chunks
+**Examples:**
+```python
+print(f"Knowledge base contains {kb.count()} chunks")
+```
+##### `refresh(sources: Union[str, List[str]]) -> Ragi`
+Refresh specific sources by deleting old chunks and re-adding. Useful when documents have been updated.
+**Parameters:**
+- `sources` - File paths, URLs, or glob patterns to refresh
+**Returns:** Self for chaining
+**Examples:**
+```python
+# Refresh a single file
+kb.refresh("./docs/api.md")
+# Refresh multiple files
+kb.refresh(["./docs/*.pdf", "./README.md"])
+# Refresh after editing
+with open("./docs/guide.md", "w") as f:
+    f.write("Updated content...")
+kb.refresh("./docs/guide.md")
+```
+##### `clear() -> None`
+Clear all data from the knowledge base.
+**Examples:**
+```python
+kb.clear()
+print(kb.count())  # 0
+```
+## Data Types
+### `Answer`
+Result from a query with answer text and citations.
+**Attributes:**
+- `text: str` - The generated answer
+- `citations: List[Citation]` - Source citations
+- `query: str` - Original query
+**Methods:**
+- `__str__()` - Returns answer text
+- `__repr__()` - Returns detailed representation
+**Examples:**
+```python
+answer = kb.ask("What is RAG?")
+print(answer.text)              # The answer
+print(answer.query)             # "What is RAG?"
+print(len(answer.citations))    # Number of citations
+# String representation
+print(answer)                   # Same as answer.text
+print(repr(answer))             # Answer(text='...', citations=3)
+```
+### `Citation`
+A single source citation with relevance score.
+**Attributes:**
+- `source: str` - Source file path or URL
+- `chunk: str` - The actual text chunk
+- `score: float` - Relevance score (0-1, higher is better)
+- `metadata: Dict[str, Any]` - Additional metadata
+**Properties:**
+- `preview: str` - Preview of chunk (first 100 chars)
+**Examples:**
+```python
+for citation in answer.citations:
+    print(f"Source: {citation.source}")
+    print(f"Score: {citation.score:.2%}")
+    print(f"Preview: {citation.preview}")
+    print(f"Metadata: {citation.metadata}")
+```
+## Supported File Formats
+Ragi uses [markitdown](https://github.com/microsoft/markitdown) for document conversion and supports:
+### Documents
+- PDF (`.pdf`)
+- Microsoft Word (`.docx`, `.doc`)
+- Microsoft PowerPoint (`.pptx`, `.ppt`)
+- Microsoft Excel (`.xlsx`, `.xls`)
+### Text
+- Markdown (`.md`)
+- Plain text (`.txt`)
+- Source code (`.py`, `.js`, `.java`, `.cpp`, etc.)
+- HTML (`.html`)
+### Data
+- JSON (`.json`)
+- XML (`.xml`)
+- CSV (`.csv`)
+### Media
+- Images (`.png`, `.jpg`, `.jpeg`, `.gif`) - with OCR
+- Audio (`.mp3`, `.wav`) - with transcription
+### Web
+- URLs (converted to markdown)
+### Archives
+- ZIP files (`.zip`)
+### E-books
+- EPub (`.epub`)
+## Metadata Fields
+### Automatic Metadata
+Automatically extracted for all documents:
+- `filename` - File name
+- `file_type` - File extension without dot
+- `file_path` - Absolute file path
+For URLs:
+- `url` - The URL
+- `source_type` - Always "url"
+### Custom Metadata
+Add custom metadata when loading:
+```python
+# This is a planned feature
+kb.add("./docs/api.pdf", metadata={"category": "api", "version": "v2"})
+```
+Filter by custom metadata:
+```python
+answer = kb.filter(category="api").ask("How does it work?")
+```
+## Error Handling
+### Common Exceptions
+```python
+# Invalid source
+try:
+    kb = Ragi("/nonexistent/path")
+except ValueError as e:
+    print(f"Error: {e}")
+# Missing API key
+try:
+    kb = Ragi("./docs")
+except RuntimeError as e:
+    print(f"Error: {e}")
+# Embedding generation failed
+try:
+    answer = kb.ask("question")
+except RuntimeError as e:
+    print(f"Error: {e}")
+```
+## Environment Variables
+- `LLM_BASE_URL` - LLM API base URL (default: `http://localhost:11434/v1`)
+- `LLM_API_KEY` - LLM API key (default: `not-needed`)
+For Ollama (default, free local models):
+```bash
+# No environment variables needed!
+# Just make sure Ollama is running: ollama serve
+```
+For OpenAI or other providers:
+```bash
+export LLM_BASE_URL="https://api.openai.com/v1"
+export LLM_API_KEY="sk-..."
+```
+Or in `.env` file:
+```
+LLM_BASE_URL=https://api.openai.com/v1
+LLM_API_KEY=sk-...
+```
+## Best Practices
+### Chunking
+- Use smaller chunks (256-512) for precise retrieval
+- Use larger chunks (1024+) when more context is needed
+- Increase overlap (100-200) for better continuity
+### Embeddings
+- Use `sentence-transformers/all-MiniLM-L6-v2` for free, fast embeddings (recommended for getting started)
+- Use `nvidia/llama-embed-nemotron-8b` for higher quality (requires HuggingFace auth)
+- Use any sentence-transformers model from HuggingFace
+### LLM Selection
+- Use `llama3.2` via Ollama for free local inference (default)
+- Use `mistral` via Ollama for fast responses
+- Use OpenAI-compatible APIs for cloud-based models (configure via `config` dict)
+### Performance
+- Persist data to disk to avoid re-processing:
+  ```python
+  kb = Ragi("./docs", persist_dir=".kb")
+  ```
+- Batch document additions:
+  ```python
+  kb.add(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
+  ```
+- Use appropriate `top_k` values (5-10 for most cases)
+### Filtering
+- Use metadata filters to narrow search space
+- Combine filters for precise targeting:
+  ```python
+  kb.filter(type="api", version="v2").ask("...")
+  ```
+## Type Hints
+Ragi is fully typed. Example:
+```python
+from typing import List
+from ragi import Ragi, Answer, Citation
+kb: Ragi = Ragi("./docs")
+answer: Answer = kb.ask("What is this?")
+citations: List[Citation] = answer.citations
+```

piragi-0.1.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,35 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.1.0] - 2025-01-10
+### Added
+- Initial release of Ragi
+- Zero-config RAG with built-in vector store (LanceDB)
+- Universal document support (PDF, Word, Excel, Markdown, Code, URLs, Images, Audio)
+- Auto-chunking with markdown-aware splitting
+- Local embeddings via sentence-transformers (nvidia/llama-embed-nemotron-8b)
+- Remote embeddings via OpenAI-compatible APIs
+- Local LLM via Ollama (llama3.2)
+- OpenAI-compatible LLM support
+- Smart citations with relevance scores
+- Metadata filtering
+- Auto-updates with background workers
+- Change detection for files (mtime + hash) and URLs (HTTP HEAD)
+- Concurrent query support
+- Single unified config dict
+- Examples: quickstart, ollama, code_qa, multi_format, embedding_options, update_documents
+- Comprehensive API documentation
+### Features
+- **Simple Setup** - Works with free local models out of the box
+- **All Formats** - PDF, Word, Excel, Markdown, Code, URLs, Images, Audio
+- **Auto-Updates** - Background refresh, queries never blocked
+- **Smart Citations** - Every answer includes ranked source citations
+- **OpenAI Compatible** - Drop-in support for any OpenAI-compatible API
+[0.1.0]: https://github.com/hemanth/ragi/releases/tag/v0.1.0

piragi-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 Ragi Contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

piragi-0.1.0/Makefile ADDED Viewed

@@ -0,0 +1,50 @@
+.PHONY: install test format lint type-check clean build publish help
+help:
+	@echo "Ragi Development Commands"
+	@echo "========================="
+	@echo "install      - Install package in development mode"
+	@echo "test         - Run tests"
+	@echo "test-cov     - Run tests with coverage"
+	@echo "format       - Format code with black"
+	@echo "lint         - Lint code with ruff"
+	@echo "type-check   - Check types with mypy"
+	@echo "clean        - Remove build artifacts"
+	@echo "build        - Build package"
+	@echo "publish      - Publish to PyPI"
+install:
+	pip install -e ".[dev]"
+test:
+	pytest
+test-cov:
+	pytest --cov=ragi --cov-report=term-missing --cov-report=html
+format:
+	black src/ tests/ examples/
+lint:
+	ruff check src/ tests/ examples/
+type-check:
+	mypy src/
+clean:
+	rm -rf build/
+	rm -rf dist/
+	rm -rf *.egg-info
+	rm -rf .pytest_cache
+	rm -rf .coverage
+	rm -rf htmlcov/
+	rm -rf .mypy_cache
+	rm -rf .ruff_cache
+	find . -type d -name __pycache__ -exec rm -rf {} +
+	find . -type f -name "*.pyc" -delete
+build: clean
+	python -m build
+publish: build
+	python -m twine upload dist/*