ragwire 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragwire-1.0.0/LICENSE +21 -0
- ragwire-1.0.0/PKG-INFO +233 -0
- ragwire-1.0.0/README.md +169 -0
- ragwire-1.0.0/assets/ragwire.png +0 -0
- ragwire-1.0.0/pyproject.toml +150 -0
- ragwire-1.0.0/ragwire/__init__.py +73 -0
- ragwire-1.0.0/ragwire/core/__init__.py +6 -0
- ragwire-1.0.0/ragwire/core/config.py +113 -0
- ragwire-1.0.0/ragwire/core/pipeline.py +403 -0
- ragwire-1.0.0/ragwire/embeddings/__init__.py +5 -0
- ragwire-1.0.0/ragwire/embeddings/factory.py +159 -0
- ragwire-1.0.0/ragwire/loaders/__init__.py +5 -0
- ragwire-1.0.0/ragwire/loaders/markitdown_loader.py +169 -0
- ragwire-1.0.0/ragwire/metadata/__init__.py +6 -0
- ragwire-1.0.0/ragwire/metadata/extractor.py +171 -0
- ragwire-1.0.0/ragwire/metadata/schema.py +80 -0
- ragwire-1.0.0/ragwire/processing/__init__.py +14 -0
- ragwire-1.0.0/ragwire/processing/hashing.py +116 -0
- ragwire-1.0.0/ragwire/processing/splitter.py +137 -0
- ragwire-1.0.0/ragwire/py.typed +0 -0
- ragwire-1.0.0/ragwire/retriever/__init__.py +5 -0
- ragwire-1.0.0/ragwire/retriever/hybrid.py +172 -0
- ragwire-1.0.0/ragwire/utils/__init__.py +5 -0
- ragwire-1.0.0/ragwire/utils/logging.py +146 -0
- ragwire-1.0.0/ragwire/vectorstores/__init__.py +5 -0
- ragwire-1.0.0/ragwire/vectorstores/qdrant_store.py +261 -0
- ragwire-1.0.0/ragwire.egg-info/PKG-INFO +233 -0
- ragwire-1.0.0/ragwire.egg-info/SOURCES.txt +31 -0
- ragwire-1.0.0/ragwire.egg-info/dependency_links.txt +1 -0
- ragwire-1.0.0/ragwire.egg-info/requires.txt +43 -0
- ragwire-1.0.0/ragwire.egg-info/top_level.txt +1 -0
- ragwire-1.0.0/setup.cfg +4 -0
- ragwire-1.0.0/tests/test_imports.py +71 -0
ragwire-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 KGP Talkie Private Limited
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
ragwire-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ragwire
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: RAGWire — Production-grade RAG toolkit for document ingestion and retrieval with hybrid search support
|
|
5
|
+
Author-email: KGP Talkie Private Limited <udemy@kgptalkie.com>
|
|
6
|
+
Maintainer-email: KGP Talkie Private Limited <udemy@kgptalkie.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://kgptalkie.com
|
|
9
|
+
Project-URL: Documentation, https://github.com/laxmimerit/ragwire#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/laxmimerit/ragwire.git
|
|
11
|
+
Project-URL: Issues, https://github.com/laxmimerit/ragwire/issues
|
|
12
|
+
Project-URL: YouTube, https://youtube.com/kgptalkie
|
|
13
|
+
Keywords: rag,retrieval,vector-database,qdrant,embeddings,hybrid-search,nlp,document-processing
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
24
|
+
Requires-Python: >=3.9
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: langchain>=0.1.0
|
|
28
|
+
Requires-Dist: langchain-core>=0.1.0
|
|
29
|
+
Requires-Dist: langchain-community>=0.0.0
|
|
30
|
+
Requires-Dist: langchain-text-splitters>=0.0.1
|
|
31
|
+
Requires-Dist: qdrant-client>=1.6.0
|
|
32
|
+
Requires-Dist: langchain-qdrant>=0.1.0
|
|
33
|
+
Requires-Dist: markitdown[pdf]>=0.0.1
|
|
34
|
+
Requires-Dist: pyyaml>=6.0
|
|
35
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
36
|
+
Requires-Dist: pydantic>=2.0.0
|
|
37
|
+
Requires-Dist: tqdm>=4.66.0
|
|
38
|
+
Provides-Extra: openai
|
|
39
|
+
Requires-Dist: langchain-openai>=0.0.0; extra == "openai"
|
|
40
|
+
Provides-Extra: huggingface
|
|
41
|
+
Requires-Dist: langchain-huggingface>=0.0.0; extra == "huggingface"
|
|
42
|
+
Provides-Extra: ollama
|
|
43
|
+
Requires-Dist: langchain-ollama>=0.0.0; extra == "ollama"
|
|
44
|
+
Provides-Extra: google
|
|
45
|
+
Requires-Dist: langchain-google-genai>=0.0.0; extra == "google"
|
|
46
|
+
Provides-Extra: fastembed
|
|
47
|
+
Requires-Dist: fastembed>=0.2.0; extra == "fastembed"
|
|
48
|
+
Provides-Extra: dev
|
|
49
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
50
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
51
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
52
|
+
Requires-Dist: isort>=5.0.0; extra == "dev"
|
|
53
|
+
Requires-Dist: flake8>=6.0.0; extra == "dev"
|
|
54
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
55
|
+
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
56
|
+
Provides-Extra: all
|
|
57
|
+
Requires-Dist: ragwire[openai]; extra == "all"
|
|
58
|
+
Requires-Dist: ragwire[huggingface]; extra == "all"
|
|
59
|
+
Requires-Dist: ragwire[ollama]; extra == "all"
|
|
60
|
+
Requires-Dist: ragwire[google]; extra == "all"
|
|
61
|
+
Requires-Dist: ragwire[fastembed]; extra == "all"
|
|
62
|
+
Requires-Dist: ragwire[dev]; extra == "all"
|
|
63
|
+
Dynamic: license-file
|
|
64
|
+
|
|
65
|
+
<p align="center">
|
|
66
|
+
<img src="assets/ragwire.png" alt="RAGWire logo" width="120"/>
|
|
67
|
+
</p>
|
|
68
|
+
|
|
69
|
+
<h1 align="center">RAGWire</h1>
|
|
70
|
+
<p align="center">Production-grade RAG toolkit for document ingestion and retrieval</p>
|
|
71
|
+
|
|
72
|
+
<p align="center">
|
|
73
|
+
<a href="https://pypi.org/project/ragwire"><img src="https://img.shields.io/pypi/v/ragwire" alt="PyPI"/></a>
|
|
74
|
+
<a href="https://github.com/laxmimerit/ragwire/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-green" alt="License"/></a>
|
|
75
|
+
<a href="https://youtube.com/kgptalkie"><img src="https://img.shields.io/badge/YouTube-KGP%20Talkie-red" alt="YouTube"/></a>
|
|
76
|
+
</p>
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## Features
|
|
81
|
+
|
|
82
|
+
- **Document Loading** — PDF, DOCX, XLSX, PPTX and more via MarkItDown
|
|
83
|
+
- **LLM Metadata Extraction** — extracts company, doc type, fiscal period using your LLM
|
|
84
|
+
- **Smart Text Splitting** — markdown-aware and recursive chunking strategies
|
|
85
|
+
- **Multiple Embedding Providers** — Ollama, OpenAI, HuggingFace, Google, FastEmbed
|
|
86
|
+
- **Qdrant Vector Store** — dense, sparse, and hybrid search
|
|
87
|
+
- **Advanced Retrieval** — similarity, MMR, and hybrid search
|
|
88
|
+
- **SHA256 Deduplication** — at both file and chunk level
|
|
89
|
+
|
|
90
|
+
## Installation
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install ragwire
|
|
94
|
+
|
|
95
|
+
# With Ollama support (local, no API key)
|
|
96
|
+
pip install "ragwire[ollama]"
|
|
97
|
+
|
|
98
|
+
# With all providers
|
|
99
|
+
pip install "ragwire[all]"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Quick Start
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from ragwire import RAGPipeline
|
|
106
|
+
|
|
107
|
+
pipeline = RAGPipeline("config.yaml")
|
|
108
|
+
|
|
109
|
+
# Ingest documents
|
|
110
|
+
stats = pipeline.ingest_documents(["data/Apple_10k_2025.pdf"])
|
|
111
|
+
print(f"Chunks created: {stats['chunks_created']}")
|
|
112
|
+
|
|
113
|
+
# Retrieve
|
|
114
|
+
results = pipeline.retrieve("What is Apple's total revenue?", top_k=5)
|
|
115
|
+
for doc in results:
|
|
116
|
+
print(doc.metadata.get("company_name"), doc.page_content[:200])
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Configuration
|
|
120
|
+
|
|
121
|
+
Copy `config.example.yaml` to `config.yaml` and edit:
|
|
122
|
+
|
|
123
|
+
```yaml
|
|
124
|
+
embeddings:
|
|
125
|
+
provider: "ollama"
|
|
126
|
+
model: "qwen3-embedding:0.6b"
|
|
127
|
+
base_url: "http://localhost:11434"
|
|
128
|
+
|
|
129
|
+
llm:
|
|
130
|
+
provider: "ollama"
|
|
131
|
+
model: "qwen3.5:9b"
|
|
132
|
+
temperature: 0.0
|
|
133
|
+
num_ctx: 16384
|
|
134
|
+
|
|
135
|
+
vectorstore:
|
|
136
|
+
url: "http://localhost:6333"
|
|
137
|
+
collection_name: "my_docs"
|
|
138
|
+
use_sparse: true
|
|
139
|
+
|
|
140
|
+
retriever:
|
|
141
|
+
search_type: "hybrid"
|
|
142
|
+
top_k: 5
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Embedding Providers
|
|
146
|
+
|
|
147
|
+
```yaml
|
|
148
|
+
# Ollama (local)
|
|
149
|
+
embeddings:
|
|
150
|
+
provider: "ollama"
|
|
151
|
+
model: "qwen3-embedding:0.6b"
|
|
152
|
+
|
|
153
|
+
# OpenAI
|
|
154
|
+
embeddings:
|
|
155
|
+
provider: "openai"
|
|
156
|
+
model: "text-embedding-3-small"
|
|
157
|
+
|
|
158
|
+
# HuggingFace (local)
|
|
159
|
+
embeddings:
|
|
160
|
+
provider: "huggingface"
|
|
161
|
+
model_name: "sentence-transformers/all-MiniLM-L6-v2"
|
|
162
|
+
|
|
163
|
+
# Google
|
|
164
|
+
embeddings:
|
|
165
|
+
provider: "google"
|
|
166
|
+
model: "models/embedding-001"
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Component Usage
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from ragwire import (
|
|
173
|
+
MarkItDownLoader,
|
|
174
|
+
get_splitter,
|
|
175
|
+
get_markdown_splitter,
|
|
176
|
+
get_embedding,
|
|
177
|
+
QdrantStore,
|
|
178
|
+
MetadataExtractor,
|
|
179
|
+
hybrid_search,
|
|
180
|
+
mmr_search,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Load a document
|
|
184
|
+
loader = MarkItDownLoader()
|
|
185
|
+
result = loader.load("document.pdf")
|
|
186
|
+
|
|
187
|
+
# Split text
|
|
188
|
+
splitter = get_markdown_splitter(chunk_size=10000, chunk_overlap=2000)
|
|
189
|
+
chunks = splitter.split_text(result["text_content"])
|
|
190
|
+
|
|
191
|
+
# Embeddings
|
|
192
|
+
embedding = get_embedding({"provider": "ollama", "model": "qwen3-embedding:0.6b"})
|
|
193
|
+
|
|
194
|
+
# Vector store
|
|
195
|
+
store = QdrantStore(config={"url": "http://localhost:6333"}, embedding=embedding)
|
|
196
|
+
store.set_collection("my_collection")
|
|
197
|
+
vectorstore = store.get_store()
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Architecture
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
ragwire/
|
|
204
|
+
├── core/ # Config loader + RAGPipeline orchestrator
|
|
205
|
+
├── loaders/ # MarkItDown document converter
|
|
206
|
+
├── processing/ # Text splitters + SHA256 hashing
|
|
207
|
+
├── metadata/ # Pydantic schema + LLM extractor
|
|
208
|
+
├── embeddings/ # Multi-provider embedding factory
|
|
209
|
+
├── vectorstores/ # Qdrant wrapper with hybrid search
|
|
210
|
+
├── retriever/ # Similarity, MMR, hybrid retrieval
|
|
211
|
+
└── utils/ # Logging
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## Troubleshooting
|
|
215
|
+
|
|
216
|
+
| Error | Fix |
|
|
217
|
+
|-------|-----|
|
|
218
|
+
| Qdrant connection refused | `docker run -p 6333:6333 qdrant/qdrant` |
|
|
219
|
+
| `markitdown[pdf]` missing | `pip install "markitdown[pdf]"` |
|
|
220
|
+
| Ollama model not found | `ollama pull <model-name>` |
|
|
221
|
+
| `fastembed` missing | `pip install fastembed` (needed for hybrid search) |
|
|
222
|
+
| Embedding dimension mismatch | Set `force_recreate: true` in config once, then back to `false` |
|
|
223
|
+
|
|
224
|
+
## License
|
|
225
|
+
|
|
226
|
+
MIT © 2026 [KGP Talkie Private Limited](https://kgptalkie.com)
|
|
227
|
+
|
|
228
|
+
## Links
|
|
229
|
+
|
|
230
|
+
- Website: [kgptalkie.com](https://kgptalkie.com)
|
|
231
|
+
- YouTube: [youtube.com/kgptalkie](https://youtube.com/kgptalkie)
|
|
232
|
+
- GitHub: [github.com/laxmimerit/ragwire](https://github.com/laxmimerit/ragwire)
|
|
233
|
+
- Email: udemy@kgptalkie.com
|
ragwire-1.0.0/README.md
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/ragwire.png" alt="RAGWire logo" width="120"/>
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">RAGWire</h1>
|
|
6
|
+
<p align="center">Production-grade RAG toolkit for document ingestion and retrieval</p>
|
|
7
|
+
|
|
8
|
+
<p align="center">
|
|
9
|
+
<a href="https://pypi.org/project/ragwire"><img src="https://img.shields.io/pypi/v/ragwire" alt="PyPI"/></a>
|
|
10
|
+
<a href="https://github.com/laxmimerit/ragwire/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-green" alt="License"/></a>
|
|
11
|
+
<a href="https://youtube.com/kgptalkie"><img src="https://img.shields.io/badge/YouTube-KGP%20Talkie-red" alt="YouTube"/></a>
|
|
12
|
+
</p>
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- **Document Loading** — PDF, DOCX, XLSX, PPTX and more via MarkItDown
|
|
19
|
+
- **LLM Metadata Extraction** — extracts company, doc type, fiscal period using your LLM
|
|
20
|
+
- **Smart Text Splitting** — markdown-aware and recursive chunking strategies
|
|
21
|
+
- **Multiple Embedding Providers** — Ollama, OpenAI, HuggingFace, Google, FastEmbed
|
|
22
|
+
- **Qdrant Vector Store** — dense, sparse, and hybrid search
|
|
23
|
+
- **Advanced Retrieval** — similarity, MMR, and hybrid search
|
|
24
|
+
- **SHA256 Deduplication** — at both file and chunk level
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install ragwire
|
|
30
|
+
|
|
31
|
+
# With Ollama support (local, no API key)
|
|
32
|
+
pip install "ragwire[ollama]"
|
|
33
|
+
|
|
34
|
+
# With all providers
|
|
35
|
+
pip install "ragwire[all]"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from ragwire import RAGPipeline
|
|
42
|
+
|
|
43
|
+
pipeline = RAGPipeline("config.yaml")
|
|
44
|
+
|
|
45
|
+
# Ingest documents
|
|
46
|
+
stats = pipeline.ingest_documents(["data/Apple_10k_2025.pdf"])
|
|
47
|
+
print(f"Chunks created: {stats['chunks_created']}")
|
|
48
|
+
|
|
49
|
+
# Retrieve
|
|
50
|
+
results = pipeline.retrieve("What is Apple's total revenue?", top_k=5)
|
|
51
|
+
for doc in results:
|
|
52
|
+
print(doc.metadata.get("company_name"), doc.page_content[:200])
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Configuration
|
|
56
|
+
|
|
57
|
+
Copy `config.example.yaml` to `config.yaml` and edit:
|
|
58
|
+
|
|
59
|
+
```yaml
|
|
60
|
+
embeddings:
|
|
61
|
+
provider: "ollama"
|
|
62
|
+
model: "qwen3-embedding:0.6b"
|
|
63
|
+
base_url: "http://localhost:11434"
|
|
64
|
+
|
|
65
|
+
llm:
|
|
66
|
+
provider: "ollama"
|
|
67
|
+
model: "qwen3.5:9b"
|
|
68
|
+
temperature: 0.0
|
|
69
|
+
num_ctx: 16384
|
|
70
|
+
|
|
71
|
+
vectorstore:
|
|
72
|
+
url: "http://localhost:6333"
|
|
73
|
+
collection_name: "my_docs"
|
|
74
|
+
use_sparse: true
|
|
75
|
+
|
|
76
|
+
retriever:
|
|
77
|
+
search_type: "hybrid"
|
|
78
|
+
top_k: 5
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Embedding Providers
|
|
82
|
+
|
|
83
|
+
```yaml
|
|
84
|
+
# Ollama (local)
|
|
85
|
+
embeddings:
|
|
86
|
+
provider: "ollama"
|
|
87
|
+
model: "qwen3-embedding:0.6b"
|
|
88
|
+
|
|
89
|
+
# OpenAI
|
|
90
|
+
embeddings:
|
|
91
|
+
provider: "openai"
|
|
92
|
+
model: "text-embedding-3-small"
|
|
93
|
+
|
|
94
|
+
# HuggingFace (local)
|
|
95
|
+
embeddings:
|
|
96
|
+
provider: "huggingface"
|
|
97
|
+
model_name: "sentence-transformers/all-MiniLM-L6-v2"
|
|
98
|
+
|
|
99
|
+
# Google
|
|
100
|
+
embeddings:
|
|
101
|
+
provider: "google"
|
|
102
|
+
model: "models/embedding-001"
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Component Usage
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from ragwire import (
|
|
109
|
+
MarkItDownLoader,
|
|
110
|
+
get_splitter,
|
|
111
|
+
get_markdown_splitter,
|
|
112
|
+
get_embedding,
|
|
113
|
+
QdrantStore,
|
|
114
|
+
MetadataExtractor,
|
|
115
|
+
hybrid_search,
|
|
116
|
+
mmr_search,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Load a document
|
|
120
|
+
loader = MarkItDownLoader()
|
|
121
|
+
result = loader.load("document.pdf")
|
|
122
|
+
|
|
123
|
+
# Split text
|
|
124
|
+
splitter = get_markdown_splitter(chunk_size=10000, chunk_overlap=2000)
|
|
125
|
+
chunks = splitter.split_text(result["text_content"])
|
|
126
|
+
|
|
127
|
+
# Embeddings
|
|
128
|
+
embedding = get_embedding({"provider": "ollama", "model": "qwen3-embedding:0.6b"})
|
|
129
|
+
|
|
130
|
+
# Vector store
|
|
131
|
+
store = QdrantStore(config={"url": "http://localhost:6333"}, embedding=embedding)
|
|
132
|
+
store.set_collection("my_collection")
|
|
133
|
+
vectorstore = store.get_store()
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Architecture
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
ragwire/
|
|
140
|
+
├── core/ # Config loader + RAGPipeline orchestrator
|
|
141
|
+
├── loaders/ # MarkItDown document converter
|
|
142
|
+
├── processing/ # Text splitters + SHA256 hashing
|
|
143
|
+
├── metadata/ # Pydantic schema + LLM extractor
|
|
144
|
+
├── embeddings/ # Multi-provider embedding factory
|
|
145
|
+
├── vectorstores/ # Qdrant wrapper with hybrid search
|
|
146
|
+
├── retriever/ # Similarity, MMR, hybrid retrieval
|
|
147
|
+
└── utils/ # Logging
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Troubleshooting
|
|
151
|
+
|
|
152
|
+
| Error | Fix |
|
|
153
|
+
|-------|-----|
|
|
154
|
+
| Qdrant connection refused | `docker run -p 6333:6333 qdrant/qdrant` |
|
|
155
|
+
| `markitdown[pdf]` missing | `pip install "markitdown[pdf]"` |
|
|
156
|
+
| Ollama model not found | `ollama pull <model-name>` |
|
|
157
|
+
| `fastembed` missing | `pip install fastembed` (needed for hybrid search) |
|
|
158
|
+
| Embedding dimension mismatch | Set `force_recreate: true` in config once, then back to `false` |
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT © 2026 [KGP Talkie Private Limited](https://kgptalkie.com)
|
|
163
|
+
|
|
164
|
+
## Links
|
|
165
|
+
|
|
166
|
+
- Website: [kgptalkie.com](https://kgptalkie.com)
|
|
167
|
+
- YouTube: [youtube.com/kgptalkie](https://youtube.com/kgptalkie)
|
|
168
|
+
- GitHub: [github.com/laxmimerit/ragwire](https://github.com/laxmimerit/ragwire)
|
|
169
|
+
- Email: udemy@kgptalkie.com
|
|
Binary file
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ragwire"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "RAGWire — Production-grade RAG toolkit for document ingestion and retrieval with hybrid search support"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "KGP Talkie Private Limited", email = "udemy@kgptalkie.com"}
|
|
13
|
+
]
|
|
14
|
+
maintainers = [
|
|
15
|
+
{name = "KGP Talkie Private Limited", email = "udemy@kgptalkie.com"}
|
|
16
|
+
]
|
|
17
|
+
keywords = [
|
|
18
|
+
"rag",
|
|
19
|
+
"retrieval",
|
|
20
|
+
"vector-database",
|
|
21
|
+
"qdrant",
|
|
22
|
+
"embeddings",
|
|
23
|
+
"hybrid-search",
|
|
24
|
+
"nlp",
|
|
25
|
+
"document-processing"
|
|
26
|
+
]
|
|
27
|
+
classifiers = [
|
|
28
|
+
"Development Status :: 4 - Beta",
|
|
29
|
+
"Intended Audience :: Developers",
|
|
30
|
+
"Intended Audience :: Science/Research",
|
|
31
|
+
"Programming Language :: Python :: 3",
|
|
32
|
+
"Programming Language :: Python :: 3.9",
|
|
33
|
+
"Programming Language :: Python :: 3.10",
|
|
34
|
+
"Programming Language :: Python :: 3.11",
|
|
35
|
+
"Programming Language :: Python :: 3.12",
|
|
36
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
37
|
+
"Topic :: Text Processing :: Linguistic",
|
|
38
|
+
]
|
|
39
|
+
requires-python = ">=3.9"
|
|
40
|
+
dependencies = [
|
|
41
|
+
# Core dependencies
|
|
42
|
+
"langchain>=0.1.0",
|
|
43
|
+
"langchain-core>=0.1.0",
|
|
44
|
+
"langchain-community>=0.0.0",
|
|
45
|
+
"langchain-text-splitters>=0.0.1",
|
|
46
|
+
|
|
47
|
+
# Vector database
|
|
48
|
+
"qdrant-client>=1.6.0",
|
|
49
|
+
"langchain-qdrant>=0.1.0",
|
|
50
|
+
|
|
51
|
+
# Document processing
|
|
52
|
+
"markitdown[pdf]>=0.0.1",
|
|
53
|
+
|
|
54
|
+
# Configuration
|
|
55
|
+
"pyyaml>=6.0",
|
|
56
|
+
"python-dotenv>=1.0.0",
|
|
57
|
+
|
|
58
|
+
# Data validation
|
|
59
|
+
"pydantic>=2.0.0",
|
|
60
|
+
|
|
61
|
+
# Progress bars
|
|
62
|
+
"tqdm>=4.66.0",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
[project.optional-dependencies]
|
|
66
|
+
# OpenAI embeddings
|
|
67
|
+
openai = [
|
|
68
|
+
"langchain-openai>=0.0.0",
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
# HuggingFace embeddings
|
|
72
|
+
huggingface = [
|
|
73
|
+
"langchain-huggingface>=0.0.0",
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
# Ollama embeddings
|
|
77
|
+
ollama = [
|
|
78
|
+
"langchain-ollama>=0.0.0",
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
# Google/Gemini embeddings and LLM
|
|
82
|
+
google = [
|
|
83
|
+
"langchain-google-genai>=0.0.0",
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
# FastEmbed for sparse vectors
|
|
87
|
+
fastembed = [
|
|
88
|
+
"fastembed>=0.2.0",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
# Development dependencies
|
|
92
|
+
dev = [
|
|
93
|
+
"pytest>=7.0.0",
|
|
94
|
+
"pytest-cov>=4.0.0",
|
|
95
|
+
"black>=23.0.0",
|
|
96
|
+
"isort>=5.0.0",
|
|
97
|
+
"flake8>=6.0.0",
|
|
98
|
+
"mypy>=1.0.0",
|
|
99
|
+
"pre-commit>=3.0.0",
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# All dependencies
|
|
103
|
+
all = [
|
|
104
|
+
"ragwire[openai]",
|
|
105
|
+
"ragwire[huggingface]",
|
|
106
|
+
"ragwire[ollama]",
|
|
107
|
+
"ragwire[google]",
|
|
108
|
+
"ragwire[fastembed]",
|
|
109
|
+
"ragwire[dev]",
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
[project.urls]
|
|
113
|
+
Homepage = "https://kgptalkie.com"
|
|
114
|
+
Documentation = "https://github.com/laxmimerit/ragwire#readme"
|
|
115
|
+
Repository = "https://github.com/laxmimerit/ragwire.git"
|
|
116
|
+
Issues = "https://github.com/laxmimerit/ragwire/issues"
|
|
117
|
+
YouTube = "https://youtube.com/kgptalkie"
|
|
118
|
+
|
|
119
|
+
[tool.setuptools.packages.find]
|
|
120
|
+
where = ["."]
|
|
121
|
+
include = ["ragwire*"]
|
|
122
|
+
|
|
123
|
+
[tool.setuptools.package-data]
|
|
124
|
+
ragwire = ["py.typed"]
|
|
125
|
+
|
|
126
|
+
[tool.setuptools.data-files]
|
|
127
|
+
"assets" = ["assets/ragwire.png"]
|
|
128
|
+
|
|
129
|
+
[tool.black]
|
|
130
|
+
line-length = 88
|
|
131
|
+
target-version = ['py39', 'py310', 'py311', 'py312']
|
|
132
|
+
include = '\.pyi?$'
|
|
133
|
+
|
|
134
|
+
[tool.isort]
|
|
135
|
+
profile = "black"
|
|
136
|
+
line_length = 88
|
|
137
|
+
multi_line_output = 3
|
|
138
|
+
|
|
139
|
+
[tool.mypy]
|
|
140
|
+
python_version = "3.9"
|
|
141
|
+
warn_return_any = true
|
|
142
|
+
warn_unused_configs = true
|
|
143
|
+
disallow_untyped_defs = false
|
|
144
|
+
|
|
145
|
+
[tool.pytest.ini_options]
|
|
146
|
+
testpaths = ["tests"]
|
|
147
|
+
python_files = "test_*.py"
|
|
148
|
+
python_classes = "Test*"
|
|
149
|
+
python_functions = "test_*"
|
|
150
|
+
addopts = "-v --cov=ragwire --cov-report=term-missing"
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RAGWire — Production-grade RAG toolkit for document ingestion and retrieval.
|
|
3
|
+
|
|
4
|
+
A clean, installable Python toolkit providing:
|
|
5
|
+
- Document loading and conversion (PDF, DOCX, XLSX, etc.)
|
|
6
|
+
- Text splitting and chunking
|
|
7
|
+
- LLM-based metadata extraction
|
|
8
|
+
- Multiple embedding provider support (OpenAI, HuggingFace, Ollama, Google)
|
|
9
|
+
- Qdrant vector store with hybrid search
|
|
10
|
+
- Advanced retrieval strategies (similarity, MMR, hybrid)
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
>>> from ragwire import RAGPipeline
|
|
14
|
+
>>>
|
|
15
|
+
>>> pipeline = RAGPipeline("config.yaml")
|
|
16
|
+
>>> stats = pipeline.ingest_documents(["doc.pdf"])
|
|
17
|
+
>>> results = pipeline.retrieve("What is the revenue?")
|
|
18
|
+
>>> for doc in results:
|
|
19
|
+
... print(doc.page_content)
|
|
20
|
+
|
|
21
|
+
Author: KGP Talkie Private Limited
|
|
22
|
+
License: MIT
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
__version__ = version("ragwire")
|
|
29
|
+
except PackageNotFoundError:
|
|
30
|
+
__version__ = "1.0.0"
|
|
31
|
+
|
|
32
|
+
__author__ = "KGP Talkie Private Limited"
|
|
33
|
+
|
|
34
|
+
from .core.config import Config
|
|
35
|
+
from .core.pipeline import RAGPipeline
|
|
36
|
+
from .metadata.schema import DocumentMetadata
|
|
37
|
+
from .metadata.extractor import MetadataExtractor
|
|
38
|
+
from .loaders.markitdown_loader import MarkItDownLoader
|
|
39
|
+
from .processing.splitter import get_splitter, get_markdown_splitter, get_code_splitter
|
|
40
|
+
from .processing.hashing import sha256_text, sha256_file_from_path, sha256_chunk
|
|
41
|
+
from .embeddings.factory import get_embedding
|
|
42
|
+
from .vectorstores.qdrant_store import QdrantStore
|
|
43
|
+
from .retriever.hybrid import get_retriever, hybrid_search, mmr_search
|
|
44
|
+
from .utils.logging import setup_logging, get_logger
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
# Core
|
|
48
|
+
"Config",
|
|
49
|
+
"RAGPipeline",
|
|
50
|
+
# Metadata
|
|
51
|
+
"DocumentMetadata",
|
|
52
|
+
"MetadataExtractor",
|
|
53
|
+
# Loaders
|
|
54
|
+
"MarkItDownLoader",
|
|
55
|
+
# Processing
|
|
56
|
+
"get_splitter",
|
|
57
|
+
"get_markdown_splitter",
|
|
58
|
+
"get_code_splitter",
|
|
59
|
+
"sha256_text",
|
|
60
|
+
"sha256_file_from_path",
|
|
61
|
+
"sha256_chunk",
|
|
62
|
+
# Embeddings
|
|
63
|
+
"get_embedding",
|
|
64
|
+
# Vector Stores
|
|
65
|
+
"QdrantStore",
|
|
66
|
+
# Retrieval
|
|
67
|
+
"get_retriever",
|
|
68
|
+
"hybrid_search",
|
|
69
|
+
"mmr_search",
|
|
70
|
+
# Utilities
|
|
71
|
+
"setup_logging",
|
|
72
|
+
"get_logger",
|
|
73
|
+
]
|