White-Walker 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- white_walker-0.1.0/LICENSE +21 -0
- white_walker-0.1.0/PKG-INFO +174 -0
- white_walker-0.1.0/README.md +140 -0
- white_walker-0.1.0/White_Walker.egg-info/PKG-INFO +174 -0
- white_walker-0.1.0/White_Walker.egg-info/SOURCES.txt +30 -0
- white_walker-0.1.0/White_Walker.egg-info/dependency_links.txt +1 -0
- white_walker-0.1.0/White_Walker.egg-info/requires.txt +8 -0
- white_walker-0.1.0/White_Walker.egg-info/top_level.txt +1 -0
- white_walker-0.1.0/pyproject.toml +53 -0
- white_walker-0.1.0/setup.cfg +4 -0
- white_walker-0.1.0/white_walker/__init__.py +9 -0
- white_walker-0.1.0/white_walker/client.py +162 -0
- white_walker-0.1.0/white_walker/config.yaml +9 -0
- white_walker-0.1.0/white_walker/ingest/__init__.py +2 -0
- white_walker-0.1.0/white_walker/ingest/md_ingestor.py +288 -0
- white_walker-0.1.0/white_walker/ingest/pdf_ingestor.py +1109 -0
- white_walker-0.1.0/white_walker/llm/__init__.py +1 -0
- white_walker-0.1.0/white_walker/llm/adapter.py +186 -0
- white_walker-0.1.0/white_walker/retrieval/__init__.py +2 -0
- white_walker-0.1.0/white_walker/retrieval/agent_retriever.py +85 -0
- white_walker-0.1.0/white_walker/retrieval/prompts.py +54 -0
- white_walker-0.1.0/white_walker/retrieval/retrieve.py +112 -0
- white_walker-0.1.0/white_walker/store/__init__.py +3 -0
- white_walker-0.1.0/white_walker/store/base_store.py +36 -0
- white_walker-0.1.0/white_walker/store/local_store.py +114 -0
- white_walker-0.1.0/white_walker/store/supabase_store.py +102 -0
- white_walker-0.1.0/white_walker/utils.py +669 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 NUHASHROXME
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: White-Walker
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multimodal Agentic RAG with hierarchical tree indexing — index PDFs & Markdown into navigable tree structures, persist in Supabase, and query with LLM-powered retrieval.
|
|
5
|
+
Author: NUHASHROXME
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/YashNuhash/White-Walker
|
|
8
|
+
Project-URL: Repository, https://github.com/YashNuhash/White-Walker
|
|
9
|
+
Project-URL: Issues, https://github.com/YashNuhash/White-Walker/issues
|
|
10
|
+
Keywords: RAG,multimodal,agentic,LLM,retrieval,tree-indexing,supabase,nvidia,pdf,arxiv
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: supabase>=2.0
|
|
26
|
+
Requires-Dist: PyPDF2>=3.0
|
|
27
|
+
Requires-Dist: pdfplumber>=0.9
|
|
28
|
+
Requires-Dist: python-dotenv
|
|
29
|
+
Requires-Dist: httpx
|
|
30
|
+
Requires-Dist: requests
|
|
31
|
+
Requires-Dist: pymupdf
|
|
32
|
+
Requires-Dist: pyyaml
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# 🧊 White Walker
|
|
36
|
+
|
|
37
|
+
[](https://pypi.org/project/white-walker/)
|
|
38
|
+
[](https://pypi.org/project/white-walker/)
|
|
39
|
+
[](LICENSE)
|
|
40
|
+
|
|
41
|
+
**Multimodal Agentic RAG** — Index PDFs & Markdown into hierarchical tree structures, persist in Supabase, and query with LLM-powered retrieval.
|
|
42
|
+
|
|
43
|
+
White Walker transforms documents into navigable tree indexes using LLM-based structural analysis, stores them in Supabase (or locally), and answers questions by intelligently traversing the tree to find relevant sections.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## ✨ Features
|
|
48
|
+
|
|
49
|
+
- 🌲 **Hierarchical Tree Indexing** — Automatically detects document structure (TOC, sections, subsections) and builds a navigable tree
|
|
50
|
+
- 🗄️ **Supabase Persistence** — Trees, raw pages, and metadata stored in PostgreSQL via Supabase
|
|
51
|
+
- 🤖 **Agentic Retrieval** — LLM navigates the tree to find relevant sections, then synthesizes answers with citations
|
|
52
|
+
- 📄 **PDF & Markdown Support** — Works with academic papers, technical docs, and any structured document
|
|
53
|
+
- ⚡ **Rate-Limited API** — Built-in token-bucket rate limiter for NVIDIA NIM API (38 RPM, configurable)
|
|
54
|
+
- 🔌 **Pluggable LLM Backend** — Default: NVIDIA NIM API, easily extensible
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## 📦 Installation
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install white-walker
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## 🚀 Quick Start
|
|
67
|
+
|
|
68
|
+
### 1. Set Environment Variables
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# .env file
|
|
72
|
+
SUPABASE_URL=https://your-project.supabase.co
|
|
73
|
+
SUPABASE_KEY=your-anon-key
|
|
74
|
+
NVIDIA_API_KEY=your-nvidia-nim-api-key
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### 2. Set Up Supabase Tables
|
|
78
|
+
|
|
79
|
+
Run the SQL in `setup_supabase.sql` in your Supabase SQL editor to create the required tables (`rag_documents`, `rag_raw_pages`, `rag_tree_nodes`).
|
|
80
|
+
|
|
81
|
+
### 3. Index & Query
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from white_walker import WhiteWalkerClient
|
|
85
|
+
|
|
86
|
+
client = WhiteWalkerClient()
|
|
87
|
+
|
|
88
|
+
# Index a PDF
|
|
89
|
+
doc_id = client.index("paper.pdf")
|
|
90
|
+
|
|
91
|
+
# Query the indexed document
|
|
92
|
+
result = client.query(doc_id, "What is the main contribution of this paper?")
|
|
93
|
+
|
|
94
|
+
print(result["answer"])
|
|
95
|
+
print(result["citations"])
|
|
96
|
+
print(result["confidence"]) # "high", "medium", or "low"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## 🏗️ Architecture
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
PDF / Markdown
|
|
105
|
+
│
|
|
106
|
+
▼
|
|
107
|
+
┌─────────────────┐
|
|
108
|
+
│ PDF Ingestor │ Extracts text, detects TOC, builds hierarchical tree
|
|
109
|
+
└────────┬────────┘
|
|
110
|
+
│
|
|
111
|
+
▼
|
|
112
|
+
┌─────────────────┐
|
|
113
|
+
│ Tree Indexer │ LLM-based section detection, title verification,
|
|
114
|
+
│ │ page-number alignment, node summarization
|
|
115
|
+
└────────┬────────┘
|
|
116
|
+
│
|
|
117
|
+
▼
|
|
118
|
+
┌─────────────────┐
|
|
119
|
+
│ Supabase Store │ Persists tree nodes, raw pages, document metadata
|
|
120
|
+
└────────┬────────┘
|
|
121
|
+
│
|
|
122
|
+
▼
|
|
123
|
+
┌─────────────────┐
|
|
124
|
+
│ Agent Retriever │ LLM navigates tree → finds relevant nodes →
|
|
125
|
+
│ │ fetches page content → synthesizes cited answer
|
|
126
|
+
└─────────────────┘
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## ⚙️ Configuration
|
|
132
|
+
|
|
133
|
+
White Walker uses `white_walker/config.yaml` for defaults:
|
|
134
|
+
|
|
135
|
+
| Parameter | Default | Description |
|
|
136
|
+
|---|---|---|
|
|
137
|
+
| `model` | `moonshotai/kimi-k2.6` | NVIDIA NIM model for indexing & retrieval |
|
|
138
|
+
| `toc_check_page_num` | `20` | Max pages to scan for table of contents |
|
|
139
|
+
| `max_page_num_each_node` | `10` | Max pages per tree node |
|
|
140
|
+
| `max_token_num_each_node` | `20000` | Max tokens per tree node |
|
|
141
|
+
|
|
142
|
+
Override at initialization:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
client = WhiteWalkerClient(model="meta/llama-3.1-70b-instruct")
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## 🔑 Environment Variables
|
|
151
|
+
|
|
152
|
+
| Variable | Required | Description |
|
|
153
|
+
|---|---|---|
|
|
154
|
+
| `NVIDIA_API_KEY` | ✅ | NVIDIA NIM API key for LLM calls |
|
|
155
|
+
| `SUPABASE_URL` | Optional | Supabase project URL (falls back to local storage) |
|
|
156
|
+
| `SUPABASE_KEY` | Optional | Supabase anon/service key |
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## 📊 Evaluation
|
|
161
|
+
|
|
162
|
+
White Walker includes a RAGAS-based evaluation pipeline for benchmarking against multimodal agentic RAG datasets. See `evaluate_pipeline.py` for details.
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## 📄 License
|
|
167
|
+
|
|
168
|
+
MIT License — see [LICENSE](LICENSE) for details.
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## 🙏 Acknowledgments
|
|
173
|
+
|
|
174
|
+
Built on the foundation of [VectifyAI/PageIndex](https://github.com/VectifyAI/PageIndex), re-architected for local processing, Supabase persistence, and pluggable LLM backends.
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# 🧊 White Walker
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/white-walker/)
|
|
4
|
+
[](https://pypi.org/project/white-walker/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
**Multimodal Agentic RAG** — Index PDFs & Markdown into hierarchical tree structures, persist in Supabase, and query with LLM-powered retrieval.
|
|
8
|
+
|
|
9
|
+
White Walker transforms documents into navigable tree indexes using LLM-based structural analysis, stores them in Supabase (or locally), and answers questions by intelligently traversing the tree to find relevant sections.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## ✨ Features
|
|
14
|
+
|
|
15
|
+
- 🌲 **Hierarchical Tree Indexing** — Automatically detects document structure (TOC, sections, subsections) and builds a navigable tree
|
|
16
|
+
- 🗄️ **Supabase Persistence** — Trees, raw pages, and metadata stored in PostgreSQL via Supabase
|
|
17
|
+
- 🤖 **Agentic Retrieval** — LLM navigates the tree to find relevant sections, then synthesizes answers with citations
|
|
18
|
+
- 📄 **PDF & Markdown Support** — Works with academic papers, technical docs, and any structured document
|
|
19
|
+
- ⚡ **Rate-Limited API** — Built-in token-bucket rate limiter for NVIDIA NIM API (38 RPM, configurable)
|
|
20
|
+
- 🔌 **Pluggable LLM Backend** — Default: NVIDIA NIM API, easily extensible
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## 📦 Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install white-walker
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## 🚀 Quick Start
|
|
33
|
+
|
|
34
|
+
### 1. Set Environment Variables
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# .env file
|
|
38
|
+
SUPABASE_URL=https://your-project.supabase.co
|
|
39
|
+
SUPABASE_KEY=your-anon-key
|
|
40
|
+
NVIDIA_API_KEY=your-nvidia-nim-api-key
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### 2. Set Up Supabase Tables
|
|
44
|
+
|
|
45
|
+
Run the SQL in `setup_supabase.sql` in your Supabase SQL editor to create the required tables (`rag_documents`, `rag_raw_pages`, `rag_tree_nodes`).
|
|
46
|
+
|
|
47
|
+
### 3. Index & Query
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from white_walker import WhiteWalkerClient
|
|
51
|
+
|
|
52
|
+
client = WhiteWalkerClient()
|
|
53
|
+
|
|
54
|
+
# Index a PDF
|
|
55
|
+
doc_id = client.index("paper.pdf")
|
|
56
|
+
|
|
57
|
+
# Query the indexed document
|
|
58
|
+
result = client.query(doc_id, "What is the main contribution of this paper?")
|
|
59
|
+
|
|
60
|
+
print(result["answer"])
|
|
61
|
+
print(result["citations"])
|
|
62
|
+
print(result["confidence"]) # "high", "medium", or "low"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## 🏗️ Architecture
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
PDF / Markdown
|
|
71
|
+
│
|
|
72
|
+
▼
|
|
73
|
+
┌─────────────────┐
|
|
74
|
+
│ PDF Ingestor │ Extracts text, detects TOC, builds hierarchical tree
|
|
75
|
+
└────────┬────────┘
|
|
76
|
+
│
|
|
77
|
+
▼
|
|
78
|
+
┌─────────────────┐
|
|
79
|
+
│ Tree Indexer │ LLM-based section detection, title verification,
|
|
80
|
+
│ │ page-number alignment, node summarization
|
|
81
|
+
└────────┬────────┘
|
|
82
|
+
│
|
|
83
|
+
▼
|
|
84
|
+
┌─────────────────┐
|
|
85
|
+
│ Supabase Store │ Persists tree nodes, raw pages, document metadata
|
|
86
|
+
└────────┬────────┘
|
|
87
|
+
│
|
|
88
|
+
▼
|
|
89
|
+
┌─────────────────┐
|
|
90
|
+
│ Agent Retriever │ LLM navigates tree → finds relevant nodes →
|
|
91
|
+
│ │ fetches page content → synthesizes cited answer
|
|
92
|
+
└─────────────────┘
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## ⚙️ Configuration
|
|
98
|
+
|
|
99
|
+
White Walker uses `white_walker/config.yaml` for defaults:
|
|
100
|
+
|
|
101
|
+
| Parameter | Default | Description |
|
|
102
|
+
|---|---|---|
|
|
103
|
+
| `model` | `moonshotai/kimi-k2.6` | NVIDIA NIM model for indexing & retrieval |
|
|
104
|
+
| `toc_check_page_num` | `20` | Max pages to scan for table of contents |
|
|
105
|
+
| `max_page_num_each_node` | `10` | Max pages per tree node |
|
|
106
|
+
| `max_token_num_each_node` | `20000` | Max tokens per tree node |
|
|
107
|
+
|
|
108
|
+
Override at initialization:
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
client = WhiteWalkerClient(model="meta/llama-3.1-70b-instruct")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 🔑 Environment Variables
|
|
117
|
+
|
|
118
|
+
| Variable | Required | Description |
|
|
119
|
+
|---|---|---|
|
|
120
|
+
| `NVIDIA_API_KEY` | ✅ | NVIDIA NIM API key for LLM calls |
|
|
121
|
+
| `SUPABASE_URL` | Optional | Supabase project URL (falls back to local storage) |
|
|
122
|
+
| `SUPABASE_KEY` | Optional | Supabase anon/service key |
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## 📊 Evaluation
|
|
127
|
+
|
|
128
|
+
White Walker includes a RAGAS-based evaluation pipeline for benchmarking against multimodal agentic RAG datasets. See `evaluate_pipeline.py` for details.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## 📄 License
|
|
133
|
+
|
|
134
|
+
MIT License — see [LICENSE](LICENSE) for details.
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## 🙏 Acknowledgments
|
|
139
|
+
|
|
140
|
+
Built on the foundation of [VectifyAI/PageIndex](https://github.com/VectifyAI/PageIndex), re-architected for local processing, Supabase persistence, and pluggable LLM backends.
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: White-Walker
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multimodal Agentic RAG with hierarchical tree indexing — index PDFs & Markdown into navigable tree structures, persist in Supabase, and query with LLM-powered retrieval.
|
|
5
|
+
Author: NUHASHROXME
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/YashNuhash/White-Walker
|
|
8
|
+
Project-URL: Repository, https://github.com/YashNuhash/White-Walker
|
|
9
|
+
Project-URL: Issues, https://github.com/YashNuhash/White-Walker/issues
|
|
10
|
+
Keywords: RAG,multimodal,agentic,LLM,retrieval,tree-indexing,supabase,nvidia,pdf,arxiv
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: supabase>=2.0
|
|
26
|
+
Requires-Dist: PyPDF2>=3.0
|
|
27
|
+
Requires-Dist: pdfplumber>=0.9
|
|
28
|
+
Requires-Dist: python-dotenv
|
|
29
|
+
Requires-Dist: httpx
|
|
30
|
+
Requires-Dist: requests
|
|
31
|
+
Requires-Dist: pymupdf
|
|
32
|
+
Requires-Dist: pyyaml
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# 🧊 White Walker
|
|
36
|
+
|
|
37
|
+
[](https://pypi.org/project/white-walker/)
|
|
38
|
+
[](https://pypi.org/project/white-walker/)
|
|
39
|
+
[](LICENSE)
|
|
40
|
+
|
|
41
|
+
**Multimodal Agentic RAG** — Index PDFs & Markdown into hierarchical tree structures, persist in Supabase, and query with LLM-powered retrieval.
|
|
42
|
+
|
|
43
|
+
White Walker transforms documents into navigable tree indexes using LLM-based structural analysis, stores them in Supabase (or locally), and answers questions by intelligently traversing the tree to find relevant sections.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## ✨ Features
|
|
48
|
+
|
|
49
|
+
- 🌲 **Hierarchical Tree Indexing** — Automatically detects document structure (TOC, sections, subsections) and builds a navigable tree
|
|
50
|
+
- 🗄️ **Supabase Persistence** — Trees, raw pages, and metadata stored in PostgreSQL via Supabase
|
|
51
|
+
- 🤖 **Agentic Retrieval** — LLM navigates the tree to find relevant sections, then synthesizes answers with citations
|
|
52
|
+
- 📄 **PDF & Markdown Support** — Works with academic papers, technical docs, and any structured document
|
|
53
|
+
- ⚡ **Rate-Limited API** — Built-in token-bucket rate limiter for NVIDIA NIM API (38 RPM, configurable)
|
|
54
|
+
- 🔌 **Pluggable LLM Backend** — Default: NVIDIA NIM API, easily extensible
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## 📦 Installation
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install white-walker
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## 🚀 Quick Start
|
|
67
|
+
|
|
68
|
+
### 1. Set Environment Variables
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# .env file
|
|
72
|
+
SUPABASE_URL=https://your-project.supabase.co
|
|
73
|
+
SUPABASE_KEY=your-anon-key
|
|
74
|
+
NVIDIA_API_KEY=your-nvidia-nim-api-key
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### 2. Set Up Supabase Tables
|
|
78
|
+
|
|
79
|
+
Run the SQL in `setup_supabase.sql` in your Supabase SQL editor to create the required tables (`rag_documents`, `rag_raw_pages`, `rag_tree_nodes`).
|
|
80
|
+
|
|
81
|
+
### 3. Index & Query
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from white_walker import WhiteWalkerClient
|
|
85
|
+
|
|
86
|
+
client = WhiteWalkerClient()
|
|
87
|
+
|
|
88
|
+
# Index a PDF
|
|
89
|
+
doc_id = client.index("paper.pdf")
|
|
90
|
+
|
|
91
|
+
# Query the indexed document
|
|
92
|
+
result = client.query(doc_id, "What is the main contribution of this paper?")
|
|
93
|
+
|
|
94
|
+
print(result["answer"])
|
|
95
|
+
print(result["citations"])
|
|
96
|
+
print(result["confidence"]) # "high", "medium", or "low"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## 🏗️ Architecture
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
PDF / Markdown
|
|
105
|
+
│
|
|
106
|
+
▼
|
|
107
|
+
┌─────────────────┐
|
|
108
|
+
│ PDF Ingestor │ Extracts text, detects TOC, builds hierarchical tree
|
|
109
|
+
└────────┬────────┘
|
|
110
|
+
│
|
|
111
|
+
▼
|
|
112
|
+
┌─────────────────┐
|
|
113
|
+
│ Tree Indexer │ LLM-based section detection, title verification,
|
|
114
|
+
│ │ page-number alignment, node summarization
|
|
115
|
+
└────────┬────────┘
|
|
116
|
+
│
|
|
117
|
+
▼
|
|
118
|
+
┌─────────────────┐
|
|
119
|
+
│ Supabase Store │ Persists tree nodes, raw pages, document metadata
|
|
120
|
+
└────────┬────────┘
|
|
121
|
+
│
|
|
122
|
+
▼
|
|
123
|
+
┌─────────────────┐
|
|
124
|
+
│ Agent Retriever │ LLM navigates tree → finds relevant nodes →
|
|
125
|
+
│ │ fetches page content → synthesizes cited answer
|
|
126
|
+
└─────────────────┘
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## ⚙️ Configuration
|
|
132
|
+
|
|
133
|
+
White Walker uses `white_walker/config.yaml` for defaults:
|
|
134
|
+
|
|
135
|
+
| Parameter | Default | Description |
|
|
136
|
+
|---|---|---|
|
|
137
|
+
| `model` | `moonshotai/kimi-k2.6` | NVIDIA NIM model for indexing & retrieval |
|
|
138
|
+
| `toc_check_page_num` | `20` | Max pages to scan for table of contents |
|
|
139
|
+
| `max_page_num_each_node` | `10` | Max pages per tree node |
|
|
140
|
+
| `max_token_num_each_node` | `20000` | Max tokens per tree node |
|
|
141
|
+
|
|
142
|
+
Override at initialization:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
client = WhiteWalkerClient(model="meta/llama-3.1-70b-instruct")
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## 🔑 Environment Variables
|
|
151
|
+
|
|
152
|
+
| Variable | Required | Description |
|
|
153
|
+
|---|---|---|
|
|
154
|
+
| `NVIDIA_API_KEY` | ✅ | NVIDIA NIM API key for LLM calls |
|
|
155
|
+
| `SUPABASE_URL` | Optional | Supabase project URL (falls back to local storage) |
|
|
156
|
+
| `SUPABASE_KEY` | Optional | Supabase anon/service key |
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## 📊 Evaluation
|
|
161
|
+
|
|
162
|
+
White Walker includes a RAGAS-based evaluation pipeline for benchmarking against multimodal agentic RAG datasets. See `evaluate_pipeline.py` for details.
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## 📄 License
|
|
167
|
+
|
|
168
|
+
MIT License — see [LICENSE](LICENSE) for details.
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## 🙏 Acknowledgments
|
|
173
|
+
|
|
174
|
+
Built on the foundation of [VectifyAI/PageIndex](https://github.com/VectifyAI/PageIndex), re-architected for local processing, Supabase persistence, and pluggable LLM backends.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
White_Walker.egg-info/PKG-INFO
|
|
5
|
+
White_Walker.egg-info/SOURCES.txt
|
|
6
|
+
White_Walker.egg-info/dependency_links.txt
|
|
7
|
+
White_Walker.egg-info/requires.txt
|
|
8
|
+
White_Walker.egg-info/top_level.txt
|
|
9
|
+
white_walker/__init__.py
|
|
10
|
+
white_walker/client.py
|
|
11
|
+
white_walker/config.yaml
|
|
12
|
+
white_walker/utils.py
|
|
13
|
+
white_walker.egg-info/PKG-INFO
|
|
14
|
+
white_walker.egg-info/SOURCES.txt
|
|
15
|
+
white_walker.egg-info/dependency_links.txt
|
|
16
|
+
white_walker.egg-info/requires.txt
|
|
17
|
+
white_walker.egg-info/top_level.txt
|
|
18
|
+
white_walker/ingest/__init__.py
|
|
19
|
+
white_walker/ingest/md_ingestor.py
|
|
20
|
+
white_walker/ingest/pdf_ingestor.py
|
|
21
|
+
white_walker/llm/__init__.py
|
|
22
|
+
white_walker/llm/adapter.py
|
|
23
|
+
white_walker/retrieval/__init__.py
|
|
24
|
+
white_walker/retrieval/agent_retriever.py
|
|
25
|
+
white_walker/retrieval/prompts.py
|
|
26
|
+
white_walker/retrieval/retrieve.py
|
|
27
|
+
white_walker/store/__init__.py
|
|
28
|
+
white_walker/store/base_store.py
|
|
29
|
+
white_walker/store/local_store.py
|
|
30
|
+
white_walker/store/supabase_store.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
white_walker
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "White-Walker"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Multimodal Agentic RAG with hierarchical tree indexing — index PDFs & Markdown into navigable tree structures, persist in Supabase, and query with LLM-powered retrieval."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "NUHASHROXME"},
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"RAG", "multimodal", "agentic", "LLM", "retrieval",
|
|
17
|
+
"tree-indexing", "supabase", "nvidia", "pdf", "arxiv",
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 3 - Alpha",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"Intended Audience :: Science/Research",
|
|
23
|
+
"License :: OSI Approved :: MIT License",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.9",
|
|
26
|
+
"Programming Language :: Python :: 3.10",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
30
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
"supabase>=2.0",
|
|
34
|
+
"PyPDF2>=3.0",
|
|
35
|
+
"pdfplumber>=0.9",
|
|
36
|
+
"python-dotenv",
|
|
37
|
+
"httpx",
|
|
38
|
+
"requests",
|
|
39
|
+
"pymupdf",
|
|
40
|
+
"pyyaml",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://github.com/YashNuhash/White-Walker"
|
|
45
|
+
Repository = "https://github.com/YashNuhash/White-Walker"
|
|
46
|
+
Issues = "https://github.com/YashNuhash/White-Walker/issues"
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["."]
|
|
50
|
+
include = ["white_walker*"]
|
|
51
|
+
|
|
52
|
+
[tool.setuptools.package-data]
|
|
53
|
+
white_walker = ["config.yaml"]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from .client import WhiteWalkerClient
|
|
2
|
+
from .store.supabase_store import SupabaseTreeStore
|
|
3
|
+
from .store.local_store import LocalStore
|
|
4
|
+
from .ingest.pdf_ingestor import white_walker_index
|
|
5
|
+
from .ingest.md_ingestor import md_to_tree
|
|
6
|
+
from .llm.adapter import register_local_model
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|