treedex 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. treedex-0.1.0/.github/workflows/benchmark.yml +65 -0
  2. treedex-0.1.0/.github/workflows/publish.yml +29 -0
  3. treedex-0.1.0/.github/workflows/tests.yml +34 -0
  4. treedex-0.1.0/.gitignore +39 -0
  5. treedex-0.1.0/LICENSE +21 -0
  6. treedex-0.1.0/PKG-INFO +417 -0
  7. treedex-0.1.0/README.md +363 -0
  8. treedex-0.1.0/assets/architecture.svg +133 -0
  9. treedex-0.1.0/assets/benchmarks.svg +78 -0
  10. treedex-0.1.0/assets/comparison.svg +180 -0
  11. treedex-0.1.0/assets/how-treedex-works.svg +116 -0
  12. treedex-0.1.0/assets/llm-providers.svg +127 -0
  13. treedex-0.1.0/assets/treedex-vs-vectordb.svg +105 -0
  14. treedex-0.1.0/benchmarks/compare_vectordb.py +454 -0
  15. treedex-0.1.0/benchmarks/generate_svg.py +440 -0
  16. treedex-0.1.0/benchmarks/run_benchmark.py +362 -0
  17. treedex-0.1.0/examples/custom_llm.py +85 -0
  18. treedex-0.1.0/examples/multi_provider.py +47 -0
  19. treedex-0.1.0/examples/my_index.json +207 -0
  20. treedex-0.1.0/examples/quickstart.py +56 -0
  21. treedex-0.1.0/examples/save_load.py +50 -0
  22. treedex-0.1.0/pyproject.toml +67 -0
  23. treedex-0.1.0/requirements.txt +2 -0
  24. treedex-0.1.0/tests/__init__.py +0 -0
  25. treedex-0.1.0/tests/test_core.py +184 -0
  26. treedex-0.1.0/tests/test_llm_backends.py +187 -0
  27. treedex-0.1.0/tests/test_loaders.py +97 -0
  28. treedex-0.1.0/tests/test_pdf_parser.py +73 -0
  29. treedex-0.1.0/tests/test_tree_builder.py +151 -0
  30. treedex-0.1.0/tests/test_tree_utils.py +157 -0
  31. treedex-0.1.0/treedex/__init__.py +60 -0
  32. treedex-0.1.0/treedex/core.py +247 -0
  33. treedex-0.1.0/treedex/llm_backends.py +626 -0
  34. treedex-0.1.0/treedex/loaders.py +128 -0
  35. treedex-0.1.0/treedex/pdf_parser.py +91 -0
  36. treedex-0.1.0/treedex/prompts.py +62 -0
  37. treedex-0.1.0/treedex/tree_builder.py +116 -0
  38. treedex-0.1.0/treedex/tree_utils.py +152 -0
  39. treedex-0.1.0/treedex_demo.ipynb +200 -0
@@ -0,0 +1,65 @@
1
+ name: Benchmark
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ permissions:
10
+ contents: write
11
+
12
+ jobs:
13
+ benchmark:
14
+ runs-on: ubuntu-latest
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.12"
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ pip install --upgrade pip
27
+ pip install -e ".[dev]"
28
+ pip install chromadb
29
+
30
+ - name: Run TreeDex benchmarks
31
+ run: python benchmarks/run_benchmark.py --json benchmark_results.json
32
+
33
+ - name: Run comparison benchmark (TreeDex vs ChromaDB vs Naive)
34
+ run: python benchmarks/compare_vectordb.py --json comparison_results.json
35
+
36
+ - name: Generate benchmark SVG
37
+ run: python benchmarks/generate_svg.py benchmark_results.json assets/benchmarks.svg
38
+
39
+ - name: Generate comparison SVG
40
+ run: python benchmarks/generate_svg.py --comparison comparison_results.json assets/comparison.svg
41
+
42
+ - name: Display results
43
+ run: |
44
+ echo "=== TreeDex Benchmark ==="
45
+ python -m json.tool benchmark_results.json
46
+ echo ""
47
+ echo "=== Comparison ==="
48
+ python -m json.tool comparison_results.json
49
+
50
+ - name: Commit updated SVGs
51
+ if: github.ref == 'refs/heads/main' && github.event_name == 'push'
52
+ run: |
53
+ git config user.name "github-actions[bot]"
54
+ git config user.email "github-actions[bot]@users.noreply.github.com"
55
+ git add assets/benchmarks.svg assets/comparison.svg
56
+ git diff --cached --quiet || git commit -m "Update benchmark SVGs [auto-generated]"
57
+ git push
58
+
59
+ - name: Upload results
60
+ uses: actions/upload-artifact@v4
61
+ with:
62
+ name: benchmark-results
63
+ path: |
64
+ benchmark_results.json
65
+ comparison_results.json
@@ -0,0 +1,29 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write
9
+
10
+ jobs:
11
+ publish:
12
+ runs-on: ubuntu-latest
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.12"
21
+
22
+ - name: Install build tools
23
+ run: pip install --upgrade build
24
+
25
+ - name: Build package
26
+ run: python -m build
27
+
28
+ - name: Publish to PyPI
29
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,34 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install -e ".[dev]"
28
+
29
+ - name: Run tests
30
+ run: pytest --tb=short -q
31
+
32
+ - name: Run tests with coverage
33
+ if: matrix.python-version == '3.12'
34
+ run: pytest --cov=treedex --cov-report=term-missing
@@ -0,0 +1,39 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ *.egg
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+ ENV/
14
+
15
+ # IDE
16
+ .vscode/
17
+ .idea/
18
+ *.swp
19
+ *.swo
20
+ *~
21
+
22
+ # Environment
23
+ .env
24
+ .env.local
25
+
26
+ # OS
27
+ .DS_Store
28
+ Thumbs.db
29
+
30
+ # Project
31
+ *.pdf
32
+ db/
33
+ results/
34
+
35
+ # Jupyter
36
+ .ipynb_checkpoints/
37
+
38
+ # Claude
39
+ .claude/
treedex-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Mithun Gowda B
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
treedex-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,417 @@
1
+ Metadata-Version: 2.4
2
+ Name: treedex
3
+ Version: 0.1.0
4
+ Summary: Tree-based, vectorless document RAG framework. Connect any LLM via URL/API key.
5
+ Project-URL: Homepage, https://github.com/mithun50/TreeDex
6
+ Project-URL: Repository, https://github.com/mithun50/TreeDex
7
+ Project-URL: Issues, https://github.com/mithun50/TreeDex/issues
8
+ Author-email: Mithun Gowda B <mithungowda.b7411@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: ai,document,llm,pdf,rag,retrieval,tree,vectorless
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Text Processing :: Indexing
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: pymupdf>=1.26.0
24
+ Requires-Dist: tiktoken>=0.11.0
25
+ Provides-Extra: all
26
+ Requires-Dist: anthropic>=0.40.0; extra == 'all'
27
+ Requires-Dist: cohere>=5.0.0; extra == 'all'
28
+ Requires-Dist: google-generativeai>=0.8.0; extra == 'all'
29
+ Requires-Dist: groq>=0.11.0; extra == 'all'
30
+ Requires-Dist: litellm>=1.0.0; extra == 'all'
31
+ Requires-Dist: mistralai>=1.0.0; extra == 'all'
32
+ Requires-Dist: openai>=1.0.0; extra == 'all'
33
+ Requires-Dist: python-docx>=1.0.0; extra == 'all'
34
+ Provides-Extra: claude
35
+ Requires-Dist: anthropic>=0.40.0; extra == 'claude'
36
+ Provides-Extra: cohere
37
+ Requires-Dist: cohere>=5.0.0; extra == 'cohere'
38
+ Provides-Extra: dev
39
+ Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
40
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
41
+ Provides-Extra: docx
42
+ Requires-Dist: python-docx>=1.0.0; extra == 'docx'
43
+ Provides-Extra: gemini
44
+ Requires-Dist: google-generativeai>=0.8.0; extra == 'gemini'
45
+ Provides-Extra: groq
46
+ Requires-Dist: groq>=0.11.0; extra == 'groq'
47
+ Provides-Extra: litellm
48
+ Requires-Dist: litellm>=1.0.0; extra == 'litellm'
49
+ Provides-Extra: mistral
50
+ Requires-Dist: mistralai>=1.0.0; extra == 'mistral'
51
+ Provides-Extra: openai
52
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
53
+ Description-Content-Type: text/markdown
54
+
55
+ # TreeDex
56
+
57
+ **Tree-based, vectorless document RAG framework.**
58
+
59
+ Index any document into a navigable tree structure, then retrieve relevant sections using **any LLM**. No vector databases, no embeddings — just structured tree retrieval.
60
+
61
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mithun50/TreeDex/blob/main/treedex_demo.ipynb)
62
+ [![PyPI](https://img.shields.io/pypi/v/treedex)](https://pypi.org/project/treedex/)
63
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
64
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://python.org)
65
+
66
+ ---
67
+
68
+ ## How It Works
69
+
70
+ <p align="center">
71
+ <img src="assets/how-treedex-works.svg" alt="How TreeDex Works" width="800"/>
72
+ </p>
73
+
74
+ 1. **Load** — Extract pages from any supported format
75
+ 2. **Index** — LLM analyzes page groups and extracts hierarchical structure
76
+ 3. **Build** — Flat sections become a tree with page ranges and embedded text
77
+ 4. **Query** — LLM selects relevant tree nodes for your question
78
+ 5. **Return** — Get context text, source pages, and reasoning
79
+
80
+ ### Why TreeDex instead of Vector DB?
81
+
82
+ <p align="center">
83
+ <img src="assets/treedex-vs-vectordb.svg" alt="TreeDex vs Vector DB" width="800"/>
84
+ </p>
85
+
86
+ ---
87
+
88
+ ## Supported LLM Providers
89
+
90
+ <p align="center">
91
+ <img src="assets/llm-providers.svg" alt="LLM Providers" width="800"/>
92
+ </p>
93
+
94
+ TreeDex works with **every major AI provider** out of the box. Pick what works for you:
95
+
96
+ ### One-liner backends (zero config)
97
+
98
+ | Backend | Provider | Default Model | Dependencies |
99
+ |---------|----------|---------------|-------------|
100
+ | `GeminiLLM` | Google | gemini-2.0-flash | `google-generativeai` |
101
+ | `OpenAILLM` | OpenAI | gpt-4o | `openai` |
102
+ | `ClaudeLLM` | Anthropic | claude-sonnet-4-20250514 | `anthropic` |
103
+ | `MistralLLM` | Mistral AI | mistral-large-latest | `mistralai` |
104
+ | `CohereLLM` | Cohere | command-r-plus | `cohere` |
105
+ | `GroqLLM` | Groq | llama-3.3-70b-versatile | **None (stdlib)** |
106
+ | `TogetherLLM` | Together AI | Llama-3-70b-chat-hf | **None (stdlib)** |
107
+ | `FireworksLLM` | Fireworks | llama-v3p1-70b-instruct | **None (stdlib)** |
108
+ | `OpenRouterLLM` | OpenRouter | claude-sonnet-4 | **None (stdlib)** |
109
+ | `DeepSeekLLM` | DeepSeek | deepseek-chat | **None (stdlib)** |
110
+ | `CerebrasLLM` | Cerebras | llama-3.3-70b | **None (stdlib)** |
111
+ | `SambanovaLLM` | SambaNova | Llama-3.1-70B-Instruct | **None (stdlib)** |
112
+ | `HuggingFaceLLM` | HuggingFace | Mistral-7B-Instruct | **None (stdlib)** |
113
+ | `OllamaLLM` | Ollama (local) | llama3 | **None (stdlib)** |
114
+
115
+ ### Universal backends
116
+
117
+ | Backend | Use case | Dependencies |
118
+ |---------|----------|-------------|
119
+ | `OpenAICompatibleLLM` | **Any** OpenAI-compatible endpoint (URL + key) | **None (stdlib)** |
120
+ | `LiteLLM` | 100+ providers via litellm library | `litellm` |
121
+ | `FunctionLLM` | Wrap any `callable(str) -> str` | **None** |
122
+ | `BaseLLM` | Subclass to build your own | **None** |
123
+
124
+ ---
125
+
126
+ ## Quick Start
127
+
128
+ ### Install
129
+
130
+ ```bash
131
+ # pip
132
+ pip install treedex
133
+
134
+ # uv (faster)
135
+ uv pip install treedex
136
+
137
+ # With optional LLM SDK
138
+ pip install treedex[gemini] # Google Gemini
139
+ pip install treedex[openai] # OpenAI
140
+ pip install treedex[claude] # Anthropic Claude
141
+ pip install treedex[mistral] # Mistral AI
142
+ pip install treedex[cohere] # Cohere
143
+ pip install treedex[litellm] # LiteLLM (100+ providers)
144
+ pip install treedex[all] # Everything
145
+
146
+ # From source
147
+ pip install git+https://github.com/mithun50/TreeDex.git
148
+
149
+ # Development
150
+ git clone https://github.com/mithun50/TreeDex.git
151
+ cd TreeDex
152
+ pip install -e ".[dev]"
153
+ ```
154
+
155
+ ### Pick your LLM and go
156
+
157
+ ```python
158
+ from treedex import TreeDex
159
+
160
+ # --- Google Gemini ---
161
+ from treedex import GeminiLLM
162
+ llm = GeminiLLM(api_key="YOUR_KEY")
163
+
164
+ # --- OpenAI ---
165
+ from treedex import OpenAILLM
166
+ llm = OpenAILLM(api_key="sk-...")
167
+
168
+ # --- Claude ---
169
+ from treedex import ClaudeLLM
170
+ llm = ClaudeLLM(api_key="sk-ant-...")
171
+
172
+ # --- Groq (free, fast) ---
173
+ from treedex import GroqLLM
174
+ llm = GroqLLM(api_key="gsk_...")
175
+
176
+ # --- Together AI ---
177
+ from treedex import TogetherLLM
178
+ llm = TogetherLLM(api_key="...")
179
+
180
+ # --- DeepSeek ---
181
+ from treedex import DeepSeekLLM
182
+ llm = DeepSeekLLM(api_key="...")
183
+
184
+ # --- Fireworks ---
185
+ from treedex import FireworksLLM
186
+ llm = FireworksLLM(api_key="...")
187
+
188
+ # --- OpenRouter (access any model) ---
189
+ from treedex import OpenRouterLLM
190
+ llm = OpenRouterLLM(api_key="...", model="anthropic/claude-sonnet-4")
191
+
192
+ # --- Cerebras ---
193
+ from treedex import CerebrasLLM
194
+ llm = CerebrasLLM(api_key="...")
195
+
196
+ # --- SambaNova ---
197
+ from treedex import SambanovaLLM
198
+ llm = SambanovaLLM(api_key="...")
199
+
200
+ # --- Mistral AI ---
201
+ from treedex import MistralLLM
202
+ llm = MistralLLM(api_key="...") # pip install mistralai
203
+
204
+ # --- Cohere ---
205
+ from treedex import CohereLLM
206
+ llm = CohereLLM(api_key="...") # pip install cohere
207
+
208
+ # --- HuggingFace ---
209
+ from treedex import HuggingFaceLLM
210
+ llm = HuggingFaceLLM(api_key="hf_...", model="mistralai/Mistral-7B-Instruct-v0.3")
211
+
212
+ # --- Local Ollama ---
213
+ from treedex import OllamaLLM
214
+ llm = OllamaLLM(model="llama3")
215
+
216
+ # Index and query (same for ALL providers)
217
+ index = TreeDex.from_file("document.pdf", llm=llm)
218
+ result = index.query("What is the main argument?")
219
+ print(result.context)
220
+ print(result.pages_str) # "pages 5-8, 12-15"
221
+ ```
222
+
223
+ ### Any OpenAI-compatible endpoint
224
+
225
+ ```python
226
+ from treedex import OpenAICompatibleLLM
227
+
228
+ # Works with ANY service that speaks OpenAI format
229
+ llm = OpenAICompatibleLLM(
230
+ base_url="https://your-provider.com/v1",
231
+ api_key="...",
232
+ model="model-name"
233
+ )
234
+ ```
235
+
236
+ ### 100+ providers via LiteLLM
237
+
238
+ ```python
239
+ from treedex import LiteLLM
240
+
241
+ # pip install litellm
242
+ llm = LiteLLM("gpt-4o") # OpenAI
243
+ llm = LiteLLM("anthropic/claude-sonnet-4-20250514") # Claude
244
+ llm = LiteLLM("groq/llama-3.3-70b-versatile") # Groq
245
+ llm = LiteLLM("together_ai/meta-llama/Llama-3-70b-chat-hf")# Together
246
+ llm = LiteLLM("bedrock/anthropic.claude-3-sonnet") # AWS Bedrock
247
+ llm = LiteLLM("vertex_ai/gemini-pro") # Google Vertex
248
+ llm = LiteLLM("azure/gpt-4o") # Azure OpenAI
249
+ ```
250
+
251
+ ### Wrap any function
252
+
253
+ ```python
254
+ from treedex import FunctionLLM
255
+
256
+ # Wrap any callable(str) -> str
257
+ llm = FunctionLLM(lambda prompt: my_custom_api(prompt))
258
+
259
+ # Or a named function
260
+ def call_my_model(prompt: str) -> str:
261
+ return requests.post(url, json={"prompt": prompt}).json()["text"]
262
+
263
+ llm = FunctionLLM(call_my_model)
264
+ ```
265
+
266
+ ### Build your own backend
267
+
268
+ ```python
269
+ from treedex import BaseLLM
270
+
271
+ class MyLLM(BaseLLM):
272
+ def generate(self, prompt: str) -> str:
273
+ # Your logic here — call any API, local model, etc.
274
+ return my_api_call(prompt)
275
+
276
+ llm = MyLLM()
277
+ index = TreeDex.from_file("doc.pdf", llm=llm)
278
+ ```
279
+
280
+ ### Swap LLM at query time
281
+
282
+ ```python
283
+ # Build index with one LLM
284
+ index = TreeDex.from_file("doc.pdf", llm=gemini_llm)
285
+
286
+ # Query with a different one — same index, different brain
287
+ result = index.query("...", llm=groq_llm)
288
+ ```
289
+
290
+ ---
291
+
292
+ ## Supported Document Formats
293
+
294
+ | Format | Loader | Extra Dependencies |
295
+ |--------|--------|--------------------|
296
+ | PDF | `PDFLoader` | `pymupdf` |
297
+ | TXT / MD | `TextLoader` | None |
298
+ | HTML | `HTMLLoader` | None (stdlib) |
299
+ | DOCX | `DOCXLoader` | `python-docx` |
300
+
301
+ Use `auto_loader(path)` for automatic format detection, or pass a specific loader:
302
+
303
+ ```python
304
+ from treedex import TreeDex, TextLoader
305
+
306
+ index = TreeDex.from_file("notes.txt", llm=llm, loader=TextLoader())
307
+ ```
308
+
309
+ ---
310
+
311
+ ## API Reference
312
+
313
+ ### `TreeDex`
314
+
315
+ | Method | Description |
316
+ |--------|------------|
317
+ | `TreeDex.from_file(path, llm, ...)` | Build index from a file |
318
+ | `TreeDex.from_pages(pages, llm, ...)` | Build from pre-extracted pages |
319
+ | `TreeDex.from_tree(tree, pages, llm?)` | Create from existing tree |
320
+ | `index.query(question, llm?)` | Retrieve relevant sections |
321
+ | `index.save(path)` | Save index to JSON |
322
+ | `TreeDex.load(path, llm?)` | Load index from JSON |
323
+ | `index.show_tree()` | Print tree structure |
324
+ | `index.stats()` | Get index statistics |
325
+ | `index.find_large_sections(...)` | Find oversized nodes |
326
+
327
+ ### `QueryResult`
328
+
329
+ | Property | Type | Description |
330
+ |----------|------|-------------|
331
+ | `.context` | `str` | Concatenated text from relevant sections |
332
+ | `.node_ids` | `list[str]` | IDs of selected tree nodes |
333
+ | `.page_ranges` | `list[tuple]` | `[(start, end), ...]` page ranges |
334
+ | `.pages_str` | `str` | Human-readable: `"pages 5-8, 12-15"` |
335
+ | `.reasoning` | `str` | LLM's explanation for selection |
336
+
337
+ ### LLM Backends
338
+
339
+ | Backend | Needs SDK? | One-liner |
340
+ |---------|-----------|-----------|
341
+ | `GeminiLLM(api_key)` | Yes | `GeminiLLM("key")` |
342
+ | `OpenAILLM(api_key)` | Yes | `OpenAILLM("sk-...")` |
343
+ | `ClaudeLLM(api_key)` | Yes | `ClaudeLLM("sk-ant-...")` |
344
+ | `MistralLLM(api_key)` | Yes | `MistralLLM("key")` |
345
+ | `CohereLLM(api_key)` | Yes | `CohereLLM("key")` |
346
+ | `GroqLLM(api_key)` | No | `GroqLLM("gsk_...")` |
347
+ | `TogetherLLM(api_key)` | No | `TogetherLLM("key")` |
348
+ | `FireworksLLM(api_key)` | No | `FireworksLLM("key")` |
349
+ | `OpenRouterLLM(api_key)` | No | `OpenRouterLLM("key")` |
350
+ | `DeepSeekLLM(api_key)` | No | `DeepSeekLLM("key")` |
351
+ | `CerebrasLLM(api_key)` | No | `CerebrasLLM("key")` |
352
+ | `SambanovaLLM(api_key)` | No | `SambanovaLLM("key")` |
353
+ | `HuggingFaceLLM(api_key)` | No | `HuggingFaceLLM("hf_...")` |
354
+ | `OllamaLLM(model)` | No | `OllamaLLM("llama3")` |
355
+ | `LiteLLM(model)` | Yes | `LiteLLM("gpt-4o")` |
356
+ | `FunctionLLM(fn)` | No | `FunctionLLM(my_fn)` |
357
+ | `OpenAICompatibleLLM(url, model)` | No | Any endpoint |
358
+ | `BaseLLM` (subclass) | No | Your own logic |
359
+
360
+ ---
361
+
362
+ ## Benchmarks
363
+
364
+ ### TreeDex vs Vector DB vs Naive Chunking
365
+
366
+ <p align="center">
367
+ <img src="assets/comparison.svg" alt="Comparison Benchmark" width="800"/>
368
+ </p>
369
+
370
+ Real benchmark on the same document (NCERT Electromagnetic Waves, 14 pages, 10 queries). All three methods retrieve from the same content — only the indexing and retrieval approach differs. **Auto-generated by CI on every push.**
371
+
372
+ ### TreeDex Stats
373
+
374
+ <p align="center">
375
+ <img src="assets/benchmarks.svg" alt="Benchmarks" width="800"/>
376
+ </p>
377
+
378
+ | Feature | TreeDex | Vector RAG | Naive Chunking |
379
+ |---------|---------|------------|----------------|
380
+ | **Page Attribution** | Exact source pages | Approximate | None |
381
+ | **Structure Preserved** | Full tree hierarchy | None | None |
382
+ | **Index Format** | Human-readable JSON | Opaque vectors | Text chunks |
383
+ | **Embedding Model** | Not needed | Required | Not needed |
384
+ | **Infrastructure** | None (JSON file) | Vector DB required | None |
385
+ | **Core Dependencies** | 2 (pymupdf, tiktoken) | 5-8+ | 2-5 |
386
+
387
+ > Run your own: `python benchmarks/run_benchmark.py --help` or `python benchmarks/compare_vectordb.py --help`
388
+
389
+ ---
390
+
391
+ ## Architecture
392
+
393
+ <p align="center">
394
+ <img src="assets/architecture.svg" alt="Architecture" width="800"/>
395
+ </p>
396
+
397
+ ## Running Tests
398
+
399
+ ```bash
400
+ # Install dev dependencies
401
+ pip install -e ".[dev]"
402
+
403
+ # Run all tests
404
+ pytest
405
+
406
+ # With coverage
407
+ pytest --cov=treedex
408
+
409
+ # Run specific test file
410
+ pytest tests/test_core.py -v
411
+ ```
412
+
413
+ ---
414
+
415
+ ## License
416
+
417
+ MIT License — Mithun Gowda B