navidoc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- navidoc-0.1.0/PKG-INFO +126 -0
- navidoc-0.1.0/README.md +115 -0
- navidoc-0.1.0/pyproject.toml +17 -0
- navidoc-0.1.0/setup.cfg +4 -0
- navidoc-0.1.0/src/navidoc/__init__.py +4 -0
- navidoc-0.1.0/src/navidoc/core.py +208 -0
- navidoc-0.1.0/src/navidoc/index/__init__.py +1 -0
- navidoc-0.1.0/src/navidoc/index/page.py +20 -0
- navidoc-0.1.0/src/navidoc/index/tree.py +27 -0
- navidoc-0.1.0/src/navidoc/parsers/__init__.py +1 -0
- navidoc-0.1.0/src/navidoc/parsers/docx.py +52 -0
- navidoc-0.1.0/src/navidoc/parsers/markdown.py +45 -0
- navidoc-0.1.0/src/navidoc/parsers/pdf.py +73 -0
- navidoc-0.1.0/src/navidoc/parsers/pptx.py +27 -0
- navidoc-0.1.0/src/navidoc.egg-info/PKG-INFO +126 -0
- navidoc-0.1.0/src/navidoc.egg-info/SOURCES.txt +18 -0
- navidoc-0.1.0/src/navidoc.egg-info/dependency_links.txt +1 -0
- navidoc-0.1.0/src/navidoc.egg-info/requires.txt +4 -0
- navidoc-0.1.0/src/navidoc.egg-info/top_level.txt +1 -0
- navidoc-0.1.0/tests/test_parsers.py +22 -0
navidoc-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: navidoc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight, completely local, zero-API, tree-based RAG framework
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: ollama>=0.6.2
|
|
8
|
+
Requires-Dist: pypdf>=6.11.0
|
|
9
|
+
Requires-Dist: python-docx>=1.2.0
|
|
10
|
+
Requires-Dist: python-pptx>=1.0.2
|
|
11
|
+
|
|
12
|
+
# πΊοΈ NaviDoc
|
|
13
|
+
|
|
14
|
+
NaviDoc is a lightweight, **completely local, zero-API, tree-based RAG framework** designed to navigate document structures intelligently. Instead of blindly chopping your files into vector chunks, NaviDoc maps your documents into a logical structural tree hierarchy and uses local LLMs to precisely steer and navigate to answers.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## β¨ Features
|
|
19
|
+
|
|
20
|
+
* **π 100% Private & Offline:** Your documents never leave your machine. Zero cloud APIs, zero telemetry.
|
|
21
|
+
* **π³ Tree-Based Navigation:** Mimics human navigation by following document structures (headers, font sizes) instead of standard proximity vector chunks.
|
|
22
|
+
* **β‘ High Precision:** Pinpoints specific structural sections, avoiding context contamination or context blowouts.
|
|
23
|
+
* **π Multi-Format Support**: Supports Markdown, PDF (with font-size analysis), DOCX (with style detection), and PPTX.
|
|
24
|
+
* **πΎ Index Persistence**: Save your indexed tree structures to JSON and reload them instantly.
|
|
25
|
+
* **π¬ Chat SDK**: Maintain conversation history with your documents SDK-style.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## π Getting Started
|
|
30
|
+
|
|
31
|
+
### 1. Prerequisites
|
|
32
|
+
|
|
33
|
+
NaviDoc requires **Ollama** to host your local LLM engine.
|
|
34
|
+
|
|
35
|
+
1. Download and install Ollama from [ollama.com](https://ollama.com).
|
|
36
|
+
2. Pull a smart, small model (we recommend `phi3` or `llama3`):
|
|
37
|
+
```bash
|
|
38
|
+
ollama pull phi3
|
|
39
|
+
```
|
|
40
|
+
3. Ensure the Ollama service is running in the background before running NaviDoc.
|
|
41
|
+
|
|
42
|
+
### 2. Installation
|
|
43
|
+
|
|
44
|
+
Install NaviDoc via pip:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install navidoc
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Or using `uv`:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
uv add navidoc
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## π‘ Usage Examples
|
|
59
|
+
|
|
60
|
+
### π One-off Query
|
|
61
|
+
```python
|
|
62
|
+
from navidoc import NaviDoc
|
|
63
|
+
|
|
64
|
+
# Initialize (defaults to phi3 or NAVIDOC_MODEL_NAME env var)
|
|
65
|
+
engine = NaviDoc()
|
|
66
|
+
|
|
67
|
+
# Ingest and structurally index any local document
|
|
68
|
+
status = engine.ingest("your_document.pdf")
|
|
69
|
+
print(status)
|
|
70
|
+
|
|
71
|
+
# Query your document offline
|
|
72
|
+
response = engine.query("What are the exact system requirements?")
|
|
73
|
+
print(response)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### π¬ Multi-turn Chat (SDK Style)
|
|
77
|
+
```python
|
|
78
|
+
from navidoc import NaviDoc
|
|
79
|
+
|
|
80
|
+
engine = NaviDoc()
|
|
81
|
+
engine.ingest("manual.docx")
|
|
82
|
+
|
|
83
|
+
# First turn
|
|
84
|
+
print(engine.chat("How do I install the battery?"))
|
|
85
|
+
|
|
86
|
+
# Second turn (remembers context and history!)
|
|
87
|
+
print(engine.chat("Where can I buy a replacement?"))
|
|
88
|
+
|
|
89
|
+
# Clear history if needed
|
|
90
|
+
engine.clear_history()
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### πΎ Save & Fast Load Index
|
|
94
|
+
Avoid re-parsing large documents by saving the tree index.
|
|
95
|
+
```python
|
|
96
|
+
from navidoc import NaviDoc
|
|
97
|
+
|
|
98
|
+
engine = NaviDoc()
|
|
99
|
+
|
|
100
|
+
# First time: Parse and Save
|
|
101
|
+
engine.ingest("massive_report.pdf")
|
|
102
|
+
engine.save_index("storage/indices/massive_report.json")
|
|
103
|
+
|
|
104
|
+
# Second time: Instant Load in milliseconds
|
|
105
|
+
engine.load_index("storage/indices/massive_report.json")
|
|
106
|
+
response = engine.query("What is the revenue?")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## βοΈ Configuration
|
|
112
|
+
|
|
113
|
+
### Environment Variables
|
|
114
|
+
You can configure NaviDoc without changing your code by setting environment variables:
|
|
115
|
+
|
|
116
|
+
* `NAVIDOC_MODEL_NAME`: Set the default Ollama model to use (Default: `phi3`).
|
|
117
|
+
|
|
118
|
+
**How to change it:**
|
|
119
|
+
* **Windows (PowerShell)**: `$env:NAVIDOC_MODEL_NAME="llama3"`
|
|
120
|
+
* **Linux/Mac**: `export NAVIDOC_MODEL_NAME="llama3"`
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## π License
|
|
125
|
+
|
|
126
|
+
NaviDoc is open-source software distributed completely free under the **[MIT License](LICENSE)**.
|
navidoc-0.1.0/README.md
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# πΊοΈ NaviDoc
|
|
2
|
+
|
|
3
|
+
NaviDoc is a lightweight, **completely local, zero-API, tree-based RAG framework** designed to navigate document structures intelligently. Instead of blindly chopping your files into vector chunks, NaviDoc maps your documents into a logical structural tree hierarchy and uses local LLMs to precisely steer and navigate to answers.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## β¨ Features
|
|
8
|
+
|
|
9
|
+
* **π 100% Private & Offline:** Your documents never leave your machine. Zero cloud APIs, zero telemetry.
|
|
10
|
+
* **π³ Tree-Based Navigation:** Mimics human navigation by following document structures (headers, font sizes) instead of standard proximity vector chunks.
|
|
11
|
+
* **β‘ High Precision:** Pinpoints specific structural sections, avoiding context contamination or context blowouts.
|
|
12
|
+
* **π Multi-Format Support**: Supports Markdown, PDF (with font-size analysis), DOCX (with style detection), and PPTX.
|
|
13
|
+
* **πΎ Index Persistence**: Save your indexed tree structures to JSON and reload them instantly.
|
|
14
|
+
* **π¬ Chat SDK**: Maintain conversation history with your documents SDK-style.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## π Getting Started
|
|
19
|
+
|
|
20
|
+
### 1. Prerequisites
|
|
21
|
+
|
|
22
|
+
NaviDoc requires **Ollama** to host your local LLM engine.
|
|
23
|
+
|
|
24
|
+
1. Download and install Ollama from [ollama.com](https://ollama.com).
|
|
25
|
+
2. Pull a smart, small model (we recommend `phi3` or `llama3`):
|
|
26
|
+
```bash
|
|
27
|
+
ollama pull phi3
|
|
28
|
+
```
|
|
29
|
+
3. Ensure the Ollama service is running in the background before running NaviDoc.
|
|
30
|
+
|
|
31
|
+
### 2. Installation
|
|
32
|
+
|
|
33
|
+
Install NaviDoc via pip:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install navidoc
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Or using `uv`:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
uv add navidoc
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## π‘ Usage Examples
|
|
48
|
+
|
|
49
|
+
### π One-off Query
|
|
50
|
+
```python
|
|
51
|
+
from navidoc import NaviDoc
|
|
52
|
+
|
|
53
|
+
# Initialize (defaults to phi3 or NAVIDOC_MODEL_NAME env var)
|
|
54
|
+
engine = NaviDoc()
|
|
55
|
+
|
|
56
|
+
# Ingest and structurally index any local document
|
|
57
|
+
status = engine.ingest("your_document.pdf")
|
|
58
|
+
print(status)
|
|
59
|
+
|
|
60
|
+
# Query your document offline
|
|
61
|
+
response = engine.query("What are the exact system requirements?")
|
|
62
|
+
print(response)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### π¬ Multi-turn Chat (SDK Style)
|
|
66
|
+
```python
|
|
67
|
+
from navidoc import NaviDoc
|
|
68
|
+
|
|
69
|
+
engine = NaviDoc()
|
|
70
|
+
engine.ingest("manual.docx")
|
|
71
|
+
|
|
72
|
+
# First turn
|
|
73
|
+
print(engine.chat("How do I install the battery?"))
|
|
74
|
+
|
|
75
|
+
# Second turn (remembers context and history!)
|
|
76
|
+
print(engine.chat("Where can I buy a replacement?"))
|
|
77
|
+
|
|
78
|
+
# Clear history if needed
|
|
79
|
+
engine.clear_history()
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### πΎ Save & Fast Load Index
|
|
83
|
+
Avoid re-parsing large documents by saving the tree index.
|
|
84
|
+
```python
|
|
85
|
+
from navidoc import NaviDoc
|
|
86
|
+
|
|
87
|
+
engine = NaviDoc()
|
|
88
|
+
|
|
89
|
+
# First time: Parse and Save
|
|
90
|
+
engine.ingest("massive_report.pdf")
|
|
91
|
+
engine.save_index("storage/indices/massive_report.json")
|
|
92
|
+
|
|
93
|
+
# Second time: Instant Load in milliseconds
|
|
94
|
+
engine.load_index("storage/indices/massive_report.json")
|
|
95
|
+
response = engine.query("What is the revenue?")
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## βοΈ Configuration
|
|
101
|
+
|
|
102
|
+
### Environment Variables
|
|
103
|
+
You can configure NaviDoc without changing your code by setting environment variables:
|
|
104
|
+
|
|
105
|
+
* `NAVIDOC_MODEL_NAME`: Set the default Ollama model to use (Default: `phi3`).
|
|
106
|
+
|
|
107
|
+
**How to change it:**
|
|
108
|
+
* **Windows (PowerShell)**: `$env:NAVIDOC_MODEL_NAME="llama3"`
|
|
109
|
+
* **Linux/Mac**: `export NAVIDOC_MODEL_NAME="llama3"`
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## π License
|
|
114
|
+
|
|
115
|
+
NaviDoc is open-source software distributed completely free under the **[MIT License](LICENSE)**.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "navidoc"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A lightweight, completely local, zero-API, tree-based RAG framework"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"ollama>=0.6.2",
|
|
9
|
+
"pypdf>=6.11.0",
|
|
10
|
+
"python-docx>=1.2.0",
|
|
11
|
+
"python-pptx>=1.0.2",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[dependency-groups]
|
|
15
|
+
dev = [
|
|
16
|
+
"pytest>=9.0.3",
|
|
17
|
+
]
|
navidoc-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from typing import Optional, Dict, Any, List
|
|
4
|
+
import ollama
|
|
5
|
+
|
|
6
|
+
from .parsers.markdown import MarkdownParser
|
|
7
|
+
from .parsers.pdf import PdfParser
|
|
8
|
+
from .parsers.docx import DocxParser
|
|
9
|
+
from .parsers.pptx import PptxParser
|
|
10
|
+
from .index.tree import TreeIndex
|
|
11
|
+
from .index.page import PageIndex
|
|
12
|
+
|
|
13
|
+
class NaviDoc:
|
|
14
|
+
def __init__(self, model: Optional[str] = None):
|
|
15
|
+
"""
|
|
16
|
+
Initialize NaviDoc SDK.
|
|
17
|
+
|
|
18
|
+
Priority for model selection:
|
|
19
|
+
1. Explicitly passed `model` argument.
|
|
20
|
+
2. `NAVIDOC_MODEL_NAME` environment variable.
|
|
21
|
+
3. Fallback default 'phi3'.
|
|
22
|
+
"""
|
|
23
|
+
self.model = model or os.getenv("NAVIDOC_MODEL_NAME", "phi3")
|
|
24
|
+
self.index = None
|
|
25
|
+
self.index_type = None # "tree" or "page"
|
|
26
|
+
self.history: List[Dict[str, str]] = [] # Chat history
|
|
27
|
+
|
|
28
|
+
print(f"NaviDoc SDK initialized with model: {self.model}")
|
|
29
|
+
|
|
30
|
+
def ingest(self, file_path: str) -> str:
|
|
31
|
+
"""Ingest a document and create the appropriate index."""
|
|
32
|
+
if not os.path.exists(file_path):
|
|
33
|
+
return f"Error: File {file_path} not found."
|
|
34
|
+
|
|
35
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
36
|
+
|
|
37
|
+
if ext == '.md':
|
|
38
|
+
parser = MarkdownParser()
|
|
39
|
+
tree_data = parser.parse(file_path)
|
|
40
|
+
self.index = TreeIndex()
|
|
41
|
+
self.index.load_tree(tree_data)
|
|
42
|
+
self.index_type = "tree"
|
|
43
|
+
return f"Successfully ingested Markdown: {file_path}"
|
|
44
|
+
|
|
45
|
+
elif ext == '.pdf':
|
|
46
|
+
parser = PdfParser()
|
|
47
|
+
tree_data = parser.parse(file_path)
|
|
48
|
+
self.index = TreeIndex()
|
|
49
|
+
self.index.load_tree(tree_data)
|
|
50
|
+
self.index_type = "tree"
|
|
51
|
+
return f"Successfully ingested PDF (Tree): {file_path}"
|
|
52
|
+
|
|
53
|
+
elif ext == '.docx':
|
|
54
|
+
parser = DocxParser()
|
|
55
|
+
tree_data = parser.parse(file_path)
|
|
56
|
+
self.index = TreeIndex()
|
|
57
|
+
self.index.load_tree(tree_data)
|
|
58
|
+
self.index_type = "tree"
|
|
59
|
+
return f"Successfully ingested DOCX (Tree): {file_path}"
|
|
60
|
+
|
|
61
|
+
elif ext == '.pptx':
|
|
62
|
+
parser = PptxParser()
|
|
63
|
+
pptx_data = parser.parse(file_path)
|
|
64
|
+
self.index = PageIndex()
|
|
65
|
+
self.index.load_pages(pptx_data["pages"])
|
|
66
|
+
self.index_type = "page"
|
|
67
|
+
return f"Successfully ingested PPTX: {file_path}"
|
|
68
|
+
|
|
69
|
+
else:
|
|
70
|
+
return f"Unsupported file format: {ext}"
|
|
71
|
+
|
|
72
|
+
def ingest_markdown(self, file_path: str) -> str:
|
|
73
|
+
"""Explicit method for markdown as requested in README."""
|
|
74
|
+
return self.ingest(file_path)
|
|
75
|
+
|
|
76
|
+
def save_index(self, file_path: str):
|
|
77
|
+
"""Save the current index to a JSON file for fast reloading."""
|
|
78
|
+
if not self.index:
|
|
79
|
+
raise ValueError("No index to save. Ingest a document first.")
|
|
80
|
+
|
|
81
|
+
data = {
|
|
82
|
+
"index_type": self.index_type,
|
|
83
|
+
"tree": self.index.tree if self.index_type == "tree" else None,
|
|
84
|
+
"pages": self.index.pages if self.index_type == "page" else None
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
os.makedirs(os.path.dirname(os.path.abspath(file_path)), exist_ok=True)
|
|
88
|
+
|
|
89
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
90
|
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
91
|
+
print(f"Index successfully saved to {file_path}")
|
|
92
|
+
|
|
93
|
+
def load_index(self, file_path: str):
|
|
94
|
+
"""Load a previously saved index from a JSON file."""
|
|
95
|
+
if not os.path.exists(file_path):
|
|
96
|
+
raise FileNotFoundError(f"Index file not found: {file_path}")
|
|
97
|
+
|
|
98
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
99
|
+
data = json.load(f)
|
|
100
|
+
|
|
101
|
+
self.index_type = data["index_type"]
|
|
102
|
+
if self.index_type == "tree":
|
|
103
|
+
self.index = TreeIndex()
|
|
104
|
+
self.index.load_tree(data["tree"])
|
|
105
|
+
elif self.index_type == "page":
|
|
106
|
+
self.index = PageIndex()
|
|
107
|
+
self.index.load_pages(data["pages"])
|
|
108
|
+
print(f"Index successfully loaded from {file_path}")
|
|
109
|
+
|
|
110
|
+
def _navigate_tree(self, query: str, node: Dict[str, Any]) -> str:
|
|
111
|
+
"""Recursively navigate the tree using the local LLM."""
|
|
112
|
+
if not node.get("children"):
|
|
113
|
+
return node.get("content", "")
|
|
114
|
+
|
|
115
|
+
headers = [child["title"] for child in node["children"]]
|
|
116
|
+
|
|
117
|
+
prompt = f"""
|
|
118
|
+
Given the query: "{query}"
|
|
119
|
+
And the following document sections:
|
|
120
|
+
{", ".join([f"'{h}'" for h in headers])}
|
|
121
|
+
|
|
122
|
+
Which section is most likely to contain the answer?
|
|
123
|
+
Reply ONLY with the exact section title from the list above. If none seem relevant, reply 'NONE'.
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
response = ollama.generate(model=self.model, prompt=prompt)
|
|
127
|
+
chosen_header = response['response'].strip().strip("'").strip('"')
|
|
128
|
+
|
|
129
|
+
if chosen_header == 'NONE':
|
|
130
|
+
return node.get("content", "Section not found.")
|
|
131
|
+
|
|
132
|
+
for child in node["children"]:
|
|
133
|
+
if child["title"] == chosen_header:
|
|
134
|
+
return self._navigate_tree(query, child)
|
|
135
|
+
|
|
136
|
+
return node.get("content", "Navigation path lost.")
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
return f"Navigation error: {str(e)}"
|
|
140
|
+
|
|
141
|
+
def query(self, prompt: str) -> str:
|
|
142
|
+
"""One-off query without maintaining history."""
|
|
143
|
+
if not self.index:
|
|
144
|
+
return "No document ingested yet."
|
|
145
|
+
|
|
146
|
+
if self.index_type == "tree":
|
|
147
|
+
relevant_content = self._navigate_tree(prompt, self.index.tree)
|
|
148
|
+
else:
|
|
149
|
+
relevant_content = self.index.get_all_text()
|
|
150
|
+
|
|
151
|
+
full_prompt = f"""
|
|
152
|
+
Answer the user's question based ONLY on this specific context found during navigation:
|
|
153
|
+
{relevant_content}
|
|
154
|
+
|
|
155
|
+
Question: {prompt}
|
|
156
|
+
Answer:
|
|
157
|
+
"""
|
|
158
|
+
try:
|
|
159
|
+
response = ollama.generate(model=self.model, prompt=full_prompt)
|
|
160
|
+
return response['response']
|
|
161
|
+
except Exception as e:
|
|
162
|
+
return f"Error calling Ollama: {str(e)}"
|
|
163
|
+
|
|
164
|
+
def chat(self, prompt: str) -> str:
|
|
165
|
+
"""Chat with the document, maintaining conversation history."""
|
|
166
|
+
if not self.index:
|
|
167
|
+
return "No document ingested yet."
|
|
168
|
+
|
|
169
|
+
# Get context for the latest query
|
|
170
|
+
if self.index_type == "tree":
|
|
171
|
+
relevant_content = self._navigate_tree(prompt, self.index.tree)
|
|
172
|
+
else:
|
|
173
|
+
relevant_content = self.index.get_all_text()
|
|
174
|
+
|
|
175
|
+
# Format history for Ollama
|
|
176
|
+
history_str = ""
|
|
177
|
+
for turn in self.history:
|
|
178
|
+
history_str += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"
|
|
179
|
+
|
|
180
|
+
full_prompt = f"""
|
|
181
|
+
You are having a conversation about a document.
|
|
182
|
+
Answer the user's latest question based ONLY on the context provided below.
|
|
183
|
+
Maintain continuity with the conversation history.
|
|
184
|
+
|
|
185
|
+
Context:
|
|
186
|
+
{relevant_content}
|
|
187
|
+
|
|
188
|
+
History:
|
|
189
|
+
{history_str}
|
|
190
|
+
|
|
191
|
+
User: {prompt}
|
|
192
|
+
Assistant:
|
|
193
|
+
"""
|
|
194
|
+
try:
|
|
195
|
+
response = ollama.generate(model=self.model, prompt=full_prompt)
|
|
196
|
+
reply = response['response']
|
|
197
|
+
|
|
198
|
+
# Save to history
|
|
199
|
+
self.history.append({"user": prompt, "assistant": reply})
|
|
200
|
+
|
|
201
|
+
return reply
|
|
202
|
+
except Exception as e:
|
|
203
|
+
return f"Error calling Ollama: {str(e)}"
|
|
204
|
+
|
|
205
|
+
def clear_history(self):
|
|
206
|
+
"""Clear the chat history."""
|
|
207
|
+
self.history = []
|
|
208
|
+
print("Chat history cleared.")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Index package
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Dict, Any, List
|
|
2
|
+
|
|
3
|
+
class PageIndex:
|
|
4
|
+
def __init__(self):
|
|
5
|
+
self.pages = []
|
|
6
|
+
|
|
7
|
+
def load_pages(self, pages_data: List[Dict[str, Any]]):
|
|
8
|
+
"""Load the pages."""
|
|
9
|
+
self.pages = pages_data
|
|
10
|
+
|
|
11
|
+
def search(self, query: str) -> str:
|
|
12
|
+
"""
|
|
13
|
+
Search the pages for relevant content.
|
|
14
|
+
"""
|
|
15
|
+
# Skeleton: Return the whole content for now
|
|
16
|
+
return f"PageIndex search for: {query}"
|
|
17
|
+
|
|
18
|
+
def get_all_text(self) -> str:
|
|
19
|
+
"""Helper to get all text from pages."""
|
|
20
|
+
return "\n".join([p.get("content", "") for p in self.pages])
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import Dict, Any, List
|
|
2
|
+
|
|
3
|
+
class TreeIndex:
|
|
4
|
+
def __init__(self):
|
|
5
|
+
self.tree = {}
|
|
6
|
+
|
|
7
|
+
def load_tree(self, tree_data: Dict[str, Any]):
|
|
8
|
+
"""Load the tree structure."""
|
|
9
|
+
self.tree = tree_data
|
|
10
|
+
|
|
11
|
+
def search(self, query: str) -> str:
|
|
12
|
+
"""
|
|
13
|
+
Search the tree for relevant content.
|
|
14
|
+
In a real implementation, this would use an LLM to navigate.
|
|
15
|
+
"""
|
|
16
|
+
# Skeleton: Return the whole tree content for now
|
|
17
|
+
# or a message saying it's a skeleton.
|
|
18
|
+
return f"TreeIndex search for: {query}"
|
|
19
|
+
|
|
20
|
+
def get_all_text(self) -> str:
|
|
21
|
+
"""Helper to get all text from tree."""
|
|
22
|
+
def _recurse(node):
|
|
23
|
+
text = node.get("content", "")
|
|
24
|
+
for child in node.get("children", []):
|
|
25
|
+
text += _recurse(child)
|
|
26
|
+
return text
|
|
27
|
+
return _recurse(self.tree)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Parsers package
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from docx import Document
|
|
2
|
+
from typing import Dict, Any, List
|
|
3
|
+
|
|
4
|
+
class DocxParser:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
def parse(self, file_path: str) -> Dict[str, Any]:
|
|
9
|
+
"""Parse a DOCX file into a tree structure based on Heading styles."""
|
|
10
|
+
doc = Document(file_path)
|
|
11
|
+
|
|
12
|
+
root = {"title": "Root", "level": 0, "content": "", "children": []}
|
|
13
|
+
stack = [root]
|
|
14
|
+
|
|
15
|
+
for para in doc.paragraphs:
|
|
16
|
+
text = para.text.strip()
|
|
17
|
+
if not text:
|
|
18
|
+
continue
|
|
19
|
+
|
|
20
|
+
style_name = para.style.name if para.style else ""
|
|
21
|
+
|
|
22
|
+
# Check if it's a heading
|
|
23
|
+
if style_name.startswith('Heading'):
|
|
24
|
+
try:
|
|
25
|
+
# Extract level from "Heading 1" -> 1
|
|
26
|
+
level = int(style_name.split()[-1])
|
|
27
|
+
except (ValueError, IndexError):
|
|
28
|
+
level = 1 # Fallback
|
|
29
|
+
|
|
30
|
+
node = {
|
|
31
|
+
"title": text,
|
|
32
|
+
"level": level,
|
|
33
|
+
"content": "",
|
|
34
|
+
"children": []
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# Pop from stack until we find the parent
|
|
38
|
+
while stack and stack[-1]["level"] >= level:
|
|
39
|
+
stack.pop()
|
|
40
|
+
|
|
41
|
+
if stack:
|
|
42
|
+
stack[-1]["children"].append(node)
|
|
43
|
+
stack.append(node)
|
|
44
|
+
else:
|
|
45
|
+
root["children"].append(node)
|
|
46
|
+
stack.append(node)
|
|
47
|
+
else:
|
|
48
|
+
# Add content to the current node
|
|
49
|
+
if stack:
|
|
50
|
+
stack[-1]["content"] += text + "\n"
|
|
51
|
+
|
|
52
|
+
return root
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Dict, Any
|
|
3
|
+
|
|
4
|
+
class MarkdownParser:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
def parse(self, file_path: str) -> Dict[str, Any]:
|
|
9
|
+
"""Parse a markdown file into a tree structure based on headers."""
|
|
10
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
11
|
+
lines = f.readlines()
|
|
12
|
+
|
|
13
|
+
root = {"title": "Root", "level": 0, "content": "", "children": []}
|
|
14
|
+
stack = [root]
|
|
15
|
+
|
|
16
|
+
for line in lines:
|
|
17
|
+
match = re.match(r'^(#+)\s+(.*)', line)
|
|
18
|
+
if match:
|
|
19
|
+
level = len(match.group(1))
|
|
20
|
+
title = match.group(2).strip()
|
|
21
|
+
|
|
22
|
+
node = {
|
|
23
|
+
"title": title,
|
|
24
|
+
"level": level,
|
|
25
|
+
"content": "",
|
|
26
|
+
"children": []
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# Pop from stack until we find the parent (level < current level)
|
|
30
|
+
while stack and stack[-1]["level"] >= level:
|
|
31
|
+
stack.pop()
|
|
32
|
+
|
|
33
|
+
if stack:
|
|
34
|
+
stack[-1]["children"].append(node)
|
|
35
|
+
stack.append(node)
|
|
36
|
+
else:
|
|
37
|
+
# Fallback if somehow stack is empty
|
|
38
|
+
root["children"].append(node)
|
|
39
|
+
stack.append(node)
|
|
40
|
+
else:
|
|
41
|
+
# Add content to the current node
|
|
42
|
+
if stack:
|
|
43
|
+
stack[-1]["content"] += line
|
|
44
|
+
|
|
45
|
+
return root
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from pypdf import PdfReader
|
|
2
|
+
from typing import Dict, Any, List
|
|
3
|
+
|
|
4
|
+
class PdfParser:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
def parse(self, file_path: str) -> Dict[str, Any]:
|
|
9
|
+
"""Parse a PDF file into a tree structure based on font sizes."""
|
|
10
|
+
reader = PdfReader(file_path)
|
|
11
|
+
|
|
12
|
+
# 1. Analyze font sizes to find body and headings
|
|
13
|
+
font_sizes = {}
|
|
14
|
+
|
|
15
|
+
def size_visitor(text, cm, tm, font_dict, font_size):
|
|
16
|
+
if text.strip():
|
|
17
|
+
font_sizes[font_size] = font_sizes.get(font_size, 0) + len(text)
|
|
18
|
+
|
|
19
|
+
for page in reader.pages:
|
|
20
|
+
page.extract_text(visitor_text=size_visitor)
|
|
21
|
+
|
|
22
|
+
if not font_sizes:
|
|
23
|
+
return {"title": "Root", "level": 0, "content": "No text found", "children": []}
|
|
24
|
+
|
|
25
|
+
# Body text is the most common font size
|
|
26
|
+
body_size = max(font_sizes, key=font_sizes.get)
|
|
27
|
+
|
|
28
|
+
# Headings are larger than body text
|
|
29
|
+
headings = [s for s in font_sizes.keys() if s > body_size + 1.0]
|
|
30
|
+
headings.sort(reverse=True)
|
|
31
|
+
|
|
32
|
+
# Map font sizes to heading levels (up to 3 levels)
|
|
33
|
+
heading_map = {}
|
|
34
|
+
for i, size in enumerate(headings[:3]):
|
|
35
|
+
heading_map[size] = i + 1
|
|
36
|
+
|
|
37
|
+
# 2. Build the tree
|
|
38
|
+
root = {"title": "Root", "level": 0, "content": "", "children": []}
|
|
39
|
+
stack = [root]
|
|
40
|
+
|
|
41
|
+
def build_visitor(text, cm, tm, font_dict, font_size):
|
|
42
|
+
t = text.strip()
|
|
43
|
+
if not t:
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
level = heading_map.get(font_size)
|
|
47
|
+
if level:
|
|
48
|
+
node = {
|
|
49
|
+
"title": t,
|
|
50
|
+
"level": level,
|
|
51
|
+
"content": "",
|
|
52
|
+
"children": []
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Pop stack until we find the parent
|
|
56
|
+
while stack and stack[-1]["level"] >= level:
|
|
57
|
+
stack.pop()
|
|
58
|
+
|
|
59
|
+
if stack:
|
|
60
|
+
stack[-1]["children"].append(node)
|
|
61
|
+
stack.append(node)
|
|
62
|
+
else:
|
|
63
|
+
root["children"].append(node)
|
|
64
|
+
stack.append(node)
|
|
65
|
+
else:
|
|
66
|
+
# Add text to the current node
|
|
67
|
+
if stack:
|
|
68
|
+
stack[-1]["content"] += text + " "
|
|
69
|
+
|
|
70
|
+
for page in reader.pages:
|
|
71
|
+
page.extract_text(visitor_text=build_visitor)
|
|
72
|
+
|
|
73
|
+
return root
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from pptx import Presentation
|
|
2
|
+
from typing import Dict, Any, List
|
|
3
|
+
|
|
4
|
+
class PptxParser:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
def parse(self, file_path: str) -> Dict[str, Any]:
|
|
9
|
+
"""Parse a PPTX file."""
|
|
10
|
+
prs = Presentation(file_path)
|
|
11
|
+
slides = []
|
|
12
|
+
|
|
13
|
+
for i, slide in enumerate(prs.slides):
|
|
14
|
+
text = []
|
|
15
|
+
for shape in slide.shapes:
|
|
16
|
+
if hasattr(shape, "text"):
|
|
17
|
+
text.append(shape.text)
|
|
18
|
+
slides.append({
|
|
19
|
+
"page_number": i + 1,
|
|
20
|
+
"content": "\n".join(text)
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
"title": file_path,
|
|
25
|
+
"type": "pptx",
|
|
26
|
+
"pages": slides
|
|
27
|
+
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: navidoc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight, completely local, zero-API, tree-based RAG framework
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: ollama>=0.6.2
|
|
8
|
+
Requires-Dist: pypdf>=6.11.0
|
|
9
|
+
Requires-Dist: python-docx>=1.2.0
|
|
10
|
+
Requires-Dist: python-pptx>=1.0.2
|
|
11
|
+
|
|
12
|
+
# πΊοΈ NaviDoc
|
|
13
|
+
|
|
14
|
+
NaviDoc is a lightweight, **completely local, zero-API, tree-based RAG framework** designed to navigate document structures intelligently. Instead of blindly chopping your files into vector chunks, NaviDoc maps your documents into a logical structural tree hierarchy and uses local LLMs to precisely steer and navigate to answers.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## β¨ Features
|
|
19
|
+
|
|
20
|
+
* **π 100% Private & Offline:** Your documents never leave your machine. Zero cloud APIs, zero telemetry.
|
|
21
|
+
* **π³ Tree-Based Navigation:** Mimics human navigation by following document structures (headers, font sizes) instead of standard proximity vector chunks.
|
|
22
|
+
* **β‘ High Precision:** Pinpoints specific structural sections, avoiding context contamination or context blowouts.
|
|
23
|
+
* **π Multi-Format Support**: Supports Markdown, PDF (with font-size analysis), DOCX (with style detection), and PPTX.
|
|
24
|
+
* **πΎ Index Persistence**: Save your indexed tree structures to JSON and reload them instantly.
|
|
25
|
+
* **π¬ Chat SDK**: Maintain conversation history with your documents SDK-style.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## π Getting Started
|
|
30
|
+
|
|
31
|
+
### 1. Prerequisites
|
|
32
|
+
|
|
33
|
+
NaviDoc requires **Ollama** to host your local LLM engine.
|
|
34
|
+
|
|
35
|
+
1. Download and install Ollama from [ollama.com](https://ollama.com).
|
|
36
|
+
2. Pull a smart, small model (we recommend `phi3` or `llama3`):
|
|
37
|
+
```bash
|
|
38
|
+
ollama pull phi3
|
|
39
|
+
```
|
|
40
|
+
3. Ensure the Ollama service is running in the background before running NaviDoc.
|
|
41
|
+
|
|
42
|
+
### 2. Installation
|
|
43
|
+
|
|
44
|
+
Install NaviDoc via pip:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install navidoc
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Or using `uv`:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
uv add navidoc
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## π‘ Usage Examples
|
|
59
|
+
|
|
60
|
+
### π One-off Query
|
|
61
|
+
```python
|
|
62
|
+
from navidoc import NaviDoc
|
|
63
|
+
|
|
64
|
+
# Initialize (defaults to phi3 or NAVIDOC_MODEL_NAME env var)
|
|
65
|
+
engine = NaviDoc()
|
|
66
|
+
|
|
67
|
+
# Ingest and structurally index any local document
|
|
68
|
+
status = engine.ingest("your_document.pdf")
|
|
69
|
+
print(status)
|
|
70
|
+
|
|
71
|
+
# Query your document offline
|
|
72
|
+
response = engine.query("What are the exact system requirements?")
|
|
73
|
+
print(response)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### π¬ Multi-turn Chat (SDK Style)
|
|
77
|
+
```python
|
|
78
|
+
from navidoc import NaviDoc
|
|
79
|
+
|
|
80
|
+
engine = NaviDoc()
|
|
81
|
+
engine.ingest("manual.docx")
|
|
82
|
+
|
|
83
|
+
# First turn
|
|
84
|
+
print(engine.chat("How do I install the battery?"))
|
|
85
|
+
|
|
86
|
+
# Second turn (remembers context and history!)
|
|
87
|
+
print(engine.chat("Where can I buy a replacement?"))
|
|
88
|
+
|
|
89
|
+
# Clear history if needed
|
|
90
|
+
engine.clear_history()
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### πΎ Save & Fast Load Index
|
|
94
|
+
Avoid re-parsing large documents by saving the tree index.
|
|
95
|
+
```python
|
|
96
|
+
from navidoc import NaviDoc
|
|
97
|
+
|
|
98
|
+
engine = NaviDoc()
|
|
99
|
+
|
|
100
|
+
# First time: Parse and Save
|
|
101
|
+
engine.ingest("massive_report.pdf")
|
|
102
|
+
engine.save_index("storage/indices/massive_report.json")
|
|
103
|
+
|
|
104
|
+
# Second time: Instant Load in milliseconds
|
|
105
|
+
engine.load_index("storage/indices/massive_report.json")
|
|
106
|
+
response = engine.query("What is the revenue?")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## βοΈ Configuration
|
|
112
|
+
|
|
113
|
+
### Environment Variables
|
|
114
|
+
You can configure NaviDoc without changing your code by setting environment variables:
|
|
115
|
+
|
|
116
|
+
* `NAVIDOC_MODEL_NAME`: Set the default Ollama model to use (Default: `phi3`).
|
|
117
|
+
|
|
118
|
+
**How to change it:**
|
|
119
|
+
* **Windows (PowerShell)**: `$env:NAVIDOC_MODEL_NAME="llama3"`
|
|
120
|
+
* **Linux/Mac**: `export NAVIDOC_MODEL_NAME="llama3"`
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## π License
|
|
125
|
+
|
|
126
|
+
NaviDoc is open-source software distributed completely free under the **[MIT License](LICENSE)**.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/navidoc/__init__.py
|
|
4
|
+
src/navidoc/core.py
|
|
5
|
+
src/navidoc.egg-info/PKG-INFO
|
|
6
|
+
src/navidoc.egg-info/SOURCES.txt
|
|
7
|
+
src/navidoc.egg-info/dependency_links.txt
|
|
8
|
+
src/navidoc.egg-info/requires.txt
|
|
9
|
+
src/navidoc.egg-info/top_level.txt
|
|
10
|
+
src/navidoc/index/__init__.py
|
|
11
|
+
src/navidoc/index/page.py
|
|
12
|
+
src/navidoc/index/tree.py
|
|
13
|
+
src/navidoc/parsers/__init__.py
|
|
14
|
+
src/navidoc/parsers/docx.py
|
|
15
|
+
src/navidoc/parsers/markdown.py
|
|
16
|
+
src/navidoc/parsers/pdf.py
|
|
17
|
+
src/navidoc/parsers/pptx.py
|
|
18
|
+
tests/test_parsers.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
navidoc
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from navidoc.parsers.markdown import MarkdownParser
|
|
4
|
+
|
|
5
|
+
def test_markdown_parser():
|
|
6
|
+
# Create a dummy markdown file in storage
|
|
7
|
+
storage_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../storage'))
|
|
8
|
+
test_dir = os.path.join(storage_dir, 'test_data')
|
|
9
|
+
os.makedirs(test_dir, exist_ok=True)
|
|
10
|
+
|
|
11
|
+
p = os.path.join(test_dir, "test.md")
|
|
12
|
+
with open(p, 'w', encoding='utf-8') as f:
|
|
13
|
+
f.write("# Header 1\nContent 1\n## Header 2\nContent 2")
|
|
14
|
+
|
|
15
|
+
parser = MarkdownParser()
|
|
16
|
+
tree = parser.parse(p)
|
|
17
|
+
|
|
18
|
+
assert tree["title"] == "Root"
|
|
19
|
+
assert len(tree["children"]) == 1
|
|
20
|
+
assert tree["children"][0]["title"] == "Header 1"
|
|
21
|
+
assert len(tree["children"][0]["children"]) == 1
|
|
22
|
+
assert tree["children"][0]["children"][0]["title"] == "Header 2"
|