hie-rag 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hie_rag/hie_rag.py +1 -1
- hie_rag/utils.py +12 -22
- hie_rag-0.2.4.dist-info/METADATA +147 -0
- hie_rag-0.2.4.dist-info/RECORD +13 -0
- hie_rag/split_and_process.py +0 -17
- hie_rag-0.2.2.dist-info/METADATA +0 -48
- hie_rag-0.2.2.dist-info/RECORD +0 -14
- {hie_rag-0.2.2.dist-info → hie_rag-0.2.4.dist-info}/WHEEL +0 -0
- {hie_rag-0.2.2.dist-info → hie_rag-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {hie_rag-0.2.2.dist-info → hie_rag-0.2.4.dist-info}/top_level.txt +0 -0
hie_rag/hie_rag.py
CHANGED
@@ -16,7 +16,7 @@ class HieRag:
|
|
16
16
|
def process_and_save_index_stream(self, file_name: str, uploaded_file: bytes, min_chunk_size, max_chunk_size):
|
17
17
|
yield {"status": "🔍 Extracting text..."}
|
18
18
|
print(f"Extracting text from {file_name}")
|
19
|
-
extracted_text = self.utils.extract_text(uploaded_file)
|
19
|
+
extracted_text = self.utils.extract_text(file_name=file_name, uploaded_bytes=uploaded_file)
|
20
20
|
|
21
21
|
yield {"status": "✂️ Splitting into chunks..."}
|
22
22
|
print(f"Splitting text into chunks with min size {min_chunk_size} and max size {max_chunk_size}")
|
hie_rag/utils.py
CHANGED
@@ -16,36 +16,26 @@ class Utils:
|
|
16
16
|
# self.client = OpenAI(api_key=api_key)
|
17
17
|
self.client = AiClient(base_url=base_url)
|
18
18
|
|
19
|
-
def extract_text(self,
|
19
|
+
def extract_text(self, uploaded_bytes: bytes, file_name: str):
|
20
20
|
"""Extract text from an uploaded file using MarkItDown."""
|
21
|
-
# md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
|
22
21
|
md = MarkItDown()
|
23
22
|
|
24
|
-
#
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
suffix = os.path.splitext(filename)[-1] if filename else ".bin"
|
32
|
-
else:
|
33
|
-
raise TypeError("Unsupported file type: must be bytes or file-like object")
|
34
|
-
|
35
|
-
# Write to temp file for MarkItDown to process
|
36
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
37
|
-
temp_file_path = temp_file.name
|
38
|
-
temp_file.write(file_bytes)
|
23
|
+
# derive a real suffix from the filename
|
24
|
+
suffix = os.path.splitext(file_name)[1].lower() or ".txt"
|
25
|
+
|
26
|
+
# write to temp file
|
27
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
28
|
+
tmp.write(uploaded_bytes)
|
29
|
+
tmp_path = tmp.name
|
39
30
|
|
40
31
|
try:
|
41
|
-
# Redirect stderr to suppress native print warnings like "CropBox missing"
|
42
32
|
with contextlib.redirect_stderr(io.StringIO()):
|
43
|
-
|
33
|
+
result = md.convert(tmp_path)
|
44
34
|
finally:
|
45
|
-
|
46
|
-
os.remove(temp_file_path)
|
35
|
+
os.remove(tmp_path)
|
47
36
|
|
48
|
-
return
|
37
|
+
# depending on MarkItDown version this may return a str or an object
|
38
|
+
return getattr(result, "text_content", result)
|
49
39
|
|
50
40
|
def count_tokens(self, text: str, encoding="cl100k_base") -> int:
|
51
41
|
"""Count tokens in text using tiktoken"""
|
@@ -0,0 +1,147 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: hie_rag
|
3
|
+
Version: 0.2.4
|
4
|
+
Summary: A hierarchical RAG framework for chunks retrieval.
|
5
|
+
License: Apache License
|
6
|
+
Version 2.0, January 2004
|
7
|
+
http://www.apache.org/licenses/
|
8
|
+
|
9
|
+
Copyright [2025] [Chih-Han Chung]
|
10
|
+
|
11
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
12
|
+
you may not use this file except in compliance with the License.
|
13
|
+
You may obtain a copy of the License at
|
14
|
+
|
15
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
16
|
+
|
17
|
+
Unless required by applicable law or agreed to in writing, software
|
18
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
19
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
20
|
+
See the License for the specific language governing permissions and
|
21
|
+
limitations under the License.
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
License-File: LICENSE
|
24
|
+
Requires-Dist: python-dotenv==1.0.1
|
25
|
+
Requires-Dist: werkzeug==3.1.3
|
26
|
+
Requires-Dist: numpy
|
27
|
+
Requires-Dist: markitdown==0.0.1a3
|
28
|
+
Requires-Dist: openai==1.66.3
|
29
|
+
Requires-Dist: scikit-learn
|
30
|
+
Requires-Dist: tiktoken==0.8.0
|
31
|
+
Requires-Dist: langchain==0.3.13
|
32
|
+
Requires-Dist: langchain-ollama==0.3.3
|
33
|
+
Requires-Dist: chromadb==0.6.2
|
34
|
+
Dynamic: license-file
|
35
|
+
|
36
|
+
# 📚 HieRAG – Hierarchical Retrieval-Augmented Generation
|
37
|
+
|
38
|
+
`hie_rag` is a modular, extensible Python package designed for **Hierarchical Retrieval-Augmented Generation (Hie-RAG)**. It enables you to extract, split, embed, summarize, and query documents using both chunk- and tree-level semantics, all backed by a vector database.
|
39
|
+
|
40
|
+
---
|
41
|
+
|
42
|
+
## ✅ Features
|
43
|
+
|
44
|
+
- PDF/DOCX/XLSX/CSV/PPT ingestion and intelligent semantic splitting
|
45
|
+
- Hierarchical summarization tree building
|
46
|
+
- Embedding-based similarity search
|
47
|
+
- Vector DB indexing and querying (e.g., Qdrant)
|
48
|
+
- Full streaming interface for frontend integration
|
49
|
+
|
50
|
+
---
|
51
|
+
|
52
|
+
## 📦 Components Used
|
53
|
+
|
54
|
+
| Module | Role |
|
55
|
+
| ----------- | -------------------------------------------------------------- |
|
56
|
+
| `HieRAG` | Main interface for processing, querying, and managing indexes. |
|
57
|
+
| `Split` | Split raw text into chunks |
|
58
|
+
| `Process` | Adds metadata and embeddings to chunks |
|
59
|
+
| `TreeIndex` | Builds tree-based hierarchical summaries |
|
60
|
+
| `Utils` | Text extraction and token handling |
|
61
|
+
| `Vectordb` | Stores and queries summaries/chunks |
|
62
|
+
| `AiClient` | Handles embedding API (e.g., OpenAI, HuggingFace, Ollama) |
|
63
|
+
|
64
|
+
---
|
65
|
+
|
66
|
+
## 🛠 Installation
|
67
|
+
|
68
|
+
```bash
|
69
|
+
pip install hie-rag
|
70
|
+
```
|
71
|
+
|
72
|
+
## ⏯︎ How to Use
|
73
|
+
|
74
|
+
### Initialize HieRAG
|
75
|
+
|
76
|
+
```python
|
77
|
+
from hie_rag import HieRag
|
78
|
+
|
79
|
+
hierag = HieRag(base_url="http://localhost:11434")
|
80
|
+
```
|
81
|
+
|
82
|
+
> [!NOTE]
|
83
|
+
> Ensure you have set u an AI server. You should have a chatting model and a embedding model running.
|
84
|
+
|
85
|
+
### Process and Index a File
|
86
|
+
|
87
|
+
```python
|
88
|
+
with open("sample.pdf", "rb") as f:
|
89
|
+
file_bytes = f.read()
|
90
|
+
|
91
|
+
for status in hierag.process_and_save_index_stream(
|
92
|
+
file_name="sample.pdf",
|
93
|
+
uploaded_file=file_bytes,
|
94
|
+
min_chunk_size=300,
|
95
|
+
max_chunk_size=500
|
96
|
+
):
|
97
|
+
print(status)
|
98
|
+
```
|
99
|
+
|
100
|
+
> ```JSON
|
101
|
+
> {
|
102
|
+
> "status": "✅ Done",
|
103
|
+
> "file_id": "abc123",
|
104
|
+
> "summary_count": 5,
|
105
|
+
> "chunk_count": 22
|
106
|
+
> }
|
107
|
+
> ```
|
108
|
+
|
109
|
+
### Query the Summaries or Chunks
|
110
|
+
|
111
|
+
#### Query Summaries by text:
|
112
|
+
|
113
|
+
```python
|
114
|
+
results = hierag.query_summaries_by_text("What is the contract duration?")
|
115
|
+
```
|
116
|
+
|
117
|
+
#### Query Chunks by text:
|
118
|
+
|
119
|
+
```python
|
120
|
+
results = hierag.query_chunks_by_text("Explain clause 3.4", file_id="abc123")
|
121
|
+
```
|
122
|
+
|
123
|
+
### List & Manage Indexed Files
|
124
|
+
|
125
|
+
#### List All Indexed Files
|
126
|
+
|
127
|
+
```python
|
128
|
+
hierag.list_summaries()
|
129
|
+
```
|
130
|
+
|
131
|
+
#### View Chunks of a File
|
132
|
+
|
133
|
+
```python
|
134
|
+
hierag.list_chunks(file_id="abc123")
|
135
|
+
```
|
136
|
+
|
137
|
+
#### Delete a File Index
|
138
|
+
|
139
|
+
```python
|
140
|
+
hierag.delete_index(file_id="abc123")
|
141
|
+
```
|
142
|
+
|
143
|
+
#### Get the Summary of a File
|
144
|
+
|
145
|
+
```python
|
146
|
+
hierag.get_summary(file_id="abc123")
|
147
|
+
```
|
@@ -0,0 +1,13 @@
|
|
1
|
+
hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
|
2
|
+
hie_rag/ai_client.py,sha256=VbGQ0e3vZNn8W2YoR15Vvq2r-MUs-TBRNLGiImT4QxU,1000
|
3
|
+
hie_rag/hie_rag.py,sha256=fprBRIKlbuDqSdBoeupFie-f8i1gkCz0VUPB5lB0mXc,2777
|
4
|
+
hie_rag/process.py,sha256=Z4qpNmxSsxUJgnqJtw8cYWJTS6SxhRR7F7eX_akyVCU,2427
|
5
|
+
hie_rag/split.py,sha256=gEQVt57xWruT5e1psgSOnwuBrQngzri3S4H6ZvKzsw4,5082
|
6
|
+
hie_rag/tree_index.py,sha256=iTa25ohMv5O0HYc5JtzIzVAIhNdVklYiAIJvqyE8sbM,2722
|
7
|
+
hie_rag/utils.py,sha256=_4TGiHuJ-Xo4JEEEdOjp4d1zxw6dNVsxROcom-vr7uU,4059
|
8
|
+
hie_rag/vectordb.py,sha256=iI73ujrONjDaHU66RNdHnD2PZWSppnjm0isIHPJEGAY,11068
|
9
|
+
hie_rag-0.2.4.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
|
10
|
+
hie_rag-0.2.4.dist-info/METADATA,sha256=-4V6CrFey8UwqnSdWGnM6WmhBRAIRhWNYPI07cX3hpY,3988
|
11
|
+
hie_rag-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
12
|
+
hie_rag-0.2.4.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
|
13
|
+
hie_rag-0.2.4.dist-info/RECORD,,
|
hie_rag/split_and_process.py
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
from hie_rag.process import Process
|
2
|
-
from hie_rag.split import Split
|
3
|
-
from hie_rag.utils import Utils
|
4
|
-
|
5
|
-
|
6
|
-
class SplitAndProcess:
|
7
|
-
def __init__(self, base_url: str):
|
8
|
-
self.split = Split(base_url=base_url)
|
9
|
-
self.utils = Utils(base_url=base_url)
|
10
|
-
self.process = Process(base_url=base_url)
|
11
|
-
|
12
|
-
def split_and_process(self, uploaded_file):
|
13
|
-
extracted_text = self.utils.extract_text(uploaded_file)
|
14
|
-
result_split = self.split.split(extracted_text)
|
15
|
-
result_process = self.process.process_chunks(result_split)
|
16
|
-
|
17
|
-
return result_process
|
hie_rag-0.2.2.dist-info/METADATA
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: hie_rag
|
3
|
-
Version: 0.2.2
|
4
|
-
Summary: A hierarchical RAG framework for chunks retrieval.
|
5
|
-
License: Apache License
|
6
|
-
Version 2.0, January 2004
|
7
|
-
http://www.apache.org/licenses/
|
8
|
-
|
9
|
-
Copyright [2025] [Chih-Han Chung]
|
10
|
-
|
11
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
12
|
-
you may not use this file except in compliance with the License.
|
13
|
-
You may obtain a copy of the License at
|
14
|
-
|
15
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
16
|
-
|
17
|
-
Unless required by applicable law or agreed to in writing, software
|
18
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
19
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
20
|
-
See the License for the specific language governing permissions and
|
21
|
-
limitations under the License.
|
22
|
-
Description-Content-Type: text/markdown
|
23
|
-
License-File: LICENSE
|
24
|
-
Requires-Dist: python-dotenv==1.0.1
|
25
|
-
Requires-Dist: werkzeug==3.1.3
|
26
|
-
Requires-Dist: numpy==2.2.3
|
27
|
-
Requires-Dist: markitdown==0.0.1a3
|
28
|
-
Requires-Dist: openai==1.66.3
|
29
|
-
Requires-Dist: scikit-learn
|
30
|
-
Requires-Dist: tiktoken==0.8.0
|
31
|
-
Requires-Dist: langchain==0.3.13
|
32
|
-
Requires-Dist: langchain-ollama==0.3.3
|
33
|
-
Requires-Dist: chromadb==0.6.2
|
34
|
-
Dynamic: license-file
|
35
|
-
|
36
|
-
# Hie-RAG
|
37
|
-
|
38
|
-
Hie-RAG is a Hierarchical Retrieval-Augmented Generation framework for question answering.
|
39
|
-
|
40
|
-
# Introduction
|
41
|
-
|
42
|
-
It is designed to improve the retrieval quality and generation quality of the RAG model by introducing a hierarchical structure. The framework consists of two main parts: a higher-level summary retriever and a lower-level document retriever.
|
43
|
-
|
44
|
-
# Installation
|
45
|
-
|
46
|
-
```bash
|
47
|
-
pip install hie-rag
|
48
|
-
```
|
hie_rag-0.2.2.dist-info/RECORD
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
|
2
|
-
hie_rag/ai_client.py,sha256=VbGQ0e3vZNn8W2YoR15Vvq2r-MUs-TBRNLGiImT4QxU,1000
|
3
|
-
hie_rag/hie_rag.py,sha256=Nl_1WZM9IWhpNyZMvPzsae_u_xaCWEwrJgorZV-hp20,2741
|
4
|
-
hie_rag/process.py,sha256=Z4qpNmxSsxUJgnqJtw8cYWJTS6SxhRR7F7eX_akyVCU,2427
|
5
|
-
hie_rag/split.py,sha256=gEQVt57xWruT5e1psgSOnwuBrQngzri3S4H6ZvKzsw4,5082
|
6
|
-
hie_rag/split_and_process.py,sha256=PkFlnOF7nW4Zs47JTsGF4AY9VDOXz1AtxG9Die8_mQk,572
|
7
|
-
hie_rag/tree_index.py,sha256=iTa25ohMv5O0HYc5JtzIzVAIhNdVklYiAIJvqyE8sbM,2722
|
8
|
-
hie_rag/utils.py,sha256=GwGiQj-zc8-U9UXOFHTKkjHWHx8YTYquR27gsXJgzCE,4687
|
9
|
-
hie_rag/vectordb.py,sha256=iI73ujrONjDaHU66RNdHnD2PZWSppnjm0isIHPJEGAY,11068
|
10
|
-
hie_rag-0.2.2.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
|
11
|
-
hie_rag-0.2.2.dist-info/METADATA,sha256=3vTI_zyvJxOOq8VrrchOAn0a7m8hwQPISnlholFi3u0,1698
|
12
|
-
hie_rag-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
13
|
-
hie_rag-0.2.2.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
|
14
|
-
hie_rag-0.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|