hie-rag 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hie_rag/hie_rag.py CHANGED
@@ -16,7 +16,7 @@ class HieRag:
16
16
  def process_and_save_index_stream(self, file_name: str, uploaded_file: bytes, min_chunk_size, max_chunk_size):
17
17
  yield {"status": "🔍 Extracting text..."}
18
18
  print(f"Extracting text from {file_name}")
19
- extracted_text = self.utils.extract_text(uploaded_file)
19
+ extracted_text = self.utils.extract_text(file_name=file_name, uploaded_bytes=uploaded_file)
20
20
 
21
21
  yield {"status": "✂️ Splitting into chunks..."}
22
22
  print(f"Splitting text into chunks with min size {min_chunk_size} and max size {max_chunk_size}")
hie_rag/utils.py CHANGED
@@ -16,36 +16,26 @@ class Utils:
16
16
  # self.client = OpenAI(api_key=api_key)
17
17
  self.client = AiClient(base_url=base_url)
18
18
 
19
- def extract_text(self, uploaded_file: bytes):
19
+ def extract_text(self, uploaded_bytes: bytes, file_name: str):
20
20
  """Extract text from an uploaded file using MarkItDown."""
21
- # md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
22
21
  md = MarkItDown()
23
22
 
24
- # Accept both raw bytes and file-like objects with `.read()`
25
- if isinstance(uploaded_file, bytes):
26
- file_bytes = uploaded_file
27
- suffix = ".bin" # fallback generic extension
28
- elif hasattr(uploaded_file, "read"):
29
- file_bytes = uploaded_file.read()
30
- filename = getattr(uploaded_file, "name", None) or getattr(uploaded_file, "filename", None)
31
- suffix = os.path.splitext(filename)[-1] if filename else ".bin"
32
- else:
33
- raise TypeError("Unsupported file type: must be bytes or file-like object")
34
-
35
- # Write to temp file for MarkItDown to process
36
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
37
- temp_file_path = temp_file.name
38
- temp_file.write(file_bytes)
23
+ # derive a real suffix from the filename
24
+ suffix = os.path.splitext(file_name)[1].lower() or ".txt"
25
+
26
+ # write to temp file
27
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
28
+ tmp.write(uploaded_bytes)
29
+ tmp_path = tmp.name
39
30
 
40
31
  try:
41
- # Redirect stderr to suppress native print warnings like "CropBox missing"
42
32
  with contextlib.redirect_stderr(io.StringIO()):
43
- extracted_text = md.convert(temp_file_path)
33
+ result = md.convert(tmp_path)
44
34
  finally:
45
- # Clean up the temporary file
46
- os.remove(temp_file_path)
35
+ os.remove(tmp_path)
47
36
 
48
- return extracted_text.text_content
37
+ # depending on MarkItDown version this may return a str or an object
38
+ return getattr(result, "text_content", result)
49
39
 
50
40
  def count_tokens(self, text: str, encoding="cl100k_base") -> int:
51
41
  """Count tokens in text using tiktoken"""
@@ -0,0 +1,147 @@
1
+ Metadata-Version: 2.4
2
+ Name: hie_rag
3
+ Version: 0.2.4
4
+ Summary: A hierarchical RAG framework for chunks retrieval.
5
+ License: Apache License
6
+ Version 2.0, January 2004
7
+ http://www.apache.org/licenses/
8
+
9
+ Copyright [2025] [Chih-Han Chung]
10
+
11
+ Licensed under the Apache License, Version 2.0 (the "License");
12
+ you may not use this file except in compliance with the License.
13
+ You may obtain a copy of the License at
14
+
15
+ http://www.apache.org/licenses/LICENSE-2.0
16
+
17
+ Unless required by applicable law or agreed to in writing, software
18
+ distributed under the License is distributed on an "AS IS" BASIS,
19
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
+ See the License for the specific language governing permissions and
21
+ limitations under the License.
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: python-dotenv==1.0.1
25
+ Requires-Dist: werkzeug==3.1.3
26
+ Requires-Dist: numpy
27
+ Requires-Dist: markitdown==0.0.1a3
28
+ Requires-Dist: openai==1.66.3
29
+ Requires-Dist: scikit-learn
30
+ Requires-Dist: tiktoken==0.8.0
31
+ Requires-Dist: langchain==0.3.13
32
+ Requires-Dist: langchain-ollama==0.3.3
33
+ Requires-Dist: chromadb==0.6.2
34
+ Dynamic: license-file
35
+
36
+ # 📚 HieRAG – Hierarchical Retrieval-Augmented Generation
37
+
38
+ `hie_rag` is a modular, extensible Python package designed for **Hierarchical Retrieval-Augmented Generation (Hie-RAG)**. It enables you to extract, split, embed, summarize, and query documents using both chunk- and tree-level semantics, all backed by a vector database.
39
+
40
+ ---
41
+
42
+ ## ✅ Features
43
+
44
+ - PDF/DOCX/XLSX/CSV/PPT ingestion and intelligent semantic splitting
45
+ - Hierarchical summarization tree building
46
+ - Embedding-based similarity search
47
+ - Vector DB indexing and querying (e.g., Qdrant)
48
+ - Full streaming interface for frontend integration
49
+
50
+ ---
51
+
52
+ ## 📦 Components Used
53
+
54
+ | Module | Role |
55
+ | ----------- | -------------------------------------------------------------- |
56
+ | `HieRAG` | Main interface for processing, querying, and managing indexes. |
57
+ | `Split` | Split raw text into chunks |
58
+ | `Process` | Adds metadata and embeddings to chunks |
59
+ | `TreeIndex` | Builds tree-based hierarchical summaries |
60
+ | `Utils` | Text extraction and token handling |
61
+ | `Vectordb` | Stores and queries summaries/chunks |
62
+ | `AiClient` | Handles embedding API (e.g., OpenAI, HuggingFace, Ollama) |
63
+
64
+ ---
65
+
66
+ ## 🛠 Installation
67
+
68
+ ```bash
69
+ pip install hie-rag
70
+ ```
71
+
72
+ ## ⏯︎ How to Use
73
+
74
+ ### Initialize HieRAG
75
+
76
+ ```python
77
+ from hie_rag import HieRag
78
+
79
+ hierag = HieRag(base_url="http://localhost:11434")
80
+ ```
81
+
82
+ > [!NOTE]
83
+ > Ensure you have set u an AI server. You should have a chatting model and a embedding model running.
84
+
85
+ ### Process and Index a File
86
+
87
+ ```python
88
+ with open("sample.pdf", "rb") as f:
89
+ file_bytes = f.read()
90
+
91
+ for status in hierag.process_and_save_index_stream(
92
+ file_name="sample.pdf",
93
+ uploaded_file=file_bytes,
94
+ min_chunk_size=300,
95
+ max_chunk_size=500
96
+ ):
97
+ print(status)
98
+ ```
99
+
100
+ > ```JSON
101
+ > {
102
+ > "status": "✅ Done",
103
+ > "file_id": "abc123",
104
+ > "summary_count": 5,
105
+ > "chunk_count": 22
106
+ > }
107
+ > ```
108
+
109
+ ### Query the Summaries or Chunks
110
+
111
+ #### Query Summaries by text:
112
+
113
+ ```python
114
+ results = hierag.query_summaries_by_text("What is the contract duration?")
115
+ ```
116
+
117
+ #### Query Chunks by text:
118
+
119
+ ```python
120
+ results = hierag.query_chunks_by_text("Explain clause 3.4", file_id="abc123")
121
+ ```
122
+
123
+ ### List & Manage Indexed Files
124
+
125
+ #### List All Indexed Files
126
+
127
+ ```python
128
+ hierag.list_summaries()
129
+ ```
130
+
131
+ #### View Chunks of a File
132
+
133
+ ```python
134
+ hierag.list_chunks(file_id="abc123")
135
+ ```
136
+
137
+ #### Delete a File Index
138
+
139
+ ```python
140
+ hierag.delete_index(file_id="abc123")
141
+ ```
142
+
143
+ #### Get the Summary of a File
144
+
145
+ ```python
146
+ hierag.get_summary(file_id="abc123")
147
+ ```
@@ -0,0 +1,13 @@
1
+ hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
2
+ hie_rag/ai_client.py,sha256=VbGQ0e3vZNn8W2YoR15Vvq2r-MUs-TBRNLGiImT4QxU,1000
3
+ hie_rag/hie_rag.py,sha256=fprBRIKlbuDqSdBoeupFie-f8i1gkCz0VUPB5lB0mXc,2777
4
+ hie_rag/process.py,sha256=Z4qpNmxSsxUJgnqJtw8cYWJTS6SxhRR7F7eX_akyVCU,2427
5
+ hie_rag/split.py,sha256=gEQVt57xWruT5e1psgSOnwuBrQngzri3S4H6ZvKzsw4,5082
6
+ hie_rag/tree_index.py,sha256=iTa25ohMv5O0HYc5JtzIzVAIhNdVklYiAIJvqyE8sbM,2722
7
+ hie_rag/utils.py,sha256=_4TGiHuJ-Xo4JEEEdOjp4d1zxw6dNVsxROcom-vr7uU,4059
8
+ hie_rag/vectordb.py,sha256=iI73ujrONjDaHU66RNdHnD2PZWSppnjm0isIHPJEGAY,11068
9
+ hie_rag-0.2.4.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
10
+ hie_rag-0.2.4.dist-info/METADATA,sha256=-4V6CrFey8UwqnSdWGnM6WmhBRAIRhWNYPI07cX3hpY,3988
11
+ hie_rag-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ hie_rag-0.2.4.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
13
+ hie_rag-0.2.4.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- from hie_rag.process import Process
2
- from hie_rag.split import Split
3
- from hie_rag.utils import Utils
4
-
5
-
6
- class SplitAndProcess:
7
- def __init__(self, base_url: str):
8
- self.split = Split(base_url=base_url)
9
- self.utils = Utils(base_url=base_url)
10
- self.process = Process(base_url=base_url)
11
-
12
- def split_and_process(self, uploaded_file):
13
- extracted_text = self.utils.extract_text(uploaded_file)
14
- result_split = self.split.split(extracted_text)
15
- result_process = self.process.process_chunks(result_split)
16
-
17
- return result_process
@@ -1,48 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: hie_rag
3
- Version: 0.2.2
4
- Summary: A hierarchical RAG framework for chunks retrieval.
5
- License: Apache License
6
- Version 2.0, January 2004
7
- http://www.apache.org/licenses/
8
-
9
- Copyright [2025] [Chih-Han Chung]
10
-
11
- Licensed under the Apache License, Version 2.0 (the "License");
12
- you may not use this file except in compliance with the License.
13
- You may obtain a copy of the License at
14
-
15
- http://www.apache.org/licenses/LICENSE-2.0
16
-
17
- Unless required by applicable law or agreed to in writing, software
18
- distributed under the License is distributed on an "AS IS" BASIS,
19
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
- See the License for the specific language governing permissions and
21
- limitations under the License.
22
- Description-Content-Type: text/markdown
23
- License-File: LICENSE
24
- Requires-Dist: python-dotenv==1.0.1
25
- Requires-Dist: werkzeug==3.1.3
26
- Requires-Dist: numpy==2.2.3
27
- Requires-Dist: markitdown==0.0.1a3
28
- Requires-Dist: openai==1.66.3
29
- Requires-Dist: scikit-learn
30
- Requires-Dist: tiktoken==0.8.0
31
- Requires-Dist: langchain==0.3.13
32
- Requires-Dist: langchain-ollama==0.3.3
33
- Requires-Dist: chromadb==0.6.2
34
- Dynamic: license-file
35
-
36
- # Hie-RAG
37
-
38
- Hie-RAG is a Hierarchical Retrieval-Augmented Generation framework for question answering.
39
-
40
- # Introduction
41
-
42
- It is designed to improve the retrieval quality and generation quality of the RAG model by introducing a hierarchical structure. The framework consists of two main parts: a higher-level summary retriever and a lower-level document retriever.
43
-
44
- # Installation
45
-
46
- ```bash
47
- pip install hie-rag
48
- ```
@@ -1,14 +0,0 @@
1
- hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
2
- hie_rag/ai_client.py,sha256=VbGQ0e3vZNn8W2YoR15Vvq2r-MUs-TBRNLGiImT4QxU,1000
3
- hie_rag/hie_rag.py,sha256=Nl_1WZM9IWhpNyZMvPzsae_u_xaCWEwrJgorZV-hp20,2741
4
- hie_rag/process.py,sha256=Z4qpNmxSsxUJgnqJtw8cYWJTS6SxhRR7F7eX_akyVCU,2427
5
- hie_rag/split.py,sha256=gEQVt57xWruT5e1psgSOnwuBrQngzri3S4H6ZvKzsw4,5082
6
- hie_rag/split_and_process.py,sha256=PkFlnOF7nW4Zs47JTsGF4AY9VDOXz1AtxG9Die8_mQk,572
7
- hie_rag/tree_index.py,sha256=iTa25ohMv5O0HYc5JtzIzVAIhNdVklYiAIJvqyE8sbM,2722
8
- hie_rag/utils.py,sha256=GwGiQj-zc8-U9UXOFHTKkjHWHx8YTYquR27gsXJgzCE,4687
9
- hie_rag/vectordb.py,sha256=iI73ujrONjDaHU66RNdHnD2PZWSppnjm0isIHPJEGAY,11068
10
- hie_rag-0.2.2.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
11
- hie_rag-0.2.2.dist-info/METADATA,sha256=3vTI_zyvJxOOq8VrrchOAn0a7m8hwQPISnlholFi3u0,1698
12
- hie_rag-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- hie_rag-0.2.2.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
14
- hie_rag-0.2.2.dist-info/RECORD,,