hie-rag 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hie_rag/hie_rag.py +5 -0
- hie_rag/process.py +2 -2
- hie_rag/split.py +86 -77
- hie_rag/tree_index.py +4 -4
- hie_rag/utils.py +51 -3
- {hie_rag-0.2.0.dist-info → hie_rag-0.2.2.dist-info}/METADATA +1 -1
- hie_rag-0.2.2.dist-info/RECORD +14 -0
- {hie_rag-0.2.0.dist-info → hie_rag-0.2.2.dist-info}/WHEEL +1 -1
- hie_rag-0.2.0.dist-info/RECORD +0 -14
- {hie_rag-0.2.0.dist-info → hie_rag-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {hie_rag-0.2.0.dist-info → hie_rag-0.2.2.dist-info}/top_level.txt +0 -0
hie_rag/hie_rag.py
CHANGED
@@ -15,18 +15,23 @@ class HieRag:
|
|
15
15
|
|
16
16
|
def process_and_save_index_stream(self, file_name: str, uploaded_file: bytes, min_chunk_size, max_chunk_size):
|
17
17
|
yield {"status": "🔍 Extracting text..."}
|
18
|
+
print(f"Extracting text from {file_name}")
|
18
19
|
extracted_text = self.utils.extract_text(uploaded_file)
|
19
20
|
|
20
21
|
yield {"status": "✂️ Splitting into chunks..."}
|
22
|
+
print(f"Splitting text into chunks with min size {min_chunk_size} and max size {max_chunk_size}")
|
21
23
|
result_split = self.split.split(extracted_text, min_chunk_size=min_chunk_size, max_chunk_size=max_chunk_size)
|
22
24
|
|
23
25
|
yield {"status": "🧠 Processing chunks..."}
|
26
|
+
print(f"Processing {len(result_split)} chunks")
|
24
27
|
result_process = self.process.process_chunks(result_split)
|
25
28
|
|
26
29
|
yield {"status": "🌲 Building tree index..."}
|
30
|
+
print(f"Building tree index with {len(result_process)} chunks")
|
27
31
|
tree_index = self.tree_index.tree_index(file_name = file_name, chunk_metadata=result_process)
|
28
32
|
|
29
33
|
yield {"status": "💾 Saving to vector DB..."}
|
34
|
+
print(f"Saving tree index with {len(tree_index.get('chunks', []))} chunks to vector DB")
|
30
35
|
save_result = self.vector_db.save_index(tree_index)
|
31
36
|
|
32
37
|
file_id = save_result.get("file_id", "unknown")
|
hie_rag/process.py
CHANGED
@@ -9,8 +9,8 @@ from .utils import Utils
|
|
9
9
|
|
10
10
|
|
11
11
|
class Process:
|
12
|
-
def __init__(self, base_url=None):
|
13
|
-
self.client = ChatOllama(model=
|
12
|
+
def __init__(self, base_url=None, model="llama3.2:latest"):
|
13
|
+
self.client = ChatOllama(model=model)
|
14
14
|
self.utils = Utils(base_url=base_url)
|
15
15
|
|
16
16
|
def _generate_metadata(self, chunk: str) -> Dict:
|
hie_rag/split.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
import re
|
2
|
+
from collections import deque
|
3
|
+
from typing import List, Tuple
|
2
4
|
|
3
5
|
from .utils import Utils
|
4
6
|
|
@@ -10,6 +12,20 @@ class Split:
|
|
10
12
|
"""
|
11
13
|
self.utils = Utils(base_url=base_url)
|
12
14
|
|
15
|
+
def _custom_split(self, text: str):
|
16
|
+
stripped = text.strip()
|
17
|
+
# 以「空白行」作為段落切點
|
18
|
+
raw_paragraphs = re.split(r'\n\s*\n+', stripped)
|
19
|
+
|
20
|
+
result = []
|
21
|
+
for para in raw_paragraphs:
|
22
|
+
# 把段落內所有換行改成空格
|
23
|
+
single_line = para.replace('\r\n', ' ').replace('\r', ' ').replace('\n', ' ')
|
24
|
+
cleaned = single_line.strip()
|
25
|
+
if cleaned:
|
26
|
+
result.append(cleaned)
|
27
|
+
return result
|
28
|
+
|
13
29
|
def _split_large_chunk(self, paragraphs: List[str], embeddings: List[List[float]]) -> (List[str], List[str]):
|
14
30
|
"""
|
15
31
|
Splits 'paragraphs' by finding the least similar boundary using 'embeddings'
|
@@ -25,8 +41,9 @@ class Split:
|
|
25
41
|
return paragraphs, []
|
26
42
|
|
27
43
|
# Find the least similar consecutive boundary
|
28
|
-
|
29
|
-
|
44
|
+
window_size = 3
|
45
|
+
split_index = self.utils.get_windowed_least_similar(embeddings, window_size=window_size)
|
46
|
+
|
30
47
|
left_part = paragraphs[:split_index + 1]
|
31
48
|
right_part = paragraphs[split_index + 1:]
|
32
49
|
return left_part, right_part
|
@@ -37,84 +54,76 @@ class Split:
|
|
37
54
|
min_chunk_size: int = 300,
|
38
55
|
max_chunk_size: int = 500
|
39
56
|
) -> List[str]:
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
if not paragraphs:
|
57
|
+
|
58
|
+
# 1) Build a deque of triples, so we never mutate three separate lists:
|
59
|
+
# paras = [p.strip() for p in extracted_text.split("\n\n") if p.strip()]
|
60
|
+
paras = self._custom_split(extracted_text)
|
61
|
+
|
62
|
+
if not paras:
|
47
63
|
return []
|
48
64
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
else:
|
72
|
-
# This paragraph alone might exceed max_chunk_size -> handle as you see fit
|
65
|
+
tokens = [self.utils.count_tokens(p) for p in paras]
|
66
|
+
embs = self.utils.list_embeddings(paras)
|
67
|
+
D: deque[Tuple[str,List[float],int]] = deque(
|
68
|
+
zip(paras, embs, tokens)
|
69
|
+
)
|
70
|
+
|
71
|
+
final_chunks: List[str] = []
|
72
|
+
|
73
|
+
# 2) As long as there’s anything left in D, build one chunk at a time:
|
74
|
+
while D:
|
75
|
+
cur_paras: List[str] = []
|
76
|
+
cur_embs: List[List[float]] = []
|
77
|
+
cur_tokens: List[int] = []
|
78
|
+
total_tokens = 0
|
79
|
+
|
80
|
+
# 2a) Guarantee we hit at least min_chunk_size
|
81
|
+
while D and total_tokens < min_chunk_size:
|
82
|
+
p, e, t = D.popleft()
|
83
|
+
# if even this one p would bust max, you might choose to take it alone
|
84
|
+
if total_tokens + t > max_chunk_size and total_tokens > 0:
|
85
|
+
# push it back for the next round
|
86
|
+
D.appendleft((p,e,t))
|
73
87
|
break
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
88
|
+
cur_paras.append(p)
|
89
|
+
cur_embs .append(e)
|
90
|
+
cur_tokens.append(t)
|
91
|
+
total_tokens += t
|
92
|
+
|
93
|
+
# if we ran out before min and have something -> emit it
|
94
|
+
if total_tokens < min_chunk_size and not D:
|
95
|
+
final_chunks.append(" ".join(cur_paras))
|
79
96
|
break
|
80
97
|
|
81
|
-
#
|
82
|
-
while
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
98
|
+
# 2b) Greedily fill until just under max_chunk_size
|
99
|
+
while D and total_tokens + D[0][2] <= max_chunk_size:
|
100
|
+
p, e, t = D.popleft()
|
101
|
+
cur_paras.append(p)
|
102
|
+
cur_embs .append(e)
|
103
|
+
cur_tokens.append(t)
|
104
|
+
total_tokens += t
|
105
|
+
|
106
|
+
# 3) Now we have between min and max tokens: split at the least-similar boundary
|
107
|
+
if cur_paras:
|
108
|
+
left, right = self._split_large_chunk(cur_paras, cur_embs)
|
109
|
+
|
110
|
+
# Count tokens in “left” to see if it meets min_chunk_size
|
111
|
+
left_token_count = sum(self.utils.count_tokens(p) for p in left)
|
112
|
+
|
113
|
+
if left_token_count >= min_chunk_size:
|
114
|
+
# If left is big enough, emit it
|
115
|
+
final_chunks.append(" ".join(left))
|
116
|
+
|
117
|
+
# Push “right” (the remainder) back onto D for subsequent chunks
|
118
|
+
for rp, re, rt in reversed(list(zip(
|
119
|
+
cur_paras[len(left):],
|
120
|
+
cur_embs [len(left):],
|
121
|
+
cur_tokens[len(left):]
|
122
|
+
))):
|
123
|
+
D.appendleft((rp, re, rt))
|
89
124
|
else:
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
if chunk_paragraphs:
|
94
|
-
# 3) Split at the "least similar" boundary
|
95
|
-
left_part, right_part = self._split_large_chunk(
|
96
|
-
chunk_paragraphs, chunk_embeddings
|
97
|
-
)
|
98
|
-
|
99
|
-
# We'll figure out how many paragraphs ended up in the left part
|
100
|
-
used_count = len(left_part)
|
101
|
-
leftover_count = len(right_part)
|
102
|
-
|
103
|
-
# Store left side
|
104
|
-
final_chunks.append(" ".join(left_part))
|
105
|
-
|
106
|
-
# If there's leftover, reinsert it into the main lists
|
107
|
-
if leftover_count > 0:
|
108
|
-
# Slices for leftover
|
109
|
-
leftover_embeddings = chunk_embeddings[used_count:]
|
110
|
-
leftover_tokens = chunk_tokens[used_count:]
|
111
|
-
|
112
|
-
# Re-insert them at index=idx
|
113
|
-
paragraphs[idx:idx] = right_part
|
114
|
-
paragraphs_embeddings[idx:idx] = leftover_embeddings
|
115
|
-
paragraphs_tokens[idx:idx] = leftover_tokens
|
116
|
-
|
117
|
-
# Recompute n, in case the paragraphs list has grown
|
118
|
-
n = len(paragraphs)
|
125
|
+
# If “left” is too small, just emit the entire cur_paras as one chunk
|
126
|
+
final_chunks.append(" ".join(cur_paras))
|
127
|
+
# (We do NOT push anything back, because cur_paras is fully consumed.)
|
119
128
|
|
120
|
-
return final_chunks
|
129
|
+
return final_chunks
|
hie_rag/tree_index.py
CHANGED
@@ -10,8 +10,8 @@ from .utils import Utils
|
|
10
10
|
|
11
11
|
|
12
12
|
class TreeIndex:
|
13
|
-
def __init__(self, base_url: str):
|
14
|
-
self.client = ChatOllama(model=
|
13
|
+
def __init__(self, base_url: str, model="llama3.2:latest"):
|
14
|
+
self.client = ChatOllama(model=model)
|
15
15
|
self.utils = Utils(base_url=base_url)
|
16
16
|
|
17
17
|
def _convert_to_string(self, chunk_metadata: dict) -> str:
|
@@ -32,8 +32,8 @@ class TreeIndex:
|
|
32
32
|
|
33
33
|
NOTE:
|
34
34
|
1. 請輸出繁體中文
|
35
|
-
2. The summary should be concise
|
36
|
-
3. The summary should be
|
35
|
+
2. The summary should be concise with details and better than the individual summaries.
|
36
|
+
3. The summary should be long enough to cover all the main points of the text.
|
37
37
|
|
38
38
|
Summaries:
|
39
39
|
{summaries}
|
hie_rag/utils.py
CHANGED
@@ -12,7 +12,7 @@ from .ai_client import AiClient
|
|
12
12
|
|
13
13
|
|
14
14
|
class Utils:
|
15
|
-
def __init__(self, base_url
|
15
|
+
def __init__(self, base_url: str):
|
16
16
|
# self.client = OpenAI(api_key=api_key)
|
17
17
|
self.client = AiClient(base_url=base_url)
|
18
18
|
|
@@ -24,13 +24,16 @@ class Utils:
|
|
24
24
|
# Accept both raw bytes and file-like objects with `.read()`
|
25
25
|
if isinstance(uploaded_file, bytes):
|
26
26
|
file_bytes = uploaded_file
|
27
|
+
suffix = ".bin" # fallback generic extension
|
27
28
|
elif hasattr(uploaded_file, "read"):
|
28
29
|
file_bytes = uploaded_file.read()
|
30
|
+
filename = getattr(uploaded_file, "name", None) or getattr(uploaded_file, "filename", None)
|
31
|
+
suffix = os.path.splitext(filename)[-1] if filename else ".bin"
|
29
32
|
else:
|
30
33
|
raise TypeError("Unsupported file type: must be bytes or file-like object")
|
31
34
|
|
32
35
|
# Write to temp file for MarkItDown to process
|
33
|
-
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
36
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
34
37
|
temp_file_path = temp_file.name
|
35
38
|
temp_file.write(file_bytes)
|
36
39
|
|
@@ -71,4 +74,49 @@ class Utils:
|
|
71
74
|
# Find the index where consecutive similarity is lowest
|
72
75
|
split_index = np.argmin(consecutive_similarities)
|
73
76
|
|
74
|
-
return split_index
|
77
|
+
return split_index
|
78
|
+
|
79
|
+
def get_windowed_least_similar(
|
80
|
+
self,
|
81
|
+
embeddings: list,
|
82
|
+
window_size: int = 3
|
83
|
+
) -> int:
|
84
|
+
"""
|
85
|
+
對 embeddings 做滑動窗口:對每個可能的分割位置 i(0 <= i < len-1),
|
86
|
+
將 [max(0, i-window_size+1) .. i] 這 window_size 句平均後的向量
|
87
|
+
與 [i+1 .. min(len, i+window_size)] 這 window_size 句平均後的向量做 cosine 相似度,
|
88
|
+
回傳相似度最低的那個 i。
|
89
|
+
"""
|
90
|
+
if len(embeddings) < 2:
|
91
|
+
# 根本沒得分割
|
92
|
+
return 0
|
93
|
+
|
94
|
+
# 把 list-of-lists 轉成 numpy array (shape: [n_sentences, dim_emb])
|
95
|
+
embs = np.array(embeddings)
|
96
|
+
n = embs.shape[0]
|
97
|
+
|
98
|
+
best_index = 0
|
99
|
+
lowest_sim = float('inf')
|
100
|
+
|
101
|
+
for i in range(n - 1):
|
102
|
+
# 前半段:從 pre_start 到 i (inclusive)
|
103
|
+
pre_start = max(0, i - window_size + 1)
|
104
|
+
pre_group = embs[pre_start : i + 1] # shape: (<=window_size, dim)
|
105
|
+
|
106
|
+
# 後半段:從 i+1 到 post_end-1
|
107
|
+
post_end = min(n, i + 1 + window_size)
|
108
|
+
post_group = embs[i + 1 : post_end] # shape: (<=window_size, dim)
|
109
|
+
|
110
|
+
# 計算平均向量
|
111
|
+
# (也可以改成加總:np.sum(...);不過平均比較常見且 scale 感覺一致)
|
112
|
+
pre_avg = np.mean(pre_group, axis=0).reshape(1, -1) # shape: (1, dim)
|
113
|
+
post_avg = np.mean(post_group, axis=0).reshape(1, -1) # shape: (1, dim)
|
114
|
+
|
115
|
+
# 計算 cosine similarity
|
116
|
+
sim = float(cosine_similarity(pre_avg, post_avg)[0][0])
|
117
|
+
|
118
|
+
if sim < lowest_sim:
|
119
|
+
lowest_sim = sim
|
120
|
+
best_index = i
|
121
|
+
|
122
|
+
return best_index
|
@@ -0,0 +1,14 @@
|
|
1
|
+
hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
|
2
|
+
hie_rag/ai_client.py,sha256=VbGQ0e3vZNn8W2YoR15Vvq2r-MUs-TBRNLGiImT4QxU,1000
|
3
|
+
hie_rag/hie_rag.py,sha256=Nl_1WZM9IWhpNyZMvPzsae_u_xaCWEwrJgorZV-hp20,2741
|
4
|
+
hie_rag/process.py,sha256=Z4qpNmxSsxUJgnqJtw8cYWJTS6SxhRR7F7eX_akyVCU,2427
|
5
|
+
hie_rag/split.py,sha256=gEQVt57xWruT5e1psgSOnwuBrQngzri3S4H6ZvKzsw4,5082
|
6
|
+
hie_rag/split_and_process.py,sha256=PkFlnOF7nW4Zs47JTsGF4AY9VDOXz1AtxG9Die8_mQk,572
|
7
|
+
hie_rag/tree_index.py,sha256=iTa25ohMv5O0HYc5JtzIzVAIhNdVklYiAIJvqyE8sbM,2722
|
8
|
+
hie_rag/utils.py,sha256=GwGiQj-zc8-U9UXOFHTKkjHWHx8YTYquR27gsXJgzCE,4687
|
9
|
+
hie_rag/vectordb.py,sha256=iI73ujrONjDaHU66RNdHnD2PZWSppnjm0isIHPJEGAY,11068
|
10
|
+
hie_rag-0.2.2.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
|
11
|
+
hie_rag-0.2.2.dist-info/METADATA,sha256=3vTI_zyvJxOOq8VrrchOAn0a7m8hwQPISnlholFi3u0,1698
|
12
|
+
hie_rag-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
13
|
+
hie_rag-0.2.2.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
|
14
|
+
hie_rag-0.2.2.dist-info/RECORD,,
|
hie_rag-0.2.0.dist-info/RECORD
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
|
2
|
-
hie_rag/ai_client.py,sha256=VbGQ0e3vZNn8W2YoR15Vvq2r-MUs-TBRNLGiImT4QxU,1000
|
3
|
-
hie_rag/hie_rag.py,sha256=KB44QBz3tE0Eq_FJw9pvKynCfjyAuulaMFYKk6bzjug,2359
|
4
|
-
hie_rag/process.py,sha256=D_vMnF84ingLb4_KoC77uLQXSa6FwEpR30RGukG2H9U,2414
|
5
|
-
hie_rag/split.py,sha256=My7QQ_pPiJD0TvwRzm2MgonMMA79-r3Vifwp1xLWX4I,4905
|
6
|
-
hie_rag/split_and_process.py,sha256=PkFlnOF7nW4Zs47JTsGF4AY9VDOXz1AtxG9Die8_mQk,572
|
7
|
-
hie_rag/tree_index.py,sha256=TuRi9-M2aiD46ciS-iwIJYDc9nXq7i7mwxwVbMXk5Lo,2668
|
8
|
-
hie_rag/utils.py,sha256=F5bqx147yT37z080MPWPrwzOa0tGEAWmvNFgjXpe4ZA,2729
|
9
|
-
hie_rag/vectordb.py,sha256=iI73ujrONjDaHU66RNdHnD2PZWSppnjm0isIHPJEGAY,11068
|
10
|
-
hie_rag-0.2.0.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
|
11
|
-
hie_rag-0.2.0.dist-info/METADATA,sha256=Oym7z46OyhT_Gp7unhX1rsYlFQi9UuOBU5VRsko1m_A,1698
|
12
|
-
hie_rag-0.2.0.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
13
|
-
hie_rag-0.2.0.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
|
14
|
-
hie_rag-0.2.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|