hie-rag 0.1.3__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {hie_rag-0.1.3 → hie_rag-0.2.2}/PKG-INFO +2 -2
  2. hie_rag-0.2.2/hie_rag/ai_client.py +26 -0
  3. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag/hie_rag.py +11 -6
  4. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag/process.py +4 -4
  5. hie_rag-0.2.2/hie_rag/split.py +129 -0
  6. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag/split_and_process.py +4 -4
  7. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag/tree_index.py +6 -6
  8. hie_rag-0.2.2/hie_rag/utils.py +122 -0
  9. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag/vectordb.py +2 -2
  10. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag.egg-info/PKG-INFO +2 -2
  11. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag.egg-info/SOURCES.txt +1 -1
  12. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag.egg-info/requires.txt +1 -1
  13. {hie_rag-0.1.3 → hie_rag-0.2.2}/pyproject.toml +2 -2
  14. {hie_rag-0.1.3 → hie_rag-0.2.2}/test/test-process.py +9 -5
  15. {hie_rag-0.1.3 → hie_rag-0.2.2}/test/test-split.py +9 -5
  16. hie_rag-0.2.2/test/test-utils.py +56 -0
  17. {hie_rag-0.1.3 → hie_rag-0.2.2}/test/test_split_and_process.py +7 -2
  18. hie_rag-0.1.3/hie_rag/app.py +0 -77
  19. hie_rag-0.1.3/hie_rag/split.py +0 -120
  20. hie_rag-0.1.3/hie_rag/utils.py +0 -74
  21. hie_rag-0.1.3/test/test-utils.py +0 -50
  22. {hie_rag-0.1.3 → hie_rag-0.2.2}/LICENSE +0 -0
  23. {hie_rag-0.1.3 → hie_rag-0.2.2}/README.md +0 -0
  24. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag/__init__.py +0 -0
  25. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag.egg-info/dependency_links.txt +0 -0
  26. {hie_rag-0.1.3 → hie_rag-0.2.2}/hie_rag.egg-info/top_level.txt +0 -0
  27. {hie_rag-0.1.3 → hie_rag-0.2.2}/setup.cfg +0 -0
  28. {hie_rag-0.1.3 → hie_rag-0.2.2}/test/test-vectordb.py +0 -0
  29. {hie_rag-0.1.3 → hie_rag-0.2.2}/test/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hie_rag
3
- Version: 0.1.3
3
+ Version: 0.2.2
4
4
  Summary: A hierarchical RAG framework for chunks retrieval.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -29,7 +29,7 @@ Requires-Dist: openai==1.66.3
29
29
  Requires-Dist: scikit-learn
30
30
  Requires-Dist: tiktoken==0.8.0
31
31
  Requires-Dist: langchain==0.3.13
32
- Requires-Dist: langchain-openai==0.2.13
32
+ Requires-Dist: langchain-ollama==0.3.3
33
33
  Requires-Dist: chromadb==0.6.2
34
34
  Dynamic: license-file
35
35
 
@@ -0,0 +1,26 @@
1
+ import requests
2
+
3
+
4
+ class AiClient:
5
+ def __init__(self, base_url="http://localhost:11434"):
6
+ self.base_url = base_url
7
+ self.headers = {"Content-Type": "application/json"}
8
+
9
+ def get_embedding(self, text: str, model="nomic-embed-text") -> list:
10
+ url = f"{self.base_url}/api/embeddings"
11
+ payload = {
12
+ "model": model,
13
+ "prompt": text
14
+ }
15
+ response = requests.post(url, json=payload, headers=self.headers, timeout=60)
16
+ response.raise_for_status()
17
+ data = response.json()
18
+
19
+ # Extract embedding, adapt if your API response structure differs
20
+ embedding = data.get("embedding") or (data.get("data") and data["data"][0].get("embedding"))
21
+ if embedding is None:
22
+ raise ValueError("Embedding not found in Ollama response")
23
+ return embedding
24
+
25
+ def list_embeddings(self, texts: list, model="nomic-embed-text") -> list:
26
+ return [self.get_embedding(text, model=model) for text in texts]
@@ -6,27 +6,32 @@ from hie_rag.vectordb import Vectordb
6
6
 
7
7
 
8
8
  class HieRag:
9
- def __init__(self, api_key=None, path="./db", collection_name="db_collection"):
10
- self.split = Split(api_key=api_key)
11
- self.utils = Utils(api_key=api_key)
12
- self.tree_index = TreeIndex(api_key=api_key)
13
- self.process = Process(api_key=api_key)
14
- self.vector_db = Vectordb(path=path, api_key=api_key, collection_name=collection_name)
9
+ def __init__(self, base_url, path="./db", collection_name="db_collection"):
10
+ self.split = Split(base_url=base_url)
11
+ self.utils = Utils(base_url=base_url)
12
+ self.tree_index = TreeIndex(base_url=base_url)
13
+ self.process = Process(base_url=base_url)
14
+ self.vector_db = Vectordb(path=path, base_url=base_url, collection_name=collection_name)
15
15
 
16
16
  def process_and_save_index_stream(self, file_name: str, uploaded_file: bytes, min_chunk_size, max_chunk_size):
17
17
  yield {"status": "🔍 Extracting text..."}
18
+ print(f"Extracting text from {file_name}")
18
19
  extracted_text = self.utils.extract_text(uploaded_file)
19
20
 
20
21
  yield {"status": "✂️ Splitting into chunks..."}
22
+ print(f"Splitting text into chunks with min size {min_chunk_size} and max size {max_chunk_size}")
21
23
  result_split = self.split.split(extracted_text, min_chunk_size=min_chunk_size, max_chunk_size=max_chunk_size)
22
24
 
23
25
  yield {"status": "🧠 Processing chunks..."}
26
+ print(f"Processing {len(result_split)} chunks")
24
27
  result_process = self.process.process_chunks(result_split)
25
28
 
26
29
  yield {"status": "🌲 Building tree index..."}
30
+ print(f"Building tree index with {len(result_process)} chunks")
27
31
  tree_index = self.tree_index.tree_index(file_name = file_name, chunk_metadata=result_process)
28
32
 
29
33
  yield {"status": "💾 Saving to vector DB..."}
34
+ print(f"Saving tree index with {len(tree_index.get('chunks', []))} chunks to vector DB")
30
35
  save_result = self.vector_db.save_index(tree_index)
31
36
 
32
37
  file_id = save_result.get("file_id", "unknown")
@@ -1,7 +1,7 @@
1
1
  from typing import Dict, List
2
2
 
3
3
  from langchain_core.prompts import PromptTemplate
4
- from langchain_openai import ChatOpenAI
4
+ from langchain_ollama import ChatOllama
5
5
  from pydantic import Field
6
6
  from typing_extensions import TypedDict
7
7
 
@@ -9,9 +9,9 @@ from .utils import Utils
9
9
 
10
10
 
11
11
  class Process:
12
- def __init__(self, api_key=None):
13
- self.client = ChatOpenAI(temperature=0, model="gpt-4o", api_key=api_key)
14
- self.utils = Utils(api_key=api_key)
12
+ def __init__(self, base_url=None, model="llama3.2:latest"):
13
+ self.client = ChatOllama(model=model)
14
+ self.utils = Utils(base_url=base_url)
15
15
 
16
16
  def _generate_metadata(self, chunk: str) -> Dict:
17
17
  """Generate metadata for a chunk using LangChain"""
@@ -0,0 +1,129 @@
1
+ import re
2
+ from collections import deque
3
+ from typing import List, Tuple
4
+
5
+ from .utils import Utils
6
+
7
+
8
+ class Split:
9
+ def __init__(self, base_url: str = None):
10
+ """
11
+ Initializes the Split object with default or user-defined thresholds.
12
+ """
13
+ self.utils = Utils(base_url=base_url)
14
+
15
+ def _custom_split(self, text: str):
16
+ stripped = text.strip()
17
+ # 以「空白行」作為段落切點
18
+ raw_paragraphs = re.split(r'\n\s*\n+', stripped)
19
+
20
+ result = []
21
+ for para in raw_paragraphs:
22
+ # 把段落內所有換行改成空格
23
+ single_line = para.replace('\r\n', ' ').replace('\r', ' ').replace('\n', ' ')
24
+ cleaned = single_line.strip()
25
+ if cleaned:
26
+ result.append(cleaned)
27
+ return result
28
+
29
+ def _split_large_chunk(self, paragraphs: List[str], embeddings: List[List[float]]) -> (List[str], List[str]):
30
+ """
31
+ Splits 'paragraphs' by finding the least similar boundary using 'embeddings'
32
+ (which are precomputed for these paragraphs only). Returns (left_part, right_part).
33
+ """
34
+ # If there are 0 or 1 paragraphs, no need to split
35
+ if len(paragraphs) < 2:
36
+ return paragraphs, []
37
+
38
+ # We'll assume 'embeddings' is already the same length as 'paragraphs'.
39
+ if len(embeddings) < 2:
40
+ # Can't compute consecutive similarities with fewer than 2 embeddings
41
+ return paragraphs, []
42
+
43
+ # Find the least similar consecutive boundary
44
+ window_size = 3
45
+ split_index = self.utils.get_windowed_least_similar(embeddings, window_size=window_size)
46
+
47
+ left_part = paragraphs[:split_index + 1]
48
+ right_part = paragraphs[split_index + 1:]
49
+ return left_part, right_part
50
+
51
+ def split(
52
+ self,
53
+ extracted_text: str,
54
+ min_chunk_size: int = 300,
55
+ max_chunk_size: int = 500
56
+ ) -> List[str]:
57
+
58
+ # 1) Build a deque of triples, so we never mutate three separate lists:
59
+ # paras = [p.strip() for p in extracted_text.split("\n\n") if p.strip()]
60
+ paras = self._custom_split(extracted_text)
61
+
62
+ if not paras:
63
+ return []
64
+
65
+ tokens = [self.utils.count_tokens(p) for p in paras]
66
+ embs = self.utils.list_embeddings(paras)
67
+ D: deque[Tuple[str,List[float],int]] = deque(
68
+ zip(paras, embs, tokens)
69
+ )
70
+
71
+ final_chunks: List[str] = []
72
+
73
+ # 2) As long as there’s anything left in D, build one chunk at a time:
74
+ while D:
75
+ cur_paras: List[str] = []
76
+ cur_embs: List[List[float]] = []
77
+ cur_tokens: List[int] = []
78
+ total_tokens = 0
79
+
80
+ # 2a) Guarantee we hit at least min_chunk_size
81
+ while D and total_tokens < min_chunk_size:
82
+ p, e, t = D.popleft()
83
+ # if even this one p would bust max, you might choose to take it alone
84
+ if total_tokens + t > max_chunk_size and total_tokens > 0:
85
+ # push it back for the next round
86
+ D.appendleft((p,e,t))
87
+ break
88
+ cur_paras.append(p)
89
+ cur_embs .append(e)
90
+ cur_tokens.append(t)
91
+ total_tokens += t
92
+
93
+ # if we ran out before min and have something -> emit it
94
+ if total_tokens < min_chunk_size and not D:
95
+ final_chunks.append(" ".join(cur_paras))
96
+ break
97
+
98
+ # 2b) Greedily fill until just under max_chunk_size
99
+ while D and total_tokens + D[0][2] <= max_chunk_size:
100
+ p, e, t = D.popleft()
101
+ cur_paras.append(p)
102
+ cur_embs .append(e)
103
+ cur_tokens.append(t)
104
+ total_tokens += t
105
+
106
+ # 3) Now we have between min and max tokens: split at the least-similar boundary
107
+ if cur_paras:
108
+ left, right = self._split_large_chunk(cur_paras, cur_embs)
109
+
110
+ # Count tokens in “left” to see if it meets min_chunk_size
111
+ left_token_count = sum(self.utils.count_tokens(p) for p in left)
112
+
113
+ if left_token_count >= min_chunk_size:
114
+ # If left is big enough, emit it
115
+ final_chunks.append(" ".join(left))
116
+
117
+ # Push “right” (the remainder) back onto D for subsequent chunks
118
+ for rp, re, rt in reversed(list(zip(
119
+ cur_paras[len(left):],
120
+ cur_embs [len(left):],
121
+ cur_tokens[len(left):]
122
+ ))):
123
+ D.appendleft((rp, re, rt))
124
+ else:
125
+ # If “left” is too small, just emit the entire cur_paras as one chunk
126
+ final_chunks.append(" ".join(cur_paras))
127
+ # (We do NOT push anything back, because cur_paras is fully consumed.)
128
+
129
+ return final_chunks
@@ -4,10 +4,10 @@ from hie_rag.utils import Utils
4
4
 
5
5
 
6
6
  class SplitAndProcess:
7
- def __init__(self, api_key=None):
8
- self.split = Split(api_key=api_key)
9
- self.utils = Utils(api_key=api_key)
10
- self.process = Process(api_key=api_key)
7
+ def __init__(self, base_url: str):
8
+ self.split = Split(base_url=base_url)
9
+ self.utils = Utils(base_url=base_url)
10
+ self.process = Process(base_url=base_url)
11
11
 
12
12
  def split_and_process(self, uploaded_file):
13
13
  extracted_text = self.utils.extract_text(uploaded_file)
@@ -2,7 +2,7 @@ import json
2
2
  from typing import List
3
3
 
4
4
  from langchain_core.prompts import PromptTemplate
5
- from langchain_openai import ChatOpenAI
5
+ from langchain_ollama import ChatOllama
6
6
  from pydantic import Field
7
7
  from typing_extensions import TypedDict
8
8
 
@@ -10,9 +10,9 @@ from .utils import Utils
10
10
 
11
11
 
12
12
  class TreeIndex:
13
- def __init__(self, api_key: str):
14
- self.client = ChatOpenAI(temperature=0, model="gpt-4o", api_key=api_key)
15
- self.utils = Utils(api_key=api_key)
13
+ def __init__(self, base_url: str, model="llama3.2:latest"):
14
+ self.client = ChatOllama(model=model)
15
+ self.utils = Utils(base_url=base_url)
16
16
 
17
17
  def _convert_to_string(self, chunk_metadata: dict) -> str:
18
18
  """
@@ -32,8 +32,8 @@ class TreeIndex:
32
32
 
33
33
  NOTE:
34
34
  1. 請輸出繁體中文
35
- 2. The summary should be concise and capture the main points of the text.
36
- 3. The summary should be around 5-8 sentences long.
35
+ 2. The summary should be concise with details and better than the individual summaries.
36
+ 3. The summary should be long enough to cover all the main points of the text.
37
37
 
38
38
  Summaries:
39
39
  {summaries}
@@ -0,0 +1,122 @@
1
+ import contextlib
2
+ import io
3
+ import os
4
+ import tempfile
5
+
6
+ import numpy as np
7
+ import tiktoken
8
+ from markitdown import MarkItDown
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+
11
+ from .ai_client import AiClient
12
+
13
+
14
+ class Utils:
15
+ def __init__(self, base_url: str):
16
+ # self.client = OpenAI(api_key=api_key)
17
+ self.client = AiClient(base_url=base_url)
18
+
19
+ def extract_text(self, uploaded_file: bytes):
20
+ """Extract text from an uploaded file using MarkItDown."""
21
+ # md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
22
+ md = MarkItDown()
23
+
24
+ # Accept both raw bytes and file-like objects with `.read()`
25
+ if isinstance(uploaded_file, bytes):
26
+ file_bytes = uploaded_file
27
+ suffix = ".bin" # fallback generic extension
28
+ elif hasattr(uploaded_file, "read"):
29
+ file_bytes = uploaded_file.read()
30
+ filename = getattr(uploaded_file, "name", None) or getattr(uploaded_file, "filename", None)
31
+ suffix = os.path.splitext(filename)[-1] if filename else ".bin"
32
+ else:
33
+ raise TypeError("Unsupported file type: must be bytes or file-like object")
34
+
35
+ # Write to temp file for MarkItDown to process
36
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
37
+ temp_file_path = temp_file.name
38
+ temp_file.write(file_bytes)
39
+
40
+ try:
41
+ # Redirect stderr to suppress native print warnings like "CropBox missing"
42
+ with contextlib.redirect_stderr(io.StringIO()):
43
+ extracted_text = md.convert(temp_file_path)
44
+ finally:
45
+ # Clean up the temporary file
46
+ os.remove(temp_file_path)
47
+
48
+ return extracted_text.text_content
49
+
50
+ def count_tokens(self, text: str, encoding="cl100k_base") -> int:
51
+ """Count tokens in text using tiktoken"""
52
+ tokenizer = tiktoken.get_encoding(encoding)
53
+ return len(tokenizer.encode(text))
54
+
55
+ def get_embedding(self, text: str, model="nomic-embed-text") -> list:
56
+ if not self.client:
57
+ raise RuntimeError("No embedding client configured")
58
+ return self.client.get_embedding(text, model=model)
59
+
60
+ def list_embeddings(self, chunks: list, model="nomic-embed-text") -> list:
61
+ if not self.client:
62
+ raise RuntimeError("No embedding client configured")
63
+ return self.client.list_embeddings(chunks, model=model)
64
+
65
+ def get_consecutive_least_similar(self, embeddings: list) -> int:
66
+ """Find the index where consecutive similarity is lowest"""
67
+ cs = cosine_similarity(embeddings)
68
+
69
+ # Get similarities between consecutive sentences only
70
+ consecutive_similarities = []
71
+ for i in range(len(cs) - 1):
72
+ consecutive_similarities.append(cs[i][i + 1])
73
+
74
+ # Find the index where consecutive similarity is lowest
75
+ split_index = np.argmin(consecutive_similarities)
76
+
77
+ return split_index
78
+
79
+ def get_windowed_least_similar(
80
+ self,
81
+ embeddings: list,
82
+ window_size: int = 3
83
+ ) -> int:
84
+ """
85
+ 對 embeddings 做滑動窗口:對每個可能的分割位置 i(0 <= i < len-1),
86
+ 將 [max(0, i-window_size+1) .. i] 這 window_size 句平均後的向量
87
+ 與 [i+1 .. min(len, i+window_size)] 這 window_size 句平均後的向量做 cosine 相似度,
88
+ 回傳相似度最低的那個 i。
89
+ """
90
+ if len(embeddings) < 2:
91
+ # 根本沒得分割
92
+ return 0
93
+
94
+ # 把 list-of-lists 轉成 numpy array (shape: [n_sentences, dim_emb])
95
+ embs = np.array(embeddings)
96
+ n = embs.shape[0]
97
+
98
+ best_index = 0
99
+ lowest_sim = float('inf')
100
+
101
+ for i in range(n - 1):
102
+ # 前半段:從 pre_start 到 i (inclusive)
103
+ pre_start = max(0, i - window_size + 1)
104
+ pre_group = embs[pre_start : i + 1] # shape: (<=window_size, dim)
105
+
106
+ # 後半段:從 i+1 到 post_end-1
107
+ post_end = min(n, i + 1 + window_size)
108
+ post_group = embs[i + 1 : post_end] # shape: (<=window_size, dim)
109
+
110
+ # 計算平均向量
111
+ # (也可以改成加總:np.sum(...);不過平均比較常見且 scale 感覺一致)
112
+ pre_avg = np.mean(pre_group, axis=0).reshape(1, -1) # shape: (1, dim)
113
+ post_avg = np.mean(post_group, axis=0).reshape(1, -1) # shape: (1, dim)
114
+
115
+ # 計算 cosine similarity
116
+ sim = float(cosine_similarity(pre_avg, post_avg)[0][0])
117
+
118
+ if sim < lowest_sim:
119
+ lowest_sim = sim
120
+ best_index = i
121
+
122
+ return best_index
@@ -7,9 +7,9 @@ from .utils import Utils
7
7
 
8
8
 
9
9
  class Vectordb():
10
- def __init__(self, path, api_key, collection_name):
10
+ def __init__(self, path, base_url, collection_name):
11
11
  self.client = chromadb.PersistentClient(path = path)
12
- self.utils = Utils(api_key=api_key)
12
+ self.utils = Utils(base_url=base_url)
13
13
  self.collection = self.client.get_or_create_collection(collection_name)
14
14
 
15
15
  def _convert_numpy(self, obj):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hie_rag
3
- Version: 0.1.3
3
+ Version: 0.2.2
4
4
  Summary: A hierarchical RAG framework for chunks retrieval.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -29,7 +29,7 @@ Requires-Dist: openai==1.66.3
29
29
  Requires-Dist: scikit-learn
30
30
  Requires-Dist: tiktoken==0.8.0
31
31
  Requires-Dist: langchain==0.3.13
32
- Requires-Dist: langchain-openai==0.2.13
32
+ Requires-Dist: langchain-ollama==0.3.3
33
33
  Requires-Dist: chromadb==0.6.2
34
34
  Dynamic: license-file
35
35
 
@@ -2,7 +2,7 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  hie_rag/__init__.py
5
- hie_rag/app.py
5
+ hie_rag/ai_client.py
6
6
  hie_rag/hie_rag.py
7
7
  hie_rag/process.py
8
8
  hie_rag/split.py
@@ -6,5 +6,5 @@ openai==1.66.3
6
6
  scikit-learn
7
7
  tiktoken==0.8.0
8
8
  langchain==0.3.13
9
- langchain-openai==0.2.13
9
+ langchain-ollama==0.3.3
10
10
  chromadb==0.6.2
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "hie_rag"
7
- version = "0.1.3"
7
+ version = "0.2.2"
8
8
  description = "A hierarchical RAG framework for chunks retrieval."
9
9
  license = { file = "LICENSE" }
10
10
  dependencies = [
@@ -16,7 +16,7 @@ dependencies = [
16
16
  "scikit-learn",
17
17
  "tiktoken==0.8.0",
18
18
  "langchain==0.3.13",
19
- "langchain-openai==0.2.13",
19
+ "langchain-ollama==0.3.3",
20
20
  "chromadb==0.6.2"
21
21
  ]
22
22
  readme = "README.md"
@@ -1,15 +1,19 @@
1
1
  import os
2
+ import sys
3
+
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
2
5
 
3
6
  from dotenv import load_dotenv
4
- from hie_rag.app import Split
7
+
5
8
  from hie_rag.process import Process
9
+ from hie_rag.split import Split
6
10
  from hie_rag.utils import Utils
7
11
 
8
12
  load_dotenv()
9
13
 
10
- split = Split(api_key=os.getenv("OPENAI_API_KEY"), min_chunk_size=200, max_chunk_size=500)
11
- utils = Utils(api_key=os.getenv("OPENAI_API_KEY"))
12
- process = Process(api_key=os.getenv("OPENAI_API_KEY"))
14
+ split = Split(base_url=os.getenv('BASE_URL'))
15
+ utils = Utils(base_url=os.getenv('BASE_URL'))
16
+ process = Process(base_url=os.getenv('BASE_URL'))
13
17
 
14
18
  with open("test.pdf", "rb") as uploaded_file:
15
19
  extracted_text = utils.extract_text(uploaded_file)
@@ -20,7 +24,7 @@ result_split = split.split(extracted_text)
20
24
  result_process = process.process_chunks(result_split)
21
25
 
22
26
  # Write results to the text file
23
- with open("test-process-result", "w", encoding="utf-8") as file:
27
+ with open("test-process-result-new", "w", encoding="utf-8") as file:
24
28
  file.write("Processed Chunks:\n")
25
29
  file.write(str(result_process) + "\n")
26
30
 
@@ -1,13 +1,17 @@
1
1
  import os
2
+ import sys
3
+
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
2
5
 
3
6
  from dotenv import load_dotenv
4
- from hie_rag.app import Split
7
+
8
+ from hie_rag.split import Split
5
9
  from hie_rag.utils import Utils
6
10
 
7
11
  load_dotenv()
8
12
 
9
- split = Split(api_key=os.getenv("OPENAI_API_KEY"), min_chunk_size=200, max_chunk_size=500)
10
- utils = Utils(api_key=os.getenv("OPENAI_API_KEY"))
13
+ split = Split(base_url=os.getenv("BASE_URL"))
14
+ utils = Utils(base_url=os.getenv("BASE_URL"))
11
15
 
12
16
  with open("test.pdf", "rb") as uploaded_file:
13
17
  extracted_text = utils.extract_text(uploaded_file)
@@ -16,10 +20,10 @@ with open("test.pdf", "rb") as uploaded_file:
16
20
  extracted_text = extracted_text[:1000]
17
21
 
18
22
  # Split the extracted text
19
- result_split = split.split(extracted_text)
23
+ result_split = split.split(extracted_text, min_chunk_size=300, max_chunk_size=500)
20
24
 
21
25
  # Write results to the text file
22
- with open("test-split-result", "w", encoding="utf-8") as file:
26
+ with open("test-split-result-new", "w", encoding="utf-8") as file:
23
27
  file.write("Splitted Text:\n")
24
28
  file.write(str(result_split) + "\n")
25
29
  file.write("Length of the Splitted Text:\n")
@@ -0,0 +1,56 @@
1
+ import os
2
+ import sys
3
+
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
5
+
6
+ from dotenv import load_dotenv
7
+
8
+ from hie_rag.utils import Utils
9
+
10
+ load_dotenv()
11
+
12
+ utils = Utils(base_url=os.getenv("BASE_URL"))
13
+
14
+ with open("test2.pdf", "rb") as uploaded_file:
15
+ extracted_text = utils.extract_text(uploaded_file)
16
+
17
+ # Count tokens for the first 100 words
18
+ # result_count_tokens = utils.count_tokens(extracted_text[:100])
19
+ result_count_tokens = utils.count_tokens(extracted_text)
20
+ print(f"Token count: {result_count_tokens}")
21
+
22
+ # # Get embeddings for the text slices
23
+ # result_list_embeddings = utils.list_embeddings([
24
+ # extracted_text[:100],
25
+ # extracted_text[100:200],
26
+ # extracted_text[200:300]
27
+ # ])
28
+
29
+ # # Get the embedding for the first 100 words
30
+ # result_get_embedding = utils.get_embedding(extracted_text[:100])
31
+
32
+ # # Find the index of least similar consecutive embeddings
33
+ # result_get_consecutive_least_similar = utils.get_consecutive_least_similar(result_list_embeddings)
34
+
35
+ # # Write results to the text file
36
+ # with open("test-utils-result-new", "w", encoding="utf-8") as file:
37
+ # file.write("Extracted Text:\n")
38
+ # file.write(extracted_text + "\n\n")
39
+ # file.write("====================================\n\n")
40
+
41
+ # file.write("Count of Tokens (First 100 words):\n")
42
+ # file.write(str(result_count_tokens) + "\n\n")
43
+ # file.write("====================================\n\n")
44
+
45
+ # file.write("List of Embeddings:\n")
46
+ # file.write(str(result_list_embeddings) + "\n\n")
47
+ # file.write("====================================\n\n")
48
+
49
+ # file.write("Embedding of First 100 words:\n")
50
+ # file.write(str(result_get_embedding) + "\n\n")
51
+ # file.write("====================================\n\n")
52
+
53
+ # file.write("Index of Least Similar Consecutive Embeddings:\n")
54
+ # file.write(str(result_get_consecutive_least_similar) + "\n")
55
+
56
+ # print("Results written to a txt file.")
@@ -1,16 +1,21 @@
1
1
  import os
2
+ import sys
3
+
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
5
+
2
6
 
3
7
  from dotenv import load_dotenv
8
+
4
9
  from hie_rag import SplitAndProcess
5
10
 
6
11
  load_dotenv()
7
12
 
8
- split_and_process = SplitAndProcess(api_key=os.getenv("OPENAI_API_KEY"))
13
+ split_and_process = SplitAndProcess(base_url=os.getenv("BASE_URL"))
9
14
 
10
15
  with open("test.pdf", "rb") as uploaded_file:
11
16
  result_process = split_and_process.split_and_process(uploaded_file)
12
17
 
13
- with open("test-split-and-process-result", "w", encoding="utf-8") as file:
18
+ with open("test-split-and-process-result-new", "w", encoding="utf-8") as file:
14
19
  file.write("Split and Processed Text:\n")
15
20
  file.write(str(result_process) + "\n")
16
21
 
@@ -1,77 +0,0 @@
1
- # import json
2
- # import os
3
-
4
- # from .generate import Generate
5
- # from .process import Process
6
- # from .split import Split
7
- # from .tree_index import TreeIndex
8
- # from .utils import Utils
9
- # from .vectordb import Vectordb
10
-
11
-
12
- # # Function to handle data
13
- # def handle_data(data):
14
- # """
15
- # Processes incoming data and returns a response.
16
- # """
17
- # try:
18
- # # This is the logic that used to be in the /api/data route
19
- # return {"received": data}
20
- # except Exception as e:
21
- # return {"error": str(e)}
22
-
23
- # # Function to handle file upload and processing
24
- # def handle_file_upload(uploaded_file, access_token):
25
- # """
26
- # Processes the uploaded file and extracts its text.
27
- # """
28
- # try:
29
- # utils = Utils(api_key=access_token)
30
- # process = Process(api_key=access_token)
31
- # split = Split(api_key=access_token)
32
- # tree_index = TreeIndex(api_key=access_token)
33
-
34
- # if uploaded_file is None:
35
- # return {"error": "No file selected for uploading"}
36
-
37
- # filename = uploaded_file.filename
38
- # extracted_text = utils.extract_text(uploaded_file)
39
- # final_chunk_list = split.split(extracted_text)
40
- # processed_chunks = process.process_chunks(final_chunk_list)
41
- # data = tree_index.output_index(processed_chunks)
42
-
43
- # return {"filename": filename, "data": data}
44
- # except Exception as e:
45
- # return {"error": str(e)}
46
-
47
- # # Function to handle generation logic
48
- # def handle_generation(file, access_token):
49
- # """
50
- # Handles the file for generation and returns generated data.
51
- # """
52
- # try:
53
- # data = json.load(file)
54
-
55
- # if "chunks" not in data:
56
- # return {"error": "Missing 'chunks' in data"}
57
-
58
- # path = os.getenv("INDEX_PATH")
59
- # vectordb = Vectordb(path=path, api_key=access_token)
60
- # generate = Generate(api_key=access_token)
61
-
62
- # save_index_result = vectordb.save_index(data)
63
- # generated_full_data = []
64
-
65
- # for i in data["chunks"]:
66
- # original_chunk = i["original_chunk"]
67
- # query_result = vectordb.query_by_text(original_chunk, n_results=3)
68
- # possible_reference = query_result["metadatas"][0][1]["summary"] + "\n" + query_result["metadatas"][0][2]["summary"]
69
-
70
- # data_gen = generate.generate(original_chunk, possible_reference)
71
- # generated_full_data.extend(data_gen["dataset"])
72
-
73
- # return {"data": generated_full_data}
74
- # except json.JSONDecodeError:
75
- # return {"error": "Invalid JSON file format"}
76
- # except Exception as e:
77
- # return {"error": str(e)}
@@ -1,120 +0,0 @@
1
- from typing import List
2
-
3
- from .utils import Utils
4
-
5
-
6
- class Split:
7
- def __init__(self, api_key: str = None):
8
- """
9
- Initializes the Split object with default or user-defined thresholds.
10
- """
11
- self.utils = Utils(api_key=api_key)
12
-
13
- def _split_large_chunk(self, paragraphs: List[str], embeddings: List[List[float]]) -> (List[str], List[str]):
14
- """
15
- Splits 'paragraphs' by finding the least similar boundary using 'embeddings'
16
- (which are precomputed for these paragraphs only). Returns (left_part, right_part).
17
- """
18
- # If there are 0 or 1 paragraphs, no need to split
19
- if len(paragraphs) < 2:
20
- return paragraphs, []
21
-
22
- # We'll assume 'embeddings' is already the same length as 'paragraphs'.
23
- if len(embeddings) < 2:
24
- # Can't compute consecutive similarities with fewer than 2 embeddings
25
- return paragraphs, []
26
-
27
- # Find the least similar consecutive boundary
28
- split_index = self.utils.get_consecutive_least_similar(embeddings)
29
-
30
- left_part = paragraphs[:split_index + 1]
31
- right_part = paragraphs[split_index + 1:]
32
- return left_part, right_part
33
-
34
- def split(
35
- self,
36
- extracted_text: str,
37
- min_chunk_size: int = 4000,
38
- max_chunk_size: int = 7000
39
- ) -> List[str]:
40
- """
41
- Splits the input text into chunks of token-size between [min_chunk_size, max_chunk_size].
42
- Once a chunk is in that range, we find the "least similar" boundary, store the left side,
43
- and re-insert the right side for further splitting.
44
- """
45
- paragraphs = [p.strip() for p in extracted_text.split("\n\n") if p.strip()]
46
- if not paragraphs:
47
- return []
48
-
49
- # Precompute once
50
- paragraphs_tokens = [self.utils.count_tokens(p) for p in paragraphs]
51
- paragraphs_embeddings = self.utils.list_embeddings(paragraphs)
52
-
53
- final_chunks = []
54
- idx = 0
55
- n = len(paragraphs)
56
-
57
- while idx < n:
58
- chunk_paragraphs = []
59
- chunk_embeddings = []
60
- chunk_tokens = [] # Keep track of tokens in this chunk
61
- current_tokens = 0
62
-
63
- # 1) Accumulate until we at least exceed min_chunk_size or run out
64
- while idx < n and current_tokens < min_chunk_size:
65
- if current_tokens + paragraphs_tokens[idx] <= max_chunk_size:
66
- chunk_paragraphs.append(paragraphs[idx])
67
- chunk_embeddings.append(paragraphs_embeddings[idx])
68
- chunk_tokens.append(paragraphs_tokens[idx])
69
- current_tokens += paragraphs_tokens[idx]
70
- idx += 1
71
- else:
72
- # This paragraph alone might exceed max_chunk_size -> handle as you see fit
73
- break
74
-
75
- # If we haven't hit min_chunk_size but are out of paragraphs, store remainder and quit
76
- if current_tokens < min_chunk_size and idx >= n:
77
- if chunk_paragraphs:
78
- final_chunks.append(" ".join(chunk_paragraphs))
79
- break
80
-
81
- # 2) Keep adding while we're under max_chunk_size
82
- while idx < n:
83
- if current_tokens + paragraphs_tokens[idx] <= max_chunk_size:
84
- chunk_paragraphs.append(paragraphs[idx])
85
- chunk_embeddings.append(paragraphs_embeddings[idx])
86
- chunk_tokens.append(paragraphs_tokens[idx])
87
- current_tokens += paragraphs_tokens[idx]
88
- idx += 1
89
- else:
90
- break
91
-
92
- # Now we have between min_chunk_size and max_chunk_size tokens in 'chunk_paragraphs'
93
- if chunk_paragraphs:
94
- # 3) Split at the "least similar" boundary
95
- left_part, right_part = self._split_large_chunk(
96
- chunk_paragraphs, chunk_embeddings
97
- )
98
-
99
- # We'll figure out how many paragraphs ended up in the left part
100
- used_count = len(left_part)
101
- leftover_count = len(right_part)
102
-
103
- # Store left side
104
- final_chunks.append(" ".join(left_part))
105
-
106
- # If there's leftover, reinsert it into the main lists
107
- if leftover_count > 0:
108
- # Slices for leftover
109
- leftover_embeddings = chunk_embeddings[used_count:]
110
- leftover_tokens = chunk_tokens[used_count:]
111
-
112
- # Re-insert them at index=idx
113
- paragraphs[idx:idx] = right_part
114
- paragraphs_embeddings[idx:idx] = leftover_embeddings
115
- paragraphs_tokens[idx:idx] = leftover_tokens
116
-
117
- # Recompute n, in case the paragraphs list has grown
118
- n = len(paragraphs)
119
-
120
- return final_chunks
@@ -1,74 +0,0 @@
1
- import contextlib
2
- import io
3
- import os
4
- import tempfile
5
-
6
- import numpy as np
7
- import tiktoken
8
- from markitdown import MarkItDown
9
- from openai import OpenAI
10
- from sklearn.metrics.pairwise import cosine_similarity
11
-
12
-
13
- class Utils:
14
- def __init__(self, api_key=None):
15
- self.client = OpenAI(api_key=api_key)
16
-
17
- def extract_text(self, uploaded_file: bytes):
18
- """Extract text from an uploaded file using MarkItDown."""
19
- md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
20
-
21
- # Accept both raw bytes and file-like objects with `.read()`
22
- if isinstance(uploaded_file, bytes):
23
- file_bytes = uploaded_file
24
- elif hasattr(uploaded_file, "read"):
25
- file_bytes = uploaded_file.read()
26
- else:
27
- raise TypeError("Unsupported file type: must be bytes or file-like object")
28
-
29
- # Write to temp file for MarkItDown to process
30
- with tempfile.NamedTemporaryFile(delete=False) as temp_file:
31
- temp_file_path = temp_file.name
32
- temp_file.write(file_bytes)
33
-
34
- try:
35
- # Redirect stderr to suppress native print warnings like "CropBox missing"
36
- with contextlib.redirect_stderr(io.StringIO()):
37
- extracted_text = md.convert(temp_file_path)
38
- finally:
39
- # Clean up the temporary file
40
- os.remove(temp_file_path)
41
-
42
- return extracted_text.text_content
43
-
44
- def count_tokens(self, text: str, encoding="cl100k_base") -> int:
45
- """Count tokens in text using tiktoken"""
46
- tokenizer = tiktoken.get_encoding(encoding)
47
- return len(tokenizer.encode(text))
48
-
49
- def list_embeddings(self, chunks: list, model="text-embedding-3-small") -> list:
50
- """Get embeddings for a list of text chunks"""
51
- embeddings = []
52
- for chunk in chunks:
53
- response = self.client.embeddings.create(input=chunk, model=model)
54
- embeddings.append(response.data[0].embedding)
55
- return embeddings
56
-
57
- def get_embedding(self, text: str, model="text-embedding-3-small") -> list:
58
- """Get embedding for a text"""
59
- response = self.client.embeddings.create(input=text, model=model)
60
- return response.data[0].embedding
61
-
62
- def get_consecutive_least_similar(self, embeddings: list) -> int:
63
- """Find the index where consecutive similarity is lowest"""
64
- cs = cosine_similarity(embeddings)
65
-
66
- # Get similarities between consecutive sentences only
67
- consecutive_similarities = []
68
- for i in range(len(cs) - 1):
69
- consecutive_similarities.append(cs[i][i + 1])
70
-
71
- # Find the index where consecutive similarity is lowest
72
- split_index = np.argmin(consecutive_similarities)
73
-
74
- return split_index
@@ -1,50 +0,0 @@
1
- import os
2
-
3
- from dotenv import load_dotenv
4
- from hie_rag.utils import Utils
5
-
6
- load_dotenv()
7
-
8
- utils = Utils(api_key=os.getenv("OPENAI_API_KEY"))
9
-
10
- with open("test.pdf", "rb") as uploaded_file:
11
- extracted_text = utils.extract_text(uploaded_file)
12
-
13
- # Count tokens for the first 100 words
14
- result_count_tokens = utils.count_tokens(extracted_text[:100])
15
-
16
- # Get embeddings for the text slices
17
- result_list_embeddings = utils.list_embeddings([
18
- extracted_text[:100],
19
- extracted_text[100:200],
20
- extracted_text[200:300]
21
- ])
22
-
23
- # Get the embedding for the first 100 words
24
- result_get_embedding = utils.get_embedding(extracted_text[:100])
25
-
26
- # Find the index of least similar consecutive embeddings
27
- result_get_consecutive_least_similar = utils.get_consecutive_least_similar(result_list_embeddings)
28
-
29
- # Write results to the text file
30
- with open("test-utils-result", "w", encoding="utf-8") as file:
31
- file.write("Extracted Text:\n")
32
- file.write(extracted_text + "\n\n")
33
- file.write("====================================\n\n")
34
-
35
- file.write("Count of Tokens (First 100 words):\n")
36
- file.write(str(result_count_tokens) + "\n\n")
37
- file.write("====================================\n\n")
38
-
39
- file.write("List of Embeddings:\n")
40
- file.write(str(result_list_embeddings) + "\n\n")
41
- file.write("====================================\n\n")
42
-
43
- file.write("Embedding of First 100 words:\n")
44
- file.write(str(result_get_embedding) + "\n\n")
45
- file.write("====================================\n\n")
46
-
47
- file.write("Index of Least Similar Consecutive Embeddings:\n")
48
- file.write(str(result_get_consecutive_least_similar) + "\n")
49
-
50
- print("Results written to a txt file.")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes