hie-rag 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {hie_rag-0.1.3 → hie_rag-0.2.0}/PKG-INFO +2 -2
  2. hie_rag-0.2.0/hie_rag/ai_client.py +26 -0
  3. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag/hie_rag.py +6 -6
  4. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag/process.py +4 -4
  5. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag/split.py +4 -4
  6. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag/split_and_process.py +4 -4
  7. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag/tree_index.py +4 -4
  8. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag/utils.py +15 -15
  9. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag/vectordb.py +2 -2
  10. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag.egg-info/PKG-INFO +2 -2
  11. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag.egg-info/SOURCES.txt +1 -1
  12. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag.egg-info/requires.txt +1 -1
  13. {hie_rag-0.1.3 → hie_rag-0.2.0}/pyproject.toml +2 -2
  14. {hie_rag-0.1.3 → hie_rag-0.2.0}/test/test-process.py +9 -5
  15. {hie_rag-0.1.3 → hie_rag-0.2.0}/test/test-split.py +9 -5
  16. {hie_rag-0.1.3 → hie_rag-0.2.0}/test/test-utils.py +6 -2
  17. {hie_rag-0.1.3 → hie_rag-0.2.0}/test/test_split_and_process.py +7 -2
  18. hie_rag-0.1.3/hie_rag/app.py +0 -77
  19. {hie_rag-0.1.3 → hie_rag-0.2.0}/LICENSE +0 -0
  20. {hie_rag-0.1.3 → hie_rag-0.2.0}/README.md +0 -0
  21. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag/__init__.py +0 -0
  22. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag.egg-info/dependency_links.txt +0 -0
  23. {hie_rag-0.1.3 → hie_rag-0.2.0}/hie_rag.egg-info/top_level.txt +0 -0
  24. {hie_rag-0.1.3 → hie_rag-0.2.0}/setup.cfg +0 -0
  25. {hie_rag-0.1.3 → hie_rag-0.2.0}/test/test-vectordb.py +0 -0
  26. {hie_rag-0.1.3 → hie_rag-0.2.0}/test/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hie_rag
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: A hierarchical RAG framework for chunks retrieval.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -29,7 +29,7 @@ Requires-Dist: openai==1.66.3
29
29
  Requires-Dist: scikit-learn
30
30
  Requires-Dist: tiktoken==0.8.0
31
31
  Requires-Dist: langchain==0.3.13
32
- Requires-Dist: langchain-openai==0.2.13
32
+ Requires-Dist: langchain-ollama==0.3.3
33
33
  Requires-Dist: chromadb==0.6.2
34
34
  Dynamic: license-file
35
35
 
@@ -0,0 +1,26 @@
1
+ import requests
2
+
3
+
4
+ class AiClient:
5
+ def __init__(self, base_url="http://localhost:11434"):
6
+ self.base_url = base_url
7
+ self.headers = {"Content-Type": "application/json"}
8
+
9
+ def get_embedding(self, text: str, model="nomic-embed-text") -> list:
10
+ url = f"{self.base_url}/api/embeddings"
11
+ payload = {
12
+ "model": model,
13
+ "prompt": text
14
+ }
15
+ response = requests.post(url, json=payload, headers=self.headers, timeout=60)
16
+ response.raise_for_status()
17
+ data = response.json()
18
+
19
+ # Extract embedding, adapt if your API response structure differs
20
+ embedding = data.get("embedding") or (data.get("data") and data["data"][0].get("embedding"))
21
+ if embedding is None:
22
+ raise ValueError("Embedding not found in Ollama response")
23
+ return embedding
24
+
25
+ def list_embeddings(self, texts: list, model="nomic-embed-text") -> list:
26
+ return [self.get_embedding(text, model=model) for text in texts]
@@ -6,12 +6,12 @@ from hie_rag.vectordb import Vectordb
6
6
 
7
7
 
8
8
  class HieRag:
9
- def __init__(self, api_key=None, path="./db", collection_name="db_collection"):
10
- self.split = Split(api_key=api_key)
11
- self.utils = Utils(api_key=api_key)
12
- self.tree_index = TreeIndex(api_key=api_key)
13
- self.process = Process(api_key=api_key)
14
- self.vector_db = Vectordb(path=path, api_key=api_key, collection_name=collection_name)
9
+ def __init__(self, base_url, path="./db", collection_name="db_collection"):
10
+ self.split = Split(base_url=base_url)
11
+ self.utils = Utils(base_url=base_url)
12
+ self.tree_index = TreeIndex(base_url=base_url)
13
+ self.process = Process(base_url=base_url)
14
+ self.vector_db = Vectordb(path=path, base_url=base_url, collection_name=collection_name)
15
15
 
16
16
  def process_and_save_index_stream(self, file_name: str, uploaded_file: bytes, min_chunk_size, max_chunk_size):
17
17
  yield {"status": "🔍 Extracting text..."}
@@ -1,7 +1,7 @@
1
1
  from typing import Dict, List
2
2
 
3
3
  from langchain_core.prompts import PromptTemplate
4
- from langchain_openai import ChatOpenAI
4
+ from langchain_ollama import ChatOllama
5
5
  from pydantic import Field
6
6
  from typing_extensions import TypedDict
7
7
 
@@ -9,9 +9,9 @@ from .utils import Utils
9
9
 
10
10
 
11
11
  class Process:
12
- def __init__(self, api_key=None):
13
- self.client = ChatOpenAI(temperature=0, model="gpt-4o", api_key=api_key)
14
- self.utils = Utils(api_key=api_key)
12
+ def __init__(self, base_url=None):
13
+ self.client = ChatOllama(model="llama3.2:latest")
14
+ self.utils = Utils(base_url=base_url)
15
15
 
16
16
  def _generate_metadata(self, chunk: str) -> Dict:
17
17
  """Generate metadata for a chunk using LangChain"""
@@ -4,11 +4,11 @@ from .utils import Utils
4
4
 
5
5
 
6
6
  class Split:
7
- def __init__(self, api_key: str = None):
7
+ def __init__(self, base_url: str = None):
8
8
  """
9
9
  Initializes the Split object with default or user-defined thresholds.
10
10
  """
11
- self.utils = Utils(api_key=api_key)
11
+ self.utils = Utils(base_url=base_url)
12
12
 
13
13
  def _split_large_chunk(self, paragraphs: List[str], embeddings: List[List[float]]) -> (List[str], List[str]):
14
14
  """
@@ -34,8 +34,8 @@ class Split:
34
34
  def split(
35
35
  self,
36
36
  extracted_text: str,
37
- min_chunk_size: int = 4000,
38
- max_chunk_size: int = 7000
37
+ min_chunk_size: int = 300,
38
+ max_chunk_size: int = 500
39
39
  ) -> List[str]:
40
40
  """
41
41
  Splits the input text into chunks of token-size between [min_chunk_size, max_chunk_size].
@@ -4,10 +4,10 @@ from hie_rag.utils import Utils
4
4
 
5
5
 
6
6
  class SplitAndProcess:
7
- def __init__(self, api_key=None):
8
- self.split = Split(api_key=api_key)
9
- self.utils = Utils(api_key=api_key)
10
- self.process = Process(api_key=api_key)
7
+ def __init__(self, base_url: str):
8
+ self.split = Split(base_url=base_url)
9
+ self.utils = Utils(base_url=base_url)
10
+ self.process = Process(base_url=base_url)
11
11
 
12
12
  def split_and_process(self, uploaded_file):
13
13
  extracted_text = self.utils.extract_text(uploaded_file)
@@ -2,7 +2,7 @@ import json
2
2
  from typing import List
3
3
 
4
4
  from langchain_core.prompts import PromptTemplate
5
- from langchain_openai import ChatOpenAI
5
+ from langchain_ollama import ChatOllama
6
6
  from pydantic import Field
7
7
  from typing_extensions import TypedDict
8
8
 
@@ -10,9 +10,9 @@ from .utils import Utils
10
10
 
11
11
 
12
12
  class TreeIndex:
13
- def __init__(self, api_key: str):
14
- self.client = ChatOpenAI(temperature=0, model="gpt-4o", api_key=api_key)
15
- self.utils = Utils(api_key=api_key)
13
+ def __init__(self, base_url: str):
14
+ self.client = ChatOllama(model="llama3.2:latest")
15
+ self.utils = Utils(base_url=base_url)
16
16
 
17
17
  def _convert_to_string(self, chunk_metadata: dict) -> str:
18
18
  """
@@ -6,17 +6,20 @@ import tempfile
6
6
  import numpy as np
7
7
  import tiktoken
8
8
  from markitdown import MarkItDown
9
- from openai import OpenAI
10
9
  from sklearn.metrics.pairwise import cosine_similarity
11
10
 
11
+ from .ai_client import AiClient
12
+
12
13
 
13
14
  class Utils:
14
- def __init__(self, api_key=None):
15
- self.client = OpenAI(api_key=api_key)
15
+ def __init__(self, base_url=None):
16
+ # self.client = OpenAI(api_key=api_key)
17
+ self.client = AiClient(base_url=base_url)
16
18
 
17
19
  def extract_text(self, uploaded_file: bytes):
18
20
  """Extract text from an uploaded file using MarkItDown."""
19
- md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
21
+ # md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
22
+ md = MarkItDown()
20
23
 
21
24
  # Accept both raw bytes and file-like objects with `.read()`
22
25
  if isinstance(uploaded_file, bytes):
@@ -46,18 +49,15 @@ class Utils:
46
49
  tokenizer = tiktoken.get_encoding(encoding)
47
50
  return len(tokenizer.encode(text))
48
51
 
49
- def list_embeddings(self, chunks: list, model="text-embedding-3-small") -> list:
50
- """Get embeddings for a list of text chunks"""
51
- embeddings = []
52
- for chunk in chunks:
53
- response = self.client.embeddings.create(input=chunk, model=model)
54
- embeddings.append(response.data[0].embedding)
55
- return embeddings
52
+ def get_embedding(self, text: str, model="nomic-embed-text") -> list:
53
+ if not self.client:
54
+ raise RuntimeError("No embedding client configured")
55
+ return self.client.get_embedding(text, model=model)
56
56
 
57
- def get_embedding(self, text: str, model="text-embedding-3-small") -> list:
58
- """Get embedding for a text"""
59
- response = self.client.embeddings.create(input=text, model=model)
60
- return response.data[0].embedding
57
+ def list_embeddings(self, chunks: list, model="nomic-embed-text") -> list:
58
+ if not self.client:
59
+ raise RuntimeError("No embedding client configured")
60
+ return self.client.list_embeddings(chunks, model=model)
61
61
 
62
62
  def get_consecutive_least_similar(self, embeddings: list) -> int:
63
63
  """Find the index where consecutive similarity is lowest"""
@@ -7,9 +7,9 @@ from .utils import Utils
7
7
 
8
8
 
9
9
  class Vectordb():
10
- def __init__(self, path, api_key, collection_name):
10
+ def __init__(self, path, base_url, collection_name):
11
11
  self.client = chromadb.PersistentClient(path = path)
12
- self.utils = Utils(api_key=api_key)
12
+ self.utils = Utils(base_url=base_url)
13
13
  self.collection = self.client.get_or_create_collection(collection_name)
14
14
 
15
15
  def _convert_numpy(self, obj):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hie_rag
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: A hierarchical RAG framework for chunks retrieval.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -29,7 +29,7 @@ Requires-Dist: openai==1.66.3
29
29
  Requires-Dist: scikit-learn
30
30
  Requires-Dist: tiktoken==0.8.0
31
31
  Requires-Dist: langchain==0.3.13
32
- Requires-Dist: langchain-openai==0.2.13
32
+ Requires-Dist: langchain-ollama==0.3.3
33
33
  Requires-Dist: chromadb==0.6.2
34
34
  Dynamic: license-file
35
35
 
@@ -2,7 +2,7 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  hie_rag/__init__.py
5
- hie_rag/app.py
5
+ hie_rag/ai_client.py
6
6
  hie_rag/hie_rag.py
7
7
  hie_rag/process.py
8
8
  hie_rag/split.py
@@ -6,5 +6,5 @@ openai==1.66.3
6
6
  scikit-learn
7
7
  tiktoken==0.8.0
8
8
  langchain==0.3.13
9
- langchain-openai==0.2.13
9
+ langchain-ollama==0.3.3
10
10
  chromadb==0.6.2
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "hie_rag"
7
- version = "0.1.3"
7
+ version = "0.2.0"
8
8
  description = "A hierarchical RAG framework for chunks retrieval."
9
9
  license = { file = "LICENSE" }
10
10
  dependencies = [
@@ -16,7 +16,7 @@ dependencies = [
16
16
  "scikit-learn",
17
17
  "tiktoken==0.8.0",
18
18
  "langchain==0.3.13",
19
- "langchain-openai==0.2.13",
19
+ "langchain-ollama==0.3.3",
20
20
  "chromadb==0.6.2"
21
21
  ]
22
22
  readme = "README.md"
@@ -1,15 +1,19 @@
1
1
  import os
2
+ import sys
3
+
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
2
5
 
3
6
  from dotenv import load_dotenv
4
- from hie_rag.app import Split
7
+
5
8
  from hie_rag.process import Process
9
+ from hie_rag.split import Split
6
10
  from hie_rag.utils import Utils
7
11
 
8
12
  load_dotenv()
9
13
 
10
- split = Split(api_key=os.getenv("OPENAI_API_KEY"), min_chunk_size=200, max_chunk_size=500)
11
- utils = Utils(api_key=os.getenv("OPENAI_API_KEY"))
12
- process = Process(api_key=os.getenv("OPENAI_API_KEY"))
14
+ split = Split(base_url=os.getenv('BASE_URL'))
15
+ utils = Utils(base_url=os.getenv('BASE_URL'))
16
+ process = Process(base_url=os.getenv('BASE_URL'))
13
17
 
14
18
  with open("test.pdf", "rb") as uploaded_file:
15
19
  extracted_text = utils.extract_text(uploaded_file)
@@ -20,7 +24,7 @@ result_split = split.split(extracted_text)
20
24
  result_process = process.process_chunks(result_split)
21
25
 
22
26
  # Write results to the text file
23
- with open("test-process-result", "w", encoding="utf-8") as file:
27
+ with open("test-process-result-new", "w", encoding="utf-8") as file:
24
28
  file.write("Processed Chunks:\n")
25
29
  file.write(str(result_process) + "\n")
26
30
 
@@ -1,13 +1,17 @@
1
1
  import os
2
+ import sys
3
+
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
2
5
 
3
6
  from dotenv import load_dotenv
4
- from hie_rag.app import Split
7
+
8
+ from hie_rag.split import Split
5
9
  from hie_rag.utils import Utils
6
10
 
7
11
  load_dotenv()
8
12
 
9
- split = Split(api_key=os.getenv("OPENAI_API_KEY"), min_chunk_size=200, max_chunk_size=500)
10
- utils = Utils(api_key=os.getenv("OPENAI_API_KEY"))
13
+ split = Split(base_url=os.getenv("BASE_URL"))
14
+ utils = Utils(base_url=os.getenv("BASE_URL"))
11
15
 
12
16
  with open("test.pdf", "rb") as uploaded_file:
13
17
  extracted_text = utils.extract_text(uploaded_file)
@@ -16,10 +20,10 @@ with open("test.pdf", "rb") as uploaded_file:
16
20
  extracted_text = extracted_text[:1000]
17
21
 
18
22
  # Split the extracted text
19
- result_split = split.split(extracted_text)
23
+ result_split = split.split(extracted_text, min_chunk_size=300, max_chunk_size=500)
20
24
 
21
25
  # Write results to the text file
22
- with open("test-split-result", "w", encoding="utf-8") as file:
26
+ with open("test-split-result-new", "w", encoding="utf-8") as file:
23
27
  file.write("Splitted Text:\n")
24
28
  file.write(str(result_split) + "\n")
25
29
  file.write("Length of the Splitted Text:\n")
@@ -1,11 +1,15 @@
1
1
  import os
2
+ import sys
3
+
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
2
5
 
3
6
  from dotenv import load_dotenv
7
+
4
8
  from hie_rag.utils import Utils
5
9
 
6
10
  load_dotenv()
7
11
 
8
- utils = Utils(api_key=os.getenv("OPENAI_API_KEY"))
12
+ utils = Utils(base_url=os.getenv("BASE_URL"))
9
13
 
10
14
  with open("test.pdf", "rb") as uploaded_file:
11
15
  extracted_text = utils.extract_text(uploaded_file)
@@ -27,7 +31,7 @@ result_get_embedding = utils.get_embedding(extracted_text[:100])
27
31
  result_get_consecutive_least_similar = utils.get_consecutive_least_similar(result_list_embeddings)
28
32
 
29
33
  # Write results to the text file
30
- with open("test-utils-result", "w", encoding="utf-8") as file:
34
+ with open("test-utils-result-new", "w", encoding="utf-8") as file:
31
35
  file.write("Extracted Text:\n")
32
36
  file.write(extracted_text + "\n\n")
33
37
  file.write("====================================\n\n")
@@ -1,16 +1,21 @@
1
1
  import os
2
+ import sys
3
+
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
5
+
2
6
 
3
7
  from dotenv import load_dotenv
8
+
4
9
  from hie_rag import SplitAndProcess
5
10
 
6
11
  load_dotenv()
7
12
 
8
- split_and_process = SplitAndProcess(api_key=os.getenv("OPENAI_API_KEY"))
13
+ split_and_process = SplitAndProcess(base_url=os.getenv("BASE_URL"))
9
14
 
10
15
  with open("test.pdf", "rb") as uploaded_file:
11
16
  result_process = split_and_process.split_and_process(uploaded_file)
12
17
 
13
- with open("test-split-and-process-result", "w", encoding="utf-8") as file:
18
+ with open("test-split-and-process-result-new", "w", encoding="utf-8") as file:
14
19
  file.write("Split and Processed Text:\n")
15
20
  file.write(str(result_process) + "\n")
16
21
 
@@ -1,77 +0,0 @@
1
- # import json
2
- # import os
3
-
4
- # from .generate import Generate
5
- # from .process import Process
6
- # from .split import Split
7
- # from .tree_index import TreeIndex
8
- # from .utils import Utils
9
- # from .vectordb import Vectordb
10
-
11
-
12
- # # Function to handle data
13
- # def handle_data(data):
14
- # """
15
- # Processes incoming data and returns a response.
16
- # """
17
- # try:
18
- # # This is the logic that used to be in the /api/data route
19
- # return {"received": data}
20
- # except Exception as e:
21
- # return {"error": str(e)}
22
-
23
- # # Function to handle file upload and processing
24
- # def handle_file_upload(uploaded_file, access_token):
25
- # """
26
- # Processes the uploaded file and extracts its text.
27
- # """
28
- # try:
29
- # utils = Utils(api_key=access_token)
30
- # process = Process(api_key=access_token)
31
- # split = Split(api_key=access_token)
32
- # tree_index = TreeIndex(api_key=access_token)
33
-
34
- # if uploaded_file is None:
35
- # return {"error": "No file selected for uploading"}
36
-
37
- # filename = uploaded_file.filename
38
- # extracted_text = utils.extract_text(uploaded_file)
39
- # final_chunk_list = split.split(extracted_text)
40
- # processed_chunks = process.process_chunks(final_chunk_list)
41
- # data = tree_index.output_index(processed_chunks)
42
-
43
- # return {"filename": filename, "data": data}
44
- # except Exception as e:
45
- # return {"error": str(e)}
46
-
47
- # # Function to handle generation logic
48
- # def handle_generation(file, access_token):
49
- # """
50
- # Handles the file for generation and returns generated data.
51
- # """
52
- # try:
53
- # data = json.load(file)
54
-
55
- # if "chunks" not in data:
56
- # return {"error": "Missing 'chunks' in data"}
57
-
58
- # path = os.getenv("INDEX_PATH")
59
- # vectordb = Vectordb(path=path, api_key=access_token)
60
- # generate = Generate(api_key=access_token)
61
-
62
- # save_index_result = vectordb.save_index(data)
63
- # generated_full_data = []
64
-
65
- # for i in data["chunks"]:
66
- # original_chunk = i["original_chunk"]
67
- # query_result = vectordb.query_by_text(original_chunk, n_results=3)
68
- # possible_reference = query_result["metadatas"][0][1]["summary"] + "\n" + query_result["metadatas"][0][2]["summary"]
69
-
70
- # data_gen = generate.generate(original_chunk, possible_reference)
71
- # generated_full_data.extend(data_gen["dataset"])
72
-
73
- # return {"data": generated_full_data}
74
- # except json.JSONDecodeError:
75
- # return {"error": "Invalid JSON file format"}
76
- # except Exception as e:
77
- # return {"error": str(e)}
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes