PyPI - hie-rag - Versions diffs - 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

hie-rag 0.1.2py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

hie_rag/__init__.py +10 -1
hie_rag/ai_client.py +26 -0
hie_rag/hie_rag.py +7 -39
hie_rag/process.py +4 -4
hie_rag/split.py +4 -4
hie_rag/split_and_process.py +4 -4
hie_rag/tree_index.py +4 -4
hie_rag/utils.py +15 -15
hie_rag/vectordb.py +2 -2
{hie_rag-0.1.2.dist-info → hie_rag-0.2.0.dist-info}/METADATA +3 -3
hie_rag-0.2.0.dist-info/RECORD +14 -0
{hie_rag-0.1.2.dist-info → hie_rag-0.2.0.dist-info}/WHEEL +1 -1
hie_rag/app.py +0 -77
hie_rag/generate.py +0 -61
hie_rag-0.1.2.dist-info/RECORD +0 -15
{hie_rag-0.1.2.dist-info → hie_rag-0.2.0.dist-info}/licenses/LICENSE +0 -0
{hie_rag-0.1.2.dist-info → hie_rag-0.2.0.dist-info}/top_level.txt +0 -0

hie_rag/__init__.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # __init__.py
-from .generate import Generate
 from .hie_rag import HieRag
 from .process import Process
 from .split import Split
@@ -7,3 +6,13 @@ from .split_and_process import SplitAndProcess
 from .tree_index import TreeIndex
 from .utils import Utils
 from .vectordb import Vectordb
+__all__ = [
+    "HieRag",
+    "Process",
+    "Split",
+    "SplitAndProcess",
+    "TreeIndex",
+    "Utils",
+    "Vectordb",
+]

hie_rag/ai_client.py ADDED Viewed

@@ -0,0 +1,26 @@
+import requests
+class AiClient:
+    def __init__(self, base_url="http://localhost:11434"):
+        self.base_url = base_url
+        self.headers = {"Content-Type": "application/json"}
+    def get_embedding(self, text: str, model="nomic-embed-text") -> list:
+        url = f"{self.base_url}/api/embeddings"
+        payload = {
+            "model": model,
+            "prompt": text
+        }
+        response = requests.post(url, json=payload, headers=self.headers, timeout=60)
+        response.raise_for_status()
+        data = response.json()
+        # Extract embedding, adapt if your API response structure differs
+        embedding = data.get("embedding") or (data.get("data") and data["data"][0].get("embedding"))
+        if embedding is None:
+            raise ValueError("Embedding not found in Ollama response")
+        return embedding
+    def list_embeddings(self, texts: list, model="nomic-embed-text") -> list:
+        return [self.get_embedding(text, model=model) for text in texts]

hie_rag/hie_rag.py CHANGED Viewed

@@ -6,12 +6,12 @@ from hie_rag.vectordb import Vectordb
 class HieRag:
-    def __init__(self, api_key=None, path="./db", collection_name="db_collection"):
-        self.split = Split(api_key=api_key)
-        self.utils = Utils(api_key=api_key)
-        self.tree_index = TreeIndex(api_key=api_key)
-        self.process = Process(api_key=api_key)
-        self.vector_db = Vectordb(path=path, api_key=api_key, collection_name=collection_name)
+    def __init__(self, base_url, path="./db", collection_name="db_collection"):
+        self.split = Split(base_url=base_url)
+        self.utils = Utils(base_url=base_url)
+        self.tree_index = TreeIndex(base_url=base_url)
+        self.process = Process(base_url=base_url)
+        self.vector_db = Vectordb(path=path, base_url=base_url, collection_name=collection_name)
     def process_and_save_index_stream(self, file_name: str, uploaded_file: bytes, min_chunk_size, max_chunk_size):
         yield {"status": "🔍 Extracting text..."}
@@ -38,7 +38,6 @@ class HieRag:
             "chunk_count": len(tree_index.get("chunks", [])),
         }
     def get_summary(self, file_id):
         return self.vector_db.get_summary(file_id)
@@ -55,35 +54,4 @@ class HieRag:
         return self.vector_db.query_summaries_by_text(query_text, n_results=n_results)
     def query_chunks_by_text(self, query_text: str, file_id: str, n_results=5):
-        return self.vector_db.query_chunks_by_text(query_text, file_id=file_id, n_results=n_results)
-    # def query(self, query_text: str, n_results=5):
-    #     """
-    #     This n_result is for the chunks.
-    #     """
-    #     print("The summary is querying...")
-    #     query_summary_result = self.vector_db.query_summaries_by_text(query_text)
-    #     if not query_summary_result["metadatas"]:
-    #         return "No results found"
-    #     file_id = query_summary_result["metadatas"][0][0]["file_id"]
-    #     summary = query_summary_result["metadatas"][0][0]["summary"]
-    #     keywords = query_summary_result["metadatas"][0][0]["keywords"]
-    #     print("The chunks are querying...")
-    #     query_chunks_result = self.vector_db.query_chunks_by_text(query_text, file_id=file_id, n_results=n_results)
-    #     if not query_chunks_result["metadatas"]:
-    #         return "No results found"
-    #     chunks = query_chunks_result["metadatas"][0]
-    #     data = {
-    #         "file_id": file_id,
-    #         "summary": summary,
-    #         "keywords": keywords,
-    #         "chunks": chunks
-    #     }
-    #     return data
+        return self.vector_db.query_chunks_by_text(query_text, file_id=file_id, n_results=n_results)

hie_rag/process.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import Dict, List
 from langchain_core.prompts import PromptTemplate
-from langchain_openai import ChatOpenAI
+from langchain_ollama import ChatOllama
 from pydantic import Field
 from typing_extensions import TypedDict
@@ -9,9 +9,9 @@ from .utils import Utils
 class Process:
-    def __init__(self, api_key=None):
-        self.client = ChatOpenAI(temperature=0, model="gpt-4o", api_key=api_key)
-        self.utils = Utils(api_key=api_key)
+    def __init__(self, base_url=None):
+        self.client = ChatOllama(model="llama3.2:latest")
+        self.utils = Utils(base_url=base_url)
     def _generate_metadata(self, chunk: str) -> Dict:
         """Generate metadata for a chunk using LangChain"""

hie_rag/split.py CHANGED Viewed

@@ -4,11 +4,11 @@ from .utils import Utils
 class Split:
-    def __init__(self, api_key: str = None):
+    def __init__(self, base_url: str = None):
         """
         Initializes the Split object with default or user-defined thresholds.
         """
-        self.utils = Utils(api_key=api_key)
+        self.utils = Utils(base_url=base_url)
     def _split_large_chunk(self, paragraphs: List[str], embeddings: List[List[float]]) -> (List[str], List[str]):
         """
@@ -34,8 +34,8 @@ class Split:
     def split(
         self,
         extracted_text: str,
-        min_chunk_size: int = 4000,
-        max_chunk_size: int = 7000
+        min_chunk_size: int = 300,
+        max_chunk_size: int = 500
     ) -> List[str]:
         """
         Splits the input text into chunks of token-size between [min_chunk_size, max_chunk_size].

hie_rag/split_and_process.py CHANGED Viewed

@@ -4,10 +4,10 @@ from hie_rag.utils import Utils
 class SplitAndProcess:
-    def __init__(self, api_key=None):
-        self.split = Split(api_key=api_key)
-        self.utils = Utils(api_key=api_key)
-        self.process = Process(api_key=api_key)
+    def __init__(self, base_url: str):
+        self.split = Split(base_url=base_url)
+        self.utils = Utils(base_url=base_url)
+        self.process = Process(base_url=base_url)
     def split_and_process(self, uploaded_file):
         extracted_text = self.utils.extract_text(uploaded_file)

hie_rag/tree_index.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 from typing import List
 from langchain_core.prompts import PromptTemplate
-from langchain_openai import ChatOpenAI
+from langchain_ollama import ChatOllama
 from pydantic import Field
 from typing_extensions import TypedDict
@@ -10,9 +10,9 @@ from .utils import Utils
 class TreeIndex:
-    def __init__(self, api_key: str):
-        self.client = ChatOpenAI(temperature=0, model="gpt-4o", api_key=api_key)
-        self.utils = Utils(api_key=api_key)
+    def __init__(self, base_url: str):
+        self.client = ChatOllama(model="llama3.2:latest")
+        self.utils = Utils(base_url=base_url)
     def _convert_to_string(self, chunk_metadata: dict) -> str:
         """

hie_rag/utils.py CHANGED Viewed

@@ -6,17 +6,20 @@ import tempfile
 import numpy as np
 import tiktoken
 from markitdown import MarkItDown
-from openai import OpenAI
 from sklearn.metrics.pairwise import cosine_similarity
+from .ai_client import AiClient
 class Utils:
-    def __init__(self, api_key=None):
-        self.client = OpenAI(api_key=api_key)
+    def __init__(self, base_url=None):
+        # self.client = OpenAI(api_key=api_key)
+        self.client = AiClient(base_url=base_url)
     def extract_text(self, uploaded_file: bytes):
         """Extract text from an uploaded file using MarkItDown."""
-        md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
+        # md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
+        md = MarkItDown()
         # Accept both raw bytes and file-like objects with `.read()`
         if isinstance(uploaded_file, bytes):
@@ -46,18 +49,15 @@ class Utils:
         tokenizer = tiktoken.get_encoding(encoding)
         return len(tokenizer.encode(text))
-    def list_embeddings(self, chunks: list, model="text-embedding-3-small") -> list:
-        """Get embeddings for a list of text chunks"""
-        embeddings = []
-        for chunk in chunks:
-            response = self.client.embeddings.create(input=chunk, model=model)
-            embeddings.append(response.data[0].embedding)
-        return embeddings
+    def get_embedding(self, text: str, model="nomic-embed-text") -> list:
+        if not self.client:
+            raise RuntimeError("No embedding client configured")
+        return self.client.get_embedding(text, model=model)
-    def get_embedding(self, text: str, model="text-embedding-3-small") -> list:
-        """Get embedding for a text"""
-        response = self.client.embeddings.create(input=text, model=model)
-        return response.data[0].embedding
+    def list_embeddings(self, chunks: list, model="nomic-embed-text") -> list:
+        if not self.client:
+            raise RuntimeError("No embedding client configured")
+        return self.client.list_embeddings(chunks, model=model)
     def get_consecutive_least_similar(self, embeddings: list) -> int:
         """Find the index where consecutive similarity is lowest"""

hie_rag/vectordb.py CHANGED Viewed

@@ -7,9 +7,9 @@ from .utils import Utils
 class Vectordb():
-    def __init__(self, path, api_key, collection_name):
+    def __init__(self, path, base_url, collection_name):
         self.client = chromadb.PersistentClient(path = path)
-        self.utils = Utils(api_key=api_key)
+        self.utils = Utils(base_url=base_url)
         self.collection = self.client.get_or_create_collection(collection_name)
     def _convert_numpy(self, obj):

{hie_rag-0.1.2.dist-info → hie_rag-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.4
 Name: hie_rag
-Version: 0.1.2
-Summary: A simple hierarchical RAG model
+Version: 0.2.0
+Summary: A hierarchical RAG framework for chunks retrieval.
 License: Apache License
         Version 2.0, January 2004
         http://www.apache.org/licenses/
@@ -29,7 +29,7 @@ Requires-Dist: openai==1.66.3
 Requires-Dist: scikit-learn
 Requires-Dist: tiktoken==0.8.0
 Requires-Dist: langchain==0.3.13
-Requires-Dist: langchain-openai==0.2.13
+Requires-Dist: langchain-ollama==0.3.3
 Requires-Dist: chromadb==0.6.2
 Dynamic: license-file

hie_rag-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
+hie_rag/ai_client.py,sha256=VbGQ0e3vZNn8W2YoR15Vvq2r-MUs-TBRNLGiImT4QxU,1000
+hie_rag/hie_rag.py,sha256=KB44QBz3tE0Eq_FJw9pvKynCfjyAuulaMFYKk6bzjug,2359
+hie_rag/process.py,sha256=D_vMnF84ingLb4_KoC77uLQXSa6FwEpR30RGukG2H9U,2414
+hie_rag/split.py,sha256=My7QQ_pPiJD0TvwRzm2MgonMMA79-r3Vifwp1xLWX4I,4905
+hie_rag/split_and_process.py,sha256=PkFlnOF7nW4Zs47JTsGF4AY9VDOXz1AtxG9Die8_mQk,572
+hie_rag/tree_index.py,sha256=TuRi9-M2aiD46ciS-iwIJYDc9nXq7i7mwxwVbMXk5Lo,2668
+hie_rag/utils.py,sha256=F5bqx147yT37z080MPWPrwzOa0tGEAWmvNFgjXpe4ZA,2729
+hie_rag/vectordb.py,sha256=iI73ujrONjDaHU66RNdHnD2PZWSppnjm0isIHPJEGAY,11068
+hie_rag-0.2.0.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
+hie_rag-0.2.0.dist-info/METADATA,sha256=Oym7z46OyhT_Gp7unhX1rsYlFQi9UuOBU5VRsko1m_A,1698
+hie_rag-0.2.0.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
+hie_rag-0.2.0.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
+hie_rag-0.2.0.dist-info/RECORD,,

{hie_rag-0.1.2.dist-info → hie_rag-0.2.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.7.1)
+Generator: setuptools (80.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

hie_rag/app.py DELETED Viewed

@@ -1,77 +0,0 @@
-# import json
-# import os
-# from .generate import Generate
-# from .process import Process
-# from .split import Split
-# from .tree_index import TreeIndex
-# from .utils import Utils
-# from .vectordb import Vectordb
-# # Function to handle data
-# def handle_data(data):
-#     """
-#     Processes incoming data and returns a response.
-#     """
-#     try:
-#         # This is the logic that used to be in the /api/data route
-#         return {"received": data}
-#     except Exception as e:
-#         return {"error": str(e)}
-# # Function to handle file upload and processing
-# def handle_file_upload(uploaded_file, access_token):
-#     """
-#     Processes the uploaded file and extracts its text.
-#     """
-#     try:
-#         utils = Utils(api_key=access_token)
-#         process = Process(api_key=access_token)
-#         split = Split(api_key=access_token)
-#         tree_index = TreeIndex(api_key=access_token)
-#         if uploaded_file is None:
-#             return {"error": "No file selected for uploading"}
-#         filename = uploaded_file.filename
-#         extracted_text = utils.extract_text(uploaded_file)
-#         final_chunk_list = split.split(extracted_text)
-#         processed_chunks = process.process_chunks(final_chunk_list)
-#         data = tree_index.output_index(processed_chunks)
-#         return {"filename": filename, "data": data}
-#     except Exception as e:
-#         return {"error": str(e)}
-# # Function to handle generation logic
-# def handle_generation(file, access_token):
-#     """
-#     Handles the file for generation and returns generated data.
-#     """
-#     try:
-#         data = json.load(file)
-#         if "chunks" not in data:
-#             return {"error": "Missing 'chunks' in data"}
-#         path = os.getenv("INDEX_PATH")
-#         vectordb = Vectordb(path=path, api_key=access_token)
-#         generate = Generate(api_key=access_token)
-#         save_index_result = vectordb.save_index(data)
-#         generated_full_data = []
-#         for i in data["chunks"]:
-#             original_chunk = i["original_chunk"]
-#             query_result = vectordb.query_by_text(original_chunk, n_results=3)
-#             possible_reference = query_result["metadatas"][0][1]["summary"] + "\n" + query_result["metadatas"][0][2]["summary"]
-#             data_gen = generate.generate(original_chunk, possible_reference)
-#             generated_full_data.extend(data_gen["dataset"])
-#         return {"data": generated_full_data}
-#     except json.JSONDecodeError:
-#         return {"error": "Invalid JSON file format"}
-#     except Exception as e:
-#         return {"error": str(e)}

hie_rag/generate.py DELETED Viewed

@@ -1,61 +0,0 @@
-from typing import Dict, List
-from langchain_core.prompts import PromptTemplate
-from langchain_openai import ChatOpenAI
-from pydantic import Field
-from typing_extensions import TypedDict
-class Generate:
-    def __init__(self, api_key: str):
-        self.client = ChatOpenAI(temperature=0, model="gpt-4o", api_key=api_key)
-    def generate(self, content: str, possible_reference: str) -> Dict:
-        """Generate data for finetuning"""
-        prompt = PromptTemplate(
-            template="""
-            你是一個資料生成器，負責生成用於微調模型的資料集。
-            你的工作是閱讀以下內容，並生成一系列人類可能給出的指令（instruction）以及對應的詳細回應（response）。
-            instruction 可能會是一個問題、請求整理或者整理內容等指令。
-            注意事項：
-            1. 請輸出繁體中文。
-            2. 請務必只生成與內容相關的指令與回應。
-            3. 如果不確定內容在講什麼，可以參考「可能參考資料（Possible Reference）」來幫助理解。
-            4.「可能參考資料」只是可能幫助你理解的參考來源。
-            5. 不要捏造答案，如果真的不知道，就不要亂寫。
-            Content:
-            {content}
-            Possible Reference:
-            {possible_reference}
-            """,
-            input_variables=["content", "possible_reference"],
-        )
-        class InstructionResponse(TypedDict):
-            instruction: str = Field(
-                description="An instruction that a human might provide based on the content.",
-            )
-            response: str = Field(
-                description="The corresponding response to the instruction.",
-            )
-            used_reference: bool = Field(
-                description="Indicates whether the possible reference was used to generate this pair. True if the Possible Reference is relavent and useful, False otherwise.",
-            )
-            reference_usage: str = Field(
-                description="Explanation of how the reference was used, if it was used.",
-            )
-        class Dataset(TypedDict):
-            dataset: List[InstructionResponse]
-            content_analysis: str = Field(
-                description="Brief analysis of whether and how the reference helped with understanding the content.",
-            )
-        model = self.client
-        llm_with_tool = model.with_structured_output(Dataset)
-        chain = prompt | llm_with_tool
-        return chain.invoke({"content": content, "possible_reference": possible_reference})

hie_rag-0.1.2.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-hie_rag/__init__.py,sha256=mPaUHUxcZlu5RtGcY0NqZmUaeogsG-9vMboTCEP_yoU,264
-hie_rag/app.py,sha256=jZkGEIXhYL2mY3KhixXFqvkOn8r0Cdav3EZxlChvKDA,2636
-hie_rag/generate.py,sha256=qNRiRQMUWPZenhPRvtBzkrA7LBkXIQXnHlA0ICiunI4,2656
-hie_rag/hie_rag.py,sha256=rpQfbcPVYaeA2RCEcOxlwovYPsBYRzFYlfO9WP4piSo,3442
-hie_rag/process.py,sha256=JaL8i1IZckeeaHsNSYiUIlYRsRRB73E9QqLCSh09JHA,2434
-hie_rag/split.py,sha256=st_bZ4UaKUOXbxUIDobfG1IsW5vC9rHeyo4LXprfKrk,4904
-hie_rag/split_and_process.py,sha256=eRMiBYBZWUo3ljFasZGAOSP_6_adiwBD094DZJfVQDk,565
-hie_rag/tree_index.py,sha256=5rCoCCO14KLFvRzeOGB08mAnd6d3p7dl4h4jGQqF13A,2688
-hie_rag/utils.py,sha256=cxYLNch5CVgnpuD3ScVoJMP8Kp0_Ni3grF5tV1_sCOM,2769
-hie_rag/vectordb.py,sha256=UVdAinxUDhDqwbFbeXaLVdzN6uC4nu5l7rWi600d8BU,11065
-hie_rag-0.1.2.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
-hie_rag-0.1.2.dist-info/METADATA,sha256=utsAOnIm1VW1r66Va8-plF5jWQVPDDWmES_PTwac3No,1680
-hie_rag-0.1.2.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
-hie_rag-0.1.2.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
-hie_rag-0.1.2.dist-info/RECORD,,

{hie_rag-0.1.2.dist-info → hie_rag-0.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{hie_rag-0.1.2.dist-info → hie_rag-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

hie-rag 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

hie-rag 0.1.2py3-none-any.whl → 0.2.0py3-none-any.whl