hie-rag 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hie_rag/__init__.py +10 -1
- hie_rag/hie_rag.py +1 -33
- {hie_rag-0.1.2.dist-info → hie_rag-0.1.3.dist-info}/METADATA +2 -2
- {hie_rag-0.1.2.dist-info → hie_rag-0.1.3.dist-info}/RECORD +7 -8
- hie_rag/generate.py +0 -61
- {hie_rag-0.1.2.dist-info → hie_rag-0.1.3.dist-info}/WHEEL +0 -0
- {hie_rag-0.1.2.dist-info → hie_rag-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {hie_rag-0.1.2.dist-info → hie_rag-0.1.3.dist-info}/top_level.txt +0 -0
hie_rag/__init__.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# __init__.py
|
2
|
-
from .generate import Generate
|
3
2
|
from .hie_rag import HieRag
|
4
3
|
from .process import Process
|
5
4
|
from .split import Split
|
@@ -7,3 +6,13 @@ from .split_and_process import SplitAndProcess
|
|
7
6
|
from .tree_index import TreeIndex
|
8
7
|
from .utils import Utils
|
9
8
|
from .vectordb import Vectordb
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"HieRag",
|
12
|
+
"Process",
|
13
|
+
"Split",
|
14
|
+
"SplitAndProcess",
|
15
|
+
"TreeIndex",
|
16
|
+
"Utils",
|
17
|
+
"Vectordb",
|
18
|
+
]
|
hie_rag/hie_rag.py
CHANGED
@@ -38,7 +38,6 @@ class HieRag:
|
|
38
38
|
"chunk_count": len(tree_index.get("chunks", [])),
|
39
39
|
}
|
40
40
|
|
41
|
-
|
42
41
|
def get_summary(self, file_id):
|
43
42
|
return self.vector_db.get_summary(file_id)
|
44
43
|
|
@@ -55,35 +54,4 @@ class HieRag:
|
|
55
54
|
return self.vector_db.query_summaries_by_text(query_text, n_results=n_results)
|
56
55
|
|
57
56
|
def query_chunks_by_text(self, query_text: str, file_id: str, n_results=5):
|
58
|
-
return self.vector_db.query_chunks_by_text(query_text, file_id=file_id, n_results=n_results)
|
59
|
-
|
60
|
-
# def query(self, query_text: str, n_results=5):
|
61
|
-
# """
|
62
|
-
# This n_result is for the chunks.
|
63
|
-
# """
|
64
|
-
# print("The summary is querying...")
|
65
|
-
# query_summary_result = self.vector_db.query_summaries_by_text(query_text)
|
66
|
-
# if not query_summary_result["metadatas"]:
|
67
|
-
# return "No results found"
|
68
|
-
|
69
|
-
# file_id = query_summary_result["metadatas"][0][0]["file_id"]
|
70
|
-
# summary = query_summary_result["metadatas"][0][0]["summary"]
|
71
|
-
# keywords = query_summary_result["metadatas"][0][0]["keywords"]
|
72
|
-
|
73
|
-
# print("The chunks are querying...")
|
74
|
-
# query_chunks_result = self.vector_db.query_chunks_by_text(query_text, file_id=file_id, n_results=n_results)
|
75
|
-
|
76
|
-
# if not query_chunks_result["metadatas"]:
|
77
|
-
# return "No results found"
|
78
|
-
|
79
|
-
# chunks = query_chunks_result["metadatas"][0]
|
80
|
-
|
81
|
-
# data = {
|
82
|
-
# "file_id": file_id,
|
83
|
-
# "summary": summary,
|
84
|
-
# "keywords": keywords,
|
85
|
-
# "chunks": chunks
|
86
|
-
# }
|
87
|
-
|
88
|
-
# return data
|
89
|
-
|
57
|
+
return self.vector_db.query_chunks_by_text(query_text, file_id=file_id, n_results=n_results)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: hie_rag
|
3
|
-
Version: 0.1.
|
4
|
-
Summary: A
|
3
|
+
Version: 0.1.3
|
4
|
+
Summary: A hierarchical RAG framework for chunks retrieval.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
7
7
|
http://www.apache.org/licenses/
|
@@ -1,15 +1,14 @@
|
|
1
|
-
hie_rag/__init__.py,sha256=
|
1
|
+
hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
|
2
2
|
hie_rag/app.py,sha256=jZkGEIXhYL2mY3KhixXFqvkOn8r0Cdav3EZxlChvKDA,2636
|
3
|
-
hie_rag/
|
4
|
-
hie_rag/hie_rag.py,sha256=rpQfbcPVYaeA2RCEcOxlwovYPsBYRzFYlfO9WP4piSo,3442
|
3
|
+
hie_rag/hie_rag.py,sha256=h5EcGcxbcGm6-jB3lr_EIuZ-0wEQFJTF1xffzQKDJUI,2353
|
5
4
|
hie_rag/process.py,sha256=JaL8i1IZckeeaHsNSYiUIlYRsRRB73E9QqLCSh09JHA,2434
|
6
5
|
hie_rag/split.py,sha256=st_bZ4UaKUOXbxUIDobfG1IsW5vC9rHeyo4LXprfKrk,4904
|
7
6
|
hie_rag/split_and_process.py,sha256=eRMiBYBZWUo3ljFasZGAOSP_6_adiwBD094DZJfVQDk,565
|
8
7
|
hie_rag/tree_index.py,sha256=5rCoCCO14KLFvRzeOGB08mAnd6d3p7dl4h4jGQqF13A,2688
|
9
8
|
hie_rag/utils.py,sha256=cxYLNch5CVgnpuD3ScVoJMP8Kp0_Ni3grF5tV1_sCOM,2769
|
10
9
|
hie_rag/vectordb.py,sha256=UVdAinxUDhDqwbFbeXaLVdzN6uC4nu5l7rWi600d8BU,11065
|
11
|
-
hie_rag-0.1.
|
12
|
-
hie_rag-0.1.
|
13
|
-
hie_rag-0.1.
|
14
|
-
hie_rag-0.1.
|
15
|
-
hie_rag-0.1.
|
10
|
+
hie_rag-0.1.3.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
|
11
|
+
hie_rag-0.1.3.dist-info/METADATA,sha256=U_s4BPalfUt8xQWqj1mHNJJC7IEPZuXSmeVaHOBNhn4,1699
|
12
|
+
hie_rag-0.1.3.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
13
|
+
hie_rag-0.1.3.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
|
14
|
+
hie_rag-0.1.3.dist-info/RECORD,,
|
hie_rag/generate.py
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
from typing import Dict, List
|
2
|
-
|
3
|
-
from langchain_core.prompts import PromptTemplate
|
4
|
-
from langchain_openai import ChatOpenAI
|
5
|
-
from pydantic import Field
|
6
|
-
from typing_extensions import TypedDict
|
7
|
-
|
8
|
-
|
9
|
-
class Generate:
|
10
|
-
def __init__(self, api_key: str):
|
11
|
-
self.client = ChatOpenAI(temperature=0, model="gpt-4o", api_key=api_key)
|
12
|
-
|
13
|
-
def generate(self, content: str, possible_reference: str) -> Dict:
|
14
|
-
"""Generate data for finetuning"""
|
15
|
-
prompt = PromptTemplate(
|
16
|
-
template="""
|
17
|
-
你是一個資料生成器,負責生成用於微調模型的資料集。
|
18
|
-
你的工作是閱讀以下內容,並生成一系列人類可能給出的指令(instruction)以及對應的詳細回應(response)。
|
19
|
-
instruction 可能會是一個問題、請求整理或者整理內容等指令。
|
20
|
-
|
21
|
-
注意事項:
|
22
|
-
1. 請輸出繁體中文。
|
23
|
-
2. 請務必只生成與內容相關的指令與回應。
|
24
|
-
3. 如果不確定內容在講什麼,可以參考「可能參考資料(Possible Reference)」來幫助理解。
|
25
|
-
4.「可能參考資料」只是可能幫助你理解的參考來源。
|
26
|
-
5. 不要捏造答案,如果真的不知道,就不要亂寫。
|
27
|
-
|
28
|
-
Content:
|
29
|
-
{content}
|
30
|
-
|
31
|
-
Possible Reference:
|
32
|
-
{possible_reference}
|
33
|
-
|
34
|
-
""",
|
35
|
-
input_variables=["content", "possible_reference"],
|
36
|
-
)
|
37
|
-
class InstructionResponse(TypedDict):
|
38
|
-
instruction: str = Field(
|
39
|
-
description="An instruction that a human might provide based on the content.",
|
40
|
-
)
|
41
|
-
response: str = Field(
|
42
|
-
description="The corresponding response to the instruction.",
|
43
|
-
)
|
44
|
-
used_reference: bool = Field(
|
45
|
-
description="Indicates whether the possible reference was used to generate this pair. True if the Possible Reference is relavent and useful, False otherwise.",
|
46
|
-
)
|
47
|
-
reference_usage: str = Field(
|
48
|
-
description="Explanation of how the reference was used, if it was used.",
|
49
|
-
)
|
50
|
-
|
51
|
-
class Dataset(TypedDict):
|
52
|
-
dataset: List[InstructionResponse]
|
53
|
-
content_analysis: str = Field(
|
54
|
-
description="Brief analysis of whether and how the reference helped with understanding the content.",
|
55
|
-
)
|
56
|
-
|
57
|
-
model = self.client
|
58
|
-
llm_with_tool = model.with_structured_output(Dataset)
|
59
|
-
chain = prompt | llm_with_tool
|
60
|
-
|
61
|
-
return chain.invoke({"content": content, "possible_reference": possible_reference})
|
File without changes
|
File without changes
|
File without changes
|