langroid 0.33.4__py3-none-any.whl → 0.33.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/__init__.py +106 -0
- langroid/agent/__init__.py +41 -0
- langroid/agent/base.py +1983 -0
- langroid/agent/batch.py +398 -0
- langroid/agent/callbacks/__init__.py +0 -0
- langroid/agent/callbacks/chainlit.py +598 -0
- langroid/agent/chat_agent.py +1899 -0
- langroid/agent/chat_document.py +454 -0
- langroid/agent/openai_assistant.py +882 -0
- langroid/agent/special/__init__.py +59 -0
- langroid/agent/special/arangodb/__init__.py +0 -0
- langroid/agent/special/arangodb/arangodb_agent.py +656 -0
- langroid/agent/special/arangodb/system_messages.py +186 -0
- langroid/agent/special/arangodb/tools.py +107 -0
- langroid/agent/special/arangodb/utils.py +36 -0
- langroid/agent/special/doc_chat_agent.py +1466 -0
- langroid/agent/special/lance_doc_chat_agent.py +262 -0
- langroid/agent/special/lance_rag/__init__.py +9 -0
- langroid/agent/special/lance_rag/critic_agent.py +198 -0
- langroid/agent/special/lance_rag/lance_rag_task.py +82 -0
- langroid/agent/special/lance_rag/query_planner_agent.py +260 -0
- langroid/agent/special/lance_tools.py +61 -0
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +433 -0
- langroid/agent/special/neo4j/system_messages.py +120 -0
- langroid/agent/special/neo4j/tools.py +32 -0
- langroid/agent/special/relevance_extractor_agent.py +127 -0
- langroid/agent/special/retriever_agent.py +56 -0
- langroid/agent/special/sql/__init__.py +17 -0
- langroid/agent/special/sql/sql_chat_agent.py +654 -0
- langroid/agent/special/sql/utils/__init__.py +21 -0
- langroid/agent/special/sql/utils/description_extractors.py +190 -0
- langroid/agent/special/sql/utils/populate_metadata.py +85 -0
- langroid/agent/special/sql/utils/system_message.py +35 -0
- langroid/agent/special/sql/utils/tools.py +64 -0
- langroid/agent/special/table_chat_agent.py +263 -0
- langroid/agent/task.py +2095 -0
- langroid/agent/tool_message.py +393 -0
- langroid/agent/tools/__init__.py +38 -0
- langroid/agent/tools/duckduckgo_search_tool.py +50 -0
- langroid/agent/tools/file_tools.py +234 -0
- langroid/agent/tools/google_search_tool.py +39 -0
- langroid/agent/tools/metaphor_search_tool.py +68 -0
- langroid/agent/tools/orchestration.py +303 -0
- langroid/agent/tools/recipient_tool.py +235 -0
- langroid/agent/tools/retrieval_tool.py +32 -0
- langroid/agent/tools/rewind_tool.py +137 -0
- langroid/agent/tools/segment_extract_tool.py +41 -0
- langroid/agent/xml_tool_message.py +382 -0
- langroid/cachedb/__init__.py +17 -0
- langroid/cachedb/base.py +58 -0
- langroid/cachedb/momento_cachedb.py +108 -0
- langroid/cachedb/redis_cachedb.py +153 -0
- langroid/embedding_models/__init__.py +39 -0
- langroid/embedding_models/base.py +74 -0
- langroid/embedding_models/models.py +461 -0
- langroid/embedding_models/protoc/__init__.py +0 -0
- langroid/embedding_models/protoc/embeddings.proto +19 -0
- langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
- langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
- langroid/embedding_models/remote_embeds.py +153 -0
- langroid/exceptions.py +71 -0
- langroid/language_models/__init__.py +53 -0
- langroid/language_models/azure_openai.py +153 -0
- langroid/language_models/base.py +678 -0
- langroid/language_models/config.py +18 -0
- langroid/language_models/mock_lm.py +124 -0
- langroid/language_models/openai_gpt.py +1964 -0
- langroid/language_models/prompt_formatter/__init__.py +16 -0
- langroid/language_models/prompt_formatter/base.py +40 -0
- langroid/language_models/prompt_formatter/hf_formatter.py +132 -0
- langroid/language_models/prompt_formatter/llama2_formatter.py +75 -0
- langroid/language_models/utils.py +151 -0
- langroid/mytypes.py +84 -0
- langroid/parsing/__init__.py +52 -0
- langroid/parsing/agent_chats.py +38 -0
- langroid/parsing/code_parser.py +121 -0
- langroid/parsing/document_parser.py +718 -0
- langroid/parsing/para_sentence_split.py +62 -0
- langroid/parsing/parse_json.py +155 -0
- langroid/parsing/parser.py +313 -0
- langroid/parsing/repo_loader.py +790 -0
- langroid/parsing/routing.py +36 -0
- langroid/parsing/search.py +275 -0
- langroid/parsing/spider.py +102 -0
- langroid/parsing/table_loader.py +94 -0
- langroid/parsing/url_loader.py +111 -0
- langroid/parsing/urls.py +273 -0
- langroid/parsing/utils.py +373 -0
- langroid/parsing/web_search.py +156 -0
- langroid/prompts/__init__.py +9 -0
- langroid/prompts/dialog.py +17 -0
- langroid/prompts/prompts_config.py +5 -0
- langroid/prompts/templates.py +141 -0
- langroid/pydantic_v1/__init__.py +10 -0
- langroid/pydantic_v1/main.py +4 -0
- langroid/utils/__init__.py +19 -0
- langroid/utils/algorithms/__init__.py +3 -0
- langroid/utils/algorithms/graph.py +103 -0
- langroid/utils/configuration.py +98 -0
- langroid/utils/constants.py +30 -0
- langroid/utils/git_utils.py +252 -0
- langroid/utils/globals.py +49 -0
- langroid/utils/logging.py +135 -0
- langroid/utils/object_registry.py +66 -0
- langroid/utils/output/__init__.py +20 -0
- langroid/utils/output/citations.py +41 -0
- langroid/utils/output/printing.py +99 -0
- langroid/utils/output/status.py +40 -0
- langroid/utils/pandas_utils.py +30 -0
- langroid/utils/pydantic_utils.py +602 -0
- langroid/utils/system.py +286 -0
- langroid/utils/types.py +93 -0
- langroid/vector_store/__init__.py +50 -0
- langroid/vector_store/base.py +359 -0
- langroid/vector_store/chromadb.py +214 -0
- langroid/vector_store/lancedb.py +406 -0
- langroid/vector_store/meilisearch.py +299 -0
- langroid/vector_store/momento.py +278 -0
- langroid/vector_store/qdrantdb.py +468 -0
- {langroid-0.33.4.dist-info → langroid-0.33.7.dist-info}/METADATA +95 -94
- langroid-0.33.7.dist-info/RECORD +127 -0
- {langroid-0.33.4.dist-info → langroid-0.33.7.dist-info}/WHEEL +1 -1
- langroid-0.33.4.dist-info/RECORD +0 -7
- langroid-0.33.4.dist-info/entry_points.txt +0 -4
- pyproject.toml +0 -356
- {langroid-0.33.4.dist-info → langroid-0.33.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,260 @@
|
|
1
|
+
"""
|
2
|
+
LanceQueryPlanAgent is a ChatAgent created with a specific document schema.
|
3
|
+
Given a QUERY, the LLM constructs a Query Plan consisting of:
|
4
|
+
- filter condition if needed (or empty string if no filter is needed)
|
5
|
+
- query - a possibly rephrased query that can be used to match the `content` field
|
6
|
+
- dataframe_calc - a Pandas-dataframe calculation/aggregation string, possibly empty
|
7
|
+
- original_query - the original query for reference
|
8
|
+
|
9
|
+
This agent has access to two tools:
|
10
|
+
- QueryPlanTool, which is used to generate the Query Plan, and the handler of
|
11
|
+
this tool simply passes it on to the RAG agent named in config.doc_agent_name.
|
12
|
+
- QueryPlanFeedbackTool, which is used to handle feedback on the Query Plan and
|
13
|
+
Result from the RAG agent. The QueryPlanFeedbackTool is used by
|
14
|
+
the QueryPlanCritic, who inserts feedback into the `feedback` field
|
15
|
+
"""
|
16
|
+
|
17
|
+
import logging
|
18
|
+
from typing import Optional
|
19
|
+
|
20
|
+
from langroid.agent.chat_agent import ChatAgent, ChatAgentConfig
|
21
|
+
from langroid.agent.chat_document import ChatDocument
|
22
|
+
from langroid.agent.special.lance_tools import (
|
23
|
+
AnswerTool,
|
24
|
+
QueryPlan,
|
25
|
+
QueryPlanAnswerTool,
|
26
|
+
QueryPlanFeedbackTool,
|
27
|
+
QueryPlanTool,
|
28
|
+
)
|
29
|
+
from langroid.agent.tools.orchestration import AgentDoneTool, ForwardTool
|
30
|
+
from langroid.utils.constants import NO_ANSWER
|
31
|
+
|
32
|
+
logger = logging.getLogger(__name__)
|
33
|
+
|
34
|
+
|
35
|
+
class LanceQueryPlanAgentConfig(ChatAgentConfig):
|
36
|
+
name: str = "LancePlanner"
|
37
|
+
critic_name: str = "QueryPlanCritic"
|
38
|
+
doc_agent_name: str = "LanceRAG"
|
39
|
+
doc_schema: str = ""
|
40
|
+
use_tools = False
|
41
|
+
max_retries: int = 5 # max number of retries for query plan
|
42
|
+
use_functions_api = True
|
43
|
+
|
44
|
+
system_message = """
|
45
|
+
You will receive a QUERY, to be answered based on an EXTREMELY LARGE collection
|
46
|
+
of documents you DO NOT have access to, but your ASSISTANT does.
|
47
|
+
You only know that these documents have a special `content` field
|
48
|
+
and additional FILTERABLE fields in the SCHEMA below, along with the
|
49
|
+
SAMPLE VALUES for each field, and the DTYPE in PANDAS TERMINOLOGY.
|
50
|
+
|
51
|
+
{doc_schema}
|
52
|
+
|
53
|
+
Based on the QUERY and the above SCHEMA, your task is to determine a QUERY PLAN,
|
54
|
+
consisting of:
|
55
|
+
- a PANDAS-TYPE FILTER (can be empty string) that would help the ASSISTANT to
|
56
|
+
answer the query.
|
57
|
+
Remember the FILTER can refer to ANY fields in the above SCHEMA
|
58
|
+
EXCEPT the `content` field of the documents.
|
59
|
+
ONLY USE A FILTER IF EXPLICITLY MENTIONED IN THE QUERY.
|
60
|
+
TO get good results, for STRING MATCHES, consider using LIKE instead of =, e.g.
|
61
|
+
"CEO LIKE '%Jobs%'" instead of "CEO = 'Steve Jobs'"
|
62
|
+
YOUR FILTER MUST BE A PANDAS-TYPE FILTER, respecting the shown DTYPES.
|
63
|
+
- a possibly REPHRASED QUERY (CANNOT BE EMPTY) to be answerable given the FILTER.
|
64
|
+
Keep in mind that the ASSISTANT does NOT know anything about the FILTER fields,
|
65
|
+
so the REPHRASED QUERY should NOT mention ANY FILTER fields.
|
66
|
+
The assistant will answer based on documents whose CONTENTS match the QUERY,
|
67
|
+
possibly REPHRASED.
|
68
|
+
!!!!****THE REPHRASED QUERY SHOULD NEVER BE EMPTY****!!!
|
69
|
+
- an OPTIONAL SINGLE-LINE Pandas-dataframe calculation/aggregation string
|
70
|
+
that can be used to calculate the answer to the original query,
|
71
|
+
e.g. "df["rating"].mean()",
|
72
|
+
or "df.groupby("director").mean()["rating"]",
|
73
|
+
or EMPTY string if no calc is needed.
|
74
|
+
The dataframe calc CAN refer to the `content` field.
|
75
|
+
If a DataFrame calculation is NOT needed, leave this field EMPTY.
|
76
|
+
|
77
|
+
IMPORTANT: The DataFrame `df` in this calculation is the result of
|
78
|
+
applying the FILTER AND REPHRASED QUERY to the documents.
|
79
|
+
|
80
|
+
WATCH OUT!! When deciding the dataframe calc, if any, CAREFULLY
|
81
|
+
note what the query is asking, and ensure that the result of your
|
82
|
+
dataframe calc expression would answer the query.
|
83
|
+
|
84
|
+
|
85
|
+
EXAMPLE:
|
86
|
+
-------
|
87
|
+
Suppose there is a document-set about crime reports, where:
|
88
|
+
CONTENT = crime report,
|
89
|
+
Filterable SCHEMA consists of City, Year, num_deaths.
|
90
|
+
|
91
|
+
Then given this ORIGINAL QUERY:
|
92
|
+
|
93
|
+
Total deaths in shoplifting crimes in Los Angeles in 2023?
|
94
|
+
|
95
|
+
A POSSIBLE QUERY PLAN could be:
|
96
|
+
|
97
|
+
FILTER: "City LIKE '%Los Angeles%' AND Year = 2023"
|
98
|
+
REPHRASED QUERY: "shoplifting crime" --> this will be used to MATCH content of docs
|
99
|
+
[NOTE: we dropped the FILTER fields City and Year since the
|
100
|
+
ASSISTANT does not know about them and only uses the query to
|
101
|
+
match the CONTENT of the docs.]
|
102
|
+
DATAFRAME CALCULATION: "df["num_deaths"].sum()"
|
103
|
+
NOTE!!! The DataFrame `df` in this calculation is the result of
|
104
|
+
applying the FILTER AND REPHRASED QUERY to the documents,
|
105
|
+
hence this computation will give the total deaths in shoplifting crimes.
|
106
|
+
------------- END OF EXAMPLE ----------------
|
107
|
+
|
108
|
+
The FILTER must be a PANDAS-like condition, e.g.
|
109
|
+
"year > 2000 AND genre = 'ScienceFiction'".
|
110
|
+
To ensure you get useful results, you should make your FILTER
|
111
|
+
NOT TOO STRICT, e.g. look for approximate match using LIKE, etc.
|
112
|
+
E.g. "CEO LIKE '%Jobs%'" instead of "CEO = 'Steve Jobs'"
|
113
|
+
Use DOT NOTATION to refer to nested fields, e.g. `metadata.year`, etc.
|
114
|
+
|
115
|
+
You must FIRST present the QUERY PLAN using the `query_plan` tool/function.
|
116
|
+
This will be handled by your document assistant, who will produce an ANSWER.
|
117
|
+
|
118
|
+
You may receive FEEDBACK on your QUERY PLAN and received ANSWER,
|
119
|
+
from the 'QueryPlanCritic' who may offer suggestions for
|
120
|
+
a better FILTER, REPHRASED QUERY, or DATAFRAME CALCULATION.
|
121
|
+
|
122
|
+
At the BEGINNING if there is no query, ASK the user what they want to know.
|
123
|
+
"""
|
124
|
+
|
125
|
+
def set_system_message(self) -> None:
|
126
|
+
self.system_message = self.system_message.format(
|
127
|
+
doc_schema=self.doc_schema,
|
128
|
+
)
|
129
|
+
|
130
|
+
|
131
|
+
class LanceQueryPlanAgent(ChatAgent):
|
132
|
+
def __init__(self, config: LanceQueryPlanAgentConfig):
|
133
|
+
super().__init__(config)
|
134
|
+
self.config: LanceQueryPlanAgentConfig = config
|
135
|
+
# This agent should generate the QueryPlanTool
|
136
|
+
# as well as handle it for validation
|
137
|
+
self.enable_message(QueryPlanTool, use=True, handle=True)
|
138
|
+
self.enable_message(QueryPlanFeedbackTool, use=False, handle=True)
|
139
|
+
self.enable_message(AnswerTool, use=False, handle=True)
|
140
|
+
# neither use nor handle! Added to "known" tools so that the Planner agent
|
141
|
+
# can avoid processing it
|
142
|
+
self.enable_message(QueryPlanAnswerTool, use=False, handle=False)
|
143
|
+
# LLM will not use this, so set use=False (Agent generates it)
|
144
|
+
self.enable_message(AgentDoneTool, use=False, handle=True)
|
145
|
+
|
146
|
+
def init_state(self) -> None:
|
147
|
+
super().init_state()
|
148
|
+
self.curr_query_plan: QueryPlan | None = None
|
149
|
+
self.expecting_query_plan: bool = False
|
150
|
+
# how many times re-trying query plan in response to feedback:
|
151
|
+
self.n_retries: int = 0
|
152
|
+
self.n_query_plan_reminders: int = 0
|
153
|
+
self.result: str = "" # answer received from LanceRAG
|
154
|
+
|
155
|
+
def llm_response(
|
156
|
+
self, message: Optional[str | ChatDocument] = None
|
157
|
+
) -> Optional[ChatDocument]:
|
158
|
+
self.expecting_query_plan = True
|
159
|
+
return super().llm_response(message)
|
160
|
+
|
161
|
+
def query_plan(self, msg: QueryPlanTool) -> ForwardTool | str:
|
162
|
+
"""Valid, tool msg, forward chat_doc to RAG Agent.
|
163
|
+
Note this chat_doc will already have the
|
164
|
+
QueryPlanTool in its tool_messages list.
|
165
|
+
We just update the recipient to the doc_agent_name.
|
166
|
+
"""
|
167
|
+
# save, to be used to assemble QueryPlanResultTool
|
168
|
+
if len(msg.plan.dataframe_calc.split("\n")) > 1:
|
169
|
+
return "DATAFRAME CALCULATION must be a SINGLE LINE; Retry the `query_plan`"
|
170
|
+
self.curr_query_plan = msg.plan
|
171
|
+
self.expecting_query_plan = False
|
172
|
+
|
173
|
+
# To forward the QueryPlanTool to doc_agent, we could either:
|
174
|
+
|
175
|
+
# (a) insert `recipient` in the QueryPlanTool:
|
176
|
+
# QPWithRecipient = QueryPlanTool.require_recipient()
|
177
|
+
# qp = QPWithRecipient(**msg.dict(), recipient=self.config.doc_agent_name)
|
178
|
+
# return qp
|
179
|
+
#
|
180
|
+
# OR
|
181
|
+
#
|
182
|
+
# (b) create an agent response with recipient and tool_messages.
|
183
|
+
# response = self.create_agent_response(
|
184
|
+
# recipient=self.config.doc_agent_name, tool_messages=[msg]
|
185
|
+
# )
|
186
|
+
# return response
|
187
|
+
|
188
|
+
# OR
|
189
|
+
# (c) use the ForwardTool:
|
190
|
+
return ForwardTool(agent=self.config.doc_agent_name)
|
191
|
+
|
192
|
+
def query_plan_feedback(self, msg: QueryPlanFeedbackTool) -> str | AgentDoneTool:
|
193
|
+
"""Process Critic feedback on QueryPlan + Answer from RAG Agent"""
|
194
|
+
# We should have saved answer in self.result by this time,
|
195
|
+
# since this Agent seeks feedback only after receiving RAG answer.
|
196
|
+
if (
|
197
|
+
msg.suggested_fix == ""
|
198
|
+
and NO_ANSWER not in self.result
|
199
|
+
and self.result != ""
|
200
|
+
):
|
201
|
+
# This means the result is good AND Query Plan is fine,
|
202
|
+
# as judged by Critic
|
203
|
+
# (Note sometimes critic may have empty suggested_fix even when
|
204
|
+
# the result is NO_ANSWER)
|
205
|
+
self.n_retries = 0 # good answer, so reset this
|
206
|
+
return AgentDoneTool(content=self.result)
|
207
|
+
self.n_retries += 1
|
208
|
+
if self.n_retries >= self.config.max_retries:
|
209
|
+
# bail out to avoid infinite loop
|
210
|
+
self.n_retries = 0
|
211
|
+
return AgentDoneTool(content=NO_ANSWER)
|
212
|
+
|
213
|
+
# there is a suggested_fix, OR the result is empty or NO_ANSWER
|
214
|
+
if self.result == "" or NO_ANSWER in self.result:
|
215
|
+
# if result is empty or NO_ANSWER, we should retry the query plan
|
216
|
+
feedback = """
|
217
|
+
There was no answer, which might mean there is a problem in your query.
|
218
|
+
"""
|
219
|
+
suggested = "Retry the `query_plan` to try to get a non-null answer"
|
220
|
+
else:
|
221
|
+
feedback = msg.feedback
|
222
|
+
suggested = msg.suggested_fix
|
223
|
+
|
224
|
+
self.expecting_query_plan = True
|
225
|
+
|
226
|
+
return f"""
|
227
|
+
here is FEEDBACK about your QUERY PLAN, and a SUGGESTED FIX.
|
228
|
+
Modify the QUERY PLAN if needed:
|
229
|
+
ANSWER: {self.result}
|
230
|
+
FEEDBACK: {feedback}
|
231
|
+
SUGGESTED FIX: {suggested}
|
232
|
+
"""
|
233
|
+
|
234
|
+
def answer_tool(self, msg: AnswerTool) -> QueryPlanAnswerTool:
|
235
|
+
"""Handle AnswerTool received from LanceRagAgent:
|
236
|
+
Construct a QueryPlanAnswerTool with the answer"""
|
237
|
+
self.result = msg.answer # save answer to interpret feedback later
|
238
|
+
assert self.curr_query_plan is not None
|
239
|
+
query_plan_answer_tool = QueryPlanAnswerTool(
|
240
|
+
plan=self.curr_query_plan,
|
241
|
+
answer=msg.answer,
|
242
|
+
)
|
243
|
+
self.curr_query_plan = None # reset
|
244
|
+
return query_plan_answer_tool
|
245
|
+
|
246
|
+
def handle_message_fallback(
|
247
|
+
self, msg: str | ChatDocument
|
248
|
+
) -> str | ChatDocument | None:
|
249
|
+
"""
|
250
|
+
Remind to use QueryPlanTool if we are expecting it.
|
251
|
+
"""
|
252
|
+
if self.expecting_query_plan and self.n_query_plan_reminders < 5:
|
253
|
+
self.n_query_plan_reminders += 1
|
254
|
+
return """
|
255
|
+
You FORGOT to use the `query_plan` tool/function,
|
256
|
+
OR you had a WRONG JSON SYNTAX when trying to use it.
|
257
|
+
Re-try your response using the `query_plan` tool/function CORRECTLY.
|
258
|
+
"""
|
259
|
+
self.n_query_plan_reminders = 0 # reset
|
260
|
+
return None
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from langroid.agent.tool_message import ToolMessage
|
4
|
+
from langroid.pydantic_v1 import BaseModel, Field
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
class QueryPlan(BaseModel):
|
10
|
+
original_query: str = Field(..., description="The original query for reference")
|
11
|
+
query: str = Field(..., description="A possibly NON-EMPTY rephrased query")
|
12
|
+
filter: str = Field(
|
13
|
+
"",
|
14
|
+
description="Filter condition if needed (or empty if no filter is needed)",
|
15
|
+
)
|
16
|
+
dataframe_calc: str = Field(
|
17
|
+
"", description="An optional Pandas-dataframe calculation/aggregation string"
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
class QueryPlanTool(ToolMessage):
|
22
|
+
request = "query_plan" # the agent method name that handles this tool
|
23
|
+
purpose = """
|
24
|
+
Given a user's query, generate a query <plan> consisting of:
|
25
|
+
- <original_query> - the original query for reference
|
26
|
+
- <filter> condition if needed (or empty string if no filter is needed)
|
27
|
+
- <query> - a possibly NON-EMPTY rephrased query that can be used to match the
|
28
|
+
CONTENT of the documents
|
29
|
+
(can be same as <original_query> if no rephrasing is needed)
|
30
|
+
- <dataframe_calc> - a Pandas-dataframe calculation/aggregation string
|
31
|
+
that can be used to calculate the answer
|
32
|
+
(or empty string if no calculation is needed).
|
33
|
+
"""
|
34
|
+
plan: QueryPlan
|
35
|
+
|
36
|
+
|
37
|
+
class AnswerTool(ToolMessage):
|
38
|
+
"""Wrapper for answer from LanceDocChatAgent"""
|
39
|
+
|
40
|
+
purpose: str = "To package the answer from LanceDocChatAgent"
|
41
|
+
request: str = "answer_tool"
|
42
|
+
answer: str
|
43
|
+
|
44
|
+
|
45
|
+
class QueryPlanAnswerTool(ToolMessage):
|
46
|
+
request: str = "query_plan_answer" # the agent method name that handles this tool
|
47
|
+
purpose: str = """
|
48
|
+
Assemble query <plan> and <answer>
|
49
|
+
"""
|
50
|
+
plan: QueryPlan
|
51
|
+
answer: str = Field(..., description="The answer received from the assistant")
|
52
|
+
|
53
|
+
|
54
|
+
class QueryPlanFeedbackTool(ToolMessage):
|
55
|
+
request = "query_plan_feedback"
|
56
|
+
purpose = """
|
57
|
+
To give <feedback> regarding the query plan,
|
58
|
+
along with a <suggested_fix> if any (empty string if no fix is suggested).
|
59
|
+
"""
|
60
|
+
feedback: str
|
61
|
+
suggested_fix: str
|
File without changes
|
@@ -0,0 +1,174 @@
|
|
1
|
+
from typing import List, Optional, Tuple
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
import typer
|
5
|
+
|
6
|
+
from langroid.agent.special.neo4j.neo4j_chat_agent import (
|
7
|
+
Neo4jChatAgent,
|
8
|
+
Neo4jChatAgentConfig,
|
9
|
+
)
|
10
|
+
from langroid.agent.tool_message import ToolMessage
|
11
|
+
from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
|
12
|
+
from langroid.parsing.table_loader import read_tabular_data
|
13
|
+
from langroid.utils.output import status
|
14
|
+
from langroid.vector_store.base import VectorStoreConfig
|
15
|
+
|
16
|
+
app = typer.Typer()
|
17
|
+
|
18
|
+
|
19
|
+
BUILD_KG_INSTRUCTIONS = """
|
20
|
+
Your task is to build a knowledge graph based on a CSV file.
|
21
|
+
|
22
|
+
You need to generate the graph database based on this
|
23
|
+
header:
|
24
|
+
|
25
|
+
{header}
|
26
|
+
|
27
|
+
and these sample rows:
|
28
|
+
|
29
|
+
{sample_rows}.
|
30
|
+
|
31
|
+
Leverage the above information to:
|
32
|
+
- Define node labels and their properties
|
33
|
+
- Infer relationships
|
34
|
+
- Infer constraints
|
35
|
+
ASK me if you need further information to figure out the schema.
|
36
|
+
You can use the tool/function `pandas_to_kg` to display and confirm
|
37
|
+
the nodes and relationships.
|
38
|
+
"""
|
39
|
+
|
40
|
+
DEFAULT_CSV_KG_CHAT_SYSTEM_MESSAGE = """
|
41
|
+
You are an expert in Knowledge Graphs and analyzing them using Neo4j.
|
42
|
+
You will be asked to answer questions based on the knowledge graph.
|
43
|
+
"""
|
44
|
+
|
45
|
+
|
46
|
+
def _preprocess_dataframe_for_neo4j(
|
47
|
+
df: pd.DataFrame, default_value: Optional[str] = None, remove_null_rows: bool = True
|
48
|
+
) -> pd.DataFrame:
|
49
|
+
"""
|
50
|
+
Preprocess a DataFrame for Neo4j import by fixing mismatched quotes in string
|
51
|
+
columns and handling null or missing values.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
df (DataFrame): The DataFrame to be preprocessed.
|
55
|
+
default_value (str, optional): The default value to replace null values.
|
56
|
+
This is ignored if remove_null_rows is True. Defaults to None.
|
57
|
+
remove_null_rows (bool, optional): If True, rows with any null values will
|
58
|
+
be removed.
|
59
|
+
If False, null values will be filled with default_value. Defaults to False.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
DataFrame: The preprocessed DataFrame ready for Neo4j import.
|
63
|
+
"""
|
64
|
+
|
65
|
+
# Fix mismatched quotes in string columns
|
66
|
+
for column in df.select_dtypes(include=["object"]):
|
67
|
+
df[column] = df[column].apply(
|
68
|
+
lambda x: x + '"' if (isinstance(x, str) and x.count('"') % 2 != 0) else x
|
69
|
+
)
|
70
|
+
|
71
|
+
# Handle null or missing values
|
72
|
+
if remove_null_rows:
|
73
|
+
df = df.dropna()
|
74
|
+
else:
|
75
|
+
if default_value is not None:
|
76
|
+
df = df.fillna(default_value)
|
77
|
+
|
78
|
+
return df
|
79
|
+
|
80
|
+
|
81
|
+
class CSVGraphAgentConfig(Neo4jChatAgentConfig):
|
82
|
+
system_message: str = DEFAULT_CSV_KG_CHAT_SYSTEM_MESSAGE
|
83
|
+
data: str | pd.DataFrame | None # data file, URL, or DataFrame
|
84
|
+
separator: None | str = None # separator for data file
|
85
|
+
vecdb: None | VectorStoreConfig = None
|
86
|
+
llm: OpenAIGPTConfig = OpenAIGPTConfig(
|
87
|
+
chat_model=OpenAIChatModel.GPT4_TURBO,
|
88
|
+
)
|
89
|
+
|
90
|
+
|
91
|
+
class PandasToKGTool(ToolMessage):
|
92
|
+
request: str = "pandas_to_kg"
|
93
|
+
purpose: str = """Use this tool to create ONLY nodes and their relationships based
|
94
|
+
on the created model.
|
95
|
+
Take into account that the Cypher query will be executed while iterating
|
96
|
+
over the rows in the CSV file (e.g. `index, row in df.iterrows()`),
|
97
|
+
so there NO NEED to load the CSV.
|
98
|
+
Make sure you send me the cypher query in this format:
|
99
|
+
- placeholders in <cypherQuery> should be based on the CSV header.
|
100
|
+
- <args> an array wherein each element corresponds to a placeholder in the
|
101
|
+
<cypherQuery> and provided in the same order as the headers.
|
102
|
+
SO the <args> should be the result of: `[row_dict[header] for header in headers]`
|
103
|
+
"""
|
104
|
+
cypherQuery: str
|
105
|
+
args: list[str]
|
106
|
+
|
107
|
+
@classmethod
|
108
|
+
def examples(cls) -> List["ToolMessage" | Tuple[str, "ToolMessage"]]:
|
109
|
+
return [
|
110
|
+
cls(
|
111
|
+
cypherQuery="""MERGE (employee:Employee {name: $employeeName,
|
112
|
+
id: $employeeId})\n
|
113
|
+
MERGE (department:Department {name: $departmentName})\n
|
114
|
+
MERGE (employee)-[:WORKS_IN]->(department)\n
|
115
|
+
SET employee.email = $employeeEmail""",
|
116
|
+
args=["employeeName", "employeeId", "departmentName", "employeeEmail"],
|
117
|
+
),
|
118
|
+
]
|
119
|
+
|
120
|
+
|
121
|
+
class CSVGraphAgent(Neo4jChatAgent):
|
122
|
+
def __init__(self, config: CSVGraphAgentConfig):
|
123
|
+
formatted_build_instr = ""
|
124
|
+
if isinstance(config.data, pd.DataFrame):
|
125
|
+
df = config.data
|
126
|
+
self.df = df
|
127
|
+
else:
|
128
|
+
if config.data:
|
129
|
+
df = read_tabular_data(config.data, config.separator)
|
130
|
+
df_cleaned = _preprocess_dataframe_for_neo4j(df)
|
131
|
+
|
132
|
+
df_cleaned.columns = df_cleaned.columns.str.strip().str.replace(
|
133
|
+
" +", "_", regex=True
|
134
|
+
)
|
135
|
+
|
136
|
+
self.df = df_cleaned
|
137
|
+
|
138
|
+
formatted_build_instr = BUILD_KG_INSTRUCTIONS.format(
|
139
|
+
header=self.df.columns, sample_rows=self.df.head(3)
|
140
|
+
)
|
141
|
+
|
142
|
+
config.system_message = config.system_message + formatted_build_instr
|
143
|
+
super().__init__(config)
|
144
|
+
|
145
|
+
self.config: Neo4jChatAgentConfig = config
|
146
|
+
|
147
|
+
self.enable_message(PandasToKGTool)
|
148
|
+
|
149
|
+
def pandas_to_kg(self, msg: PandasToKGTool) -> str:
|
150
|
+
"""
|
151
|
+
Creates nodes and relationships in the graph database based on the data in
|
152
|
+
a CSV file.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
msg (PandasToKGTool): An instance of the PandasToKGTool class containing
|
156
|
+
the necessary information for generating nodes.
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
str: A string indicating the success or failure of the operation.
|
160
|
+
"""
|
161
|
+
with status("[cyan]Generating graph database..."):
|
162
|
+
if self.df is not None and hasattr(self.df, "iterrows"):
|
163
|
+
for counter, (index, row) in enumerate(self.df.iterrows()):
|
164
|
+
row_dict = row.to_dict()
|
165
|
+
response = self.write_query(
|
166
|
+
msg.cypherQuery,
|
167
|
+
parameters={header: row_dict[header] for header in msg.args},
|
168
|
+
)
|
169
|
+
# there is a possibility the generated cypher query is not correct
|
170
|
+
# so we need to check the response before continuing to the
|
171
|
+
# iteration
|
172
|
+
if counter == 0 and not response.success:
|
173
|
+
return str(response.data)
|
174
|
+
return "Graph database successfully generated"
|