npcpy 1.0.26__py3-none-any.whl → 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcpy/__init__.py +0 -7
- npcpy/data/audio.py +16 -99
- npcpy/data/image.py +43 -42
- npcpy/data/load.py +83 -124
- npcpy/data/text.py +28 -28
- npcpy/data/video.py +8 -32
- npcpy/data/web.py +51 -23
- npcpy/ft/diff.py +110 -0
- npcpy/ft/ge.py +115 -0
- npcpy/ft/memory_trainer.py +171 -0
- npcpy/ft/model_ensembler.py +357 -0
- npcpy/ft/rl.py +360 -0
- npcpy/ft/sft.py +248 -0
- npcpy/ft/usft.py +128 -0
- npcpy/gen/audio_gen.py +24 -0
- npcpy/gen/embeddings.py +13 -13
- npcpy/gen/image_gen.py +262 -117
- npcpy/gen/response.py +615 -415
- npcpy/gen/video_gen.py +53 -7
- npcpy/llm_funcs.py +1869 -437
- npcpy/main.py +1 -1
- npcpy/memory/command_history.py +844 -510
- npcpy/memory/kg_vis.py +833 -0
- npcpy/memory/knowledge_graph.py +892 -1845
- npcpy/memory/memory_processor.py +81 -0
- npcpy/memory/search.py +188 -90
- npcpy/mix/debate.py +192 -3
- npcpy/npc_compiler.py +1672 -801
- npcpy/npc_sysenv.py +593 -1266
- npcpy/serve.py +3120 -0
- npcpy/sql/ai_function_tools.py +257 -0
- npcpy/sql/database_ai_adapters.py +186 -0
- npcpy/sql/database_ai_functions.py +163 -0
- npcpy/sql/model_runner.py +19 -19
- npcpy/sql/npcsql.py +706 -507
- npcpy/sql/sql_model_compiler.py +156 -0
- npcpy/tools.py +183 -0
- npcpy/work/plan.py +13 -279
- npcpy/work/trigger.py +3 -3
- npcpy-1.2.32.dist-info/METADATA +803 -0
- npcpy-1.2.32.dist-info/RECORD +54 -0
- npcpy/data/dataframes.py +0 -171
- npcpy/memory/deep_research.py +0 -125
- npcpy/memory/sleep.py +0 -557
- npcpy/modes/_state.py +0 -78
- npcpy/modes/alicanto.py +0 -1075
- npcpy/modes/guac.py +0 -785
- npcpy/modes/mcp_npcsh.py +0 -822
- npcpy/modes/npc.py +0 -213
- npcpy/modes/npcsh.py +0 -1158
- npcpy/modes/plonk.py +0 -409
- npcpy/modes/pti.py +0 -234
- npcpy/modes/serve.py +0 -1637
- npcpy/modes/spool.py +0 -312
- npcpy/modes/wander.py +0 -549
- npcpy/modes/yap.py +0 -572
- npcpy/npc_team/alicanto.npc +0 -2
- npcpy/npc_team/alicanto.png +0 -0
- npcpy/npc_team/assembly_lines/test_pipeline.py +0 -181
- npcpy/npc_team/corca.npc +0 -13
- npcpy/npc_team/foreman.npc +0 -7
- npcpy/npc_team/frederic.npc +0 -6
- npcpy/npc_team/frederic4.png +0 -0
- npcpy/npc_team/guac.png +0 -0
- npcpy/npc_team/jinxs/automator.jinx +0 -18
- npcpy/npc_team/jinxs/bash_executer.jinx +0 -31
- npcpy/npc_team/jinxs/calculator.jinx +0 -11
- npcpy/npc_team/jinxs/edit_file.jinx +0 -96
- npcpy/npc_team/jinxs/file_chat.jinx +0 -14
- npcpy/npc_team/jinxs/gui_controller.jinx +0 -28
- npcpy/npc_team/jinxs/image_generation.jinx +0 -29
- npcpy/npc_team/jinxs/internet_search.jinx +0 -30
- npcpy/npc_team/jinxs/local_search.jinx +0 -152
- npcpy/npc_team/jinxs/npcsh_executor.jinx +0 -31
- npcpy/npc_team/jinxs/python_executor.jinx +0 -8
- npcpy/npc_team/jinxs/screen_cap.jinx +0 -25
- npcpy/npc_team/jinxs/sql_executor.jinx +0 -33
- npcpy/npc_team/kadiefa.npc +0 -3
- npcpy/npc_team/kadiefa.png +0 -0
- npcpy/npc_team/npcsh.ctx +0 -9
- npcpy/npc_team/npcsh_sibiji.png +0 -0
- npcpy/npc_team/plonk.npc +0 -2
- npcpy/npc_team/plonk.png +0 -0
- npcpy/npc_team/plonkjr.npc +0 -2
- npcpy/npc_team/plonkjr.png +0 -0
- npcpy/npc_team/sibiji.npc +0 -5
- npcpy/npc_team/sibiji.png +0 -0
- npcpy/npc_team/spool.png +0 -0
- npcpy/npc_team/templates/analytics/celona.npc +0 -0
- npcpy/npc_team/templates/hr_support/raone.npc +0 -0
- npcpy/npc_team/templates/humanities/eriane.npc +0 -4
- npcpy/npc_team/templates/it_support/lineru.npc +0 -0
- npcpy/npc_team/templates/marketing/slean.npc +0 -4
- npcpy/npc_team/templates/philosophy/maurawa.npc +0 -0
- npcpy/npc_team/templates/sales/turnic.npc +0 -4
- npcpy/npc_team/templates/software/welxor.npc +0 -0
- npcpy/npc_team/yap.png +0 -0
- npcpy/routes.py +0 -958
- npcpy/work/mcp_helpers.py +0 -357
- npcpy/work/mcp_server.py +0 -194
- npcpy-1.0.26.data/data/npcpy/npc_team/alicanto.npc +0 -2
- npcpy-1.0.26.data/data/npcpy/npc_team/alicanto.png +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/automator.jinx +0 -18
- npcpy-1.0.26.data/data/npcpy/npc_team/bash_executer.jinx +0 -31
- npcpy-1.0.26.data/data/npcpy/npc_team/calculator.jinx +0 -11
- npcpy-1.0.26.data/data/npcpy/npc_team/celona.npc +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/corca.npc +0 -13
- npcpy-1.0.26.data/data/npcpy/npc_team/edit_file.jinx +0 -96
- npcpy-1.0.26.data/data/npcpy/npc_team/eriane.npc +0 -4
- npcpy-1.0.26.data/data/npcpy/npc_team/file_chat.jinx +0 -14
- npcpy-1.0.26.data/data/npcpy/npc_team/foreman.npc +0 -7
- npcpy-1.0.26.data/data/npcpy/npc_team/frederic.npc +0 -6
- npcpy-1.0.26.data/data/npcpy/npc_team/frederic4.png +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/guac.png +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/gui_controller.jinx +0 -28
- npcpy-1.0.26.data/data/npcpy/npc_team/image_generation.jinx +0 -29
- npcpy-1.0.26.data/data/npcpy/npc_team/internet_search.jinx +0 -30
- npcpy-1.0.26.data/data/npcpy/npc_team/kadiefa.npc +0 -3
- npcpy-1.0.26.data/data/npcpy/npc_team/kadiefa.png +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/lineru.npc +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/local_search.jinx +0 -152
- npcpy-1.0.26.data/data/npcpy/npc_team/maurawa.npc +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/npcsh.ctx +0 -9
- npcpy-1.0.26.data/data/npcpy/npc_team/npcsh_executor.jinx +0 -31
- npcpy-1.0.26.data/data/npcpy/npc_team/npcsh_sibiji.png +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/plonk.npc +0 -2
- npcpy-1.0.26.data/data/npcpy/npc_team/plonk.png +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/plonkjr.npc +0 -2
- npcpy-1.0.26.data/data/npcpy/npc_team/plonkjr.png +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/python_executor.jinx +0 -8
- npcpy-1.0.26.data/data/npcpy/npc_team/raone.npc +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/screen_cap.jinx +0 -25
- npcpy-1.0.26.data/data/npcpy/npc_team/sibiji.npc +0 -5
- npcpy-1.0.26.data/data/npcpy/npc_team/sibiji.png +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/slean.npc +0 -4
- npcpy-1.0.26.data/data/npcpy/npc_team/spool.png +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/sql_executor.jinx +0 -33
- npcpy-1.0.26.data/data/npcpy/npc_team/test_pipeline.py +0 -181
- npcpy-1.0.26.data/data/npcpy/npc_team/turnic.npc +0 -4
- npcpy-1.0.26.data/data/npcpy/npc_team/welxor.npc +0 -0
- npcpy-1.0.26.data/data/npcpy/npc_team/yap.png +0 -0
- npcpy-1.0.26.dist-info/METADATA +0 -827
- npcpy-1.0.26.dist-info/RECORD +0 -139
- npcpy-1.0.26.dist-info/entry_points.txt +0 -11
- /npcpy/{modes → ft}/__init__.py +0 -0
- {npcpy-1.0.26.dist-info → npcpy-1.2.32.dist-info}/WHEEL +0 -0
- {npcpy-1.0.26.dist-info → npcpy-1.2.32.dist-info}/licenses/LICENSE +0 -0
- {npcpy-1.0.26.dist-info → npcpy-1.2.32.dist-info}/top_level.txt +0 -0
npcpy/memory/knowledge_graph.py
CHANGED
|
@@ -1,22 +1,30 @@
|
|
|
1
|
-
import
|
|
2
|
-
import os
|
|
1
|
+
from collections import defaultdict
|
|
3
2
|
import datetime
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
|
|
3
|
+
import json
|
|
7
4
|
try:
|
|
8
5
|
import kuzu
|
|
9
6
|
except ModuleNotFoundError:
|
|
10
7
|
print("kuzu not installed")
|
|
8
|
+
import os
|
|
9
|
+
import random
|
|
10
|
+
import pandas as pd
|
|
11
11
|
from typing import Optional, Dict, List, Union, Tuple, Any, Set
|
|
12
12
|
|
|
13
|
+
from npcpy.llm_funcs import (
|
|
14
|
+
abstract,
|
|
15
|
+
consolidate_facts_llm,
|
|
16
|
+
generate_groups,
|
|
17
|
+
get_facts,
|
|
18
|
+
get_llm_response,
|
|
19
|
+
get_related_concepts_multi,
|
|
20
|
+
get_related_facts_llm,
|
|
21
|
+
prune_fact_subset_llm,
|
|
22
|
+
remove_idempotent_groups,
|
|
23
|
+
zoom_in,
|
|
24
|
+
)
|
|
13
25
|
|
|
14
|
-
from npcpy.
|
|
15
|
-
from npcpy.npc_compiler import NPC
|
|
16
|
-
import sqlite3
|
|
17
|
-
|
|
26
|
+
from npcpy.memory.command_history import load_kg_from_db, save_kg_to_db
|
|
18
27
|
|
|
19
|
-
import random
|
|
20
28
|
def safe_kuzu_execute(conn, query, error_message="Kuzu query failed"):
|
|
21
29
|
"""Execute a Kuzu query with proper error handling"""
|
|
22
30
|
try:
|
|
@@ -35,7 +43,7 @@ def create_group(conn, name: str, metadata: str = ""):
|
|
|
35
43
|
return False
|
|
36
44
|
|
|
37
45
|
try:
|
|
38
|
-
|
|
46
|
+
|
|
39
47
|
escaped_name = name.replace('"', '\\"')
|
|
40
48
|
escaped_metadata = metadata.replace('"', '\\"')
|
|
41
49
|
|
|
@@ -70,13 +78,13 @@ def init_db(db_path: str, drop=False):
|
|
|
70
78
|
print("Database connection established successfully")
|
|
71
79
|
|
|
72
80
|
if drop:
|
|
73
|
-
|
|
81
|
+
|
|
74
82
|
safe_kuzu_execute(conn, "DROP TABLE IF EXISTS Contains")
|
|
75
|
-
safe_kuzu_execute(conn, "DROP TABLE IF EXISTS EvolvedFrom")
|
|
83
|
+
safe_kuzu_execute(conn, "DROP TABLE IF EXISTS EvolvedFrom")
|
|
76
84
|
safe_kuzu_execute(conn, "DROP TABLE IF EXISTS Fact")
|
|
77
85
|
safe_kuzu_execute(conn, "DROP TABLE IF EXISTS Groups")
|
|
78
86
|
|
|
79
|
-
|
|
87
|
+
|
|
80
88
|
safe_kuzu_execute(
|
|
81
89
|
conn,
|
|
82
90
|
"""
|
|
@@ -90,7 +98,7 @@ def init_db(db_path: str, drop=False):
|
|
|
90
98
|
"Failed to create Fact table",
|
|
91
99
|
)
|
|
92
100
|
|
|
93
|
-
|
|
101
|
+
|
|
94
102
|
safe_kuzu_execute(
|
|
95
103
|
conn,
|
|
96
104
|
"""
|
|
@@ -106,14 +114,14 @@ def init_db(db_path: str, drop=False):
|
|
|
106
114
|
)
|
|
107
115
|
print("Groups table (with generation tracking) created or already exists.")
|
|
108
116
|
|
|
109
|
-
|
|
117
|
+
|
|
110
118
|
safe_kuzu_execute(
|
|
111
119
|
conn,
|
|
112
120
|
"CREATE REL TABLE IF NOT EXISTS Contains(FROM Groups TO Fact);",
|
|
113
121
|
"Failed to create Contains relationship table",
|
|
114
122
|
)
|
|
115
123
|
|
|
116
|
-
|
|
124
|
+
|
|
117
125
|
safe_kuzu_execute(
|
|
118
126
|
conn,
|
|
119
127
|
"""
|
|
@@ -133,531 +141,500 @@ def init_db(db_path: str, drop=False):
|
|
|
133
141
|
print(f"Fatal error initializing database: {str(e)}")
|
|
134
142
|
traceback.print_exc()
|
|
135
143
|
return None
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def find_similar_groups(
|
|
148
|
+
conn,
|
|
149
|
+
fact: str,
|
|
150
|
+
model,
|
|
151
|
+
provider,
|
|
152
|
+
npc = None,
|
|
153
|
+
context: str = None,
|
|
154
|
+
**kwargs: Any
|
|
142
155
|
) -> List[str]:
|
|
143
|
-
"""
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
Star Wars franchise and I am a member of the 501st Legion."
|
|
163
|
-
You might extract the following facts:
|
|
164
|
-
- The individual is a software engineer
|
|
165
|
-
- The individual loves to play video games
|
|
166
|
-
- The individual is a huge fan of the Star Wars franchise
|
|
167
|
-
- The individual is a member of the 501st Legion
|
|
168
|
-
|
|
169
|
-
Another example:
|
|
170
|
-
"The quantum tunneling effect allows particles to pass through barriers
|
|
171
|
-
that classical physics says they shouldn't be able to cross. This has
|
|
172
|
-
huge implications for semiconductor design."
|
|
173
|
-
You might extract these facts:
|
|
174
|
-
- Quantum tunneling enables particles to pass through barriers that are
|
|
175
|
-
impassable according to classical physics
|
|
176
|
-
- The behavior of quantum tunneling has significant implications for
|
|
177
|
-
how semiconductors must be designed
|
|
178
|
-
|
|
179
|
-
Another example:
|
|
180
|
-
"People used to think the Earth was flat. Now we know it's spherical,
|
|
181
|
-
though technically it's an oblate spheroid due to its rotation."
|
|
182
|
-
You might extract these facts:
|
|
183
|
-
- People historically believed the Earth was flat
|
|
184
|
-
- It is now known that the Earth is an oblate spheroid
|
|
185
|
-
- The Earth's oblate spheroid shape is caused by its rotation
|
|
186
|
-
|
|
187
|
-
Another example:
|
|
188
|
-
"My research on black holes suggests they emit radiation, but my professor
|
|
189
|
-
says this conflicts with Einstein's work. After reading more papers, I
|
|
190
|
-
learned this is actually Hawking radiation and doesn't conflict at all."
|
|
191
|
-
You might extract the following facts:
|
|
192
|
-
- Black holes emit radiation
|
|
193
|
-
- The professor believes this radiation conflicts with Einstein's work
|
|
194
|
-
- The radiation from black holes is called Hawking radiation
|
|
195
|
-
- Hawking radiation does not conflict with Einstein's work
|
|
196
|
-
|
|
197
|
-
Another example:
|
|
198
|
-
"During the pandemic, many developers switched to remote work. I found
|
|
199
|
-
that I'm actually more productive at home, though my company initially
|
|
200
|
-
thought productivity would drop. Now they're keeping remote work permanent."
|
|
201
|
-
You might extract the following facts:
|
|
202
|
-
- The pandemic caused many developers to switch to remote work
|
|
203
|
-
- The individual discovered higher productivity when working from home
|
|
204
|
-
- The company predicted productivity would decrease with remote work
|
|
205
|
-
- The company decided to make remote work a permanent option
|
|
206
|
-
|
|
207
|
-
Thus, it is your mission to reliably extract lists of facts.
|
|
208
|
-
|
|
209
|
-
Return a JSON object with the following structure:
|
|
210
|
-
{
|
|
211
|
-
"fact_list": "a list containing the facts where each fact is a string",
|
|
212
|
-
}
|
|
213
|
-
"""
|
|
214
|
-
if len(context) > 0:
|
|
215
|
-
prompt+=f""" Here is some relevant user context: {context}"""
|
|
156
|
+
"""Find existing groups that might contain this fact"""
|
|
157
|
+
response = conn.execute(f"MATCH (g:Groups) RETURN g.name;")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
groups = response.fetch_as_df()
|
|
162
|
+
|
|
163
|
+
if not groups:
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
prompt = """Given a fact and a list of groups, determine which groups this fact belongs to.
|
|
167
|
+
A fact should belong to a group if it is semantically related to the group's theme or purpose.
|
|
168
|
+
For example, if a fact is "The user loves programming" and there's a group called "Technical_Interests",
|
|
169
|
+
that would be a match.
|
|
170
|
+
|
|
171
|
+
Return a JSON object with the following structure:
|
|
172
|
+
{
|
|
173
|
+
"group_list": "a list containing the names of matching groups"
|
|
174
|
+
}
|
|
216
175
|
|
|
217
|
-
prompt+="""
|
|
218
176
|
Return only the JSON object.
|
|
219
177
|
Do not include any additional markdown formatting.
|
|
220
178
|
"""
|
|
221
179
|
|
|
222
180
|
response = get_llm_response(
|
|
223
|
-
prompt + f"
|
|
181
|
+
prompt + f"\n\nFact: {fact}\nGroups: {json.dumps(groups)}",
|
|
224
182
|
model=model,
|
|
225
183
|
provider=provider,
|
|
226
184
|
format="json",
|
|
185
|
+
npc=npc,
|
|
186
|
+
context=context,
|
|
187
|
+
**kwargs
|
|
227
188
|
)
|
|
228
189
|
response = response["response"]
|
|
229
|
-
return response
|
|
190
|
+
return response["group_list"]
|
|
230
191
|
|
|
231
192
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
)
|
|
240
|
-
"""Condense the conversation context into a small set of key extractions."""
|
|
241
|
-
if not messages:
|
|
242
|
-
return {"output": {}, "messages": []}
|
|
193
|
+
def kg_initial(content,
|
|
194
|
+
model=None,
|
|
195
|
+
provider=None,
|
|
196
|
+
npc=None,
|
|
197
|
+
context='',
|
|
198
|
+
facts=None,
|
|
199
|
+
generation=None,
|
|
200
|
+
verbose=True,):
|
|
243
201
|
|
|
244
|
-
|
|
202
|
+
if generation is None:
|
|
203
|
+
CURRENT_GENERATION = 0
|
|
204
|
+
else:
|
|
205
|
+
CURRENT_GENERATION = generation
|
|
206
|
+
|
|
207
|
+
print(f"--- Running KG Structuring Process (Generation: {CURRENT_GENERATION}) ---")
|
|
208
|
+
|
|
209
|
+
if facts is None:
|
|
210
|
+
if not content:
|
|
211
|
+
raise ValueError("kg_initial requires either content_text or a list of facts.")
|
|
212
|
+
print(" - Mode: Deriving new facts from text content...")
|
|
213
|
+
all_facts = []
|
|
214
|
+
print(len(content))
|
|
215
|
+
if len(content)>10000:
|
|
216
|
+
# randomly sub sample 10000 characters
|
|
217
|
+
starting_point = random.randint(0, len(content)-10000)
|
|
218
|
+
|
|
219
|
+
content_to_sample = content[starting_point:starting_point+10000]
|
|
245
220
|
|
|
246
|
-
|
|
247
|
-
|
|
221
|
+
for n in range(len(content)//10000):
|
|
222
|
+
print(n)
|
|
223
|
+
print(starting_point)
|
|
224
|
+
print(content_to_sample[0:1000])
|
|
225
|
+
facts = get_facts(content_to_sample,
|
|
226
|
+
model=model,
|
|
227
|
+
provider=provider,
|
|
228
|
+
npc=npc,
|
|
229
|
+
context=context)
|
|
230
|
+
if verbose:
|
|
231
|
+
print(f" - Extracted {len(facts)} facts from segment {n+1}")
|
|
232
|
+
print(facts)
|
|
233
|
+
all_facts.extend(facts)
|
|
234
|
+
else:
|
|
235
|
+
print(content[0:1000] )
|
|
236
|
+
all_facts = get_facts(content,
|
|
237
|
+
model=model,
|
|
238
|
+
provider=provider,
|
|
239
|
+
npc=npc,
|
|
240
|
+
context=context)
|
|
241
|
+
if verbose:
|
|
242
|
+
print(f" - Extracted {len(all_facts)} facts from content")
|
|
243
|
+
print(all_facts)
|
|
244
|
+
for fact in all_facts:
|
|
245
|
+
|
|
246
|
+
fact['generation'] = CURRENT_GENERATION
|
|
247
|
+
else:
|
|
248
|
+
print(f" - Mode: Building structure from {len(facts)} pre-existing facts...")
|
|
249
|
+
|
|
250
|
+
print(" - Inferring implied facts (zooming in)...")
|
|
251
|
+
all_implied_facts = []
|
|
252
|
+
if len(all_facts) > 20:
|
|
253
|
+
# sub sample facts randomly to generate zoomed in facts
|
|
254
|
+
sampled_facts = random.sample(all_facts, k=20)
|
|
255
|
+
for n in range(len(all_facts) // 20):
|
|
256
|
+
implied_facts = zoom_in(sampled_facts,
|
|
257
|
+
model=model,
|
|
258
|
+
provider=provider,
|
|
259
|
+
npc=npc,
|
|
260
|
+
context=context)
|
|
261
|
+
all_implied_facts.extend(implied_facts)
|
|
262
|
+
if verbose:
|
|
263
|
+
print(f" - Inferred {len(implied_facts)} implied facts from sample {n+1}")
|
|
264
|
+
print(implied_facts)
|
|
265
|
+
else:
|
|
266
|
+
implied_facts = zoom_in(all_facts,
|
|
267
|
+
model=model,
|
|
268
|
+
provider=provider,
|
|
269
|
+
npc=npc,
|
|
270
|
+
context=context)
|
|
271
|
+
print(implied_facts)
|
|
272
|
+
|
|
273
|
+
all_implied_facts.extend(implied_facts)
|
|
274
|
+
|
|
275
|
+
if verbose:
|
|
276
|
+
print(f" - Inferred {len(implied_facts)} implied facts from all facts")
|
|
277
|
+
print(implied_facts)
|
|
278
|
+
for fact in all_implied_facts:
|
|
279
|
+
fact['generation'] = CURRENT_GENERATION
|
|
280
|
+
|
|
281
|
+
all_facts = all_facts + all_implied_facts
|
|
282
|
+
|
|
283
|
+
print(" - Generating concepts from all facts...")
|
|
284
|
+
concepts = generate_groups(all_facts,
|
|
285
|
+
model=model,
|
|
286
|
+
provider=provider,
|
|
287
|
+
npc=npc,
|
|
288
|
+
context=context)
|
|
289
|
+
for concept in concepts:
|
|
290
|
+
concept['generation'] = CURRENT_GENERATION
|
|
291
|
+
|
|
292
|
+
if verbose:
|
|
293
|
+
print(f" - Generated {len(concepts)} concepts")
|
|
294
|
+
print(concepts)
|
|
295
|
+
print(" - Linking facts to concepts...")
|
|
296
|
+
fact_to_concept_links = defaultdict(list)
|
|
297
|
+
concept_names = [c['name'] for c in concepts if c and 'name' in c]
|
|
298
|
+
for fact in all_facts:
|
|
299
|
+
|
|
300
|
+
fact_to_concept_links[fact['statement']] = get_related_concepts_multi(fact['statement'], "fact", concept_names, model, provider, npc, context)
|
|
301
|
+
if verbose:
|
|
302
|
+
print(fact_to_concept_links[fact['statement']])
|
|
303
|
+
print(" - Linking facts to other facts...")
|
|
304
|
+
fact_to_fact_links = []
|
|
305
|
+
fact_statements = [f['statement'] for f in all_facts]
|
|
306
|
+
for i, fact in enumerate(all_facts):
|
|
307
|
+
other_fact_statements = fact_statements[all_facts != fact]
|
|
308
|
+
print('checking fact: ', fact)
|
|
309
|
+
if other_fact_statements:
|
|
310
|
+
related_fact_stmts = get_related_facts_llm(fact['statement'],
|
|
311
|
+
other_fact_statements,
|
|
312
|
+
model=model,
|
|
313
|
+
provider=provider,
|
|
314
|
+
npc=npc,
|
|
315
|
+
context=context)
|
|
316
|
+
for related_stmt in related_fact_stmts:
|
|
317
|
+
|
|
318
|
+
fact_to_fact_links.append((fact['statement'], related_stmt))
|
|
319
|
+
if verbose:
|
|
320
|
+
print(fact['statement'], related_stmt)
|
|
248
321
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
"facts":
|
|
322
|
+
return {
|
|
323
|
+
"generation": CURRENT_GENERATION,
|
|
324
|
+
"facts": all_facts,
|
|
325
|
+
"concepts": concepts,
|
|
326
|
+
"concept_links": [],
|
|
327
|
+
"fact_to_concept_links": dict(fact_to_concept_links),
|
|
328
|
+
"fact_to_fact_links": fact_to_fact_links
|
|
252
329
|
}
|
|
253
330
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
provider=provider,
|
|
284
|
-
npc=npc,
|
|
285
|
-
min_top=min_top,
|
|
286
|
-
max_top=max_top,
|
|
287
|
-
max_levels=max_levels
|
|
288
|
-
)
|
|
331
|
+
|
|
332
|
+
def kg_evolve_incremental(existing_kg,
|
|
333
|
+
new_content_text=None,
|
|
334
|
+
new_facts=None,
|
|
335
|
+
model = None,
|
|
336
|
+
provider=None,
|
|
337
|
+
npc=None,
|
|
338
|
+
context='',
|
|
339
|
+
get_concepts=False,
|
|
340
|
+
link_concepts_facts = False,
|
|
341
|
+
link_concepts_concepts=False,
|
|
342
|
+
link_facts_facts = False,
|
|
343
|
+
):
|
|
344
|
+
|
|
345
|
+
current_gen = existing_kg.get('generation', 0)
|
|
346
|
+
next_gen = current_gen + 1
|
|
347
|
+
print(f"\n--- ABSORBING INFO: Gen {current_gen} -> Gen {next_gen} ---")
|
|
348
|
+
|
|
349
|
+
newly_added_concepts = []
|
|
350
|
+
concept_links = list(existing_kg.get('concept_links', []))
|
|
351
|
+
fact_to_concept_links = defaultdict(list,
|
|
352
|
+
existing_kg.get('fact_to_concept_links', {}))
|
|
353
|
+
fact_to_fact_links = list(existing_kg.get('fact_to_fact_links', []))
|
|
354
|
+
|
|
355
|
+
existing_facts = existing_kg.get('facts', [])
|
|
356
|
+
existing_concepts = existing_kg.get('concepts', [])
|
|
357
|
+
existing_concept_names = {c['name'] for c in existing_concepts}
|
|
358
|
+
existing_fact_statements = [f['statement'] for f in existing_facts]
|
|
359
|
+
all_concept_names = list(existing_concept_names)
|
|
289
360
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
"leaf_groups": initial_groups_for_hierarchy, # These are the groups that were NOT abstracted further
|
|
293
|
-
}
|
|
294
|
-
# --- Helper Functions for Hierarchy (unchanged from before) ---
|
|
295
|
-
def generate_group_candidates(
|
|
296
|
-
items: List[str],
|
|
297
|
-
item_type: str,
|
|
298
|
-
model: str,
|
|
299
|
-
provider: str,
|
|
300
|
-
npc: NPC = None,
|
|
301
|
-
n_passes: int = 3,
|
|
302
|
-
subset_size: int = 10
|
|
303
|
-
) -> List[str]:
|
|
304
|
-
"""Generate candidate groups for items (facts or groups) based on core semantic meaning."""
|
|
305
|
-
all_candidates = []
|
|
361
|
+
all_new_facts = []
|
|
362
|
+
print(npc, npc.model, npc.provider)
|
|
306
363
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
364
|
+
if new_facts:
|
|
365
|
+
all_new_facts = new_facts
|
|
366
|
+
print(f'using pre-approved facts: {len(all_new_facts)}')
|
|
367
|
+
elif new_content_text:
|
|
368
|
+
print('extracting facts from content...')
|
|
369
|
+
if len(new_content_text) > 10000:
|
|
370
|
+
starting_point = random.randint(0, len(new_content_text)-10000)
|
|
371
|
+
for n in range(len(new_content_text)//10000):
|
|
372
|
+
content_to_sample = new_content_text[n*10000:(n+1)*10000]
|
|
373
|
+
facts = get_facts(content_to_sample,
|
|
374
|
+
model=model,
|
|
375
|
+
provider=provider,
|
|
376
|
+
npc = npc,
|
|
377
|
+
context=context)
|
|
378
|
+
all_new_facts.extend(facts)
|
|
379
|
+
print(facts)
|
|
310
380
|
else:
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
381
|
+
all_new_facts = get_facts(new_content_text,
|
|
382
|
+
model=model,
|
|
383
|
+
provider=provider,
|
|
384
|
+
npc = npc,
|
|
385
|
+
context=context)
|
|
386
|
+
print(all_new_facts)
|
|
387
|
+
else:
|
|
388
|
+
print("No new content or facts provided")
|
|
389
|
+
return existing_kg, {}
|
|
390
|
+
|
|
391
|
+
for fact in all_new_facts:
|
|
392
|
+
fact['generation'] = next_gen
|
|
393
|
+
|
|
394
|
+
final_facts = existing_facts + all_new_facts
|
|
395
|
+
|
|
396
|
+
if get_concepts:
|
|
397
|
+
print('generating groups...')
|
|
398
|
+
|
|
399
|
+
candidate_concepts = generate_groups(all_new_facts,
|
|
400
|
+
model = model,
|
|
401
|
+
provider = provider,
|
|
402
|
+
npc=npc,
|
|
403
|
+
context=context)
|
|
404
|
+
print(candidate_concepts)
|
|
405
|
+
print('checking group uniqueness')
|
|
406
|
+
for cand_concept in candidate_concepts:
|
|
407
|
+
cand_name = cand_concept['name']
|
|
408
|
+
if cand_name in existing_concept_names:
|
|
409
|
+
continue
|
|
410
|
+
cand_concept['generation'] = next_gen
|
|
411
|
+
newly_added_concepts.append(cand_concept)
|
|
412
|
+
if link_concepts_concepts:
|
|
413
|
+
print('linking concepts and concepts...')
|
|
414
|
+
|
|
415
|
+
related_concepts = get_related_concepts_multi(cand_name,
|
|
416
|
+
"concept",
|
|
417
|
+
all_concept_names,
|
|
418
|
+
model,
|
|
419
|
+
provider,
|
|
420
|
+
npc,
|
|
421
|
+
context)
|
|
422
|
+
for related_name in related_concepts:
|
|
423
|
+
if related_name != cand_name:
|
|
424
|
+
concept_links.append((cand_name, related_name))
|
|
425
|
+
all_concept_names.append(cand_name)
|
|
426
|
+
|
|
427
|
+
final_concepts = existing_concepts + newly_added_concepts
|
|
428
|
+
|
|
429
|
+
if link_concepts_facts:
|
|
430
|
+
print('linking facts and concepts...')
|
|
431
|
+
for fact in all_new_facts:
|
|
432
|
+
fact_to_concept_links[fact['statement']] = get_related_concepts_multi(fact['statement'],
|
|
433
|
+
"fact",
|
|
434
|
+
all_concept_names,
|
|
435
|
+
model = model,
|
|
436
|
+
provider=provider,
|
|
437
|
+
npc = npc,
|
|
438
|
+
context= context)
|
|
439
|
+
else:
|
|
440
|
+
final_concepts = existing_concepts
|
|
441
|
+
if link_facts_facts:
|
|
442
|
+
print('linking facts and facts...')
|
|
443
|
+
|
|
444
|
+
for new_fact in all_new_facts:
|
|
445
|
+
related_fact_stmts = get_related_facts_llm(new_fact['statement'],
|
|
446
|
+
existing_fact_statements,
|
|
447
|
+
model = model,
|
|
448
|
+
provider = provider,
|
|
449
|
+
npc = npc,
|
|
450
|
+
context=context)
|
|
451
|
+
for related_stmt in related_fact_stmts:
|
|
452
|
+
fact_to_fact_links.append((new_fact['statement'], related_stmt))
|
|
453
|
+
|
|
454
|
+
final_kg = {
|
|
455
|
+
"generation": next_gen,
|
|
456
|
+
"facts": final_facts,
|
|
457
|
+
"concepts": final_concepts,
|
|
458
|
+
"concept_links": concept_links,
|
|
459
|
+
"fact_to_concept_links": dict(fact_to_concept_links),
|
|
460
|
+
"fact_to_fact_links": fact_to_fact_links
|
|
331
461
|
|
|
332
|
-
|
|
333
|
-
|
|
462
|
+
}
|
|
463
|
+
return final_kg, {}
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def kg_sleep_process(existing_kg,
|
|
469
|
+
model=None,
|
|
470
|
+
provider=None,
|
|
471
|
+
npc=None,
|
|
472
|
+
context='',
|
|
473
|
+
operations_config=None):
|
|
474
|
+
current_gen = existing_kg.get('generation', 0)
|
|
475
|
+
next_gen = current_gen + 1
|
|
476
|
+
print(f"\n--- SLEEPING (Evolving Knowledge): Gen {current_gen} -> Gen {next_gen} ---")
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
facts_map = {f['statement']: f for f in existing_kg.get('facts', [])}
|
|
480
|
+
concepts_map = {c['name']: c for c in existing_kg.get('concepts', [])}
|
|
481
|
+
fact_links = defaultdict(list, {k: list(v) for k, v in existing_kg.get('fact_to_concept_links', {}).items()})
|
|
482
|
+
concept_links = set(tuple(sorted(link)) for link in existing_kg.get('concept_links', []))
|
|
483
|
+
fact_to_fact_links = set(tuple(sorted(link)) for link in existing_kg.get('fact_to_fact_links', []))
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
print(" - Phase 1: Checking for unstructured facts...")
|
|
487
|
+
facts_with_concepts = set(fact_links.keys())
|
|
488
|
+
orphaned_fact_statements = list(set(facts_map.keys()) - facts_with_concepts)
|
|
489
|
+
|
|
490
|
+
if len(orphaned_fact_statements) > 20:
|
|
491
|
+
print(f" - Found {len(orphaned_fact_statements)} orphaned facts. Applying full KG structuring process...")
|
|
492
|
+
orphaned_facts_as_dicts = [facts_map[s] for s in orphaned_fact_statements]
|
|
334
493
|
|
|
335
|
-
Return a JSON object:
|
|
336
|
-
{{
|
|
337
|
-
"groups": ["list of specific, precise, and relevant group names"]
|
|
338
|
-
}}
|
|
339
|
-
"""
|
|
340
|
-
# --- END PROMPT MODIFICATION ---
|
|
341
494
|
|
|
342
|
-
|
|
343
|
-
|
|
495
|
+
new_structure = kg_initial(
|
|
496
|
+
facts=orphaned_facts_as_dicts,
|
|
344
497
|
model=model,
|
|
345
498
|
provider=provider,
|
|
346
|
-
format="json",
|
|
347
499
|
npc=npc,
|
|
500
|
+
context=context,
|
|
501
|
+
generation=next_gen
|
|
348
502
|
)
|
|
349
|
-
|
|
350
|
-
candidates = response["response"].get("groups", [])
|
|
351
|
-
all_candidates.extend(candidates)
|
|
352
|
-
print(all_candidates)
|
|
353
|
-
return list(set(all_candidates))
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
def remove_idempotent_groups(
|
|
357
|
-
group_candidates: List[str],
|
|
358
|
-
model: str,
|
|
359
|
-
provider: str,
|
|
360
|
-
npc: NPC = None
|
|
361
|
-
) -> List[str]:
|
|
362
|
-
"""Remove groups that are essentially identical in meaning, favoring specificity and direct naming, and avoiding generic structures."""
|
|
363
|
-
|
|
364
|
-
prompt = f"""Compare these group names. Identify and list ONLY the groups that are conceptually distinct and specific.
|
|
365
|
-
|
|
366
|
-
GUIDELINES FOR SELECTING DISTINCT GROUPS:
|
|
367
|
-
1. **Prioritize Specificity and Direct Naming:** Favor precise nouns or noun phrases that directly name the subject.
|
|
368
|
-
2. **Prefer Concrete Entities/Actions:** If a name refers to a specific entity or action (e.g., "Earth", "Sun", "Water", "France", "User Authentication Module", "Lamb Shank Braising", "World War I"), keep it if it's distinct.
|
|
369
|
-
3. **Rephrase Gerunds:** If a name uses a gerund (e.g., "Understanding TDEs"), rephrase it to a noun or noun phrase (e.g., "Tidal Disruption Events").
|
|
370
|
-
4. **AVOID OVERLY GENERIC TERMS:** Do NOT use very broad or abstract terms that don't add specific meaning. Examples to avoid: "Concepts", "Processes", "Dynamics", "Mechanics", "Analysis", "Understanding", "Interactions", "Relationships", "Properties", "Structures", "Systems", "Frameworks", "Predictions", "Outcomes", "Effects", "Considerations", "Methods", "Techniques", "Data", "Theoretical", "Physical", "Spatial", "Temporal". If a group name seems overly generic or abstract, it should likely be removed or refined.
|
|
371
|
-
5. **Similarity Check:** If two groups are very similar, keep the one that is more descriptive or specific to the domain.
|
|
372
|
-
|
|
373
|
-
EXAMPLE 1:
|
|
374
|
-
Groups: ["Accretion Disk Formation", "Accretion Disk Dynamics", "Formation of Accretion Disks"]
|
|
375
|
-
Distinct Groups: ["Accretion Disk Formation", "Accretion Disk Dynamics"]
|
|
376
|
-
|
|
377
|
-
EXAMPLE 2:
|
|
378
|
-
Groups: ["Causes of Events", "Event Mechanisms", "Event Drivers"]
|
|
379
|
-
Distinct Groups: ["Event Causation", "Event Mechanisms"]
|
|
380
|
-
|
|
381
|
-
EXAMPLE 3:
|
|
382
|
-
Groups: ["Astrophysics Basics", "Fundamental Physics", "General Science Concepts"]
|
|
383
|
-
Distinct Groups: ["Fundamental Physics"]
|
|
384
|
-
|
|
385
|
-
EXAMPLE 4:
|
|
386
|
-
Groups: ["Earth", "The Planet Earth", "Sun", "Our Star"]
|
|
387
|
-
Distinct Groups: ["Earth", "Sun"]
|
|
388
|
-
|
|
389
|
-
EXAMPLE 5:
|
|
390
|
-
Groups: ["User Authentication Module", "Authentication System", "Login Process"]
|
|
391
|
-
Distinct Groups: ["User Authentication Module", "Login Process"]
|
|
392
|
-
|
|
393
|
-
---
|
|
394
|
-
|
|
395
|
-
Now, analyze the following groups:
|
|
396
|
-
Groups: {json.dumps(group_candidates)}
|
|
397
|
-
|
|
398
|
-
Return JSON:
|
|
399
|
-
{{
|
|
400
|
-
"distinct_groups": ["list of specific, precise, and distinct group names to keep"]
|
|
401
|
-
}}
|
|
402
|
-
"""
|
|
403
|
-
|
|
404
|
-
response = get_llm_response(
|
|
405
|
-
prompt,
|
|
406
|
-
model=model,
|
|
407
|
-
provider=provider,
|
|
408
|
-
format="json",
|
|
409
|
-
npc=npc
|
|
410
|
-
)
|
|
411
|
-
|
|
412
|
-
print(response['response']['distinct_groups'])
|
|
413
|
-
return response["response"]["distinct_groups"]
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
def build_hierarchy_dag(
|
|
417
|
-
groups: List[str],
|
|
418
|
-
model: str,
|
|
419
|
-
provider: str,
|
|
420
|
-
npc: NPC = None,
|
|
421
|
-
max_levels: int = 3,
|
|
422
|
-
target_top_count: int = 8,
|
|
423
|
-
n_passes: int = 3, # This is the number of times we query the LLM per level
|
|
424
|
-
subset_size: int = 10 # This is how many groups we pass to the LLM at once
|
|
425
|
-
) -> Dict:
|
|
426
|
-
"""Build DAG hierarchy iteratively from bottom up, abstracting groups."""
|
|
427
|
-
|
|
428
|
-
# Initialize DAG structure for the initial set of groups
|
|
429
|
-
dag = {group: {"parents": set(), "children": set(), "level": 0} for group in groups}
|
|
430
|
-
all_groups = set(groups)
|
|
431
|
-
current_level_items = groups # Start with the provided groups (the bottom layer)
|
|
432
|
-
level_num = 0
|
|
433
|
-
|
|
434
|
-
# Keep abstracting until we have a manageable number of top-level groups
|
|
435
|
-
# or reach max_levels. The condition checks the number of groups *currently* without parents.
|
|
436
|
-
while len([g for g in all_groups if not dag.get(g, {}).get("parents")]) > target_top_count and level_num < max_levels:
|
|
437
|
-
level_num += 1
|
|
438
|
-
print(f"Too many top groups ({len([g for g in all_groups if not dag.get(g, {}).get('parents')])}), abstracting level {level_num}")
|
|
439
|
-
|
|
440
|
-
# --- CRITICAL FIX: Re-introduce the multi-pass sampling for parent suggestions ---
|
|
441
|
-
potential_parents = []
|
|
442
|
-
# Multiple passes with resampling to explore different abstraction possibilities
|
|
443
|
-
for pass_num in range(n_passes): # Iterate n_passes times
|
|
444
|
-
# Sample a subset of groups from the current level for the LLM prompt
|
|
445
|
-
if len(current_level_items) > subset_size:
|
|
446
|
-
# Use a seed based on level and pass to ensure different samples each time
|
|
447
|
-
random.seed(level_num * 10 + pass_num)
|
|
448
|
-
group_subset = random.sample(current_level_items, min(subset_size, len(current_level_items)))
|
|
449
|
-
else:
|
|
450
|
-
group_subset = current_level_items # Use all if subset_size is larger than available groups
|
|
451
|
-
|
|
452
|
-
# Prompt the LLM to suggest parent categories for this subset of groups
|
|
453
|
-
prompt = f"""
|
|
454
|
-
What are broader parent categories that could contain these groups?
|
|
455
|
-
Suggest 1-3 broader categories. Make them distinct and meaningful.
|
|
456
|
-
|
|
457
|
-
Groups: {json.dumps(group_subset)}
|
|
458
|
-
|
|
459
|
-
Return JSON:
|
|
460
|
-
{{
|
|
461
|
-
"parents": ["list of parent categories"]
|
|
462
|
-
}}
|
|
463
|
-
"""
|
|
464
|
-
|
|
465
|
-
response = get_llm_response(
|
|
466
|
-
prompt, model=model, provider=provider, format="json", npc=npc
|
|
467
|
-
)
|
|
468
503
|
|
|
469
|
-
parents = response["response"].get("parents", [])
|
|
470
|
-
potential_parents.extend(parents)
|
|
471
504
|
|
|
472
|
-
|
|
505
|
+
print(" - Merging new structure into main KG...")
|
|
506
|
+
for concept in new_structure.get("concepts", []):
|
|
507
|
+
if concept['name'] not in concepts_map:
|
|
508
|
+
concepts_map[concept['name']] = concept
|
|
473
509
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
# Add these distinct parent groups to the DAG and update relationships
|
|
479
|
-
new_groups_for_next_level = set()
|
|
480
|
-
for parent in distinct_parents:
|
|
481
|
-
if parent not in dag: # If this is a completely new abstract group
|
|
482
|
-
dag[parent] = {
|
|
483
|
-
"parents": set(), # These new parents have no parents yet in this round
|
|
484
|
-
"children": set(current_level_items), # The groups from the previous level are their children
|
|
485
|
-
"level": level_num
|
|
486
|
-
}
|
|
487
|
-
all_groups.add(parent)
|
|
488
|
-
new_groups_for_next_level.add(parent)
|
|
489
|
-
else: # If the parent group already exists (e.g., from a different branch)
|
|
490
|
-
# Update its children to include the current level's groups
|
|
491
|
-
dag[parent]["children"].update(current_level_items)
|
|
492
|
-
|
|
493
|
-
# Update parent relationship for the children from the previous level
|
|
494
|
-
for child in current_level_items:
|
|
495
|
-
dag[child]["parents"].add(parent)
|
|
496
|
-
|
|
497
|
-
# The newly found parents become the input for the next abstraction level
|
|
498
|
-
current_level_items = list(new_groups_for_next_level)
|
|
510
|
+
for fact_stmt, new_links in new_structure.get("fact_to_concept_links", {}).items():
|
|
511
|
+
existing_links = set(fact_links.get(fact_stmt, []))
|
|
512
|
+
existing_links.update(new_links)
|
|
513
|
+
fact_links[fact_stmt] = list(existing_links)
|
|
499
514
|
|
|
500
|
-
|
|
501
|
-
|
|
515
|
+
for f1, f2 in new_structure.get("fact_to_fact_links", []):
|
|
516
|
+
fact_to_fact_links.add(tuple(sorted((f1, f2))))
|
|
517
|
+
else:
|
|
518
|
+
print(" - Knowledge graph is sufficiently structured. Proceeding to refinement.")
|
|
502
519
|
|
|
503
|
-
return {
|
|
504
|
-
"dag": dag,
|
|
505
|
-
"top_groups": top_groups_final,
|
|
506
|
-
"leaf_groups": groups, # The initial set of groups passed in, which are the base for the hierarchy
|
|
507
|
-
"max_level": level_num
|
|
508
|
-
}
|
|
509
|
-
|
|
510
520
|
|
|
521
|
+
if operations_config is None:
|
|
522
|
+
possible_ops = ['prune', 'deepen', 'abstract_link']
|
|
523
|
+
ops_to_run = random.sample(possible_ops, k=random.randint(1, 2))
|
|
524
|
+
else:
|
|
525
|
+
ops_to_run = operations_config
|
|
511
526
|
|
|
527
|
+
print(f" - Phase 2: Executing refinement operations: {ops_to_run}")
|
|
512
528
|
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
# Step 1: Get initial distinct groups from facts (already done by caller if passing leaf_groups)
|
|
524
|
-
# If leaf_groups is empty, we might want to generate them from facts first, but for now, assume they are provided.
|
|
525
|
-
|
|
526
|
-
# Step 2: Build the DAG structure, abstracting upwards until we have <= max_top groups
|
|
527
|
-
hierarchy = build_hierarchy_dag(
|
|
528
|
-
leaf_groups, model, provider, npc, max_levels, max_top, n_passes=3, subset_size=10
|
|
529
|
-
)
|
|
530
|
-
|
|
531
|
-
return hierarchy
|
|
529
|
+
for op in ops_to_run:
|
|
530
|
+
|
|
531
|
+
if op == 'prune' and (len(facts_map) > 10 or len(concepts_map) > 5):
|
|
532
|
+
print(" - Running 'prune' operation using consolidate_facts_llm...")
|
|
533
|
+
fact_to_check = random.choice(list(facts_map.values()))
|
|
534
|
+
other_facts = [f for f in facts_map.values() if f['statement'] != fact_to_check['statement']]
|
|
535
|
+
consolidation_result = consolidate_facts_llm(fact_to_check, other_facts, model, provider, npc, context)
|
|
536
|
+
if consolidation_result.get('decision') == 'redundant':
|
|
537
|
+
print(f" - Pruning redundant fact: '{fact_to_check['statement'][:80]}...'")
|
|
538
|
+
del facts_map[fact_to_check['statement']]
|
|
532
539
|
|
|
533
|
-
|
|
534
|
-
|
|
540
|
+
|
|
541
|
+
elif op == 'deepen' and facts_map:
|
|
542
|
+
print(" - Running 'deepen' operation using zoom_in...")
|
|
543
|
+
fact_to_deepen = random.choice(list(facts_map.values()))
|
|
544
|
+
implied_facts = zoom_in([fact_to_deepen], model, provider, npc, context)
|
|
545
|
+
new_fact_count = 0
|
|
546
|
+
for fact in implied_facts:
|
|
547
|
+
if fact['statement'] not in facts_map:
|
|
548
|
+
fact.update({'generation': next_gen, 'origin': 'deepen'})
|
|
549
|
+
facts_map[fact['statement']] = fact
|
|
550
|
+
new_fact_count += 1
|
|
551
|
+
if new_fact_count > 0: print(f" - Inferred {new_fact_count} new fact(s).")
|
|
552
|
+
|
|
553
|
+
else:
|
|
554
|
+
print(f" - SKIPPED: Operation '{op}' did not run (conditions not met).")
|
|
555
|
+
|
|
535
556
|
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
557
|
+
new_kg = {
|
|
558
|
+
"generation": next_gen,
|
|
559
|
+
"facts": list(facts_map.values()),
|
|
560
|
+
"concepts": list(concepts_map.values()),
|
|
561
|
+
"concept_links": [list(link) for link in concept_links],
|
|
562
|
+
"fact_to_concept_links": dict(fact_links),
|
|
563
|
+
"fact_to_fact_links": [list(link) for link in fact_to_fact_links]
|
|
564
|
+
}
|
|
565
|
+
return new_kg, {}
|
|
566
|
+
def kg_dream_process(existing_kg,
|
|
567
|
+
model = None,
|
|
568
|
+
provider = None,
|
|
569
|
+
npc=None,
|
|
570
|
+
context='',
|
|
571
|
+
num_seeds=3):
|
|
572
|
+
current_gen = existing_kg.get('generation', 0)
|
|
573
|
+
next_gen = current_gen + 1
|
|
574
|
+
print(f"\n--- DREAMING (Creative Synthesis): Gen {current_gen} -> Gen {next_gen} ---")
|
|
575
|
+
concepts = existing_kg.get('concepts', [])
|
|
576
|
+
if len(concepts) < num_seeds:
|
|
577
|
+
print(f" - Not enough concepts ({len(concepts)}) for dream. Skipping.")
|
|
578
|
+
return existing_kg, {}
|
|
579
|
+
seed_concepts = random.sample(concepts, k=num_seeds)
|
|
580
|
+
seed_names = [c['name'] for c in seed_concepts]
|
|
581
|
+
print(f" - Dream seeded with: {seed_names}")
|
|
582
|
+
prompt = f"""
|
|
583
|
+
Write a short, speculative paragraph (a 'dream') that plausibly connects the concepts of {json.dumps(seed_names)}.
|
|
584
|
+
Invent a brief narrative or a hypothetical situation.
|
|
585
|
+
Respond with JSON: {{"dream_text": "A short paragraph..."}}
|
|
586
|
+
"""
|
|
587
|
+
response = get_llm_response(prompt,
|
|
588
|
+
model=model,
|
|
589
|
+
provider=provider, npc = npc,
|
|
590
|
+
format="json", context=context)
|
|
591
|
+
dream_text = response['response'].get('dream_text')
|
|
592
|
+
if not dream_text:
|
|
593
|
+
print(" - Failed to generate a dream narrative. Skipping.")
|
|
594
|
+
return existing_kg, {}
|
|
595
|
+
print(f" - Generated Dream: '{dream_text[:150]}...'")
|
|
546
596
|
|
|
547
|
-
|
|
548
|
-
top_level_assignments = get_fact_assignments(fact, top_groups, model, provider, npc)
|
|
597
|
+
dream_kg, _ = kg_evolve_incremental(existing_kg, dream_text, model, provider, npc, context)
|
|
549
598
|
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
# Traverse down the hierarchy level by level
|
|
559
|
-
# We continue as long as there are groups at the current level that are assigned to the fact
|
|
560
|
-
# and these groups have children defined in the DAG.
|
|
561
|
-
processed_groups_in_level = set() # To avoid infinite loops if DAG has cycles (though should be acyclic)
|
|
599
|
+
original_fact_stmts = {f['statement'] for f in existing_kg['facts']}
|
|
600
|
+
for fact in dream_kg['facts']:
|
|
601
|
+
if fact['statement'] not in original_fact_stmts: fact['origin'] = 'dream'
|
|
602
|
+
original_concept_names = {c['name'] for c in existing_kg['concepts']}
|
|
603
|
+
for concept in dream_kg['concepts']:
|
|
604
|
+
if concept['name'] not in original_concept_names: concept['origin'] = 'dream'
|
|
605
|
+
print(" - Dream analysis complete. New knowledge integrated.")
|
|
606
|
+
return dream_kg, {}
|
|
562
607
|
|
|
563
|
-
while current_level_to_process:
|
|
564
|
-
next_level_to_process = set()
|
|
565
|
-
|
|
566
|
-
for current_group in current_level_to_process:
|
|
567
|
-
# Prevent reprocessing the same group in the same level traversal
|
|
568
|
-
if current_group in processed_groups_in_level:
|
|
569
|
-
continue
|
|
570
|
-
processed_groups_in_level.add(current_group)
|
|
571
|
-
|
|
572
|
-
# Get children of the current group
|
|
573
|
-
children = dag_data["dag"].get(current_group, {}).get("children", set())
|
|
574
|
-
|
|
575
|
-
if children:
|
|
576
|
-
# Get assignments for children
|
|
577
|
-
child_assignments = get_fact_assignments(fact, list(children), model, provider, npc)
|
|
578
|
-
|
|
579
|
-
# If the fact belongs to any children, add them to the next level to process
|
|
580
|
-
if child_assignments:
|
|
581
|
-
next_level_to_process.update(child_assignments)
|
|
582
|
-
all_assigned_groups.update(child_assignments)
|
|
583
|
-
|
|
584
|
-
# Update path segments for newly assigned children
|
|
585
|
-
for assigned_child in child_assignments:
|
|
586
|
-
# Append the child to the path of its parent
|
|
587
|
-
if current_group in path_segments:
|
|
588
|
-
path_segments[assigned_child] = path_segments[current_group] + [assigned_child]
|
|
589
|
-
else: # Should not happen if logic is correct, but as a safeguard
|
|
590
|
-
path_segments[assigned_child] = [assigned_child]
|
|
591
|
-
|
|
592
|
-
# Add completed paths to our final list
|
|
593
|
-
for group, path in path_segments.items():
|
|
594
|
-
if group in current_level_to_process and group not in processed_groups_in_level: # If it was processed and assigned
|
|
595
|
-
if path not in hierarchy_paths:
|
|
596
|
-
hierarchy_paths.append(' → '.join(path))
|
|
597
|
-
|
|
598
|
-
current_level_to_process = next_level_to_process
|
|
599
|
-
processed_groups_in_level = set() # Reset for the next level
|
|
600
608
|
|
|
601
|
-
|
|
602
|
-
for group in top_level_assignments:
|
|
603
|
-
if group in path_segments and ' → '.join(path_segments[group]) not in hierarchy_paths:
|
|
604
|
-
hierarchy_paths.append(' → '.join(path_segments[group]))
|
|
609
|
+
def save_kg_with_pandas(kg, path_prefix="kg_state"):
|
|
605
610
|
|
|
611
|
+
generation = kg.get("generation", 0)
|
|
606
612
|
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
}
|
|
612
|
-
|
|
613
|
-
def process_text_with_hierarchy(
|
|
614
|
-
text: str,
|
|
615
|
-
model: str,
|
|
616
|
-
provider: str,
|
|
617
|
-
db_path: str,
|
|
618
|
-
npc: NPC = None,
|
|
619
|
-
existing_knowledge_graph: Optional[Dict] = None
|
|
620
|
-
) -> Dict:
|
|
621
|
-
"""Full processing pipeline with hierarchical grouping"""
|
|
622
|
-
print('process_text_with_hierarchy: Starting processing')
|
|
623
|
-
facts = extract_facts(text, model, provider, npc)
|
|
624
|
-
print(f'process_text_with_hierarchy: Extracted Facts: {facts}')
|
|
613
|
+
nodes_data = []
|
|
614
|
+
for fact in kg.get('facts', []): nodes_data.append({'id': fact['statement'], 'type': 'fact', 'generation': fact.get('generation')})
|
|
615
|
+
for concept in kg.get('concepts', []): nodes_data.append({'id': concept['name'], 'type': 'concept', 'generation': concept.get('generation')})
|
|
616
|
+
pd.DataFrame(nodes_data).to_csv(f'{path_prefix}_gen{generation}_nodes.csv', index=False)
|
|
625
617
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
618
|
+
links_data = []
|
|
619
|
+
for fact_stmt, concepts in kg.get("fact_to_concept_links", {}).items():
|
|
620
|
+
for concept_name in concepts: links_data.append({'source': fact_stmt, 'target': concept_name, 'type': 'fact_to_concept'})
|
|
621
|
+
for c1, c2 in kg.get("concept_links", []):
|
|
622
|
+
links_data.append({'source': c1, 'target': c2, 'type': 'concept_to_concept'})
|
|
629
623
|
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
624
|
+
for f1, f2 in kg.get("fact_to_fact_links", []):
|
|
625
|
+
links_data.append({'source': f1, 'target': f2, 'type': 'fact_to_fact'})
|
|
626
|
+
pd.DataFrame(links_data).to_csv(f'{path_prefix}_gen{generation}_links.csv', index=False)
|
|
627
|
+
print(f"Saved KG Generation {generation} to CSV files.")
|
|
634
628
|
|
|
635
|
-
assignments = {}
|
|
636
|
-
for fact in facts:
|
|
637
|
-
# Assign facts using the top-down traversal logic
|
|
638
|
-
assignment = assign_fact_to_dag(fact, hierarchy_data, model, provider, npc)
|
|
639
|
-
|
|
640
|
-
# Store fact and its assignments in Kuzu
|
|
641
|
-
store_success = store_fact_and_group(conn, fact, assignment["all_groups"], "")
|
|
642
|
-
if not store_success:
|
|
643
|
-
print(f'process_text_with_hierarchy: Failed to store fact: {fact}')
|
|
644
|
-
|
|
645
|
-
assignments[fact] = assignment
|
|
646
|
-
|
|
647
|
-
conn.close()
|
|
648
|
-
|
|
649
|
-
print('process_text_with_hierarchy: Finished Processing')
|
|
650
|
-
return {
|
|
651
|
-
'facts': facts,
|
|
652
|
-
'leaf_groups': hierarchy_data.get("leaf_groups", []), # This should be the *final* leaf groups after abstraction
|
|
653
|
-
'hierarchy': hierarchy_data,
|
|
654
|
-
'assignments': assignments
|
|
655
|
-
}
|
|
656
629
|
|
|
630
|
+
def save_changelog_to_json(changelog, from_gen, to_gen, path_prefix="changelog"):
|
|
631
|
+
if not changelog: return
|
|
632
|
+
with open(f"{path_prefix}_gen{from_gen}_to_{to_gen}.json", 'w', encoding='utf-8') as f:
|
|
633
|
+
json.dump(changelog, f, indent=4)
|
|
634
|
+
print(f"Saved changelog for Gen {from_gen}->{to_gen}.")
|
|
657
635
|
|
|
658
636
|
|
|
659
637
|
|
|
660
|
-
### STORAGE
|
|
661
638
|
|
|
662
639
|
def store_fact_and_group(conn, fact: str,
|
|
663
640
|
groups: List[str], path: str) -> bool:
|
|
@@ -667,15 +644,15 @@ def store_fact_and_group(conn, fact: str,
|
|
|
667
644
|
return False
|
|
668
645
|
|
|
669
646
|
print(f"store_fact_and_group: Storing fact: {fact}, with groups:"
|
|
670
|
-
f" {groups}")
|
|
647
|
+
f" {groups}")
|
|
671
648
|
try:
|
|
672
|
-
|
|
673
|
-
insert_success = insert_fact(conn, fact, path)
|
|
649
|
+
|
|
650
|
+
insert_success = insert_fact(conn, fact, path)
|
|
674
651
|
if not insert_success:
|
|
675
652
|
print(f"store_fact_and_group: Failed to insert fact: {fact}")
|
|
676
653
|
return False
|
|
677
654
|
|
|
678
|
-
|
|
655
|
+
|
|
679
656
|
for group in groups:
|
|
680
657
|
assign_success = assign_fact_to_group_graph(conn, fact, group)
|
|
681
658
|
if not assign_success:
|
|
@@ -688,28 +665,26 @@ def store_fact_and_group(conn, fact: str,
|
|
|
688
665
|
print(f"store_fact_and_group: Error storing fact and group: {e}")
|
|
689
666
|
traceback.print_exc()
|
|
690
667
|
return False
|
|
691
|
-
|
|
692
668
|
def insert_fact(conn, fact: str, path: str) -> bool:
|
|
693
669
|
"""Insert a fact into the database with robust error handling"""
|
|
694
670
|
if conn is None:
|
|
695
671
|
print("insert_fact: Cannot insert fact:"
|
|
696
672
|
" database connection is None")
|
|
697
673
|
return False
|
|
698
|
-
|
|
699
674
|
try:
|
|
700
|
-
|
|
675
|
+
|
|
701
676
|
escaped_fact = fact.replace('"', '\\"')
|
|
702
677
|
escaped_path = os.path.expanduser(path).replace('"', '\\"')
|
|
703
678
|
|
|
704
|
-
|
|
679
|
+
|
|
705
680
|
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
706
681
|
|
|
707
|
-
print(f"insert_fact: Attempting to insert fact: {fact}")
|
|
682
|
+
print(f"insert_fact: Attempting to insert fact: {fact}")
|
|
708
683
|
|
|
709
|
-
|
|
684
|
+
|
|
710
685
|
safe_kuzu_execute(conn, "BEGIN TRANSACTION")
|
|
711
686
|
|
|
712
|
-
|
|
687
|
+
|
|
713
688
|
check_query = f"""
|
|
714
689
|
MATCH (f:Fact {{content: "{escaped_fact}"}})
|
|
715
690
|
RETURN f
|
|
@@ -723,7 +698,7 @@ def insert_fact(conn, fact: str, path: str) -> bool:
|
|
|
723
698
|
print(f"insert_fact: Error checking if fact exists: {error}")
|
|
724
699
|
return False
|
|
725
700
|
|
|
726
|
-
|
|
701
|
+
|
|
727
702
|
if not result.has_next():
|
|
728
703
|
insert_query = f"""
|
|
729
704
|
CREATE (f:Fact {{
|
|
@@ -741,7 +716,7 @@ def insert_fact(conn, fact: str, path: str) -> bool:
|
|
|
741
716
|
print(f"insert_fact: Error inserting fact: {error}")
|
|
742
717
|
return False
|
|
743
718
|
|
|
744
|
-
|
|
719
|
+
|
|
745
720
|
safe_kuzu_execute(conn, "COMMIT")
|
|
746
721
|
print(f"insert_fact: Successfully inserted/found fact: {fact}")
|
|
747
722
|
return True
|
|
@@ -760,14 +735,14 @@ def assign_fact_to_group_graph(conn, fact: str, group: str) -> bool:
|
|
|
760
735
|
return False
|
|
761
736
|
|
|
762
737
|
try:
|
|
763
|
-
|
|
738
|
+
|
|
764
739
|
escaped_fact = fact.replace('"', '\\"')
|
|
765
740
|
escaped_group = group.replace('"', '\\"')
|
|
766
741
|
|
|
767
742
|
print(f"assign_fact_to_group_graph: Assigning fact: {fact} to group:"
|
|
768
|
-
f" {group}")
|
|
743
|
+
f" {group}")
|
|
769
744
|
|
|
770
|
-
|
|
745
|
+
|
|
771
746
|
check_query = f"""
|
|
772
747
|
MATCH (f:Fact {{content: "{escaped_fact}"}})
|
|
773
748
|
RETURN f
|
|
@@ -794,7 +769,7 @@ def assign_fact_to_group_graph(conn, fact: str, group: str) -> bool:
|
|
|
794
769
|
print(f"assign_fact_to_group_graph: Group not found: {group}")
|
|
795
770
|
return False
|
|
796
771
|
|
|
797
|
-
|
|
772
|
+
|
|
798
773
|
query = f"""
|
|
799
774
|
MATCH (f:Fact), (g:Groups)
|
|
800
775
|
WHERE f.content = "{escaped_fact}" AND g.name = "{escaped_group}"
|
|
@@ -818,119 +793,27 @@ def assign_fact_to_group_graph(conn, fact: str, group: str) -> bool:
|
|
|
818
793
|
f" {str(e)}")
|
|
819
794
|
traceback.print_exc()
|
|
820
795
|
return False
|
|
821
|
-
|
|
822
|
-
def get_fact_assignments(
|
|
823
|
-
fact: str,
|
|
824
|
-
groups: List[str],
|
|
825
|
-
model: str,
|
|
826
|
-
provider: str,
|
|
827
|
-
npc: NPC = None
|
|
828
|
-
) -> List[str]:
|
|
829
|
-
"""Get direct group assignments for a fact"""
|
|
830
|
-
|
|
831
|
-
prompt = f"""Which of these groups does this fact belong to?
|
|
832
|
-
Select ALL that apply.
|
|
833
|
-
|
|
834
|
-
Fact: {fact}
|
|
835
|
-
Groups: {json.dumps(groups)}
|
|
836
|
-
|
|
837
|
-
Return JSON:
|
|
838
|
-
{{
|
|
839
|
-
"selected_groups": ["list of relevant groups"]
|
|
840
|
-
}}
|
|
841
|
-
"""
|
|
842
|
-
response = get_llm_response(prompt,
|
|
843
|
-
model=model,
|
|
844
|
-
provider=provider,
|
|
845
|
-
format="json",
|
|
846
|
-
npc=npc)
|
|
847
|
-
return response["response"]["selected_groups"]
|
|
848
|
-
def get_ancestor_groups(group: str, dag: Dict) -> Set[str]:
|
|
849
|
-
"""Get all ancestor groups in the DAG for a given group."""
|
|
850
|
-
ancestors = set()
|
|
851
|
-
queue = [group]
|
|
852
|
-
|
|
853
|
-
while queue:
|
|
854
|
-
current = queue.pop(0)
|
|
855
|
-
# Ensure current group exists in DAG and has parents
|
|
856
|
-
if current in dag and dag[current].get("parents"):
|
|
857
|
-
for parent in dag[current]["parents"]:
|
|
858
|
-
if parent not in ancestors:
|
|
859
|
-
ancestors.add(parent)
|
|
860
|
-
queue.append(parent)
|
|
861
|
-
return ancestors
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
# --- Main Process Flow ---
|
|
865
|
-
def process_text_with_hierarchy(
|
|
866
|
-
text: str,
|
|
867
|
-
model: str,
|
|
868
|
-
provider: str,
|
|
869
|
-
db_path: str,
|
|
870
|
-
npc: NPC = None,
|
|
871
|
-
existing_knowledge_graph: Optional[Dict] = None
|
|
872
|
-
) -> Dict:
|
|
873
|
-
"""Full processing pipeline with hierarchical grouping"""
|
|
874
|
-
print("process_text_with_hierarchy: Starting processing")
|
|
875
|
-
# Step 1: Extract facts from text
|
|
876
|
-
facts = extract_facts(text, model, provider, npc)
|
|
877
|
-
print(f"process_text_with_hierarchy: Extracted Facts: {facts}")
|
|
878
|
-
|
|
879
|
-
# Build the DB connection
|
|
880
|
-
conn = init_db(db_path, drop=False)
|
|
881
|
-
if conn is None:
|
|
882
|
-
return None
|
|
883
796
|
|
|
884
|
-
# Use the existing leaf_groups for semantic evolution
|
|
885
|
-
if existing_knowledge_graph:
|
|
886
|
-
leaf_groups = existing_knowledge_graph.get("leaf_groups", [])
|
|
887
|
-
else:
|
|
888
|
-
leaf_groups = []
|
|
889
|
-
|
|
890
|
-
# Build the hierarchy from the database
|
|
891
|
-
hierarchy_data = build_full_hierarchy(leaf_groups, model, provider, npc)
|
|
892
|
-
|
|
893
|
-
# Step 3: Assign facts to hierarchy
|
|
894
|
-
assignments = {}
|
|
895
|
-
for fact in facts:
|
|
896
|
-
assignment = assign_fact_to_dag(fact, hierarchy_data, model, provider, npc)
|
|
897
|
-
# Store fact and group in kuzu
|
|
898
|
-
store_success = store_fact_and_group(conn, fact, assignment["all_groups"], "")
|
|
899
|
-
if not store_success:
|
|
900
|
-
print(f"process_text_with_hierarchy: Failed to store fact: {fact}")
|
|
901
|
-
assignments[fact] = assignment
|
|
902
|
-
|
|
903
|
-
conn.close()
|
|
904
|
-
|
|
905
|
-
print("process_text_with_hierarchy: Finished Processing")
|
|
906
|
-
return {
|
|
907
|
-
"facts": facts,
|
|
908
|
-
"leaf_groups": leaf_groups,
|
|
909
|
-
"hierarchy": hierarchy_data,
|
|
910
|
-
"assignments": assignments
|
|
911
|
-
}
|
|
912
797
|
|
|
913
|
-
|
|
914
|
-
#--- Kuzu Database integration ---
|
|
915
798
|
def store_fact_and_group(conn, fact: str, groups: List[str], path: str) -> bool:
|
|
916
799
|
"""Insert a fact into the database along with its groups"""
|
|
917
800
|
if not conn:
|
|
918
801
|
print("store_fact_and_group: Database connection is None")
|
|
919
802
|
return False
|
|
920
803
|
|
|
921
|
-
print(f"store_fact_and_group: Storing fact: {fact}, with groups: {groups}")
|
|
804
|
+
print(f"store_fact_and_group: Storing fact: {fact}, with groups: {groups}")
|
|
922
805
|
try:
|
|
923
|
-
|
|
924
|
-
insert_success = insert_fact(conn, fact, path)
|
|
806
|
+
|
|
807
|
+
insert_success = insert_fact(conn, fact, path)
|
|
925
808
|
if not insert_success:
|
|
926
|
-
print(f"store_fact_and_group: Failed to insert fact: {fact}")
|
|
809
|
+
print(f"store_fact_and_group: Failed to insert fact: {fact}")
|
|
927
810
|
return False
|
|
928
811
|
|
|
929
|
-
|
|
812
|
+
|
|
930
813
|
for group in groups:
|
|
931
814
|
assign_success = assign_fact_to_group_graph(conn, fact, group)
|
|
932
815
|
if not assign_success:
|
|
933
|
-
print(f"store_fact_and_group: Failed to assign fact {fact} to group {group}")
|
|
816
|
+
print(f"store_fact_and_group: Failed to assign fact {fact} to group {group}")
|
|
934
817
|
return False
|
|
935
818
|
|
|
936
819
|
return True
|
|
@@ -940,7 +823,7 @@ def store_fact_and_group(conn, fact: str, groups: List[str], path: str) -> bool:
|
|
|
940
823
|
return False
|
|
941
824
|
|
|
942
825
|
|
|
943
|
-
|
|
826
|
+
|
|
944
827
|
def safe_kuzu_execute(conn, query, error_message="Kuzu query failed"):
|
|
945
828
|
"""Execute a Kuzu query with proper error handling"""
|
|
946
829
|
try:
|
|
@@ -951,462 +834,52 @@ def safe_kuzu_execute(conn, query, error_message="Kuzu query failed"):
|
|
|
951
834
|
print(error)
|
|
952
835
|
return None, error
|
|
953
836
|
|
|
837
|
+
def process_text_with_chroma(
|
|
838
|
+
kuzu_db_path: str,
|
|
839
|
+
chroma_db_path: str,
|
|
840
|
+
text: str,
|
|
841
|
+
path: str,
|
|
842
|
+
model: str ,
|
|
843
|
+
provider: str ,
|
|
844
|
+
embedding_model: str ,
|
|
845
|
+
embedding_provider: str ,
|
|
846
|
+
npc = None,
|
|
847
|
+
batch_size: int = 5,
|
|
848
|
+
):
|
|
849
|
+
"""Process text and store facts in both Kuzu and Chroma DB
|
|
850
|
+
|
|
851
|
+
Args:
|
|
852
|
+
kuzu_db_path: Path to Kuzu graph database
|
|
853
|
+
chroma_db_path: Path to Chroma vector database
|
|
854
|
+
text: Input text to process
|
|
855
|
+
path: Source path or identifier
|
|
856
|
+
model: LLM model to use
|
|
857
|
+
provider: LLM provider
|
|
858
|
+
embedding_model: Model to use for embeddings
|
|
859
|
+
npc: Optional NPC instance
|
|
860
|
+
batch_size: Batch size for processing
|
|
954
861
|
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
text = """
|
|
958
|
-
npcsh is a Python-based command-line tool for integrating LLMs into daily workflows.
|
|
959
|
-
It features a smart interpreter that understands natural language commands.
|
|
960
|
-
The tool remembers command history and can reference previous commands.
|
|
961
|
-
It supports creating custom NPCs with specific personalities and directives.
|
|
962
|
-
Advanced customization is possible through configuration files.
|
|
862
|
+
Returns:
|
|
863
|
+
List of extracted facts
|
|
963
864
|
"""
|
|
964
865
|
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
# Print results
|
|
973
|
-
print("FACTS:")
|
|
974
|
-
for i, fact in enumerate(kg["facts"]):
|
|
975
|
-
print(f"{i+1}. {fact}")
|
|
976
|
-
|
|
977
|
-
print("\nHIERARCHY LEVELS:")
|
|
978
|
-
for level in range(kg["hierarchy"]["top_level"], -1, -1):
|
|
979
|
-
groups = kg["hierarchy"][f"level_{level}"]["groups"]
|
|
980
|
-
print(f"Level {level} ({len(groups)} groups):")
|
|
981
|
-
for group in groups:
|
|
982
|
-
print(f" - {group}")
|
|
866
|
+
kuzu_conn = init_db(kuzu_db_path, drop=False)
|
|
867
|
+
chroma_client, chroma_collection = setup_chroma_db(
|
|
868
|
+
"knowledge_graph",
|
|
869
|
+
"Facts extracted from various sources",
|
|
870
|
+
chroma_db_path
|
|
871
|
+
)
|
|
872
|
+
|
|
983
873
|
|
|
984
|
-
|
|
985
|
-
for fact, assignment in kg["assignments"].items():
|
|
986
|
-
print(f"\nFact: {fact}")
|
|
987
|
-
print("Assignments by level:")
|
|
988
|
-
for level, groups in assignment["all_assignments"].items():
|
|
989
|
-
print(f" Level {level}: {groups}")
|
|
874
|
+
facts = extract_facts(text, model=model, provider=provider, npc=npc)
|
|
990
875
|
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
provider: str = "ollama",
|
|
996
|
-
npc: NPC = None,
|
|
997
|
-
) -> List[str]:
|
|
998
|
-
"""Find existing groups that might contain this fact"""
|
|
999
|
-
response = conn.execute(f"MATCH (g:Groups) RETURN g.name;") # Execute query
|
|
1000
|
-
print(response)
|
|
1001
|
-
print(type(response))
|
|
1002
|
-
print(dir(response))
|
|
1003
|
-
groups = response.fetch_as_df()
|
|
1004
|
-
print(f"Groups: {groups}")
|
|
1005
|
-
if not groups:
|
|
1006
|
-
return []
|
|
876
|
+
|
|
877
|
+
for i in range(0, len(facts), batch_size):
|
|
878
|
+
batch = facts[i : i + batch_size]
|
|
879
|
+
print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
|
|
1007
880
|
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
For example, if a fact is "The user loves programming" and there's a group called "Technical_Interests",
|
|
1011
|
-
that would be a match.
|
|
1012
|
-
|
|
1013
|
-
Return a JSON object with the following structure:
|
|
1014
|
-
{
|
|
1015
|
-
"group_list": "a list containing the names of matching groups"
|
|
1016
|
-
}
|
|
1017
|
-
|
|
1018
|
-
Return only the JSON object.
|
|
1019
|
-
Do not include any additional markdown formatting.
|
|
1020
|
-
"""
|
|
1021
|
-
|
|
1022
|
-
response = get_llm_response(
|
|
1023
|
-
prompt + f"\n\nFact: {fact}\nGroups: {json.dumps(groups)}",
|
|
1024
|
-
model=model,
|
|
1025
|
-
provider=provider,
|
|
1026
|
-
format="json",
|
|
1027
|
-
npc=npc,
|
|
1028
|
-
)
|
|
1029
|
-
response = response["response"]
|
|
1030
|
-
return response["group_list"]
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
def identify_groups(
|
|
1034
|
-
facts: List[str],
|
|
1035
|
-
model: str = "llama3.2",
|
|
1036
|
-
provider: str = "ollama",
|
|
1037
|
-
npc: NPC = None,
|
|
1038
|
-
) -> List[str]:
|
|
1039
|
-
"""Identify natural groups from a list of facts"""
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
prompt = """What are the main groups these facts could be organized into?
|
|
1043
|
-
Express these groups in plain, natural language.
|
|
1044
|
-
|
|
1045
|
-
For example, given:
|
|
1046
|
-
- User enjoys programming in Python
|
|
1047
|
-
- User works on machine learning projects
|
|
1048
|
-
- User likes to play piano
|
|
1049
|
-
- User practices meditation daily
|
|
1050
|
-
|
|
1051
|
-
You might identify groups like:
|
|
1052
|
-
- Programming
|
|
1053
|
-
- Machine Learning
|
|
1054
|
-
- Musical Interests
|
|
1055
|
-
- Daily Practices
|
|
1056
|
-
|
|
1057
|
-
Return a JSON object with the following structure:
|
|
1058
|
-
`{
|
|
1059
|
-
"groups": ["list of group names"]
|
|
1060
|
-
}`
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
Return only the JSON object. Do not include any additional markdown formatting or
|
|
1064
|
-
leading json characters.
|
|
1065
|
-
"""
|
|
1066
|
-
|
|
1067
|
-
response = get_llm_response(
|
|
1068
|
-
prompt + f"\n\nFacts: {json.dumps(facts)}",
|
|
1069
|
-
model=model,
|
|
1070
|
-
provider=provider,
|
|
1071
|
-
format="json",
|
|
1072
|
-
npc=npc,
|
|
1073
|
-
)
|
|
1074
|
-
return response["response"]["groups"]
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
def assign_groups_to_fact(
|
|
1078
|
-
fact: str,
|
|
1079
|
-
groups: List[str],
|
|
1080
|
-
model: str = "llama3.2",
|
|
1081
|
-
provider: str = "ollama",
|
|
1082
|
-
npc: NPC = None,
|
|
1083
|
-
) -> Dict[str, List[str]]:
|
|
1084
|
-
"""Assign facts to the identified groups"""
|
|
1085
|
-
prompt = f"""Given this fact, assign it to any relevant groups.
|
|
1086
|
-
|
|
1087
|
-
A fact can belong to multiple groups if it fits.
|
|
1088
|
-
|
|
1089
|
-
Here is the fact: {fact}
|
|
1090
|
-
|
|
1091
|
-
Here are the groups: {groups}
|
|
1092
|
-
|
|
1093
|
-
Return a JSON object with the following structure:
|
|
1094
|
-
{{
|
|
1095
|
-
"groups": ["list of group names"]
|
|
1096
|
-
}}
|
|
1097
|
-
|
|
1098
|
-
Do not include any additional markdown formatting or leading json characters.
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
"""
|
|
1102
|
-
|
|
1103
|
-
response = get_llm_response(
|
|
1104
|
-
prompt,
|
|
1105
|
-
model=model,
|
|
1106
|
-
provider=provider,
|
|
1107
|
-
format="json",
|
|
1108
|
-
npc=npc,
|
|
1109
|
-
)
|
|
1110
|
-
return response["response"]
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
def save_facts_to_db(
|
|
1114
|
-
conn, facts: List[str], path: str, batch_size: int
|
|
1115
|
-
):
|
|
1116
|
-
"""Save a list of facts to the database in batches"""
|
|
1117
|
-
for i in range(0, len(facts), batch_size):
|
|
1118
|
-
batch = facts[i : i + batch_size]
|
|
1119
|
-
print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
|
|
1120
|
-
|
|
1121
|
-
# Process each fact in the batch
|
|
1122
|
-
for fact in batch:
|
|
1123
|
-
try:
|
|
1124
|
-
print(f"Inserting fact: {fact}")
|
|
1125
|
-
print(f"With path: {path}")
|
|
1126
|
-
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
1127
|
-
print(f"With recorded_at: {timestamp}")
|
|
1128
|
-
|
|
1129
|
-
insert_fact(conn, fact, path)
|
|
1130
|
-
print("Success!")
|
|
1131
|
-
except Exception as e:
|
|
1132
|
-
print(f"Failed to insert fact: {fact}")
|
|
1133
|
-
print(f"Error: {e}")
|
|
1134
|
-
continue
|
|
1135
|
-
|
|
1136
|
-
print(f"Completed batch {i//batch_size + 1}")
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
def process_text(
|
|
1140
|
-
db_path: str,
|
|
1141
|
-
text: str,
|
|
1142
|
-
path: str,
|
|
1143
|
-
model: str = "llama3.2",
|
|
1144
|
-
provider: str = "ollama",
|
|
1145
|
-
npc: NPC = None,
|
|
1146
|
-
batch_size: int = 5,
|
|
1147
|
-
conn=None,
|
|
1148
|
-
):
|
|
1149
|
-
"""Process text and add extracted facts to the database with robust error handling"""
|
|
1150
|
-
|
|
1151
|
-
try:
|
|
1152
|
-
# Initialize database
|
|
1153
|
-
if conn is None:
|
|
1154
|
-
conn = init_db(db_path, drop=False)
|
|
1155
|
-
|
|
1156
|
-
return []
|
|
1157
|
-
|
|
1158
|
-
# Extract facts
|
|
1159
|
-
facts = extract_facts(text, model=model, provider=provider, npc=npc)
|
|
1160
|
-
if not facts:
|
|
1161
|
-
print("No facts extracted")
|
|
1162
|
-
return []
|
|
1163
|
-
|
|
1164
|
-
print(f"Extracted {len(facts)} facts")
|
|
1165
|
-
for fact in facts:
|
|
1166
|
-
print(f"- {fact}")
|
|
1167
|
-
|
|
1168
|
-
# Process facts in batches
|
|
1169
|
-
for i in range(0, len(facts), batch_size):
|
|
1170
|
-
batch = facts[i : i + batch_size]
|
|
1171
|
-
print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
|
|
1172
|
-
|
|
1173
|
-
for fact in batch:
|
|
1174
|
-
try:
|
|
1175
|
-
print(f"Inserting fact: {fact}")
|
|
1176
|
-
success = insert_fact(conn, fact, path)
|
|
1177
|
-
if success:
|
|
1178
|
-
print("Success!")
|
|
1179
|
-
else:
|
|
1180
|
-
print("Failed to insert fact")
|
|
1181
|
-
except Exception as e:
|
|
1182
|
-
print(f"Error processing fact: {str(e)}")
|
|
1183
|
-
traceback.print_exc()
|
|
1184
|
-
|
|
1185
|
-
print(f"Completed batch {i//batch_size + 1}")
|
|
1186
|
-
|
|
1187
|
-
return facts
|
|
1188
|
-
except Exception as e:
|
|
1189
|
-
print(f"Error processing text: {str(e)}")
|
|
1190
|
-
traceback.print_exc()
|
|
1191
|
-
return []
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
import networkx as nx
|
|
1195
|
-
import matplotlib.pyplot as plt
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
def visualize_graph(conn):
|
|
1199
|
-
"""Visualize the knowledge graph using networkx"""
|
|
1200
|
-
# Create a networkx graph
|
|
1201
|
-
G = nx.DiGraph()
|
|
1202
|
-
|
|
1203
|
-
# Get all facts and groups with their relationships
|
|
1204
|
-
facts_result = conn.execute("MATCH (f:Fact) RETURN f.content;").get_as_df()
|
|
1205
|
-
facts = [row["f.content"] for index, row in facts_result.iterrows()]
|
|
1206
|
-
|
|
1207
|
-
groups_result = conn.execute("MATCH (g:Groups) RETURN g.name;").get_as_df()
|
|
1208
|
-
groups = [row["g.name"] for index, row in groups_result.iterrows()]
|
|
1209
|
-
|
|
1210
|
-
relationships_result = conn.execute(
|
|
1211
|
-
"""
|
|
1212
|
-
MATCH (g:Groups)-[r:Contains]->(f:Fact)
|
|
1213
|
-
RETURN g.name, f.content;
|
|
1214
|
-
"""
|
|
1215
|
-
).get_as_df()
|
|
1216
|
-
|
|
1217
|
-
# Add nodes with different colors for facts and groups
|
|
1218
|
-
for fact in facts:
|
|
1219
|
-
G.add_node(fact, node_type="fact")
|
|
1220
|
-
for group in groups:
|
|
1221
|
-
G.add_node(group, node_type="group")
|
|
1222
|
-
|
|
1223
|
-
# Add edges from relationships
|
|
1224
|
-
for index, row in relationships_result.iterrows():
|
|
1225
|
-
G.add_edge(row["g.name"], row["f.content"]) # group name -> fact content
|
|
1226
|
-
|
|
1227
|
-
# Set up the visualization
|
|
1228
|
-
plt.figure(figsize=(20, 12))
|
|
1229
|
-
pos = nx.spring_layout(G, k=2, iterations=50)
|
|
1230
|
-
|
|
1231
|
-
# Draw groups (larger nodes, distinct color)
|
|
1232
|
-
group_nodes = [
|
|
1233
|
-
n for n, attr in G.nodes(data=True) if attr.get("node_type") == "group"
|
|
1234
|
-
]
|
|
1235
|
-
nx.draw_networkx_nodes(
|
|
1236
|
-
G, pos, nodelist=group_nodes, node_color="lightgreen", node_size=3000, alpha=0.7
|
|
1237
|
-
)
|
|
1238
|
-
|
|
1239
|
-
# Draw facts (smaller nodes, different color)
|
|
1240
|
-
fact_nodes = [
|
|
1241
|
-
n for n, attr in G.nodes(data=True) if attr.get("node_type") == "fact"
|
|
1242
|
-
]
|
|
1243
|
-
nx.draw_networkx_nodes(
|
|
1244
|
-
G, pos, nodelist=fact_nodes, node_color="lightblue", node_size=2000, alpha=0.5
|
|
1245
|
-
)
|
|
1246
|
-
|
|
1247
|
-
# Draw edges with arrows
|
|
1248
|
-
nx.draw_networkx_edges(G, pos, edge_color="gray", arrows=True, arrowsize=20)
|
|
1249
|
-
|
|
1250
|
-
# Add labels with different sizes for groups and facts
|
|
1251
|
-
group_labels = {node: node for node in group_nodes}
|
|
1252
|
-
fact_labels = {
|
|
1253
|
-
node: node[:50] + "..." if len(node) > 50 else node for node in fact_nodes
|
|
1254
|
-
}
|
|
1255
|
-
|
|
1256
|
-
nx.draw_networkx_labels(G, pos, group_labels, font_size=10, font_weight="bold")
|
|
1257
|
-
nx.draw_networkx_labels(G, pos, fact_labels, font_size=8)
|
|
1258
|
-
|
|
1259
|
-
plt.title("Knowledge Graph: Groups and Facts", pad=20, fontsize=16)
|
|
1260
|
-
plt.axis("off")
|
|
1261
|
-
plt.tight_layout()
|
|
1262
|
-
|
|
1263
|
-
# Print statistics
|
|
1264
|
-
print("\nKnowledge Graph Statistics:")
|
|
1265
|
-
print(f"Number of facts: {len(facts)}")
|
|
1266
|
-
print(f"Number of groups: {len(groups)}")
|
|
1267
|
-
print(f"Number of relationships: {len(relationships_result)}")
|
|
1268
|
-
|
|
1269
|
-
print("\nGroups:")
|
|
1270
|
-
for g in groups:
|
|
1271
|
-
related_facts = [
|
|
1272
|
-
row["f.content"]
|
|
1273
|
-
for index, row in relationships_result.iterrows()
|
|
1274
|
-
if row["g.name"] == g
|
|
1275
|
-
]
|
|
1276
|
-
print(f"\n{g}:")
|
|
1277
|
-
for f in related_facts:
|
|
1278
|
-
print(f" - {f}")
|
|
1279
|
-
|
|
1280
|
-
plt.show()
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
def store_fact_with_embedding(
|
|
1284
|
-
collection, fact: str, metadata: dict, embedding: List[float]
|
|
1285
|
-
) -> str:
|
|
1286
|
-
"""Store a fact with its pre-generated embedding in Chroma DB
|
|
1287
|
-
|
|
1288
|
-
Args:
|
|
1289
|
-
collection: Chroma collection
|
|
1290
|
-
fact: The fact text
|
|
1291
|
-
metadata: Dictionary with metadata (path, source, timestamp, etc.)
|
|
1292
|
-
embedding: Pre-generated embedding vector from get_embeddings
|
|
1293
|
-
|
|
1294
|
-
Returns:
|
|
1295
|
-
ID of the stored fact
|
|
1296
|
-
"""
|
|
1297
|
-
try:
|
|
1298
|
-
# Generate a deterministic ID from the fact content
|
|
1299
|
-
import hashlib
|
|
1300
|
-
|
|
1301
|
-
fact_id = hashlib.md5(fact.encode()).hexdigest()
|
|
1302
|
-
|
|
1303
|
-
# Store document with pre-generated embedding
|
|
1304
|
-
collection.add(
|
|
1305
|
-
documents=[fact],
|
|
1306
|
-
embeddings=[embedding],
|
|
1307
|
-
metadatas=[metadata],
|
|
1308
|
-
ids=[fact_id],
|
|
1309
|
-
)
|
|
1310
|
-
|
|
1311
|
-
return fact_id
|
|
1312
|
-
except Exception as e:
|
|
1313
|
-
print(f"Error storing fact in Chroma: {e}")
|
|
1314
|
-
return None
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
def find_similar_facts_chroma(
|
|
1318
|
-
collection,
|
|
1319
|
-
query: str,
|
|
1320
|
-
query_embedding: List[float],
|
|
1321
|
-
n_results: int = 5,
|
|
1322
|
-
metadata_filter: Optional[Dict] = None,
|
|
1323
|
-
) -> List[Dict]:
|
|
1324
|
-
"""Find facts similar to the query using pre-generated embedding
|
|
1325
|
-
|
|
1326
|
-
Args:
|
|
1327
|
-
collection: Chroma collection
|
|
1328
|
-
query: Query text (for reference only)
|
|
1329
|
-
query_embedding: Pre-generated embedding from get_embeddings
|
|
1330
|
-
n_results: Number of results to return
|
|
1331
|
-
metadata_filter: Optional filter for metadata fields
|
|
1332
|
-
|
|
1333
|
-
Returns:
|
|
1334
|
-
List of dictionaries with results
|
|
1335
|
-
"""
|
|
1336
|
-
try:
|
|
1337
|
-
# Perform query with optional metadata filtering
|
|
1338
|
-
results = collection.query(
|
|
1339
|
-
query_embeddings=[query_embedding],
|
|
1340
|
-
n_results=n_results,
|
|
1341
|
-
where=metadata_filter,
|
|
1342
|
-
)
|
|
1343
|
-
|
|
1344
|
-
# Format results
|
|
1345
|
-
formatted_results = []
|
|
1346
|
-
for i, doc in enumerate(results["documents"][0]):
|
|
1347
|
-
formatted_results.append(
|
|
1348
|
-
{
|
|
1349
|
-
"fact": doc,
|
|
1350
|
-
"metadata": results["metadatas"][0][i],
|
|
1351
|
-
"id": results["ids"][0][i],
|
|
1352
|
-
"distance": (
|
|
1353
|
-
results["distances"][0][i] if "distances" in results else None
|
|
1354
|
-
),
|
|
1355
|
-
}
|
|
1356
|
-
)
|
|
1357
|
-
|
|
1358
|
-
return formatted_results
|
|
1359
|
-
except Exception as e:
|
|
1360
|
-
print(f"Error searching in Chroma: {e}")
|
|
1361
|
-
return []
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
def process_text_with_chroma(
|
|
1365
|
-
kuzu_db_path: str,
|
|
1366
|
-
chroma_db_path: str,
|
|
1367
|
-
text: str,
|
|
1368
|
-
path: str,
|
|
1369
|
-
model: str ,
|
|
1370
|
-
provider: str ,
|
|
1371
|
-
embedding_model: str ,
|
|
1372
|
-
embedding_provider: str ,
|
|
1373
|
-
npc: NPC = None,
|
|
1374
|
-
batch_size: int = 5,
|
|
1375
|
-
):
|
|
1376
|
-
"""Process text and store facts in both Kuzu and Chroma DB
|
|
1377
|
-
|
|
1378
|
-
Args:
|
|
1379
|
-
kuzu_db_path: Path to Kuzu graph database
|
|
1380
|
-
chroma_db_path: Path to Chroma vector database
|
|
1381
|
-
text: Input text to process
|
|
1382
|
-
path: Source path or identifier
|
|
1383
|
-
model: LLM model to use
|
|
1384
|
-
provider: LLM provider
|
|
1385
|
-
embedding_model: Model to use for embeddings
|
|
1386
|
-
npc: Optional NPC instance
|
|
1387
|
-
batch_size: Batch size for processing
|
|
1388
|
-
|
|
1389
|
-
Returns:
|
|
1390
|
-
List of extracted facts
|
|
1391
|
-
"""
|
|
1392
|
-
# Initialize databases
|
|
1393
|
-
kuzu_conn = init_db(kuzu_db_path, drop=False)
|
|
1394
|
-
chroma_client, chroma_collection = setup_chroma_db(
|
|
1395
|
-
"knowledge_graph",
|
|
1396
|
-
"Facts extracted from various sources",
|
|
1397
|
-
chroma_db_path
|
|
1398
|
-
)
|
|
1399
|
-
|
|
1400
|
-
# Extract facts
|
|
1401
|
-
facts = extract_facts(text, model=model, provider=provider, npc=npc)
|
|
1402
|
-
|
|
1403
|
-
# Process extracted facts
|
|
1404
|
-
for i in range(0, len(facts), batch_size):
|
|
1405
|
-
batch = facts[i : i + batch_size]
|
|
1406
|
-
print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
|
|
1407
|
-
|
|
1408
|
-
# Generate embeddings for the batch using npcpy.llm_funcs.get_embeddings
|
|
1409
|
-
from npcpy.llm_funcs import get_embeddings
|
|
881
|
+
|
|
882
|
+
from npcpy.llm_funcs import get_embeddings
|
|
1410
883
|
|
|
1411
884
|
batch_embeddings = get_embeddings(
|
|
1412
885
|
batch,
|
|
@@ -1416,7 +889,7 @@ def process_text_with_chroma(
|
|
|
1416
889
|
print(f"Processing fact: {fact}")
|
|
1417
890
|
embedding = batch_embeddings[j]
|
|
1418
891
|
|
|
1419
|
-
|
|
892
|
+
|
|
1420
893
|
similar_facts = find_similar_facts_chroma(
|
|
1421
894
|
chroma_collection, fact, query_embedding=embedding, n_results=3
|
|
1422
895
|
)
|
|
@@ -1425,9 +898,9 @@ def process_text_with_chroma(
|
|
|
1425
898
|
print(f"Similar facts found:")
|
|
1426
899
|
for result in similar_facts:
|
|
1427
900
|
print(f" - {result['fact']} (distance: {result['distance']})")
|
|
1428
|
-
|
|
901
|
+
|
|
1429
902
|
|
|
1430
|
-
|
|
903
|
+
|
|
1431
904
|
metadata = {
|
|
1432
905
|
"path": path,
|
|
1433
906
|
"timestamp": datetime.now().isoformat(),
|
|
@@ -1435,10 +908,10 @@ def process_text_with_chroma(
|
|
|
1435
908
|
"source_provider": provider,
|
|
1436
909
|
}
|
|
1437
910
|
|
|
1438
|
-
|
|
911
|
+
|
|
1439
912
|
kuzu_success = insert_fact(kuzu_conn, fact, path)
|
|
1440
913
|
|
|
1441
|
-
|
|
914
|
+
|
|
1442
915
|
if kuzu_success:
|
|
1443
916
|
chroma_id = store_fact_with_embedding(
|
|
1444
917
|
chroma_collection, fact, metadata, embedding
|
|
@@ -1450,7 +923,7 @@ def process_text_with_chroma(
|
|
|
1450
923
|
else:
|
|
1451
924
|
print(f"Failed to save fact to Kuzu graph")
|
|
1452
925
|
|
|
1453
|
-
|
|
926
|
+
|
|
1454
927
|
kuzu_conn.close()
|
|
1455
928
|
|
|
1456
929
|
return facts
|
|
@@ -1479,12 +952,12 @@ def hybrid_search_with_chroma(
|
|
|
1479
952
|
Returns:
|
|
1480
953
|
List of dictionaries with combined results
|
|
1481
954
|
"""
|
|
1482
|
-
|
|
955
|
+
|
|
1483
956
|
from npcpy.llm_funcs import get_embeddings
|
|
1484
957
|
|
|
1485
958
|
query_embedding = get_embeddings([query])[0]
|
|
1486
959
|
|
|
1487
|
-
|
|
960
|
+
|
|
1488
961
|
vector_results = find_similar_facts_chroma(
|
|
1489
962
|
chroma_collection,
|
|
1490
963
|
query,
|
|
@@ -1493,13 +966,13 @@ def hybrid_search_with_chroma(
|
|
|
1493
966
|
metadata_filter=metadata_filter,
|
|
1494
967
|
)
|
|
1495
968
|
|
|
1496
|
-
|
|
969
|
+
|
|
1497
970
|
vector_facts = [result["fact"] for result in vector_results]
|
|
1498
971
|
|
|
1499
|
-
|
|
972
|
+
|
|
1500
973
|
expanded_results = []
|
|
1501
974
|
|
|
1502
|
-
|
|
975
|
+
|
|
1503
976
|
for result in vector_results:
|
|
1504
977
|
expanded_results.append(
|
|
1505
978
|
{
|
|
@@ -1511,13 +984,13 @@ def hybrid_search_with_chroma(
|
|
|
1511
984
|
}
|
|
1512
985
|
)
|
|
1513
986
|
|
|
1514
|
-
|
|
987
|
+
|
|
1515
988
|
for fact in vector_facts:
|
|
1516
989
|
try:
|
|
1517
|
-
|
|
990
|
+
|
|
1518
991
|
escaped_fact = fact.replace('"', '\\"')
|
|
1519
992
|
|
|
1520
|
-
|
|
993
|
+
|
|
1521
994
|
group_result = kuzu_conn.execute(
|
|
1522
995
|
f"""
|
|
1523
996
|
MATCH (g:Groups)-[:Contains]->(f:Fact)
|
|
@@ -1526,18 +999,18 @@ def hybrid_search_with_chroma(
|
|
|
1526
999
|
"""
|
|
1527
1000
|
).get_as_df()
|
|
1528
1001
|
|
|
1529
|
-
|
|
1002
|
+
|
|
1530
1003
|
fact_groups = [row["g.name"] for _, row in group_result.iterrows()]
|
|
1531
1004
|
|
|
1532
|
-
|
|
1005
|
+
|
|
1533
1006
|
if group_filter:
|
|
1534
1007
|
fact_groups = [g for g in fact_groups if g in group_filter]
|
|
1535
1008
|
|
|
1536
|
-
|
|
1009
|
+
|
|
1537
1010
|
for group in fact_groups:
|
|
1538
1011
|
escaped_group = group.replace('"', '\\"')
|
|
1539
1012
|
|
|
1540
|
-
|
|
1013
|
+
|
|
1541
1014
|
related_facts_result = kuzu_conn.execute(
|
|
1542
1015
|
f"""
|
|
1543
1016
|
MATCH (g:Groups)-[:Contains]->(f:Fact)
|
|
@@ -1547,7 +1020,7 @@ def hybrid_search_with_chroma(
|
|
|
1547
1020
|
"""
|
|
1548
1021
|
).get_as_df()
|
|
1549
1022
|
|
|
1550
|
-
|
|
1023
|
+
|
|
1551
1024
|
for _, row in related_facts_result.iterrows():
|
|
1552
1025
|
related_fact = {
|
|
1553
1026
|
"fact": row["f.content"],
|
|
@@ -1557,7 +1030,7 @@ def hybrid_search_with_chroma(
|
|
|
1557
1030
|
"recorded_at": row["f.recorded_at"],
|
|
1558
1031
|
}
|
|
1559
1032
|
|
|
1560
|
-
|
|
1033
|
+
|
|
1561
1034
|
if not any(
|
|
1562
1035
|
r.get("fact") == related_fact["fact"] for r in expanded_results
|
|
1563
1036
|
):
|
|
@@ -1566,806 +1039,380 @@ def hybrid_search_with_chroma(
|
|
|
1566
1039
|
except Exception as e:
|
|
1567
1040
|
print(f"Error expanding results via graph: {e}")
|
|
1568
1041
|
|
|
1569
|
-
|
|
1042
|
+
|
|
1570
1043
|
return expanded_results[:top_k]
|
|
1571
1044
|
|
|
1572
1045
|
|
|
1573
|
-
def
|
|
1574
|
-
|
|
1575
|
-
chroma_db_path: str,
|
|
1046
|
+
def find_similar_facts_chroma(
|
|
1047
|
+
collection,
|
|
1576
1048
|
query: str,
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1049
|
+
query_embedding: List[float],
|
|
1050
|
+
n_results: int = 5,
|
|
1051
|
+
metadata_filter: Optional[Dict] = None,
|
|
1052
|
+
) -> List[Dict]:
|
|
1053
|
+
"""Find facts similar to the query using pre-generated embedding
|
|
1581
1054
|
|
|
1582
1055
|
Args:
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
embedding_model: Model to use for embeddings
|
|
1589
|
-
provider: Provider for embeddings
|
|
1056
|
+
collection: Chroma collection
|
|
1057
|
+
query: Query text (for reference only)
|
|
1058
|
+
query_embedding: Pre-generated embedding from get_embeddings
|
|
1059
|
+
n_results: Number of results to return
|
|
1060
|
+
metadata_filter: Optional filter for metadata fields
|
|
1590
1061
|
|
|
1591
1062
|
Returns:
|
|
1592
|
-
|
|
1063
|
+
List of dictionaries with results
|
|
1593
1064
|
"""
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
# Perform hybrid search
|
|
1603
|
-
results = hybrid_search_with_chroma(
|
|
1604
|
-
kuzu_conn=kuzu_conn,
|
|
1605
|
-
chroma_collection=chroma_collection,
|
|
1606
|
-
query=query,
|
|
1607
|
-
group_filter=group_filters,
|
|
1608
|
-
top_k=top_k,
|
|
1609
|
-
)
|
|
1610
|
-
|
|
1611
|
-
# Format results as context for RAG
|
|
1612
|
-
context = "Related facts:\n\n"
|
|
1613
|
-
|
|
1614
|
-
# First include direct vector matches
|
|
1615
|
-
context += "Most relevant facts:\n"
|
|
1616
|
-
vector_matches = [r for r in results if r["source"] == "vector_search"]
|
|
1617
|
-
for i, item in enumerate(vector_matches):
|
|
1618
|
-
context += f"{i+1}. {item['fact']}\n"
|
|
1065
|
+
try:
|
|
1066
|
+
|
|
1067
|
+
results = collection.query(
|
|
1068
|
+
query_embeddings=[query_embedding],
|
|
1069
|
+
n_results=n_results,
|
|
1070
|
+
where=metadata_filter,
|
|
1071
|
+
)
|
|
1619
1072
|
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1073
|
+
|
|
1074
|
+
formatted_results = []
|
|
1075
|
+
for i, doc in enumerate(results["documents"][0]):
|
|
1076
|
+
formatted_results.append(
|
|
1077
|
+
{
|
|
1078
|
+
"fact": doc,
|
|
1079
|
+
"metadata": results["metadatas"][0][i],
|
|
1080
|
+
"id": results["ids"][0][i],
|
|
1081
|
+
"distance": (
|
|
1082
|
+
results["distances"][0][i] if "distances" in results else None
|
|
1083
|
+
),
|
|
1084
|
+
}
|
|
1085
|
+
)
|
|
1626
1086
|
|
|
1627
|
-
|
|
1628
|
-
|
|
1087
|
+
return formatted_results
|
|
1088
|
+
except Exception as e:
|
|
1089
|
+
print(f"Error searching in Chroma: {e}")
|
|
1090
|
+
return []
|
|
1629
1091
|
|
|
1630
|
-
return context
|
|
1631
1092
|
|
|
1632
1093
|
|
|
1633
|
-
def
|
|
1634
|
-
|
|
1635
|
-
kuzu_db_path: str = os.path.expanduser("~/npcsh_graph.db"),
|
|
1636
|
-
chroma_db_path: str = os.path.expanduser("~/npcsh_chroma.db"),
|
|
1637
|
-
model: str = "ollama",
|
|
1638
|
-
provider: str = "llama3.2",
|
|
1639
|
-
embedding_model: str = "text-embedding-3-small",
|
|
1094
|
+
def store_fact_with_embedding(
|
|
1095
|
+
collection, fact: str, metadata: dict, embedding: List[float]
|
|
1640
1096
|
) -> str:
|
|
1641
|
-
"""
|
|
1097
|
+
"""Store a fact with its pre-generated embedding in Chroma DB
|
|
1642
1098
|
|
|
1643
1099
|
Args:
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
provider: LLM provider
|
|
1649
|
-
embedding_model: Model to use for embeddings
|
|
1100
|
+
collection: Chroma collection
|
|
1101
|
+
fact: The fact text
|
|
1102
|
+
metadata: Dictionary with metadata (path, source, timestamp, etc.)
|
|
1103
|
+
embedding: Pre-generated embedding vector from get_embeddings
|
|
1650
1104
|
|
|
1651
1105
|
Returns:
|
|
1652
|
-
|
|
1653
|
-
"""
|
|
1654
|
-
# Get relevant facts using hybrid search
|
|
1655
|
-
context = get_facts_for_rag(
|
|
1656
|
-
kuzu_db_path,
|
|
1657
|
-
chroma_db_path,
|
|
1658
|
-
query,
|
|
1659
|
-
)
|
|
1660
|
-
|
|
1661
|
-
# Craft prompt with retrieved context
|
|
1662
|
-
prompt = f"""
|
|
1663
|
-
Answer this question based on the retrieved information.
|
|
1664
|
-
|
|
1665
|
-
Question: {query}
|
|
1666
|
-
|
|
1667
|
-
{context}
|
|
1668
|
-
|
|
1669
|
-
Please provide a comprehensive answer based on the facts above. If the information
|
|
1670
|
-
doesn't contain a direct answer, please indicate that clearly but try to synthesize
|
|
1671
|
-
from the available facts.
|
|
1106
|
+
ID of the stored fact
|
|
1672
1107
|
"""
|
|
1673
|
-
|
|
1674
|
-
# Get response from LLM
|
|
1675
|
-
response = get_llm_response(prompt, model=model, provider=provider)
|
|
1676
|
-
|
|
1677
|
-
return response["response"]
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
# --- New: KnowledgeGraphManager Class ---
|
|
1684
|
-
class KnowledgeGraphManager:
|
|
1685
|
-
def __init__(self, db_path: str, model: str, provider: str, npc: Optional[NPC] = None):
|
|
1686
|
-
self.db_path = db_path
|
|
1687
|
-
self.model = model
|
|
1688
|
-
self.provider = provider
|
|
1689
|
-
self.npc = npc
|
|
1690
|
-
self.conn = None
|
|
1691
|
-
self._initialize_database()
|
|
1692
|
-
self.current_generation = self._get_latest_generation()
|
|
1693
|
-
print(f"KnowledgeGraphManager initialized. Current generation: {self.current_generation}")
|
|
1694
|
-
|
|
1695
|
-
def _initialize_database(self, drop: bool = False):
|
|
1696
|
-
"""Initializes or connects to the Kuzu database."""
|
|
1697
|
-
self.conn = init_db(self.db_path, drop=drop)
|
|
1698
|
-
if self.conn is None:
|
|
1699
|
-
raise ConnectionError("Failed to initialize Kuzu database.")
|
|
1700
|
-
|
|
1701
|
-
def close(self):
|
|
1702
|
-
"""Closes the Kuzu database connection."""
|
|
1703
|
-
if self.conn:
|
|
1704
|
-
self.conn.close()
|
|
1705
|
-
print("Kuzu database connection closed.")
|
|
1706
|
-
|
|
1707
|
-
def _get_latest_generation(self) -> int:
|
|
1708
|
-
"""Queries the database for the latest generation number."""
|
|
1709
|
-
query = "MATCH (g:Groups) RETURN MAX(g.generation_created) AS max_gen;"
|
|
1710
|
-
result, error = safe_kuzu_execute(self.conn, query, "Failed to get max generation")
|
|
1711
|
-
if error:
|
|
1712
|
-
return -1 # Indicate no groups or error
|
|
1713
|
-
|
|
1714
|
-
# Kuzu returns a kuzu.result.QueryResult object
|
|
1715
|
-
# Need to fetch the value
|
|
1716
|
-
df = result.fetch_as_df()
|
|
1717
|
-
if not df.empty and not df['max_gen'].isnull().all():
|
|
1718
|
-
return int(df['max_gen'].iloc[0])
|
|
1719
|
-
return -1 # No groups yet
|
|
1720
|
-
|
|
1721
|
-
def _get_active_hierarchy_dag(self) -> Dict[str, Dict[str, Any]]:
|
|
1722
|
-
"""
|
|
1723
|
-
Queries the Kuzu database to construct the active conceptual hierarchy DAG
|
|
1724
|
-
(ParentOf relationships).
|
|
1725
|
-
Returns a dictionary representing the DAG structure:
|
|
1726
|
-
{
|
|
1727
|
-
'group_name': {
|
|
1728
|
-
'parents': set(),
|
|
1729
|
-
'children': set(),
|
|
1730
|
-
'is_active': bool,
|
|
1731
|
-
'generation_created': int
|
|
1732
|
-
},
|
|
1733
|
-
...
|
|
1734
|
-
}
|
|
1735
|
-
Also returns a list of top-level groups (roots) and leaf groups.
|
|
1736
|
-
"""
|
|
1737
|
-
dag = {}
|
|
1738
|
-
all_groups_query = "MATCH (g:Groups) RETURN g.name, g.is_active, g.generation_created;"
|
|
1739
|
-
groups_result, _ = safe_kuzu_execute(self.conn, all_groups_query)
|
|
1740
|
-
|
|
1741
|
-
if groups_result:
|
|
1742
|
-
for row in groups_result.fetch_as_df().itertuples():
|
|
1743
|
-
group_name = row._1 # Assuming the first column is g.name
|
|
1744
|
-
is_active = row._2 # Assuming the second column is g.is_active
|
|
1745
|
-
generation_created = row._3 # Assuming the third column is g.generation_created
|
|
1746
|
-
dag[group_name] = {
|
|
1747
|
-
"parents": set(),
|
|
1748
|
-
"children": set(),
|
|
1749
|
-
"is_active": is_active,
|
|
1750
|
-
"generation_created": generation_created
|
|
1751
|
-
}
|
|
1752
|
-
|
|
1753
|
-
parent_of_query = """
|
|
1754
|
-
MATCH (p:Groups)-[:ParentOf]->(c:Groups)
|
|
1755
|
-
RETURN p.name, c.name;
|
|
1756
|
-
"""
|
|
1757
|
-
relationships_result, _ = safe_kuzu_execute(self.conn, parent_of_query)
|
|
1758
|
-
|
|
1759
|
-
if relationships_result:
|
|
1760
|
-
for row in relationships_result.fetch_as_df().itertuples():
|
|
1761
|
-
parent_name = row._1
|
|
1762
|
-
child_name = row._2
|
|
1763
|
-
if child_name in dag and parent_name in dag: # Ensure both nodes exist in the active_dag structure
|
|
1764
|
-
dag[child_name]["parents"].add(parent_name)
|
|
1765
|
-
dag[parent_name]["children"].add(child_name)
|
|
1766
|
-
|
|
1767
|
-
# Filter for active groups and identify roots/leaves
|
|
1768
|
-
active_dag = {name: data for name, data in dag.items() if data['is_active']}
|
|
1108
|
+
try:
|
|
1769
1109
|
|
|
1770
|
-
|
|
1771
|
-
leaf_groups = [name for name, data in active_dag.items() if not data["children"]]
|
|
1772
|
-
|
|
1773
|
-
# Also get all active groups for potential random sampling
|
|
1774
|
-
all_active_groups = list(active_dag.keys())
|
|
1775
|
-
|
|
1776
|
-
return {
|
|
1777
|
-
"dag": active_dag,
|
|
1778
|
-
"top_groups": top_groups,
|
|
1779
|
-
"leaf_groups": leaf_groups,
|
|
1780
|
-
"all_active_groups": all_active_groups
|
|
1781
|
-
}
|
|
1782
|
-
|
|
1783
|
-
# --- LLM Abstraction Methods (wrap existing functions or define new prompts) ---
|
|
1784
|
-
|
|
1785
|
-
def _llm_extract_facts(self, text: str, context: str = "") -> List[str]:
|
|
1786
|
-
"""Wrapper for extract_facts."""
|
|
1787
|
-
return extract_facts(text, self.model, self.provider, self.npc, context)
|
|
1788
|
-
|
|
1789
|
-
def _llm_generate_concepts(self, items: List[str], item_type: str = "facts") -> List[str]:
|
|
1790
|
-
"""Wrapper for generate_group_candidates."""
|
|
1791
|
-
return generate_group_candidates(items, item_type, self.model, self.provider, self.npc)
|
|
1792
|
-
|
|
1793
|
-
def _llm_clean_concepts(self, concept_candidates: List[str]) -> List[str]:
|
|
1794
|
-
"""Wrapper for remove_idempotent_groups."""
|
|
1795
|
-
return remove_idempotent_groups(concept_candidates, self.model, self.provider, self.npc)
|
|
1796
|
-
|
|
1797
|
-
def _llm_build_initial_hierarchy(self, concepts: List[str]) -> Dict:
|
|
1798
|
-
"""
|
|
1799
|
-
Builds a hierarchy DAG from a flat list of concepts.
|
|
1800
|
-
This corresponds to LLM_BuildHierarchy in Algorithm 2.
|
|
1801
|
-
It uses the existing build_hierarchy_dag function.
|
|
1802
|
-
"""
|
|
1803
|
-
print(f"Building initial hierarchy in memory for {len(concepts)} concepts...")
|
|
1804
|
-
hierarchy_structure = build_hierarchy_dag(
|
|
1805
|
-
concepts, self.model, self.provider, self.npc,
|
|
1806
|
-
max_levels=5, # Can be tuned
|
|
1807
|
-
target_top_count=8 # Can be tuned
|
|
1808
|
-
)
|
|
1809
|
-
print("Initial hierarchy structure built in memory.")
|
|
1810
|
-
return hierarchy_structure['dag'] # Return just the DAG portion
|
|
1811
|
-
|
|
1812
|
-
def _llm_find_best_fit(self, item: str, candidates: List[str]) -> List[str]:
|
|
1813
|
-
"""
|
|
1814
|
-
Finds the best fit group(s) for an item (fact or concept) from a list of candidates.
|
|
1815
|
-
Corresponds to LLM_FindBestFit in Algorithm 3.
|
|
1816
|
-
"""
|
|
1817
|
-
return get_fact_assignments(item, candidates, self.model, self.provider, self.npc)
|
|
1818
|
-
|
|
1819
|
-
def _llm_check_direct_link(self, concept_a: str, concept_b: str) -> bool:
|
|
1820
|
-
"""
|
|
1821
|
-
Checks if there's a direct, meaningful semantic link between two concepts.
|
|
1822
|
-
Corresponds to LLM_CheckDirectLink in Algorithm 3.
|
|
1823
|
-
"""
|
|
1824
|
-
prompt = f"""Is there a direct and meaningful semantic relationship between "{concept_a}" and "{concept_b}"?
|
|
1825
|
-
Consider if one is a component of, a type of, strongly influences, or is directly associated with the other.
|
|
1826
|
-
Answer with "yes" or "no".
|
|
1827
|
-
|
|
1828
|
-
Concept A: {concept_a}
|
|
1829
|
-
Concept B: {concept_b}
|
|
1830
|
-
|
|
1831
|
-
Return JSON:
|
|
1832
|
-
{{
|
|
1833
|
-
"has_link": "yes" or "no"
|
|
1834
|
-
}}
|
|
1835
|
-
"""
|
|
1836
|
-
response = get_llm_response(
|
|
1837
|
-
prompt, model=self.model, provider=self.provider, format="json", npc=self.npc
|
|
1838
|
-
)
|
|
1839
|
-
return response["response"].get("has_link", "no").lower() == "yes"
|
|
1110
|
+
import hashlib
|
|
1840
1111
|
|
|
1841
|
-
|
|
1842
|
-
"""
|
|
1843
|
-
Identifies redundant or consolidatable groups within the hierarchy.
|
|
1844
|
-
Corresponds to LLM_FindRedundantNodes in Algorithm 1, Phase 3.
|
|
1845
|
-
Returns a list of tuples: (new_consolidated_name, [old_redundant_names]).
|
|
1846
|
-
"""
|
|
1847
|
-
if not all_active_groups:
|
|
1848
|
-
return []
|
|
1112
|
+
fact_id = hashlib.md5(fact.encode()).hexdigest()
|
|
1849
1113
|
|
|
1850
|
-
# It's better to process in batches if all_active_groups is very large
|
|
1851
|
-
# For simplicity, sending all for now, but consider batching for production.
|
|
1852
|
-
|
|
1853
|
-
prompt = f"""Given the following list of active conceptual groups, identify any groups that are highly redundant, overly specific, or could be consolidated into a single, more abstract, but still precise concept.
|
|
1854
|
-
For each set of redundant groups, propose a single, better consolidated group name.
|
|
1855
|
-
|
|
1856
|
-
GUIDELINES for Consolidation:
|
|
1857
|
-
1. **Semantic Overlap:** Only consolidate if groups are truly very similar or one is a very specific instance of another.
|
|
1858
|
-
2. **Naming:** The new consolidated name should be concise, specific, and accurately represent all merged concepts. Prioritize nouns/noun phrases. Avoid generic terms (e.g., "Concepts," "Processes").
|
|
1859
|
-
3. **Efficiency:** Aim for meaningful consolidation, not excessive merging.
|
|
1860
|
-
|
|
1861
|
-
Example:
|
|
1862
|
-
Active Groups: ["Tidal Disruption Events", "Black Hole Mergers", "Supernovae", "Neutron Star Collisions", "Astrophysical Transients", "Stellar Explosions"]
|
|
1863
|
-
Consolidation Candidates: [
|
|
1864
|
-
{{
|
|
1865
|
-
"new_concept": "Cataclysmic Astronomical Events",
|
|
1866
|
-
"old_concepts": ["Tidal Disruption Events", "Black Hole Mergers", "Supernovae", "Neutron Star Collisions"]
|
|
1867
|
-
}},
|
|
1868
|
-
{{
|
|
1869
|
-
"new_concept": "Stellar Explosions",
|
|
1870
|
-
"old_concepts": ["Supernovae", "Stellar Explosions"]
|
|
1871
|
-
}}
|
|
1872
|
-
]
|
|
1873
1114
|
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
{{
|
|
1880
|
-
"consolidation_candidates": [
|
|
1881
|
-
{{"new_concept": "Proposed Name", "old_concepts": ["Old Name 1", "Old Name 2"]}},
|
|
1882
|
-
...
|
|
1883
|
-
]
|
|
1884
|
-
}}
|
|
1885
|
-
"""
|
|
1886
|
-
response = get_llm_response(
|
|
1887
|
-
prompt, model=self.model, provider=self.provider, format="json", npc=self.npc
|
|
1115
|
+
collection.add(
|
|
1116
|
+
documents=[fact],
|
|
1117
|
+
embeddings=[embedding],
|
|
1118
|
+
metadatas=[metadata],
|
|
1119
|
+
ids=[fact_id],
|
|
1888
1120
|
)
|
|
1889
|
-
candidates_data = response["response"].get("consolidation_candidates", [])
|
|
1890
|
-
|
|
1891
|
-
# Convert to the desired format: List[Tuple[str, List[str]]]
|
|
1892
|
-
formatted_candidates = []
|
|
1893
|
-
for cand in candidates_data:
|
|
1894
|
-
new_concept = cand.get("new_concept")
|
|
1895
|
-
old_concepts = cand.get("old_concepts")
|
|
1896
|
-
if new_concept and isinstance(old_concepts, list) and old_concepts:
|
|
1897
|
-
# Filter out old_concepts that are not actually in all_active_groups
|
|
1898
|
-
# to avoid trying to merge non-existent or inactive groups.
|
|
1899
|
-
valid_old_concepts = [
|
|
1900
|
-
oc for oc in old_concepts if oc in all_active_groups
|
|
1901
|
-
]
|
|
1902
|
-
if valid_old_concepts: # Only add if there are valid old concepts to merge
|
|
1903
|
-
formatted_candidates.append((new_concept, valid_old_concepts))
|
|
1904
|
-
|
|
1905
|
-
return formatted_candidates
|
|
1906
|
-
|
|
1907
|
-
# --- Kuzu Graph Update Methods ---
|
|
1908
|
-
|
|
1909
|
-
def _add_parent_of_link(self, parent_name: str, child_name: str) -> bool:
|
|
1910
|
-
"""Creates a ParentOf relationship between two groups."""
|
|
1911
|
-
escaped_parent = parent_name.replace('"', '\\"')
|
|
1912
|
-
escaped_child = child_name.replace('"', '\\"')
|
|
1913
|
-
query = f"""
|
|
1914
|
-
MATCH (p:Groups), (c:Groups)
|
|
1915
|
-
WHERE p.name = "{escaped_parent}" AND c.name = "{escaped_child}"
|
|
1916
|
-
CREATE (p)-[:ParentOf]->(c)
|
|
1917
|
-
"""
|
|
1918
|
-
_, error = safe_kuzu_execute(self.conn, query, f"Failed to create ParentOf link: {parent_name} -> {child_name}")
|
|
1919
|
-
if error: print(f"Error creating ParentOf link: {error}")
|
|
1920
|
-
return error is None
|
|
1921
|
-
|
|
1922
|
-
def _add_associated_with_link(self, source_name: str, target_name: str) -> bool:
|
|
1923
|
-
"""Creates an AssociatedWith relationship between two groups."""
|
|
1924
|
-
escaped_source = source_name.replace('"', '\\"')
|
|
1925
|
-
escaped_target = target_name.replace('"', '\\"')
|
|
1926
|
-
query = f"""
|
|
1927
|
-
MATCH (s:Groups), (t:Groups)
|
|
1928
|
-
WHERE s.name = "{escaped_source}" AND t.name = "{escaped_target}"
|
|
1929
|
-
CREATE (s)-[:AssociatedWith]->(t)
|
|
1930
|
-
"""
|
|
1931
|
-
_, error = safe_kuzu_execute(self.conn, query, f"Failed to create AssociatedWith link: {source_name} - {target_name}")
|
|
1932
|
-
if error: print(f"Error creating AssociatedWith link: {error}")
|
|
1933
|
-
return error is None
|
|
1934
|
-
|
|
1935
|
-
def _record_evolution_link(self, old_group_name: str, new_group_name: str, event_type: str, reason: str):
|
|
1936
|
-
"""Records an EvolvedFrom link for genealogical tracking."""
|
|
1937
|
-
escaped_old = old_group_name.replace('"', '\\"')
|
|
1938
|
-
escaped_new = new_group_name.replace('"', '\\"')
|
|
1939
|
-
query = f"""
|
|
1940
|
-
MATCH (oldG:Groups), (newG:Groups)
|
|
1941
|
-
WHERE oldG.name = "{escaped_old}" AND newG.name = "{escaped_new}"
|
|
1942
|
-
CREATE (oldG)-[:EvolvedFrom {{event_type: "{event_type}", generation: {self.current_generation}, reason: "{reason}"}}]->(newG)
|
|
1943
|
-
"""
|
|
1944
|
-
_, error = safe_kuzu_execute(self.conn, query, f"Failed to record evolution link: {old_group_name} -> {new_group_name}")
|
|
1945
|
-
if error: print(f"Error recording evolution link: {error}")
|
|
1946
|
-
return error is None
|
|
1947
|
-
|
|
1948
|
-
def _set_group_active_status(self, group_name: str, is_active: bool):
|
|
1949
|
-
"""Sets the is_active status of a group."""
|
|
1950
|
-
escaped_name = group_name.replace('"', '\\"')
|
|
1951
|
-
query = f"""
|
|
1952
|
-
MATCH (g:Groups {{name: "{escaped_name}"}})
|
|
1953
|
-
SET g.is_active = {str(is_active).lower()}
|
|
1954
|
-
"""
|
|
1955
|
-
_, error = safe_kuzu_execute(self.conn, query, f"Failed to update active status for group: {group_name}")
|
|
1956
|
-
if error: print(f"Error setting group active status: {error}")
|
|
1957
|
-
return error is None
|
|
1958
|
-
|
|
1959
|
-
def _rewire_group_relationships(self, old_group_name: str, new_group_name: str):
|
|
1960
|
-
"""
|
|
1961
|
-
Rewires ParentOf, AssociatedWith, and Contains relationships from an old group to a new one.
|
|
1962
|
-
This is crucial during consolidation.
|
|
1963
|
-
"""
|
|
1964
|
-
escaped_old = old_group_name.replace('"', '\\"')
|
|
1965
|
-
escaped_new = new_group_name.replace('"', '\\"')
|
|
1966
|
-
|
|
1967
|
-
# Kuzu's `SET` on relationship destination or source is not direct.
|
|
1968
|
-
# The typical way to "rewire" in graph databases is to:
|
|
1969
|
-
# 1. Create new relationships from existing nodes to the new target.
|
|
1970
|
-
# 2. Delete the old relationships.
|
|
1971
|
-
# This requires careful transaction management if atomicity is critical,
|
|
1972
|
-
# but for simple delete-and-create within a loop, it's often fine.
|
|
1973
|
-
|
|
1974
|
-
# Rewire ParentOf where old_group is a child
|
|
1975
|
-
# (i.e., its parents should now point to new_group instead of old_group)
|
|
1976
|
-
query_parent_to_child = f"""
|
|
1977
|
-
MATCH (p:Groups)-[r:ParentOf]->(oldG:Groups)
|
|
1978
|
-
WHERE oldG.name = "{escaped_old}"
|
|
1979
|
-
AND NOT (p)-[:ParentOf]->(:Groups {{name: "{escaped_new}"}}) // Avoid duplicate relationships
|
|
1980
|
-
CREATE (p)-[:ParentOf]->(newG:Groups) WHERE newG.name = "{escaped_new}"
|
|
1981
|
-
DELETE r;
|
|
1982
|
-
"""
|
|
1983
|
-
_, error = safe_kuzu_execute(self.conn, query_parent_to_child, f"Failed to rewire ParentOf (parent to old): {old_group_name}")
|
|
1984
|
-
if error: print(f"Rewire error (ParentOf parent): {error}")
|
|
1985
|
-
|
|
1986
|
-
# Rewire ParentOf where old_group is a parent
|
|
1987
|
-
# (i.e., its children should now be children of new_group instead of old_group)
|
|
1988
|
-
query_child_to_parent = f"""
|
|
1989
|
-
MATCH (oldG:Groups)-[r:ParentOf]->(c:Groups)
|
|
1990
|
-
WHERE oldG.name = "{escaped_old}"
|
|
1991
|
-
AND NOT (:Groups {{name: "{escaped_new}"}})-[:ParentOf]->(c) // Avoid duplicate relationships
|
|
1992
|
-
CREATE (newG:Groups)-[:ParentOf]->(c) WHERE newG.name = "{escaped_new}"
|
|
1993
|
-
DELETE r;
|
|
1994
|
-
"""
|
|
1995
|
-
_, error = safe_kuzu_execute(self.conn, query_child_to_parent, f"Failed to rewire ParentOf (old to child): {old_group_name}")
|
|
1996
|
-
if error: print(f"Rewire error (ParentOf child): {error}")
|
|
1997
|
-
|
|
1998
|
-
# Rewire AssociatedWith where old_group is a source
|
|
1999
|
-
query_assoc_source = f"""
|
|
2000
|
-
MATCH (s:Groups)-[r:AssociatedWith]->(oldG:Groups)
|
|
2001
|
-
WHERE oldG.name = "{escaped_old}"
|
|
2002
|
-
AND NOT (s)-[:AssociatedWith]->(:Groups {{name: "{escaped_new}"}}) // Avoid duplicate relationships
|
|
2003
|
-
CREATE (s)-[:AssociatedWith]->(newG:Groups) WHERE newG.name = "{escaped_new}"
|
|
2004
|
-
DELETE r;
|
|
2005
|
-
"""
|
|
2006
|
-
_, error = safe_kuzu_execute(self.conn, query_assoc_source, f"Failed to rewire AssociatedWith (source to old): {old_group_name}")
|
|
2007
|
-
if error: print(f"Rewire error (AssociatedWith source): {error}")
|
|
2008
|
-
|
|
2009
|
-
# Rewire AssociatedWith where old_group is a target
|
|
2010
|
-
query_assoc_target = f"""
|
|
2011
|
-
MATCH (oldG:Groups)-[r:AssociatedWith]->(t:Groups)
|
|
2012
|
-
WHERE oldG.name = "{escaped_old}"
|
|
2013
|
-
AND NOT (:Groups {{name: "{escaped_new}"}})-[:AssociatedWith]->(t) // Avoid duplicate relationships
|
|
2014
|
-
CREATE (newG:Groups)-[:AssociatedWith]->(t) WHERE newG.name = "{escaped_new}"
|
|
2015
|
-
DELETE r;
|
|
2016
|
-
"""
|
|
2017
|
-
_, error = safe_kuzu_execute(self.conn, query_assoc_target, f"Failed to rewire AssociatedWith (old to target): {old_group_name}")
|
|
2018
|
-
if error: print(f"Rewire error (AssociatedWith target): {error}")
|
|
2019
|
-
|
|
2020
|
-
# Rewire 'Contains' relationships if facts were directly linked to the old group
|
|
2021
|
-
query_contains = f"""
|
|
2022
|
-
MATCH (oldG:Groups)-[r:Contains]->(f:Fact)
|
|
2023
|
-
WHERE oldG.name = "{escaped_old}"
|
|
2024
|
-
AND NOT (:Groups {{name: "{escaped_new}"}})-[:Contains]->(f) // Avoid duplicate relationships
|
|
2025
|
-
CREATE (newG:Groups)-[:Contains]->(f) WHERE newG.name = "{escaped_new}"
|
|
2026
|
-
DELETE r;
|
|
2027
|
-
"""
|
|
2028
|
-
_, error = safe_kuzu_execute(self.conn, query_contains, f"Failed to rewire Contains (old to fact): {old_group_name}")
|
|
2029
|
-
if error: print(f"Rewire error (Contains): {error}")
|
|
2030
|
-
|
|
2031
|
-
print(f"Rewired all relationships from '{old_group_name}' to '{new_group_name}'.")
|
|
2032
1121
|
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
Helper for FindAllAssociationPaths: Recursively traverses the hierarchy to find paths.
|
|
2038
|
-
"""
|
|
2039
|
-
paths_results = set()
|
|
2040
|
-
|
|
2041
|
-
# Base case for recursion: if no current_nodes to evaluate, path terminates.
|
|
2042
|
-
# Add the current path to results if it's not empty and represents a complete segment.
|
|
2043
|
-
if not current_nodes:
|
|
2044
|
-
if current_path:
|
|
2045
|
-
paths_results.add(tuple(current_path))
|
|
2046
|
-
return paths_results
|
|
2047
|
-
|
|
2048
|
-
# Find best fit nodes among current_nodes for the new concept
|
|
2049
|
-
relevant_next_nodes = self._llm_find_best_fit(c_new, current_nodes)
|
|
2050
|
-
|
|
2051
|
-
if not relevant_next_nodes:
|
|
2052
|
-
# If no relevant children found among current_nodes, current path segment terminates.
|
|
2053
|
-
# Only add to results if this path segment is valid and contains at least one node.
|
|
2054
|
-
if current_path: # Ensures we don't add empty paths if initial_roots have no fit
|
|
2055
|
-
paths_results.add(tuple(current_path))
|
|
2056
|
-
return paths_results
|
|
2057
|
-
|
|
2058
|
-
for node_name in relevant_next_nodes:
|
|
2059
|
-
# Ensure the node being added to the path is not already the last node in the path
|
|
2060
|
-
# This prevents cycles in a path if LLM returns the same node.
|
|
2061
|
-
if current_path and node_name == current_path[-1]:
|
|
2062
|
-
continue
|
|
2063
|
-
|
|
2064
|
-
new_path = current_path + [node_name]
|
|
2065
|
-
|
|
2066
|
-
# Get active children of the current node from the DAG
|
|
2067
|
-
children_of_node = []
|
|
2068
|
-
if node_name in hierarchy_dag:
|
|
2069
|
-
children_of_node = [child for child in hierarchy_dag[node_name]["children"] if hierarchy_dag[child]["is_active"]]
|
|
2070
|
-
|
|
2071
|
-
if not children_of_node: # Reached a leaf node or no relevant active children
|
|
2072
|
-
paths_results.add(tuple(new_path))
|
|
2073
|
-
else:
|
|
2074
|
-
# Recurse down
|
|
2075
|
-
paths_results.update(self._recursive_traversal(c_new, list(children_of_node), hierarchy_dag, new_path))
|
|
2076
|
-
|
|
2077
|
-
return paths_results
|
|
2078
|
-
|
|
2079
|
-
def _find_all_association_paths(self, c_new: str, hierarchy_dag: Dict, theta_explore: float) -> Set[Tuple[str, ...]]:
|
|
2080
|
-
"""
|
|
2081
|
-
Algorithm 3: Finds all primary and serendipitous association paths for a new concept.
|
|
2082
|
-
Returns a set of tuples, where each tuple is a path of concept names.
|
|
2083
|
-
"""
|
|
2084
|
-
print(f"Finding association paths for new concept: {c_new}")
|
|
2085
|
-
|
|
2086
|
-
# Part A: Primary Top-Down Traversal
|
|
2087
|
-
# Start with active root nodes (groups with no active parents in the current hierarchy view)
|
|
2088
|
-
active_root_nodes = [name for name, data in hierarchy_dag.items() if not data["parents"] and data["is_active"]]
|
|
2089
|
-
if not active_root_nodes:
|
|
2090
|
-
print("No active root nodes found in hierarchy. Considering all active groups as potential starting points for primary traversal.")
|
|
2091
|
-
active_root_nodes = [node for node in hierarchy_dag.keys() if hierarchy_dag[node]["is_active"]]
|
|
2092
|
-
|
|
2093
|
-
# Perform initial filtering at the top level
|
|
2094
|
-
initial_relevant_roots = self._llm_find_best_fit(c_new, active_root_nodes)
|
|
2095
|
-
|
|
2096
|
-
primary_paths = set()
|
|
2097
|
-
for root in initial_relevant_roots:
|
|
2098
|
-
# Paths start *from* the root selected by LLM
|
|
2099
|
-
primary_paths.update(self._recursive_traversal(c_new, [root], hierarchy_dag, []))
|
|
2100
|
-
|
|
2101
|
-
print(f"Primary paths found: {primary_paths}")
|
|
2102
|
-
|
|
2103
|
-
# Part B: Serendipitous Random Exploration
|
|
2104
|
-
all_active_groups = [node for node in hierarchy_dag.keys() if hierarchy_dag[node]["is_active"]]
|
|
2105
|
-
|
|
2106
|
-
# Collect all nodes visited in primary paths to exclude them from serendipitous sample
|
|
2107
|
-
visited_in_primary = set()
|
|
2108
|
-
for path in primary_paths:
|
|
2109
|
-
visited_in_primary.update(path)
|
|
2110
|
-
|
|
2111
|
-
unvisited_groups = [g for g in all_active_groups if g not in visited_in_primary]
|
|
2112
|
-
|
|
2113
|
-
num_sample = int(len(unvisited_groups) * theta_explore)
|
|
2114
|
-
sampled_nodes = random.sample(unvisited_groups, min(num_sample, len(unvisited_groups)))
|
|
2115
|
-
print(f"Sampled {len(sampled_nodes)} nodes from {len(unvisited_groups)} unvisited for serendipitous exploration.")
|
|
2116
|
-
|
|
2117
|
-
serendipity_paths = set()
|
|
2118
|
-
for s_node in sampled_nodes:
|
|
2119
|
-
if self._llm_check_direct_link(c_new, s_node):
|
|
2120
|
-
print(f"Direct link found between '{c_new}' and serendipitous node '{s_node}'. Initiating branch traversal.")
|
|
2121
|
-
# Start a new traversal from this node. The path will start with this node.
|
|
2122
|
-
branch_paths = self._recursive_traversal(c_new, [s_node], hierarchy_dag, [])
|
|
2123
|
-
serendipity_paths.update(branch_paths)
|
|
2124
|
-
print(f"Serendipitous paths found: {serendipity_paths}")
|
|
2125
|
-
|
|
2126
|
-
return primary_paths.union(serendipity_paths)
|
|
2127
|
-
|
|
2128
|
-
# --- Algorithm 2: CreateInitialGraph ---
|
|
2129
|
-
|
|
2130
|
-
def create_initial_graph(self, initial_facts: List[str]) -> Dict:
|
|
2131
|
-
"""
|
|
2132
|
-
Algorithm 2: Creates the initial Knowledge Graph at generation 0.
|
|
2133
|
-
"""
|
|
2134
|
-
if self.current_generation >= 0:
|
|
2135
|
-
print(f"Warning: Knowledge Graph already exists at generation {self.current_generation}. Returning current state.")
|
|
2136
|
-
return self._get_active_hierarchy_dag()
|
|
2137
|
-
|
|
2138
|
-
print("Creating initial Knowledge Graph (Generation 0)...")
|
|
2139
|
-
self.current_generation = 0 # Set for initial creation
|
|
2140
|
-
|
|
2141
|
-
# Store initial facts
|
|
2142
|
-
for fact_content in initial_facts:
|
|
2143
|
-
self._insert_fact(fact_content, "initial_load")
|
|
2144
|
-
|
|
2145
|
-
# Generate concept candidates from initial facts
|
|
2146
|
-
concept_candidates = self._llm_generate_concepts(initial_facts, "facts")
|
|
2147
|
-
initial_concepts = self._llm_clean_concepts(concept_candidates)
|
|
2148
|
-
print(f"Initial concepts identified for hierarchy: {initial_concepts}")
|
|
2149
|
-
|
|
2150
|
-
# Build initial hierarchy structure (in-memory DAG)
|
|
2151
|
-
hierarchy_dag_structure = self._llm_build_initial_hierarchy(initial_concepts)
|
|
2152
|
-
print(f"Initial hierarchy structure built in memory for {len(hierarchy_dag_structure)} groups.")
|
|
2153
|
-
|
|
2154
|
-
# Instantiate the concepts (Groups nodes) in Kuzu for Generation 0
|
|
2155
|
-
all_groups_in_hierarchy = set(hierarchy_dag_structure.keys())
|
|
2156
|
-
for c_name in all_groups_in_hierarchy:
|
|
2157
|
-
self.create_group(self.conn, c_name, self.current_generation, is_active=True)
|
|
2158
|
-
# Record CREATE link for the new group (concept created in this generation)
|
|
2159
|
-
self._record_evolution_link(c_name, c_name, "CREATE", f"Initial creation at generation {self.current_generation}")
|
|
2160
|
-
|
|
2161
|
-
# Create 'ParentOf' links in Kuzu based on the hierarchy_dag_structure
|
|
2162
|
-
print("Creating ParentOf links in Kuzu...")
|
|
2163
|
-
for group_name, data in hierarchy_dag_structure.items():
|
|
2164
|
-
for parent_name in data["parents"]: # Parents are defined as the 'source' of ParentOf links
|
|
2165
|
-
self._add_parent_of_link(parent_name, group_name)
|
|
2166
|
-
|
|
2167
|
-
print("Initial graph creation complete.")
|
|
2168
|
-
return self._get_active_hierarchy_dag() # Return the state of the newly created graph
|
|
1122
|
+
return fact_id
|
|
1123
|
+
except Exception as e:
|
|
1124
|
+
print(f"Error storing fact in Chroma: {e}")
|
|
1125
|
+
return None
|
|
2169
1126
|
|
|
2170
|
-
|
|
1127
|
+
def save_facts_to_graph_db(
|
|
1128
|
+
conn, facts: List[str], path: str, batch_size: int
|
|
1129
|
+
):
|
|
1130
|
+
"""Save a list of facts to the database in batches"""
|
|
1131
|
+
for i in range(0, len(facts), batch_size):
|
|
1132
|
+
batch = facts[i : i + batch_size]
|
|
1133
|
+
print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
|
|
2171
1134
|
|
|
2172
|
-
def evolve_knowledge_graph(self, new_facts: List[str], theta_explore: float = 0.1) -> Dict:
|
|
2173
|
-
"""
|
|
2174
|
-
Algorithm 1: Generational Knowledge Hierarchy Evolution (EvoSem-MHI).
|
|
2175
|
-
"""
|
|
2176
|
-
if self.current_generation == -1:
|
|
2177
|
-
print("No initial graph found. Calling create_initial_graph first for current facts.")
|
|
2178
|
-
return self.create_initial_graph(new_facts)
|
|
2179
|
-
|
|
2180
|
-
self.current_generation += 1
|
|
2181
|
-
print(f"\n--- Starting Evolution for Generation {self.current_generation} ---")
|
|
2182
|
-
|
|
2183
|
-
# Phase 1: Discovery of New Concepts
|
|
2184
|
-
print("Phase 1: Discovery of New Concepts")
|
|
2185
|
-
# LLM_GenerateConcepts for new facts
|
|
2186
|
-
candidate_new_concepts = self._llm_generate_concepts(new_facts, "facts")
|
|
2187
|
-
# LLM_CleanConcepts
|
|
2188
|
-
cleaned_new_concepts = self._llm_clean_concepts(candidate_new_concepts)
|
|
2189
1135
|
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
self.create_group(self.conn, concept_name, self.current_generation, is_active=True)
|
|
2197
|
-
# Record CREATE link for the new group (concept created in this generation)
|
|
2198
|
-
self._record_evolution_link(concept_name, concept_name, "CREATE", f"Discovered in generation {self.current_generation}")
|
|
2199
|
-
|
|
2200
|
-
print(f"Discovered and prepared {len(cleaned_new_concepts)} new concepts.")
|
|
2201
|
-
|
|
2202
|
-
# Capture the current state of the hierarchy *before* MHI
|
|
2203
|
-
# This DAG needs to include all active groups, including potentially new ones from this generation if they exist
|
|
2204
|
-
current_hierarchy_state = self._get_active_hierarchy_dag()
|
|
2205
|
-
current_dag_for_mhi = current_hierarchy_state["dag"]
|
|
2206
|
-
|
|
2207
|
-
# Phase 2: Multiplicative Hierarchical Integration
|
|
2208
|
-
print("\nPhase 2: Multiplicative Hierarchical Integration")
|
|
2209
|
-
for new_concept_name in cleaned_new_concepts:
|
|
2210
|
-
all_association_paths = self._find_all_association_paths(
|
|
2211
|
-
new_concept_name, current_dag_for_mhi, theta_explore
|
|
2212
|
-
)
|
|
2213
|
-
|
|
2214
|
-
print(f"Paths for '{new_concept_name}': {all_association_paths}")
|
|
2215
|
-
|
|
2216
|
-
# Create AssociatedWith links for all nodes along all paths
|
|
2217
|
-
# The new concept is linked *to* existing concepts in the hierarchy.
|
|
2218
|
-
for path in all_association_paths:
|
|
2219
|
-
if not path: continue # Skip empty paths
|
|
2220
|
-
for node_in_path in path:
|
|
2221
|
-
# Ensure the node in path is an active group.
|
|
2222
|
-
if node_in_path in current_dag_for_mhi and current_dag_for_mhi[node_in_path]["is_active"]:
|
|
2223
|
-
self._add_associated_with_link(new_concept_name, node_in_path)
|
|
2224
|
-
# print(f"Added 'AssociatedWith' link: '{new_concept_name}' -> '{node_in_path}'") # Too verbose
|
|
2225
|
-
|
|
2226
|
-
print("Phase 2: Integration complete.")
|
|
2227
|
-
|
|
2228
|
-
# Phase 3: Pruning and Consolidation
|
|
2229
|
-
print("\nPhase 3: Pruning and Consolidation")
|
|
2230
|
-
# Get the *updated* list of all active groups for consolidation check
|
|
2231
|
-
# This includes newly created groups from this generation (Phase 1)
|
|
2232
|
-
# and existing active groups.
|
|
2233
|
-
updated_hierarchy_state_for_pruning = self._get_active_hierarchy_dag()
|
|
2234
|
-
all_active_groups_for_consolidation = updated_hierarchy_state_for_pruning["all_active_groups"]
|
|
2235
|
-
|
|
2236
|
-
redundant_candidates = self._llm_find_redundant_nodes(all_active_groups_for_consolidation)
|
|
2237
|
-
|
|
2238
|
-
if not redundant_candidates:
|
|
2239
|
-
print("No redundant concepts identified for consolidation.")
|
|
2240
|
-
else:
|
|
2241
|
-
print(f"Identified {len(redundant_candidates)} consolidation candidates.")
|
|
2242
|
-
|
|
2243
|
-
for new_consolidated_name, old_concept_names in redundant_candidates:
|
|
2244
|
-
# Ensure the new_consolidated_name is not one of the old_concept_names
|
|
2245
|
-
# If LLM suggests merging "A" into "A", skip.
|
|
2246
|
-
if new_consolidated_name in old_concept_names:
|
|
2247
|
-
print(f"Skipping consolidation where new concept '{new_consolidated_name}' is also an old concept. This should be handled by LLM.")
|
|
2248
|
-
old_concept_names.remove(new_consolidated_name)
|
|
2249
|
-
if not old_concept_names: continue # If no other old concepts, skip
|
|
2250
|
-
|
|
2251
|
-
print(f"Consolidating: {old_concept_names} into '{new_consolidated_name}'")
|
|
2252
|
-
# Create the new consolidated group if it doesn't exist
|
|
2253
|
-
# It will be active and created in the current generation
|
|
2254
|
-
self.create_group(self.conn, new_consolidated_name, self.current_generation, is_active=True)
|
|
2255
|
-
|
|
2256
|
-
# Link old concepts to the new consolidated group and mark them inactive
|
|
2257
|
-
for old_name in old_concept_names:
|
|
2258
|
-
# Record evolution link from old to new
|
|
2259
|
-
self._record_evolution_link(old_name, new_consolidated_name, "SUBSUMED_BY", f"Consolidated in generation {self.current_generation}")
|
|
2260
|
-
|
|
2261
|
-
# Mark old group as inactive
|
|
2262
|
-
self._set_group_active_status(old_name, False)
|
|
2263
|
-
|
|
2264
|
-
# Rewire all relationships (ParentOf, AssociatedWith, Contains) from old to new
|
|
2265
|
-
self._rewire_group_relationships(old_name, new_consolidated_name)
|
|
2266
|
-
print(f"Marked '{old_name}' as inactive and rewired its connections to '{new_consolidated_name}'.")
|
|
2267
|
-
|
|
2268
|
-
print(f"\n--- Evolution for Generation {self.current_generation} Complete ---")
|
|
2269
|
-
return self._get_active_hierarchy_dag() # Return the final state of the graph after this generation
|
|
2270
|
-
|
|
2271
|
-
# --- Fact Storage (from original code, slightly adapted for self.conn) ---
|
|
2272
|
-
def _insert_fact(self, fact_content: str, path: str) -> bool:
|
|
2273
|
-
"""Insert a fact into the database with robust error handling."""
|
|
2274
|
-
if self.conn is None:
|
|
2275
|
-
print("Cannot insert fact: database connection is None")
|
|
2276
|
-
return False
|
|
2277
|
-
|
|
2278
|
-
try:
|
|
2279
|
-
escaped_fact = fact_content.replace('"', '\\"')
|
|
2280
|
-
escaped_path = os.path.expanduser(path).replace('"', '\\"')
|
|
2281
|
-
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
2282
|
-
|
|
2283
|
-
safe_kuzu_execute(self.conn, "BEGIN TRANSACTION")
|
|
2284
|
-
check_query = f'MATCH (f:Fact {{content: "{escaped_fact}"}}) RETURN f'
|
|
2285
|
-
result, error = safe_kuzu_execute(self.conn, check_query, "Failed to check if fact exists")
|
|
2286
|
-
if error:
|
|
2287
|
-
safe_kuzu_execute(self.conn, "ROLLBACK")
|
|
2288
|
-
return False
|
|
2289
|
-
|
|
2290
|
-
if not result.has_next():
|
|
2291
|
-
insert_query = f"""
|
|
2292
|
-
CREATE (f:Fact {{
|
|
2293
|
-
content: "{escaped_fact}",
|
|
2294
|
-
path: "{escaped_path}",
|
|
2295
|
-
recorded_at: "{timestamp}"
|
|
2296
|
-
}})
|
|
2297
|
-
"""
|
|
2298
|
-
_, error = safe_kuzu_execute(self.conn, insert_query, "Failed to insert fact")
|
|
2299
|
-
if error:
|
|
2300
|
-
safe_kuzu_execute(self.conn, "ROLLBACK")
|
|
2301
|
-
return False
|
|
2302
|
-
safe_kuzu_execute(self.conn, "COMMIT")
|
|
2303
|
-
return True
|
|
2304
|
-
except Exception as e:
|
|
2305
|
-
print(f"Error inserting fact: {str(e)}")
|
|
2306
|
-
traceback.print_exc()
|
|
2307
|
-
safe_kuzu_execute(self.conn, "ROLLBACK")
|
|
2308
|
-
return False
|
|
1136
|
+
for fact in batch:
|
|
1137
|
+
try:
|
|
1138
|
+
print(f"Inserting fact: {fact}")
|
|
1139
|
+
print(f"With path: {path}")
|
|
1140
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
1141
|
+
print(f"With recorded_at: {timestamp}")
|
|
2309
1142
|
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
1143
|
+
insert_fact(conn, fact, path)
|
|
1144
|
+
print("Success!")
|
|
1145
|
+
except Exception as e:
|
|
1146
|
+
print(f"Failed to insert fact: {fact}")
|
|
1147
|
+
print(f"Error: {e}")
|
|
1148
|
+
continue
|
|
2315
1149
|
|
|
2316
|
-
|
|
2317
|
-
escaped_fact = fact_content.replace('"', '\\"')
|
|
2318
|
-
escaped_group = group_name.replace('"', '\\"')
|
|
2319
|
-
|
|
2320
|
-
# Check if both fact and group exist before creating relationship
|
|
2321
|
-
check_fact_query = f'MATCH (f:Fact {{content: "{escaped_fact}"}}) RETURN f'
|
|
2322
|
-
fact_result, fact_error = safe_kuzu_execute(self.conn, check_fact_query)
|
|
2323
|
-
if fact_error or not fact_result or not fact_result.has_next():
|
|
2324
|
-
print(f"Fact not found for assignment: {fact_content}")
|
|
2325
|
-
return False
|
|
1150
|
+
print(f"Completed batch {i//batch_size + 1}")
|
|
2326
1151
|
|
|
2327
|
-
check_group_query = f'MATCH (g:Groups {{name: "{escaped_group}"}}) RETURN g'
|
|
2328
|
-
group_result, group_error = safe_kuzu_execute(self.conn, check_group_query)
|
|
2329
|
-
if group_error or not group_result or not group_result.has_next():
|
|
2330
|
-
print(f"Group not found for assignment: {group_name}")
|
|
2331
|
-
return False
|
|
2332
1152
|
|
|
2333
|
-
# Check if relationship already exists to prevent duplicates
|
|
2334
|
-
check_rel_query = f"""
|
|
2335
|
-
MATCH (g:Groups {{name: "{escaped_group}"}})-[:Contains]->(f:Fact {{content: "{escaped_fact}"}})
|
|
2336
|
-
RETURN g, f
|
|
2337
|
-
"""
|
|
2338
|
-
rel_exists_result, _ = safe_kuzu_execute(self.conn, check_rel_query)
|
|
2339
|
-
if rel_exists_result and rel_exists_result.has_next():
|
|
2340
|
-
# print(f"Contains relationship already exists for fact '{fact_content}' to group '{group_name}'.")
|
|
2341
|
-
return True # Relationship already exists, so it's "successful"
|
|
2342
|
-
|
|
2343
|
-
# Create relationship
|
|
2344
|
-
query = f"""
|
|
2345
|
-
MATCH (f:Fact), (g:Groups)
|
|
2346
|
-
WHERE f.content = "{escaped_fact}" AND g.name = "{escaped_group}"
|
|
2347
|
-
CREATE (g)-[:Contains]->(f)
|
|
2348
|
-
"""
|
|
2349
|
-
_, error = safe_kuzu_execute(self.conn, query, f"Failed to create Contains relationship for fact {fact_content} to group {group_name}")
|
|
2350
|
-
return error is None
|
|
2351
|
-
except Exception as e:
|
|
2352
|
-
print(f"Error assigning fact to group: {str(e)}")
|
|
2353
|
-
traceback.print_exc()
|
|
2354
|
-
return False
|
|
2355
1153
|
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
2360
|
-
|
|
2361
|
-
|
|
2362
|
-
|
|
2363
|
-
|
|
2364
|
-
|
|
2365
|
-
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
|
|
1154
|
+
def kg_add_fact(
|
|
1155
|
+
engine,
|
|
1156
|
+
fact_text: str,
|
|
1157
|
+
npc=None,
|
|
1158
|
+
team=None,
|
|
1159
|
+
model=None,
|
|
1160
|
+
provider=None
|
|
1161
|
+
):
|
|
1162
|
+
"""Add a new fact to the knowledge graph"""
|
|
1163
|
+
directory_path = os.getcwd()
|
|
1164
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1165
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1166
|
+
|
|
1167
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1168
|
+
|
|
1169
|
+
new_fact = {
|
|
1170
|
+
"statement": fact_text,
|
|
1171
|
+
"source_text": fact_text,
|
|
1172
|
+
"type": "manual",
|
|
1173
|
+
"generation": kg_data.get('generation', 0),
|
|
1174
|
+
"origin": "manual_add"
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
kg_data['facts'].append(new_fact)
|
|
1178
|
+
save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
|
|
1179
|
+
|
|
1180
|
+
return f"Added fact: {fact_text}"
|
|
1181
|
+
|
|
1182
|
+
def kg_search_facts(
|
|
1183
|
+
engine,
|
|
1184
|
+
query: str,
|
|
1185
|
+
npc=None,
|
|
1186
|
+
team=None,
|
|
1187
|
+
model=None,
|
|
1188
|
+
provider=None
|
|
1189
|
+
):
|
|
1190
|
+
"""Search facts in the knowledge graph"""
|
|
1191
|
+
directory_path = os.getcwd()
|
|
1192
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1193
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1194
|
+
|
|
1195
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1196
|
+
|
|
1197
|
+
matching_facts = []
|
|
1198
|
+
for fact in kg_data.get('facts', []):
|
|
1199
|
+
if query.lower() in fact['statement'].lower():
|
|
1200
|
+
matching_facts.append(fact['statement'])
|
|
1201
|
+
|
|
1202
|
+
return matching_facts
|
|
1203
|
+
|
|
1204
|
+
def kg_remove_fact(
|
|
1205
|
+
engine,
|
|
1206
|
+
fact_text: str,
|
|
1207
|
+
npc=None,
|
|
1208
|
+
team=None,
|
|
1209
|
+
model=None,
|
|
1210
|
+
provider=None
|
|
1211
|
+
):
|
|
1212
|
+
"""Remove a fact from the knowledge graph"""
|
|
1213
|
+
directory_path = os.getcwd()
|
|
1214
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1215
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1216
|
+
|
|
1217
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1218
|
+
|
|
1219
|
+
original_count = len(kg_data.get('facts', []))
|
|
1220
|
+
kg_data['facts'] = [f for f in kg_data.get('facts', []) if f['statement'] != fact_text]
|
|
1221
|
+
removed_count = original_count - len(kg_data['facts'])
|
|
1222
|
+
|
|
1223
|
+
if removed_count > 0:
|
|
1224
|
+
save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
|
|
1225
|
+
return f"Removed {removed_count} matching fact(s)"
|
|
1226
|
+
|
|
1227
|
+
return "No matching facts found"
|
|
1228
|
+
|
|
1229
|
+
def kg_list_concepts(
|
|
1230
|
+
engine,
|
|
1231
|
+
npc=None,
|
|
1232
|
+
team=None,
|
|
1233
|
+
model=None,
|
|
1234
|
+
provider=None
|
|
1235
|
+
):
|
|
1236
|
+
"""List all concepts in the knowledge graph"""
|
|
1237
|
+
directory_path = os.getcwd()
|
|
1238
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1239
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1240
|
+
|
|
1241
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1242
|
+
|
|
1243
|
+
concepts = [c['name'] for c in kg_data.get('concepts', [])]
|
|
1244
|
+
return concepts
|
|
1245
|
+
|
|
1246
|
+
def kg_get_facts_for_concept(
|
|
1247
|
+
engine,
|
|
1248
|
+
concept_name: str,
|
|
1249
|
+
npc=None,
|
|
1250
|
+
team=None,
|
|
1251
|
+
model=None,
|
|
1252
|
+
provider=None
|
|
1253
|
+
):
|
|
1254
|
+
"""Get all facts linked to a specific concept"""
|
|
1255
|
+
directory_path = os.getcwd()
|
|
1256
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1257
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1258
|
+
|
|
1259
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1260
|
+
|
|
1261
|
+
fact_to_concept_links = kg_data.get('fact_to_concept_links', {})
|
|
1262
|
+
linked_facts = []
|
|
1263
|
+
|
|
1264
|
+
for fact_statement, linked_concepts in fact_to_concept_links.items():
|
|
1265
|
+
if concept_name in linked_concepts:
|
|
1266
|
+
linked_facts.append(fact_statement)
|
|
1267
|
+
|
|
1268
|
+
return linked_facts
|
|
1269
|
+
|
|
1270
|
+
def kg_add_concept(
|
|
1271
|
+
engine,
|
|
1272
|
+
concept_name: str,
|
|
1273
|
+
concept_description: str,
|
|
1274
|
+
npc=None,
|
|
1275
|
+
team=None,
|
|
1276
|
+
model=None,
|
|
1277
|
+
provider=None
|
|
1278
|
+
):
|
|
1279
|
+
"""Add a new concept to the knowledge graph"""
|
|
1280
|
+
directory_path = os.getcwd()
|
|
1281
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1282
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1283
|
+
|
|
1284
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1285
|
+
|
|
1286
|
+
new_concept = {
|
|
1287
|
+
"name": concept_name,
|
|
1288
|
+
"description": concept_description,
|
|
1289
|
+
"generation": kg_data.get('generation', 0)
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
kg_data['concepts'].append(new_concept)
|
|
1293
|
+
save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
|
|
1294
|
+
|
|
1295
|
+
return f"Added concept: {concept_name}"
|
|
1296
|
+
|
|
1297
|
+
def kg_remove_concept(
|
|
1298
|
+
engine,
|
|
1299
|
+
concept_name: str,
|
|
1300
|
+
npc=None,
|
|
1301
|
+
team=None,
|
|
1302
|
+
model=None,
|
|
1303
|
+
provider=None
|
|
1304
|
+
):
|
|
1305
|
+
"""Remove a concept from the knowledge graph"""
|
|
1306
|
+
directory_path = os.getcwd()
|
|
1307
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1308
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1309
|
+
|
|
1310
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1311
|
+
|
|
1312
|
+
original_count = len(kg_data.get('concepts', []))
|
|
1313
|
+
kg_data['concepts'] = [c for c in kg_data.get('concepts', []) if c['name'] != concept_name]
|
|
1314
|
+
removed_count = original_count - len(kg_data['concepts'])
|
|
1315
|
+
|
|
1316
|
+
if removed_count > 0:
|
|
1317
|
+
save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
|
|
1318
|
+
return f"Removed concept: {concept_name}"
|
|
1319
|
+
|
|
1320
|
+
return "Concept not found"
|
|
1321
|
+
|
|
1322
|
+
def kg_link_fact_to_concept(
|
|
1323
|
+
engine,
|
|
1324
|
+
fact_text: str,
|
|
1325
|
+
concept_name: str,
|
|
1326
|
+
npc=None,
|
|
1327
|
+
team=None,
|
|
1328
|
+
model=None,
|
|
1329
|
+
provider=None
|
|
1330
|
+
):
|
|
1331
|
+
"""Link a fact to a concept in the knowledge graph"""
|
|
1332
|
+
directory_path = os.getcwd()
|
|
1333
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1334
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1335
|
+
|
|
1336
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1337
|
+
|
|
1338
|
+
fact_to_concept_links = kg_data.get('fact_to_concept_links', {})
|
|
1339
|
+
|
|
1340
|
+
if fact_text not in fact_to_concept_links:
|
|
1341
|
+
fact_to_concept_links[fact_text] = []
|
|
1342
|
+
|
|
1343
|
+
if concept_name not in fact_to_concept_links[fact_text]:
|
|
1344
|
+
fact_to_concept_links[fact_text].append(concept_name)
|
|
1345
|
+
kg_data['fact_to_concept_links'] = fact_to_concept_links
|
|
1346
|
+
save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
|
|
1347
|
+
return f"Linked fact '{fact_text}' to concept '{concept_name}'"
|
|
1348
|
+
|
|
1349
|
+
return "Fact already linked to concept"
|
|
1350
|
+
|
|
1351
|
+
def kg_get_all_facts(
|
|
1352
|
+
engine,
|
|
1353
|
+
npc=None,
|
|
1354
|
+
team=None,
|
|
1355
|
+
model=None,
|
|
1356
|
+
provider=None
|
|
1357
|
+
):
|
|
1358
|
+
"""Get all facts from the knowledge graph"""
|
|
1359
|
+
directory_path = os.getcwd()
|
|
1360
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1361
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1362
|
+
|
|
1363
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1364
|
+
|
|
1365
|
+
facts = [f['statement'] for f in kg_data.get('facts', [])]
|
|
1366
|
+
return facts
|
|
1367
|
+
|
|
1368
|
+
def kg_get_stats(
|
|
1369
|
+
engine,
|
|
1370
|
+
npc=None,
|
|
1371
|
+
team=None,
|
|
1372
|
+
model=None,
|
|
1373
|
+
provider=None
|
|
1374
|
+
):
|
|
1375
|
+
"""Get statistics about the knowledge graph"""
|
|
1376
|
+
directory_path = os.getcwd()
|
|
1377
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1378
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1379
|
+
|
|
1380
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1381
|
+
|
|
1382
|
+
return {
|
|
1383
|
+
"total_facts": len(kg_data.get('facts', [])),
|
|
1384
|
+
"total_concepts": len(kg_data.get('concepts', [])),
|
|
1385
|
+
"total_fact_concept_links": len(kg_data.get('fact_to_concept_links', {})),
|
|
1386
|
+
"generation": kg_data.get('generation', 0)
|
|
1387
|
+
}
|
|
1388
|
+
|
|
1389
|
+
def kg_evolve_knowledge(
|
|
1390
|
+
engine,
|
|
1391
|
+
content_text: str,
|
|
1392
|
+
npc=None,
|
|
1393
|
+
team=None,
|
|
1394
|
+
model=None,
|
|
1395
|
+
provider=None
|
|
1396
|
+
):
|
|
1397
|
+
"""Evolve the knowledge graph with new content"""
|
|
1398
|
+
directory_path = os.getcwd()
|
|
1399
|
+
team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
|
|
1400
|
+
npc_name = npc.name if npc else 'default_npc'
|
|
1401
|
+
|
|
1402
|
+
kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
|
|
1403
|
+
|
|
1404
|
+
evolved_kg, _ = kg_evolve_incremental(
|
|
1405
|
+
existing_kg=kg_data,
|
|
1406
|
+
new_content_text=content_text,
|
|
1407
|
+
model=npc.model if npc else model,
|
|
1408
|
+
provider=npc.provider if npc else provider,
|
|
1409
|
+
npc=npc,
|
|
1410
|
+
get_concepts=True,
|
|
1411
|
+
link_concepts_facts=False,
|
|
1412
|
+
link_concepts_concepts=False,
|
|
1413
|
+
link_facts_facts=False
|
|
1414
|
+
)
|
|
1415
|
+
|
|
1416
|
+
save_kg_to_db(engine, evolved_kg, team_name, npc_name, directory_path)
|
|
1417
|
+
|
|
1418
|
+
return "Knowledge graph evolved with new content"
|