npcpy 1.0.26__py3-none-any.whl → 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. npcpy/__init__.py +0 -7
  2. npcpy/data/audio.py +16 -99
  3. npcpy/data/image.py +43 -42
  4. npcpy/data/load.py +83 -124
  5. npcpy/data/text.py +28 -28
  6. npcpy/data/video.py +8 -32
  7. npcpy/data/web.py +51 -23
  8. npcpy/ft/diff.py +110 -0
  9. npcpy/ft/ge.py +115 -0
  10. npcpy/ft/memory_trainer.py +171 -0
  11. npcpy/ft/model_ensembler.py +357 -0
  12. npcpy/ft/rl.py +360 -0
  13. npcpy/ft/sft.py +248 -0
  14. npcpy/ft/usft.py +128 -0
  15. npcpy/gen/audio_gen.py +24 -0
  16. npcpy/gen/embeddings.py +13 -13
  17. npcpy/gen/image_gen.py +262 -117
  18. npcpy/gen/response.py +615 -415
  19. npcpy/gen/video_gen.py +53 -7
  20. npcpy/llm_funcs.py +1869 -437
  21. npcpy/main.py +1 -1
  22. npcpy/memory/command_history.py +844 -510
  23. npcpy/memory/kg_vis.py +833 -0
  24. npcpy/memory/knowledge_graph.py +892 -1845
  25. npcpy/memory/memory_processor.py +81 -0
  26. npcpy/memory/search.py +188 -90
  27. npcpy/mix/debate.py +192 -3
  28. npcpy/npc_compiler.py +1672 -801
  29. npcpy/npc_sysenv.py +593 -1266
  30. npcpy/serve.py +3120 -0
  31. npcpy/sql/ai_function_tools.py +257 -0
  32. npcpy/sql/database_ai_adapters.py +186 -0
  33. npcpy/sql/database_ai_functions.py +163 -0
  34. npcpy/sql/model_runner.py +19 -19
  35. npcpy/sql/npcsql.py +706 -507
  36. npcpy/sql/sql_model_compiler.py +156 -0
  37. npcpy/tools.py +183 -0
  38. npcpy/work/plan.py +13 -279
  39. npcpy/work/trigger.py +3 -3
  40. npcpy-1.2.32.dist-info/METADATA +803 -0
  41. npcpy-1.2.32.dist-info/RECORD +54 -0
  42. npcpy/data/dataframes.py +0 -171
  43. npcpy/memory/deep_research.py +0 -125
  44. npcpy/memory/sleep.py +0 -557
  45. npcpy/modes/_state.py +0 -78
  46. npcpy/modes/alicanto.py +0 -1075
  47. npcpy/modes/guac.py +0 -785
  48. npcpy/modes/mcp_npcsh.py +0 -822
  49. npcpy/modes/npc.py +0 -213
  50. npcpy/modes/npcsh.py +0 -1158
  51. npcpy/modes/plonk.py +0 -409
  52. npcpy/modes/pti.py +0 -234
  53. npcpy/modes/serve.py +0 -1637
  54. npcpy/modes/spool.py +0 -312
  55. npcpy/modes/wander.py +0 -549
  56. npcpy/modes/yap.py +0 -572
  57. npcpy/npc_team/alicanto.npc +0 -2
  58. npcpy/npc_team/alicanto.png +0 -0
  59. npcpy/npc_team/assembly_lines/test_pipeline.py +0 -181
  60. npcpy/npc_team/corca.npc +0 -13
  61. npcpy/npc_team/foreman.npc +0 -7
  62. npcpy/npc_team/frederic.npc +0 -6
  63. npcpy/npc_team/frederic4.png +0 -0
  64. npcpy/npc_team/guac.png +0 -0
  65. npcpy/npc_team/jinxs/automator.jinx +0 -18
  66. npcpy/npc_team/jinxs/bash_executer.jinx +0 -31
  67. npcpy/npc_team/jinxs/calculator.jinx +0 -11
  68. npcpy/npc_team/jinxs/edit_file.jinx +0 -96
  69. npcpy/npc_team/jinxs/file_chat.jinx +0 -14
  70. npcpy/npc_team/jinxs/gui_controller.jinx +0 -28
  71. npcpy/npc_team/jinxs/image_generation.jinx +0 -29
  72. npcpy/npc_team/jinxs/internet_search.jinx +0 -30
  73. npcpy/npc_team/jinxs/local_search.jinx +0 -152
  74. npcpy/npc_team/jinxs/npcsh_executor.jinx +0 -31
  75. npcpy/npc_team/jinxs/python_executor.jinx +0 -8
  76. npcpy/npc_team/jinxs/screen_cap.jinx +0 -25
  77. npcpy/npc_team/jinxs/sql_executor.jinx +0 -33
  78. npcpy/npc_team/kadiefa.npc +0 -3
  79. npcpy/npc_team/kadiefa.png +0 -0
  80. npcpy/npc_team/npcsh.ctx +0 -9
  81. npcpy/npc_team/npcsh_sibiji.png +0 -0
  82. npcpy/npc_team/plonk.npc +0 -2
  83. npcpy/npc_team/plonk.png +0 -0
  84. npcpy/npc_team/plonkjr.npc +0 -2
  85. npcpy/npc_team/plonkjr.png +0 -0
  86. npcpy/npc_team/sibiji.npc +0 -5
  87. npcpy/npc_team/sibiji.png +0 -0
  88. npcpy/npc_team/spool.png +0 -0
  89. npcpy/npc_team/templates/analytics/celona.npc +0 -0
  90. npcpy/npc_team/templates/hr_support/raone.npc +0 -0
  91. npcpy/npc_team/templates/humanities/eriane.npc +0 -4
  92. npcpy/npc_team/templates/it_support/lineru.npc +0 -0
  93. npcpy/npc_team/templates/marketing/slean.npc +0 -4
  94. npcpy/npc_team/templates/philosophy/maurawa.npc +0 -0
  95. npcpy/npc_team/templates/sales/turnic.npc +0 -4
  96. npcpy/npc_team/templates/software/welxor.npc +0 -0
  97. npcpy/npc_team/yap.png +0 -0
  98. npcpy/routes.py +0 -958
  99. npcpy/work/mcp_helpers.py +0 -357
  100. npcpy/work/mcp_server.py +0 -194
  101. npcpy-1.0.26.data/data/npcpy/npc_team/alicanto.npc +0 -2
  102. npcpy-1.0.26.data/data/npcpy/npc_team/alicanto.png +0 -0
  103. npcpy-1.0.26.data/data/npcpy/npc_team/automator.jinx +0 -18
  104. npcpy-1.0.26.data/data/npcpy/npc_team/bash_executer.jinx +0 -31
  105. npcpy-1.0.26.data/data/npcpy/npc_team/calculator.jinx +0 -11
  106. npcpy-1.0.26.data/data/npcpy/npc_team/celona.npc +0 -0
  107. npcpy-1.0.26.data/data/npcpy/npc_team/corca.npc +0 -13
  108. npcpy-1.0.26.data/data/npcpy/npc_team/edit_file.jinx +0 -96
  109. npcpy-1.0.26.data/data/npcpy/npc_team/eriane.npc +0 -4
  110. npcpy-1.0.26.data/data/npcpy/npc_team/file_chat.jinx +0 -14
  111. npcpy-1.0.26.data/data/npcpy/npc_team/foreman.npc +0 -7
  112. npcpy-1.0.26.data/data/npcpy/npc_team/frederic.npc +0 -6
  113. npcpy-1.0.26.data/data/npcpy/npc_team/frederic4.png +0 -0
  114. npcpy-1.0.26.data/data/npcpy/npc_team/guac.png +0 -0
  115. npcpy-1.0.26.data/data/npcpy/npc_team/gui_controller.jinx +0 -28
  116. npcpy-1.0.26.data/data/npcpy/npc_team/image_generation.jinx +0 -29
  117. npcpy-1.0.26.data/data/npcpy/npc_team/internet_search.jinx +0 -30
  118. npcpy-1.0.26.data/data/npcpy/npc_team/kadiefa.npc +0 -3
  119. npcpy-1.0.26.data/data/npcpy/npc_team/kadiefa.png +0 -0
  120. npcpy-1.0.26.data/data/npcpy/npc_team/lineru.npc +0 -0
  121. npcpy-1.0.26.data/data/npcpy/npc_team/local_search.jinx +0 -152
  122. npcpy-1.0.26.data/data/npcpy/npc_team/maurawa.npc +0 -0
  123. npcpy-1.0.26.data/data/npcpy/npc_team/npcsh.ctx +0 -9
  124. npcpy-1.0.26.data/data/npcpy/npc_team/npcsh_executor.jinx +0 -31
  125. npcpy-1.0.26.data/data/npcpy/npc_team/npcsh_sibiji.png +0 -0
  126. npcpy-1.0.26.data/data/npcpy/npc_team/plonk.npc +0 -2
  127. npcpy-1.0.26.data/data/npcpy/npc_team/plonk.png +0 -0
  128. npcpy-1.0.26.data/data/npcpy/npc_team/plonkjr.npc +0 -2
  129. npcpy-1.0.26.data/data/npcpy/npc_team/plonkjr.png +0 -0
  130. npcpy-1.0.26.data/data/npcpy/npc_team/python_executor.jinx +0 -8
  131. npcpy-1.0.26.data/data/npcpy/npc_team/raone.npc +0 -0
  132. npcpy-1.0.26.data/data/npcpy/npc_team/screen_cap.jinx +0 -25
  133. npcpy-1.0.26.data/data/npcpy/npc_team/sibiji.npc +0 -5
  134. npcpy-1.0.26.data/data/npcpy/npc_team/sibiji.png +0 -0
  135. npcpy-1.0.26.data/data/npcpy/npc_team/slean.npc +0 -4
  136. npcpy-1.0.26.data/data/npcpy/npc_team/spool.png +0 -0
  137. npcpy-1.0.26.data/data/npcpy/npc_team/sql_executor.jinx +0 -33
  138. npcpy-1.0.26.data/data/npcpy/npc_team/test_pipeline.py +0 -181
  139. npcpy-1.0.26.data/data/npcpy/npc_team/turnic.npc +0 -4
  140. npcpy-1.0.26.data/data/npcpy/npc_team/welxor.npc +0 -0
  141. npcpy-1.0.26.data/data/npcpy/npc_team/yap.png +0 -0
  142. npcpy-1.0.26.dist-info/METADATA +0 -827
  143. npcpy-1.0.26.dist-info/RECORD +0 -139
  144. npcpy-1.0.26.dist-info/entry_points.txt +0 -11
  145. /npcpy/{modes → ft}/__init__.py +0 -0
  146. {npcpy-1.0.26.dist-info → npcpy-1.2.32.dist-info}/WHEEL +0 -0
  147. {npcpy-1.0.26.dist-info → npcpy-1.2.32.dist-info}/licenses/LICENSE +0 -0
  148. {npcpy-1.0.26.dist-info → npcpy-1.2.32.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,30 @@
1
- import json
2
- import os
1
+ from collections import defaultdict
3
2
  import datetime
4
-
5
- import numpy as np
6
-
3
+ import json
7
4
  try:
8
5
  import kuzu
9
6
  except ModuleNotFoundError:
10
7
  print("kuzu not installed")
8
+ import os
9
+ import random
10
+ import pandas as pd
11
11
  from typing import Optional, Dict, List, Union, Tuple, Any, Set
12
12
 
13
+ from npcpy.llm_funcs import (
14
+ abstract,
15
+ consolidate_facts_llm,
16
+ generate_groups,
17
+ get_facts,
18
+ get_llm_response,
19
+ get_related_concepts_multi,
20
+ get_related_facts_llm,
21
+ prune_fact_subset_llm,
22
+ remove_idempotent_groups,
23
+ zoom_in,
24
+ )
13
25
 
14
- from npcpy.llm_funcs import get_llm_response
15
- from npcpy.npc_compiler import NPC
16
- import sqlite3
17
-
26
+ from npcpy.memory.command_history import load_kg_from_db, save_kg_to_db
18
27
 
19
- import random
20
28
  def safe_kuzu_execute(conn, query, error_message="Kuzu query failed"):
21
29
  """Execute a Kuzu query with proper error handling"""
22
30
  try:
@@ -35,7 +43,7 @@ def create_group(conn, name: str, metadata: str = ""):
35
43
  return False
36
44
 
37
45
  try:
38
- # Properly escape quotes in strings
46
+
39
47
  escaped_name = name.replace('"', '\\"')
40
48
  escaped_metadata = metadata.replace('"', '\\"')
41
49
 
@@ -70,13 +78,13 @@ def init_db(db_path: str, drop=False):
70
78
  print("Database connection established successfully")
71
79
 
72
80
  if drop:
73
- # Drop tables in reverse order of dependency
81
+
74
82
  safe_kuzu_execute(conn, "DROP TABLE IF EXISTS Contains")
75
- safe_kuzu_execute(conn, "DROP TABLE IF EXISTS EvolvedFrom") # New
83
+ safe_kuzu_execute(conn, "DROP TABLE IF EXISTS EvolvedFrom")
76
84
  safe_kuzu_execute(conn, "DROP TABLE IF EXISTS Fact")
77
85
  safe_kuzu_execute(conn, "DROP TABLE IF EXISTS Groups")
78
86
 
79
- # Fact table remains the same
87
+
80
88
  safe_kuzu_execute(
81
89
  conn,
82
90
  """
@@ -90,7 +98,7 @@ def init_db(db_path: str, drop=False):
90
98
  "Failed to create Fact table",
91
99
  )
92
100
 
93
- # UPDATED Groups table with generational properties
101
+
94
102
  safe_kuzu_execute(
95
103
  conn,
96
104
  """
@@ -106,14 +114,14 @@ def init_db(db_path: str, drop=False):
106
114
  )
107
115
  print("Groups table (with generation tracking) created or already exists.")
108
116
 
109
- # Contains relationship remains the same
117
+
110
118
  safe_kuzu_execute(
111
119
  conn,
112
120
  "CREATE REL TABLE IF NOT EXISTS Contains(FROM Groups TO Fact);",
113
121
  "Failed to create Contains relationship table",
114
122
  )
115
123
 
116
- # NEW EvolvedFrom relationship table
124
+
117
125
  safe_kuzu_execute(
118
126
  conn,
119
127
  """
@@ -133,531 +141,500 @@ def init_db(db_path: str, drop=False):
133
141
  print(f"Fatal error initializing database: {str(e)}")
134
142
  traceback.print_exc()
135
143
  return None
136
- def extract_facts(
137
- text: str,
138
- model: str,
139
- provider: str,
140
- npc: NPC = None,
141
- context: str = ""
144
+
145
+
146
+
147
+ def find_similar_groups(
148
+ conn,
149
+ fact: str,
150
+ model,
151
+ provider,
152
+ npc = None,
153
+ context: str = None,
154
+ **kwargs: Any
142
155
  ) -> List[str]:
143
- """Extract concise facts from text using LLM (as defined earlier)"""
144
- # Implementation from your previous code
145
- prompt = """Extract concise facts from this text.
146
- A fact is a piece of information that makes a statement about the world.
147
- A fact is typically a sentence that is true or false.
148
- Facts may be simple or complex. They can also be conflicting with each other, usually
149
- because there is some hidden context that is not mentioned in the text.
150
- In any case, it is simply your job to extract a list of facts that could pertain to
151
- an individual's personality.
152
-
153
- For example, if a message says:
154
- "since I am a doctor I am often trying to think up new ways to help people.
155
- Can you help me set up a new kind of software to help with that?"
156
- You might extract the following facts:
157
- - The individual is a doctor
158
- - They are helpful
159
-
160
- Another example:
161
- "I am a software engineer who loves to play video games. I am also a huge fan of the
162
- Star Wars franchise and I am a member of the 501st Legion."
163
- You might extract the following facts:
164
- - The individual is a software engineer
165
- - The individual loves to play video games
166
- - The individual is a huge fan of the Star Wars franchise
167
- - The individual is a member of the 501st Legion
168
-
169
- Another example:
170
- "The quantum tunneling effect allows particles to pass through barriers
171
- that classical physics says they shouldn't be able to cross. This has
172
- huge implications for semiconductor design."
173
- You might extract these facts:
174
- - Quantum tunneling enables particles to pass through barriers that are
175
- impassable according to classical physics
176
- - The behavior of quantum tunneling has significant implications for
177
- how semiconductors must be designed
178
-
179
- Another example:
180
- "People used to think the Earth was flat. Now we know it's spherical,
181
- though technically it's an oblate spheroid due to its rotation."
182
- You might extract these facts:
183
- - People historically believed the Earth was flat
184
- - It is now known that the Earth is an oblate spheroid
185
- - The Earth's oblate spheroid shape is caused by its rotation
186
-
187
- Another example:
188
- "My research on black holes suggests they emit radiation, but my professor
189
- says this conflicts with Einstein's work. After reading more papers, I
190
- learned this is actually Hawking radiation and doesn't conflict at all."
191
- You might extract the following facts:
192
- - Black holes emit radiation
193
- - The professor believes this radiation conflicts with Einstein's work
194
- - The radiation from black holes is called Hawking radiation
195
- - Hawking radiation does not conflict with Einstein's work
196
-
197
- Another example:
198
- "During the pandemic, many developers switched to remote work. I found
199
- that I'm actually more productive at home, though my company initially
200
- thought productivity would drop. Now they're keeping remote work permanent."
201
- You might extract the following facts:
202
- - The pandemic caused many developers to switch to remote work
203
- - The individual discovered higher productivity when working from home
204
- - The company predicted productivity would decrease with remote work
205
- - The company decided to make remote work a permanent option
206
-
207
- Thus, it is your mission to reliably extract lists of facts.
208
-
209
- Return a JSON object with the following structure:
210
- {
211
- "fact_list": "a list containing the facts where each fact is a string",
212
- }
213
- """
214
- if len(context) > 0:
215
- prompt+=f""" Here is some relevant user context: {context}"""
156
+ """Find existing groups that might contain this fact"""
157
+ response = conn.execute(f"MATCH (g:Groups) RETURN g.name;")
158
+
159
+
160
+
161
+ groups = response.fetch_as_df()
162
+
163
+ if not groups:
164
+ return []
165
+
166
+ prompt = """Given a fact and a list of groups, determine which groups this fact belongs to.
167
+ A fact should belong to a group if it is semantically related to the group's theme or purpose.
168
+ For example, if a fact is "The user loves programming" and there's a group called "Technical_Interests",
169
+ that would be a match.
170
+
171
+ Return a JSON object with the following structure:
172
+ {
173
+ "group_list": "a list containing the names of matching groups"
174
+ }
216
175
 
217
- prompt+="""
218
176
  Return only the JSON object.
219
177
  Do not include any additional markdown formatting.
220
178
  """
221
179
 
222
180
  response = get_llm_response(
223
- prompt + f"HERE BEGINS THE TEXT TO INVESTIGATE:\n\nText: {text}",
181
+ prompt + f"\n\nFact: {fact}\nGroups: {json.dumps(groups)}",
224
182
  model=model,
225
183
  provider=provider,
226
184
  format="json",
185
+ npc=npc,
186
+ context=context,
187
+ **kwargs
227
188
  )
228
189
  response = response["response"]
229
- return response.get("fact_list", [])
190
+ return response["group_list"]
230
191
 
231
192
 
232
- # --- Breathe (Context Condensation) ---
233
- def breathe(
234
- messages: List[Dict[str, str]],
235
- model: str,
236
- provider: str,
237
- npc: NPC = None,
238
- context: str = None,
239
- ) -> Dict[str, Any]:
240
- """Condense the conversation context into a small set of key extractions."""
241
- if not messages:
242
- return {"output": {}, "messages": []}
193
+ def kg_initial(content,
194
+ model=None,
195
+ provider=None,
196
+ npc=None,
197
+ context='',
198
+ facts=None,
199
+ generation=None,
200
+ verbose=True,):
243
201
 
244
- conversation_text = "\n".join([f"{m['role']}: {m['content']}" for m in messages])
202
+ if generation is None:
203
+ CURRENT_GENERATION = 0
204
+ else:
205
+ CURRENT_GENERATION = generation
206
+
207
+ print(f"--- Running KG Structuring Process (Generation: {CURRENT_GENERATION}) ---")
208
+
209
+ if facts is None:
210
+ if not content:
211
+ raise ValueError("kg_initial requires either content_text or a list of facts.")
212
+ print(" - Mode: Deriving new facts from text content...")
213
+ all_facts = []
214
+ print(len(content))
215
+ if len(content)>10000:
216
+ # randomly sub sample 10000 characters
217
+ starting_point = random.randint(0, len(content)-10000)
218
+
219
+ content_to_sample = content[starting_point:starting_point+10000]
245
220
 
246
- # Extract facts, mistakes, and lessons learned
247
- facts = extract_facts(conversation_text, model, provider)
221
+ for n in range(len(content)//10000):
222
+ print(n)
223
+ print(starting_point)
224
+ print(content_to_sample[0:1000])
225
+ facts = get_facts(content_to_sample,
226
+ model=model,
227
+ provider=provider,
228
+ npc=npc,
229
+ context=context)
230
+ if verbose:
231
+ print(f" - Extracted {len(facts)} facts from segment {n+1}")
232
+ print(facts)
233
+ all_facts.extend(facts)
234
+ else:
235
+ print(content[0:1000] )
236
+ all_facts = get_facts(content,
237
+ model=model,
238
+ provider=provider,
239
+ npc=npc,
240
+ context=context)
241
+ if verbose:
242
+ print(f" - Extracted {len(all_facts)} facts from content")
243
+ print(all_facts)
244
+ for fact in all_facts:
245
+
246
+ fact['generation'] = CURRENT_GENERATION
247
+ else:
248
+ print(f" - Mode: Building structure from {len(facts)} pre-existing facts...")
249
+
250
+ print(" - Inferring implied facts (zooming in)...")
251
+ all_implied_facts = []
252
+ if len(all_facts) > 20:
253
+ # sub sample facts randomly to generate zoomed in facts
254
+ sampled_facts = random.sample(all_facts, k=20)
255
+ for n in range(len(all_facts) // 20):
256
+ implied_facts = zoom_in(sampled_facts,
257
+ model=model,
258
+ provider=provider,
259
+ npc=npc,
260
+ context=context)
261
+ all_implied_facts.extend(implied_facts)
262
+ if verbose:
263
+ print(f" - Inferred {len(implied_facts)} implied facts from sample {n+1}")
264
+ print(implied_facts)
265
+ else:
266
+ implied_facts = zoom_in(all_facts,
267
+ model=model,
268
+ provider=provider,
269
+ npc=npc,
270
+ context=context)
271
+ print(implied_facts)
272
+
273
+ all_implied_facts.extend(implied_facts)
274
+
275
+ if verbose:
276
+ print(f" - Inferred {len(implied_facts)} implied facts from all facts")
277
+ print(implied_facts)
278
+ for fact in all_implied_facts:
279
+ fact['generation'] = CURRENT_GENERATION
280
+
281
+ all_facts = all_facts + all_implied_facts
282
+
283
+ print(" - Generating concepts from all facts...")
284
+ concepts = generate_groups(all_facts,
285
+ model=model,
286
+ provider=provider,
287
+ npc=npc,
288
+ context=context)
289
+ for concept in concepts:
290
+ concept['generation'] = CURRENT_GENERATION
291
+
292
+ if verbose:
293
+ print(f" - Generated {len(concepts)} concepts")
294
+ print(concepts)
295
+ print(" - Linking facts to concepts...")
296
+ fact_to_concept_links = defaultdict(list)
297
+ concept_names = [c['name'] for c in concepts if c and 'name' in c]
298
+ for fact in all_facts:
299
+
300
+ fact_to_concept_links[fact['statement']] = get_related_concepts_multi(fact['statement'], "fact", concept_names, model, provider, npc, context)
301
+ if verbose:
302
+ print(fact_to_concept_links[fact['statement']])
303
+ print(" - Linking facts to other facts...")
304
+ fact_to_fact_links = []
305
+ fact_statements = [f['statement'] for f in all_facts]
306
+ for i, fact in enumerate(all_facts):
307
+ other_fact_statements = fact_statements[all_facts != fact]
308
+ print('checking fact: ', fact)
309
+ if other_fact_statements:
310
+ related_fact_stmts = get_related_facts_llm(fact['statement'],
311
+ other_fact_statements,
312
+ model=model,
313
+ provider=provider,
314
+ npc=npc,
315
+ context=context)
316
+ for related_stmt in related_fact_stmts:
317
+
318
+ fact_to_fact_links.append((fact['statement'], related_stmt))
319
+ if verbose:
320
+ print(fact['statement'], related_stmt)
248
321
 
249
- # Combine results for brevity
250
- output = {
251
- "facts": facts,
322
+ return {
323
+ "generation": CURRENT_GENERATION,
324
+ "facts": all_facts,
325
+ "concepts": concepts,
326
+ "concept_links": [],
327
+ "fact_to_concept_links": dict(fact_to_concept_links),
328
+ "fact_to_fact_links": fact_to_fact_links
252
329
  }
253
330
 
254
- return {"output": output, "messages": []}
255
-
256
- # --- Semantic Evolution (Sleep) ---
257
- def semantic_evolution(
258
- facts: List[str],
259
- existing_leaf_groups: List[str], # These are groups from previous steps, not necessarily facts
260
- model: str,
261
- provider: str,
262
- npc: NPC = None,
263
- min_top: int = 4,
264
- max_top: int = 10,
265
- max_levels: int = 5
266
- ) -> Dict:
267
- """Build hierarchical group structure iteratively from facts and existing groups."""
268
-
269
- # Step 1: Generate initial group candidates from the new facts
270
- new_group_candidates = generate_group_candidates(facts, "facts", model, provider, npc)
271
-
272
- # Step 2: Combine with existing leaf groups and remove idempotents to get our starting set
273
- # These will be the bottom-most groups that we will then try to abstract upwards.
274
- initial_groups_for_hierarchy = remove_idempotent_groups(
275
- new_group_candidates + existing_leaf_groups, model, provider, npc
276
- )
277
-
278
- # Step 3: Build the hierarchy iteratively from these initial groups
279
- # We pass these initial groups, and build_full_hierarchy will abstract them upwards.
280
- hierarchy_data = build_full_hierarchy(
281
- initial_groups_for_hierarchy, # Use the cleaned list of groups
282
- model=model,
283
- provider=provider,
284
- npc=npc,
285
- min_top=min_top,
286
- max_top=max_top,
287
- max_levels=max_levels
288
- )
331
+
332
+ def kg_evolve_incremental(existing_kg,
333
+ new_content_text=None,
334
+ new_facts=None,
335
+ model = None,
336
+ provider=None,
337
+ npc=None,
338
+ context='',
339
+ get_concepts=False,
340
+ link_concepts_facts = False,
341
+ link_concepts_concepts=False,
342
+ link_facts_facts = False,
343
+ ):
344
+
345
+ current_gen = existing_kg.get('generation', 0)
346
+ next_gen = current_gen + 1
347
+ print(f"\n--- ABSORBING INFO: Gen {current_gen} -> Gen {next_gen} ---")
348
+
349
+ newly_added_concepts = []
350
+ concept_links = list(existing_kg.get('concept_links', []))
351
+ fact_to_concept_links = defaultdict(list,
352
+ existing_kg.get('fact_to_concept_links', {}))
353
+ fact_to_fact_links = list(existing_kg.get('fact_to_fact_links', []))
354
+
355
+ existing_facts = existing_kg.get('facts', [])
356
+ existing_concepts = existing_kg.get('concepts', [])
357
+ existing_concept_names = {c['name'] for c in existing_concepts}
358
+ existing_fact_statements = [f['statement'] for f in existing_facts]
359
+ all_concept_names = list(existing_concept_names)
289
360
 
290
- return {
291
- "hierarchy": hierarchy_data,
292
- "leaf_groups": initial_groups_for_hierarchy, # These are the groups that were NOT abstracted further
293
- }
294
- # --- Helper Functions for Hierarchy (unchanged from before) ---
295
- def generate_group_candidates(
296
- items: List[str],
297
- item_type: str,
298
- model: str,
299
- provider: str,
300
- npc: NPC = None,
301
- n_passes: int = 3,
302
- subset_size: int = 10
303
- ) -> List[str]:
304
- """Generate candidate groups for items (facts or groups) based on core semantic meaning."""
305
- all_candidates = []
361
+ all_new_facts = []
362
+ print(npc, npc.model, npc.provider)
306
363
 
307
- for pass_num in range(n_passes):
308
- if len(items) > subset_size:
309
- item_subset = random.sample(items, min(subset_size, len(items)))
364
+ if new_facts:
365
+ all_new_facts = new_facts
366
+ print(f'using pre-approved facts: {len(all_new_facts)}')
367
+ elif new_content_text:
368
+ print('extracting facts from content...')
369
+ if len(new_content_text) > 10000:
370
+ starting_point = random.randint(0, len(new_content_text)-10000)
371
+ for n in range(len(new_content_text)//10000):
372
+ content_to_sample = new_content_text[n*10000:(n+1)*10000]
373
+ facts = get_facts(content_to_sample,
374
+ model=model,
375
+ provider=provider,
376
+ npc = npc,
377
+ context=context)
378
+ all_new_facts.extend(facts)
379
+ print(facts)
310
380
  else:
311
- item_subset = items
312
-
313
- # --- PROMPT MODIFICATION: Focus on semantic essence, avoid gerunds/adverbs, favor subjects ---
314
- prompt = f"""From the following {item_type}, identify specific and relevant conceptual groups.
315
- Think about the core subject or entity being discussed.
316
-
317
- GUIDELINES FOR GROUP NAMES:
318
- 1. **Prioritize Specificity:** Names should be precise and directly reflect the content.
319
- 2. **Favor Nouns and Noun Phrases:** Use descriptive nouns or noun phrases.
320
- 3. **AVOID:**
321
- * Gerunds (words ending in -ing when used as nouns, like "Understanding", "Analyzing", "Processing"). If a gerund is unavoidable, try to make it a specific action (e.g., "User Authentication Module" is better than "Authenticating Users").
322
- * Adverbs or descriptive adjectives that don't form a core part of the subject's identity (e.g., "Quickly calculating", "Effectively managing").
323
- * Overly generic terms (e.g., "Concepts", "Processes", "Dynamics", "Mechanics", "Analysis", "Understanding", "Interactions", "Relationships", "Properties", "Structures", "Systems", "Frameworks", "Predictions", "Outcomes", "Effects", "Considerations", "Methods", "Techniques", "Data", "Theoretical", "Physical", "Spatial", "Temporal").
324
- 4. **Direct Naming:** If an item is a specific entity or action, it can be a group name itself (e.g., "Earth", "Lamb Shank Braising", "World War I").
325
-
326
- EXAMPLE:
327
- Input {item_type.capitalize()}: ["Self-intersection shocks drive accretion disk formation.", "Gravity stretches star into stream.", "Energy dissipation in shocks influences capture fraction."]
328
- Desired Output Groups: ["Accretion Disk Formation (Self-Intersection Shocks)", "Stellar Tidal Stretching", "Energy Dissipation from Shocks"]
329
-
330
- ---
381
+ all_new_facts = get_facts(new_content_text,
382
+ model=model,
383
+ provider=provider,
384
+ npc = npc,
385
+ context=context)
386
+ print(all_new_facts)
387
+ else:
388
+ print("No new content or facts provided")
389
+ return existing_kg, {}
390
+
391
+ for fact in all_new_facts:
392
+ fact['generation'] = next_gen
393
+
394
+ final_facts = existing_facts + all_new_facts
395
+
396
+ if get_concepts:
397
+ print('generating groups...')
398
+
399
+ candidate_concepts = generate_groups(all_new_facts,
400
+ model = model,
401
+ provider = provider,
402
+ npc=npc,
403
+ context=context)
404
+ print(candidate_concepts)
405
+ print('checking group uniqueness')
406
+ for cand_concept in candidate_concepts:
407
+ cand_name = cand_concept['name']
408
+ if cand_name in existing_concept_names:
409
+ continue
410
+ cand_concept['generation'] = next_gen
411
+ newly_added_concepts.append(cand_concept)
412
+ if link_concepts_concepts:
413
+ print('linking concepts and concepts...')
414
+
415
+ related_concepts = get_related_concepts_multi(cand_name,
416
+ "concept",
417
+ all_concept_names,
418
+ model,
419
+ provider,
420
+ npc,
421
+ context)
422
+ for related_name in related_concepts:
423
+ if related_name != cand_name:
424
+ concept_links.append((cand_name, related_name))
425
+ all_concept_names.append(cand_name)
426
+
427
+ final_concepts = existing_concepts + newly_added_concepts
428
+
429
+ if link_concepts_facts:
430
+ print('linking facts and concepts...')
431
+ for fact in all_new_facts:
432
+ fact_to_concept_links[fact['statement']] = get_related_concepts_multi(fact['statement'],
433
+ "fact",
434
+ all_concept_names,
435
+ model = model,
436
+ provider=provider,
437
+ npc = npc,
438
+ context= context)
439
+ else:
440
+ final_concepts = existing_concepts
441
+ if link_facts_facts:
442
+ print('linking facts and facts...')
443
+
444
+ for new_fact in all_new_facts:
445
+ related_fact_stmts = get_related_facts_llm(new_fact['statement'],
446
+ existing_fact_statements,
447
+ model = model,
448
+ provider = provider,
449
+ npc = npc,
450
+ context=context)
451
+ for related_stmt in related_fact_stmts:
452
+ fact_to_fact_links.append((new_fact['statement'], related_stmt))
453
+
454
+ final_kg = {
455
+ "generation": next_gen,
456
+ "facts": final_facts,
457
+ "concepts": final_concepts,
458
+ "concept_links": concept_links,
459
+ "fact_to_concept_links": dict(fact_to_concept_links),
460
+ "fact_to_fact_links": fact_to_fact_links
331
461
 
332
- Now, analyze the following {item_type}:
333
- {item_type.capitalize()}: {json.dumps(item_subset)}
462
+ }
463
+ return final_kg, {}
464
+
465
+
466
+
467
+
468
+ def kg_sleep_process(existing_kg,
469
+ model=None,
470
+ provider=None,
471
+ npc=None,
472
+ context='',
473
+ operations_config=None):
474
+ current_gen = existing_kg.get('generation', 0)
475
+ next_gen = current_gen + 1
476
+ print(f"\n--- SLEEPING (Evolving Knowledge): Gen {current_gen} -> Gen {next_gen} ---")
477
+
478
+
479
+ facts_map = {f['statement']: f for f in existing_kg.get('facts', [])}
480
+ concepts_map = {c['name']: c for c in existing_kg.get('concepts', [])}
481
+ fact_links = defaultdict(list, {k: list(v) for k, v in existing_kg.get('fact_to_concept_links', {}).items()})
482
+ concept_links = set(tuple(sorted(link)) for link in existing_kg.get('concept_links', []))
483
+ fact_to_fact_links = set(tuple(sorted(link)) for link in existing_kg.get('fact_to_fact_links', []))
484
+
485
+
486
+ print(" - Phase 1: Checking for unstructured facts...")
487
+ facts_with_concepts = set(fact_links.keys())
488
+ orphaned_fact_statements = list(set(facts_map.keys()) - facts_with_concepts)
489
+
490
+ if len(orphaned_fact_statements) > 20:
491
+ print(f" - Found {len(orphaned_fact_statements)} orphaned facts. Applying full KG structuring process...")
492
+ orphaned_facts_as_dicts = [facts_map[s] for s in orphaned_fact_statements]
334
493
 
335
- Return a JSON object:
336
- {{
337
- "groups": ["list of specific, precise, and relevant group names"]
338
- }}
339
- """
340
- # --- END PROMPT MODIFICATION ---
341
494
 
342
- response = get_llm_response(
343
- prompt,
495
+ new_structure = kg_initial(
496
+ facts=orphaned_facts_as_dicts,
344
497
  model=model,
345
498
  provider=provider,
346
- format="json",
347
499
  npc=npc,
500
+ context=context,
501
+ generation=next_gen
348
502
  )
349
-
350
- candidates = response["response"].get("groups", [])
351
- all_candidates.extend(candidates)
352
- print(all_candidates)
353
- return list(set(all_candidates))
354
-
355
-
356
- def remove_idempotent_groups(
357
- group_candidates: List[str],
358
- model: str,
359
- provider: str,
360
- npc: NPC = None
361
- ) -> List[str]:
362
- """Remove groups that are essentially identical in meaning, favoring specificity and direct naming, and avoiding generic structures."""
363
-
364
- prompt = f"""Compare these group names. Identify and list ONLY the groups that are conceptually distinct and specific.
365
-
366
- GUIDELINES FOR SELECTING DISTINCT GROUPS:
367
- 1. **Prioritize Specificity and Direct Naming:** Favor precise nouns or noun phrases that directly name the subject.
368
- 2. **Prefer Concrete Entities/Actions:** If a name refers to a specific entity or action (e.g., "Earth", "Sun", "Water", "France", "User Authentication Module", "Lamb Shank Braising", "World War I"), keep it if it's distinct.
369
- 3. **Rephrase Gerunds:** If a name uses a gerund (e.g., "Understanding TDEs"), rephrase it to a noun or noun phrase (e.g., "Tidal Disruption Events").
370
- 4. **AVOID OVERLY GENERIC TERMS:** Do NOT use very broad or abstract terms that don't add specific meaning. Examples to avoid: "Concepts", "Processes", "Dynamics", "Mechanics", "Analysis", "Understanding", "Interactions", "Relationships", "Properties", "Structures", "Systems", "Frameworks", "Predictions", "Outcomes", "Effects", "Considerations", "Methods", "Techniques", "Data", "Theoretical", "Physical", "Spatial", "Temporal". If a group name seems overly generic or abstract, it should likely be removed or refined.
371
- 5. **Similarity Check:** If two groups are very similar, keep the one that is more descriptive or specific to the domain.
372
-
373
- EXAMPLE 1:
374
- Groups: ["Accretion Disk Formation", "Accretion Disk Dynamics", "Formation of Accretion Disks"]
375
- Distinct Groups: ["Accretion Disk Formation", "Accretion Disk Dynamics"]
376
-
377
- EXAMPLE 2:
378
- Groups: ["Causes of Events", "Event Mechanisms", "Event Drivers"]
379
- Distinct Groups: ["Event Causation", "Event Mechanisms"]
380
-
381
- EXAMPLE 3:
382
- Groups: ["Astrophysics Basics", "Fundamental Physics", "General Science Concepts"]
383
- Distinct Groups: ["Fundamental Physics"]
384
-
385
- EXAMPLE 4:
386
- Groups: ["Earth", "The Planet Earth", "Sun", "Our Star"]
387
- Distinct Groups: ["Earth", "Sun"]
388
-
389
- EXAMPLE 5:
390
- Groups: ["User Authentication Module", "Authentication System", "Login Process"]
391
- Distinct Groups: ["User Authentication Module", "Login Process"]
392
-
393
- ---
394
-
395
- Now, analyze the following groups:
396
- Groups: {json.dumps(group_candidates)}
397
-
398
- Return JSON:
399
- {{
400
- "distinct_groups": ["list of specific, precise, and distinct group names to keep"]
401
- }}
402
- """
403
-
404
- response = get_llm_response(
405
- prompt,
406
- model=model,
407
- provider=provider,
408
- format="json",
409
- npc=npc
410
- )
411
-
412
- print(response['response']['distinct_groups'])
413
- return response["response"]["distinct_groups"]
414
-
415
-
416
- def build_hierarchy_dag(
417
- groups: List[str],
418
- model: str,
419
- provider: str,
420
- npc: NPC = None,
421
- max_levels: int = 3,
422
- target_top_count: int = 8,
423
- n_passes: int = 3, # This is the number of times we query the LLM per level
424
- subset_size: int = 10 # This is how many groups we pass to the LLM at once
425
- ) -> Dict:
426
- """Build DAG hierarchy iteratively from bottom up, abstracting groups."""
427
-
428
- # Initialize DAG structure for the initial set of groups
429
- dag = {group: {"parents": set(), "children": set(), "level": 0} for group in groups}
430
- all_groups = set(groups)
431
- current_level_items = groups # Start with the provided groups (the bottom layer)
432
- level_num = 0
433
-
434
- # Keep abstracting until we have a manageable number of top-level groups
435
- # or reach max_levels. The condition checks the number of groups *currently* without parents.
436
- while len([g for g in all_groups if not dag.get(g, {}).get("parents")]) > target_top_count and level_num < max_levels:
437
- level_num += 1
438
- print(f"Too many top groups ({len([g for g in all_groups if not dag.get(g, {}).get('parents')])}), abstracting level {level_num}")
439
-
440
- # --- CRITICAL FIX: Re-introduce the multi-pass sampling for parent suggestions ---
441
- potential_parents = []
442
- # Multiple passes with resampling to explore different abstraction possibilities
443
- for pass_num in range(n_passes): # Iterate n_passes times
444
- # Sample a subset of groups from the current level for the LLM prompt
445
- if len(current_level_items) > subset_size:
446
- # Use a seed based on level and pass to ensure different samples each time
447
- random.seed(level_num * 10 + pass_num)
448
- group_subset = random.sample(current_level_items, min(subset_size, len(current_level_items)))
449
- else:
450
- group_subset = current_level_items # Use all if subset_size is larger than available groups
451
-
452
- # Prompt the LLM to suggest parent categories for this subset of groups
453
- prompt = f"""
454
- What are broader parent categories that could contain these groups?
455
- Suggest 1-3 broader categories. Make them distinct and meaningful.
456
-
457
- Groups: {json.dumps(group_subset)}
458
-
459
- Return JSON:
460
- {{
461
- "parents": ["list of parent categories"]
462
- }}
463
- """
464
-
465
- response = get_llm_response(
466
- prompt, model=model, provider=provider, format="json", npc=npc
467
- )
468
503
 
469
- parents = response["response"].get("parents", [])
470
- potential_parents.extend(parents)
471
504
 
472
- distinct_parents = remove_idempotent_groups(potential_parents, model, provider, npc)
505
+ print(" - Merging new structure into main KG...")
506
+ for concept in new_structure.get("concepts", []):
507
+ if concept['name'] not in concepts_map:
508
+ concepts_map[concept['name']] = concept
473
509
 
474
- if not distinct_parents: # Stop if no new abstract groups were generated
475
- print("No distinct parent groups generated, stopping abstraction.")
476
- break
477
-
478
- # Add these distinct parent groups to the DAG and update relationships
479
- new_groups_for_next_level = set()
480
- for parent in distinct_parents:
481
- if parent not in dag: # If this is a completely new abstract group
482
- dag[parent] = {
483
- "parents": set(), # These new parents have no parents yet in this round
484
- "children": set(current_level_items), # The groups from the previous level are their children
485
- "level": level_num
486
- }
487
- all_groups.add(parent)
488
- new_groups_for_next_level.add(parent)
489
- else: # If the parent group already exists (e.g., from a different branch)
490
- # Update its children to include the current level's groups
491
- dag[parent]["children"].update(current_level_items)
492
-
493
- # Update parent relationship for the children from the previous level
494
- for child in current_level_items:
495
- dag[child]["parents"].add(parent)
496
-
497
- # The newly found parents become the input for the next abstraction level
498
- current_level_items = list(new_groups_for_next_level)
510
+ for fact_stmt, new_links in new_structure.get("fact_to_concept_links", {}).items():
511
+ existing_links = set(fact_links.get(fact_stmt, []))
512
+ existing_links.update(new_links)
513
+ fact_links[fact_stmt] = list(existing_links)
499
514
 
500
- # After the loop, identify the final top groups (those with no parents in the constructed DAG)
501
- top_groups_final = [g for g in all_groups if not dag.get(g, {}).get("parents")]
515
+ for f1, f2 in new_structure.get("fact_to_fact_links", []):
516
+ fact_to_fact_links.add(tuple(sorted((f1, f2))))
517
+ else:
518
+ print(" - Knowledge graph is sufficiently structured. Proceeding to refinement.")
502
519
 
503
- return {
504
- "dag": dag,
505
- "top_groups": top_groups_final,
506
- "leaf_groups": groups, # The initial set of groups passed in, which are the base for the hierarchy
507
- "max_level": level_num
508
- }
509
-
510
520
 
521
+ if operations_config is None:
522
+ possible_ops = ['prune', 'deepen', 'abstract_link']
523
+ ops_to_run = random.sample(possible_ops, k=random.randint(1, 2))
524
+ else:
525
+ ops_to_run = operations_config
511
526
 
527
+ print(f" - Phase 2: Executing refinement operations: {ops_to_run}")
512
528
 
513
- def build_full_hierarchy(
514
- leaf_groups: List[str],
515
- model: str,
516
- provider: str,
517
- npc: NPC = None,
518
- min_top: int = 4,
519
- max_top: int = 10,
520
- max_levels: int = 5
521
- ) -> Dict:
522
- """Build full hierarchy from initial leaf groups up to top groups."""
523
- # Step 1: Get initial distinct groups from facts (already done by caller if passing leaf_groups)
524
- # If leaf_groups is empty, we might want to generate them from facts first, but for now, assume they are provided.
525
-
526
- # Step 2: Build the DAG structure, abstracting upwards until we have <= max_top groups
527
- hierarchy = build_hierarchy_dag(
528
- leaf_groups, model, provider, npc, max_levels, max_top, n_passes=3, subset_size=10
529
- )
530
-
531
- return hierarchy
529
+ for op in ops_to_run:
530
+
531
+ if op == 'prune' and (len(facts_map) > 10 or len(concepts_map) > 5):
532
+ print(" - Running 'prune' operation using consolidate_facts_llm...")
533
+ fact_to_check = random.choice(list(facts_map.values()))
534
+ other_facts = [f for f in facts_map.values() if f['statement'] != fact_to_check['statement']]
535
+ consolidation_result = consolidate_facts_llm(fact_to_check, other_facts, model, provider, npc, context)
536
+ if consolidation_result.get('decision') == 'redundant':
537
+ print(f" - Pruning redundant fact: '{fact_to_check['statement'][:80]}...'")
538
+ del facts_map[fact_to_check['statement']]
532
539
 
533
- def assign_fact_to_dag(fact: str, dag_data: Dict, model: str, provider: str, npc: NPC = None) -> Dict:
534
- """Assign fact to DAG starting from top-level abstract concepts, traversing down."""
540
+
541
+ elif op == 'deepen' and facts_map:
542
+ print(" - Running 'deepen' operation using zoom_in...")
543
+ fact_to_deepen = random.choice(list(facts_map.values()))
544
+ implied_facts = zoom_in([fact_to_deepen], model, provider, npc, context)
545
+ new_fact_count = 0
546
+ for fact in implied_facts:
547
+ if fact['statement'] not in facts_map:
548
+ fact.update({'generation': next_gen, 'origin': 'deepen'})
549
+ facts_map[fact['statement']] = fact
550
+ new_fact_count += 1
551
+ if new_fact_count > 0: print(f" - Inferred {new_fact_count} new fact(s).")
552
+
553
+ else:
554
+ print(f" - SKIPPED: Operation '{op}' did not run (conditions not met).")
555
+
535
556
 
536
- top_groups = dag_data.get("top_groups", [])
537
- if not top_groups: # Handle case where no hierarchy was built
538
- print(f"Warning: No top groups found for fact: {fact}. Assigning to all leaf groups.")
539
- # Fallback: assign to leaf groups if no hierarchy exists
540
- leaf_groups = dag_data.get("leaf_groups", [])
541
- if not leaf_groups: return {'top_level_groups': [], 'all_groups': [], 'hierarchy_paths': []}
542
- assignments = get_fact_assignments(fact, leaf_groups, model, provider, npc)
543
- return {'top_level_groups': assignments, 'all_groups': assignments, 'hierarchy_paths': [f"{g}" for g in assignments]}
544
-
545
- print(f"assign_fact_to_dag: Assigning fact: {fact[:50]}...")
557
+ new_kg = {
558
+ "generation": next_gen,
559
+ "facts": list(facts_map.values()),
560
+ "concepts": list(concepts_map.values()),
561
+ "concept_links": [list(link) for link in concept_links],
562
+ "fact_to_concept_links": dict(fact_links),
563
+ "fact_to_fact_links": [list(link) for link in fact_to_fact_links]
564
+ }
565
+ return new_kg, {}
566
+ def kg_dream_process(existing_kg,
567
+ model = None,
568
+ provider = None,
569
+ npc=None,
570
+ context='',
571
+ num_seeds=3):
572
+ current_gen = existing_kg.get('generation', 0)
573
+ next_gen = current_gen + 1
574
+ print(f"\n--- DREAMING (Creative Synthesis): Gen {current_gen} -> Gen {next_gen} ---")
575
+ concepts = existing_kg.get('concepts', [])
576
+ if len(concepts) < num_seeds:
577
+ print(f" - Not enough concepts ({len(concepts)}) for dream. Skipping.")
578
+ return existing_kg, {}
579
+ seed_concepts = random.sample(concepts, k=num_seeds)
580
+ seed_names = [c['name'] for c in seed_concepts]
581
+ print(f" - Dream seeded with: {seed_names}")
582
+ prompt = f"""
583
+ Write a short, speculative paragraph (a 'dream') that plausibly connects the concepts of {json.dumps(seed_names)}.
584
+ Invent a brief narrative or a hypothetical situation.
585
+ Respond with JSON: {{"dream_text": "A short paragraph..."}}
586
+ """
587
+ response = get_llm_response(prompt,
588
+ model=model,
589
+ provider=provider, npc = npc,
590
+ format="json", context=context)
591
+ dream_text = response['response'].get('dream_text')
592
+ if not dream_text:
593
+ print(" - Failed to generate a dream narrative. Skipping.")
594
+ return existing_kg, {}
595
+ print(f" - Generated Dream: '{dream_text[:150]}...'")
546
596
 
547
- # Start assignment process from the top-level groups
548
- top_level_assignments = get_fact_assignments(fact, top_groups, model, provider, npc)
597
+ dream_kg, _ = kg_evolve_incremental(existing_kg, dream_text, model, provider, npc, context)
549
598
 
550
- # Initialize tracking for all relevant groups and paths
551
- all_assigned_groups = set(top_level_assignments)
552
- current_level_to_process = top_level_assignments # Groups at the current level we need to check children for
553
- hierarchy_paths = [] # Stores the path from top-level to the most specific assigned group
554
-
555
- # Store path segments as we go down
556
- path_segments = {group: [group] for group in top_level_assignments}
557
-
558
- # Traverse down the hierarchy level by level
559
- # We continue as long as there are groups at the current level that are assigned to the fact
560
- # and these groups have children defined in the DAG.
561
- processed_groups_in_level = set() # To avoid infinite loops if DAG has cycles (though should be acyclic)
599
+ original_fact_stmts = {f['statement'] for f in existing_kg['facts']}
600
+ for fact in dream_kg['facts']:
601
+ if fact['statement'] not in original_fact_stmts: fact['origin'] = 'dream'
602
+ original_concept_names = {c['name'] for c in existing_kg['concepts']}
603
+ for concept in dream_kg['concepts']:
604
+ if concept['name'] not in original_concept_names: concept['origin'] = 'dream'
605
+ print(" - Dream analysis complete. New knowledge integrated.")
606
+ return dream_kg, {}
562
607
 
563
- while current_level_to_process:
564
- next_level_to_process = set()
565
-
566
- for current_group in current_level_to_process:
567
- # Prevent reprocessing the same group in the same level traversal
568
- if current_group in processed_groups_in_level:
569
- continue
570
- processed_groups_in_level.add(current_group)
571
-
572
- # Get children of the current group
573
- children = dag_data["dag"].get(current_group, {}).get("children", set())
574
-
575
- if children:
576
- # Get assignments for children
577
- child_assignments = get_fact_assignments(fact, list(children), model, provider, npc)
578
-
579
- # If the fact belongs to any children, add them to the next level to process
580
- if child_assignments:
581
- next_level_to_process.update(child_assignments)
582
- all_assigned_groups.update(child_assignments)
583
-
584
- # Update path segments for newly assigned children
585
- for assigned_child in child_assignments:
586
- # Append the child to the path of its parent
587
- if current_group in path_segments:
588
- path_segments[assigned_child] = path_segments[current_group] + [assigned_child]
589
- else: # Should not happen if logic is correct, but as a safeguard
590
- path_segments[assigned_child] = [assigned_child]
591
-
592
- # Add completed paths to our final list
593
- for group, path in path_segments.items():
594
- if group in current_level_to_process and group not in processed_groups_in_level: # If it was processed and assigned
595
- if path not in hierarchy_paths:
596
- hierarchy_paths.append(' → '.join(path))
597
-
598
- current_level_to_process = next_level_to_process
599
- processed_groups_in_level = set() # Reset for the next level
600
608
 
601
- # Ensure all paths are captured even if a fact is only assigned to top-level groups
602
- for group in top_level_assignments:
603
- if group in path_segments and ' → '.join(path_segments[group]) not in hierarchy_paths:
604
- hierarchy_paths.append(' → '.join(path_segments[group]))
609
+ def save_kg_with_pandas(kg, path_prefix="kg_state"):
605
610
 
611
+ generation = kg.get("generation", 0)
606
612
 
607
- return {
608
- "top_level_groups": top_level_assignments,
609
- "all_groups": list(all_assigned_groups),
610
- "hierarchy_paths": hierarchy_paths
611
- }
612
-
613
- def process_text_with_hierarchy(
614
- text: str,
615
- model: str,
616
- provider: str,
617
- db_path: str,
618
- npc: NPC = None,
619
- existing_knowledge_graph: Optional[Dict] = None
620
- ) -> Dict:
621
- """Full processing pipeline with hierarchical grouping"""
622
- print('process_text_with_hierarchy: Starting processing')
623
- facts = extract_facts(text, model, provider, npc)
624
- print(f'process_text_with_hierarchy: Extracted Facts: {facts}')
613
+ nodes_data = []
614
+ for fact in kg.get('facts', []): nodes_data.append({'id': fact['statement'], 'type': 'fact', 'generation': fact.get('generation')})
615
+ for concept in kg.get('concepts', []): nodes_data.append({'id': concept['name'], 'type': 'concept', 'generation': concept.get('generation')})
616
+ pd.DataFrame(nodes_data).to_csv(f'{path_prefix}_gen{generation}_nodes.csv', index=False)
625
617
 
626
- conn = init_db(db_path, drop=False)
627
- if conn is None:
628
- return None
618
+ links_data = []
619
+ for fact_stmt, concepts in kg.get("fact_to_concept_links", {}).items():
620
+ for concept_name in concepts: links_data.append({'source': fact_stmt, 'target': concept_name, 'type': 'fact_to_concept'})
621
+ for c1, c2 in kg.get("concept_links", []):
622
+ links_data.append({'source': c1, 'target': c2, 'type': 'concept_to_concept'})
629
623
 
630
- leaf_groups = existing_knowledge_graph.get("leaf_groups", []) if existing_knowledge_graph else []
631
-
632
- # Build the hierarchy from the extracted facts (and any existing leaf groups)
633
- hierarchy_data = build_full_hierarchy(facts + leaf_groups, model, provider, npc) # Pass facts to generate initial groups
624
+ for f1, f2 in kg.get("fact_to_fact_links", []):
625
+ links_data.append({'source': f1, 'target': f2, 'type': 'fact_to_fact'})
626
+ pd.DataFrame(links_data).to_csv(f'{path_prefix}_gen{generation}_links.csv', index=False)
627
+ print(f"Saved KG Generation {generation} to CSV files.")
634
628
 
635
- assignments = {}
636
- for fact in facts:
637
- # Assign facts using the top-down traversal logic
638
- assignment = assign_fact_to_dag(fact, hierarchy_data, model, provider, npc)
639
-
640
- # Store fact and its assignments in Kuzu
641
- store_success = store_fact_and_group(conn, fact, assignment["all_groups"], "")
642
- if not store_success:
643
- print(f'process_text_with_hierarchy: Failed to store fact: {fact}')
644
-
645
- assignments[fact] = assignment
646
-
647
- conn.close()
648
-
649
- print('process_text_with_hierarchy: Finished Processing')
650
- return {
651
- 'facts': facts,
652
- 'leaf_groups': hierarchy_data.get("leaf_groups", []), # This should be the *final* leaf groups after abstraction
653
- 'hierarchy': hierarchy_data,
654
- 'assignments': assignments
655
- }
656
629
 
630
+ def save_changelog_to_json(changelog, from_gen, to_gen, path_prefix="changelog"):
631
+ if not changelog: return
632
+ with open(f"{path_prefix}_gen{from_gen}_to_{to_gen}.json", 'w', encoding='utf-8') as f:
633
+ json.dump(changelog, f, indent=4)
634
+ print(f"Saved changelog for Gen {from_gen}->{to_gen}.")
657
635
 
658
636
 
659
637
 
660
- ### STORAGE
661
638
 
662
639
  def store_fact_and_group(conn, fact: str,
663
640
  groups: List[str], path: str) -> bool:
@@ -667,15 +644,15 @@ def store_fact_and_group(conn, fact: str,
667
644
  return False
668
645
 
669
646
  print(f"store_fact_and_group: Storing fact: {fact}, with groups:"
670
- f" {groups}") # DEBUG
647
+ f" {groups}")
671
648
  try:
672
- # Insert the fact
673
- insert_success = insert_fact(conn, fact, path) # Capture return
649
+
650
+ insert_success = insert_fact(conn, fact, path)
674
651
  if not insert_success:
675
652
  print(f"store_fact_and_group: Failed to insert fact: {fact}")
676
653
  return False
677
654
 
678
- # Assign fact to groups
655
+
679
656
  for group in groups:
680
657
  assign_success = assign_fact_to_group_graph(conn, fact, group)
681
658
  if not assign_success:
@@ -688,28 +665,26 @@ def store_fact_and_group(conn, fact: str,
688
665
  print(f"store_fact_and_group: Error storing fact and group: {e}")
689
666
  traceback.print_exc()
690
667
  return False
691
-
692
668
  def insert_fact(conn, fact: str, path: str) -> bool:
693
669
  """Insert a fact into the database with robust error handling"""
694
670
  if conn is None:
695
671
  print("insert_fact: Cannot insert fact:"
696
672
  " database connection is None")
697
673
  return False
698
-
699
674
  try:
700
- # Properly escape quotes in strings
675
+
701
676
  escaped_fact = fact.replace('"', '\\"')
702
677
  escaped_path = os.path.expanduser(path).replace('"', '\\"')
703
678
 
704
- # Generate timestamp
679
+
705
680
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
706
681
 
707
- print(f"insert_fact: Attempting to insert fact: {fact}") #DEBUG
682
+ print(f"insert_fact: Attempting to insert fact: {fact}")
708
683
 
709
- # Begin transaction
684
+
710
685
  safe_kuzu_execute(conn, "BEGIN TRANSACTION")
711
686
 
712
- # Check if fact already exists
687
+
713
688
  check_query = f"""
714
689
  MATCH (f:Fact {{content: "{escaped_fact}"}})
715
690
  RETURN f
@@ -723,7 +698,7 @@ def insert_fact(conn, fact: str, path: str) -> bool:
723
698
  print(f"insert_fact: Error checking if fact exists: {error}")
724
699
  return False
725
700
 
726
- # Insert fact if it doesn't exist
701
+
727
702
  if not result.has_next():
728
703
  insert_query = f"""
729
704
  CREATE (f:Fact {{
@@ -741,7 +716,7 @@ def insert_fact(conn, fact: str, path: str) -> bool:
741
716
  print(f"insert_fact: Error inserting fact: {error}")
742
717
  return False
743
718
 
744
- # Commit transaction
719
+
745
720
  safe_kuzu_execute(conn, "COMMIT")
746
721
  print(f"insert_fact: Successfully inserted/found fact: {fact}")
747
722
  return True
@@ -760,14 +735,14 @@ def assign_fact_to_group_graph(conn, fact: str, group: str) -> bool:
760
735
  return False
761
736
 
762
737
  try:
763
- # Properly escape quotes in strings
738
+
764
739
  escaped_fact = fact.replace('"', '\\"')
765
740
  escaped_group = group.replace('"', '\\"')
766
741
 
767
742
  print(f"assign_fact_to_group_graph: Assigning fact: {fact} to group:"
768
- f" {group}") #DEBUG
743
+ f" {group}")
769
744
 
770
- # Check if both fact and group exist before creating relationship
745
+
771
746
  check_query = f"""
772
747
  MATCH (f:Fact {{content: "{escaped_fact}"}})
773
748
  RETURN f
@@ -794,7 +769,7 @@ def assign_fact_to_group_graph(conn, fact: str, group: str) -> bool:
794
769
  print(f"assign_fact_to_group_graph: Group not found: {group}")
795
770
  return False
796
771
 
797
- # Create relationship
772
+
798
773
  query = f"""
799
774
  MATCH (f:Fact), (g:Groups)
800
775
  WHERE f.content = "{escaped_fact}" AND g.name = "{escaped_group}"
@@ -818,119 +793,27 @@ def assign_fact_to_group_graph(conn, fact: str, group: str) -> bool:
818
793
  f" {str(e)}")
819
794
  traceback.print_exc()
820
795
  return False
821
-
822
- def get_fact_assignments(
823
- fact: str,
824
- groups: List[str],
825
- model: str,
826
- provider: str,
827
- npc: NPC = None
828
- ) -> List[str]:
829
- """Get direct group assignments for a fact"""
830
-
831
- prompt = f"""Which of these groups does this fact belong to?
832
- Select ALL that apply.
833
-
834
- Fact: {fact}
835
- Groups: {json.dumps(groups)}
836
-
837
- Return JSON:
838
- {{
839
- "selected_groups": ["list of relevant groups"]
840
- }}
841
- """
842
- response = get_llm_response(prompt,
843
- model=model,
844
- provider=provider,
845
- format="json",
846
- npc=npc)
847
- return response["response"]["selected_groups"]
848
- def get_ancestor_groups(group: str, dag: Dict) -> Set[str]:
849
- """Get all ancestor groups in the DAG for a given group."""
850
- ancestors = set()
851
- queue = [group]
852
-
853
- while queue:
854
- current = queue.pop(0)
855
- # Ensure current group exists in DAG and has parents
856
- if current in dag and dag[current].get("parents"):
857
- for parent in dag[current]["parents"]:
858
- if parent not in ancestors:
859
- ancestors.add(parent)
860
- queue.append(parent)
861
- return ancestors
862
-
863
-
864
- # --- Main Process Flow ---
865
- def process_text_with_hierarchy(
866
- text: str,
867
- model: str,
868
- provider: str,
869
- db_path: str,
870
- npc: NPC = None,
871
- existing_knowledge_graph: Optional[Dict] = None
872
- ) -> Dict:
873
- """Full processing pipeline with hierarchical grouping"""
874
- print("process_text_with_hierarchy: Starting processing")
875
- # Step 1: Extract facts from text
876
- facts = extract_facts(text, model, provider, npc)
877
- print(f"process_text_with_hierarchy: Extracted Facts: {facts}")
878
-
879
- # Build the DB connection
880
- conn = init_db(db_path, drop=False)
881
- if conn is None:
882
- return None
883
796
 
884
- # Use the existing leaf_groups for semantic evolution
885
- if existing_knowledge_graph:
886
- leaf_groups = existing_knowledge_graph.get("leaf_groups", [])
887
- else:
888
- leaf_groups = []
889
-
890
- # Build the hierarchy from the database
891
- hierarchy_data = build_full_hierarchy(leaf_groups, model, provider, npc)
892
-
893
- # Step 3: Assign facts to hierarchy
894
- assignments = {}
895
- for fact in facts:
896
- assignment = assign_fact_to_dag(fact, hierarchy_data, model, provider, npc)
897
- # Store fact and group in kuzu
898
- store_success = store_fact_and_group(conn, fact, assignment["all_groups"], "")
899
- if not store_success:
900
- print(f"process_text_with_hierarchy: Failed to store fact: {fact}")
901
- assignments[fact] = assignment
902
-
903
- conn.close()
904
-
905
- print("process_text_with_hierarchy: Finished Processing")
906
- return {
907
- "facts": facts,
908
- "leaf_groups": leaf_groups,
909
- "hierarchy": hierarchy_data,
910
- "assignments": assignments
911
- }
912
797
 
913
-
914
- #--- Kuzu Database integration ---
915
798
  def store_fact_and_group(conn, fact: str, groups: List[str], path: str) -> bool:
916
799
  """Insert a fact into the database along with its groups"""
917
800
  if not conn:
918
801
  print("store_fact_and_group: Database connection is None")
919
802
  return False
920
803
 
921
- print(f"store_fact_and_group: Storing fact: {fact}, with groups: {groups}") # DEBUG
804
+ print(f"store_fact_and_group: Storing fact: {fact}, with groups: {groups}")
922
805
  try:
923
- # Insert the fact
924
- insert_success = insert_fact(conn, fact, path) # Capture return value
806
+
807
+ insert_success = insert_fact(conn, fact, path)
925
808
  if not insert_success:
926
- print(f"store_fact_and_group: Failed to insert fact: {fact}") #DEBUG
809
+ print(f"store_fact_and_group: Failed to insert fact: {fact}")
927
810
  return False
928
811
 
929
- # Assign fact to groups
812
+
930
813
  for group in groups:
931
814
  assign_success = assign_fact_to_group_graph(conn, fact, group)
932
815
  if not assign_success:
933
- print(f"store_fact_and_group: Failed to assign fact {fact} to group {group}") #DEBUG
816
+ print(f"store_fact_and_group: Failed to assign fact {fact} to group {group}")
934
817
  return False
935
818
 
936
819
  return True
@@ -940,7 +823,7 @@ def store_fact_and_group(conn, fact: str, groups: List[str], path: str) -> bool:
940
823
  return False
941
824
 
942
825
 
943
- # ---Database and other helper methods---
826
+
944
827
  def safe_kuzu_execute(conn, query, error_message="Kuzu query failed"):
945
828
  """Execute a Kuzu query with proper error handling"""
946
829
  try:
@@ -951,462 +834,52 @@ def safe_kuzu_execute(conn, query, error_message="Kuzu query failed"):
951
834
  print(error)
952
835
  return None, error
953
836
 
837
+ def process_text_with_chroma(
838
+ kuzu_db_path: str,
839
+ chroma_db_path: str,
840
+ text: str,
841
+ path: str,
842
+ model: str ,
843
+ provider: str ,
844
+ embedding_model: str ,
845
+ embedding_provider: str ,
846
+ npc = None,
847
+ batch_size: int = 5,
848
+ ):
849
+ """Process text and store facts in both Kuzu and Chroma DB
850
+
851
+ Args:
852
+ kuzu_db_path: Path to Kuzu graph database
853
+ chroma_db_path: Path to Chroma vector database
854
+ text: Input text to process
855
+ path: Source path or identifier
856
+ model: LLM model to use
857
+ provider: LLM provider
858
+ embedding_model: Model to use for embeddings
859
+ npc: Optional NPC instance
860
+ batch_size: Batch size for processing
954
861
 
955
- def test_hierarchical_knowledge_graph():
956
- """Test the full hierarchical knowledge graph implementation"""
957
- text = """
958
- npcsh is a Python-based command-line tool for integrating LLMs into daily workflows.
959
- It features a smart interpreter that understands natural language commands.
960
- The tool remembers command history and can reference previous commands.
961
- It supports creating custom NPCs with specific personalities and directives.
962
- Advanced customization is possible through configuration files.
862
+ Returns:
863
+ List of extracted facts
963
864
  """
964
865
 
965
- # Initialize with model and provider
966
- model = "gpt-4o-mini"
967
- provider = "openai"
968
-
969
- # Create knowledge graph
970
- kg = create_knowledge_graph(text, model, provider)
971
-
972
- # Print results
973
- print("FACTS:")
974
- for i, fact in enumerate(kg["facts"]):
975
- print(f"{i+1}. {fact}")
976
-
977
- print("\nHIERARCHY LEVELS:")
978
- for level in range(kg["hierarchy"]["top_level"], -1, -1):
979
- groups = kg["hierarchy"][f"level_{level}"]["groups"]
980
- print(f"Level {level} ({len(groups)} groups):")
981
- for group in groups:
982
- print(f" - {group}")
866
+ kuzu_conn = init_db(kuzu_db_path, drop=False)
867
+ chroma_client, chroma_collection = setup_chroma_db(
868
+ "knowledge_graph",
869
+ "Facts extracted from various sources",
870
+ chroma_db_path
871
+ )
872
+
983
873
 
984
- print("\nASSIGNMENTS:")
985
- for fact, assignment in kg["assignments"].items():
986
- print(f"\nFact: {fact}")
987
- print("Assignments by level:")
988
- for level, groups in assignment["all_assignments"].items():
989
- print(f" Level {level}: {groups}")
874
+ facts = extract_facts(text, model=model, provider=provider, npc=npc)
990
875
 
991
- def find_similar_groups(
992
- conn,
993
- fact: str, # Ensure fact is passed as a string
994
- model: str = "llama3.2",
995
- provider: str = "ollama",
996
- npc: NPC = None,
997
- ) -> List[str]:
998
- """Find existing groups that might contain this fact"""
999
- response = conn.execute(f"MATCH (g:Groups) RETURN g.name;") # Execute query
1000
- print(response)
1001
- print(type(response))
1002
- print(dir(response))
1003
- groups = response.fetch_as_df()
1004
- print(f"Groups: {groups}")
1005
- if not groups:
1006
- return []
876
+
877
+ for i in range(0, len(facts), batch_size):
878
+ batch = facts[i : i + batch_size]
879
+ print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
1007
880
 
1008
- prompt = """Given a fact and a list of groups, determine which groups this fact belongs to.
1009
- A fact should belong to a group if it is semantically related to the group's theme or purpose.
1010
- For example, if a fact is "The user loves programming" and there's a group called "Technical_Interests",
1011
- that would be a match.
1012
-
1013
- Return a JSON object with the following structure:
1014
- {
1015
- "group_list": "a list containing the names of matching groups"
1016
- }
1017
-
1018
- Return only the JSON object.
1019
- Do not include any additional markdown formatting.
1020
- """
1021
-
1022
- response = get_llm_response(
1023
- prompt + f"\n\nFact: {fact}\nGroups: {json.dumps(groups)}",
1024
- model=model,
1025
- provider=provider,
1026
- format="json",
1027
- npc=npc,
1028
- )
1029
- response = response["response"]
1030
- return response["group_list"]
1031
-
1032
-
1033
- def identify_groups(
1034
- facts: List[str],
1035
- model: str = "llama3.2",
1036
- provider: str = "ollama",
1037
- npc: NPC = None,
1038
- ) -> List[str]:
1039
- """Identify natural groups from a list of facts"""
1040
-
1041
-
1042
- prompt = """What are the main groups these facts could be organized into?
1043
- Express these groups in plain, natural language.
1044
-
1045
- For example, given:
1046
- - User enjoys programming in Python
1047
- - User works on machine learning projects
1048
- - User likes to play piano
1049
- - User practices meditation daily
1050
-
1051
- You might identify groups like:
1052
- - Programming
1053
- - Machine Learning
1054
- - Musical Interests
1055
- - Daily Practices
1056
-
1057
- Return a JSON object with the following structure:
1058
- `{
1059
- "groups": ["list of group names"]
1060
- }`
1061
-
1062
-
1063
- Return only the JSON object. Do not include any additional markdown formatting or
1064
- leading json characters.
1065
- """
1066
-
1067
- response = get_llm_response(
1068
- prompt + f"\n\nFacts: {json.dumps(facts)}",
1069
- model=model,
1070
- provider=provider,
1071
- format="json",
1072
- npc=npc,
1073
- )
1074
- return response["response"]["groups"]
1075
-
1076
-
1077
- def assign_groups_to_fact(
1078
- fact: str,
1079
- groups: List[str],
1080
- model: str = "llama3.2",
1081
- provider: str = "ollama",
1082
- npc: NPC = None,
1083
- ) -> Dict[str, List[str]]:
1084
- """Assign facts to the identified groups"""
1085
- prompt = f"""Given this fact, assign it to any relevant groups.
1086
-
1087
- A fact can belong to multiple groups if it fits.
1088
-
1089
- Here is the fact: {fact}
1090
-
1091
- Here are the groups: {groups}
1092
-
1093
- Return a JSON object with the following structure:
1094
- {{
1095
- "groups": ["list of group names"]
1096
- }}
1097
-
1098
- Do not include any additional markdown formatting or leading json characters.
1099
-
1100
-
1101
- """
1102
-
1103
- response = get_llm_response(
1104
- prompt,
1105
- model=model,
1106
- provider=provider,
1107
- format="json",
1108
- npc=npc,
1109
- )
1110
- return response["response"]
1111
-
1112
-
1113
- def save_facts_to_db(
1114
- conn, facts: List[str], path: str, batch_size: int
1115
- ):
1116
- """Save a list of facts to the database in batches"""
1117
- for i in range(0, len(facts), batch_size):
1118
- batch = facts[i : i + batch_size]
1119
- print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
1120
-
1121
- # Process each fact in the batch
1122
- for fact in batch:
1123
- try:
1124
- print(f"Inserting fact: {fact}")
1125
- print(f"With path: {path}")
1126
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
1127
- print(f"With recorded_at: {timestamp}")
1128
-
1129
- insert_fact(conn, fact, path)
1130
- print("Success!")
1131
- except Exception as e:
1132
- print(f"Failed to insert fact: {fact}")
1133
- print(f"Error: {e}")
1134
- continue
1135
-
1136
- print(f"Completed batch {i//batch_size + 1}")
1137
-
1138
-
1139
- def process_text(
1140
- db_path: str,
1141
- text: str,
1142
- path: str,
1143
- model: str = "llama3.2",
1144
- provider: str = "ollama",
1145
- npc: NPC = None,
1146
- batch_size: int = 5,
1147
- conn=None,
1148
- ):
1149
- """Process text and add extracted facts to the database with robust error handling"""
1150
-
1151
- try:
1152
- # Initialize database
1153
- if conn is None:
1154
- conn = init_db(db_path, drop=False)
1155
-
1156
- return []
1157
-
1158
- # Extract facts
1159
- facts = extract_facts(text, model=model, provider=provider, npc=npc)
1160
- if not facts:
1161
- print("No facts extracted")
1162
- return []
1163
-
1164
- print(f"Extracted {len(facts)} facts")
1165
- for fact in facts:
1166
- print(f"- {fact}")
1167
-
1168
- # Process facts in batches
1169
- for i in range(0, len(facts), batch_size):
1170
- batch = facts[i : i + batch_size]
1171
- print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
1172
-
1173
- for fact in batch:
1174
- try:
1175
- print(f"Inserting fact: {fact}")
1176
- success = insert_fact(conn, fact, path)
1177
- if success:
1178
- print("Success!")
1179
- else:
1180
- print("Failed to insert fact")
1181
- except Exception as e:
1182
- print(f"Error processing fact: {str(e)}")
1183
- traceback.print_exc()
1184
-
1185
- print(f"Completed batch {i//batch_size + 1}")
1186
-
1187
- return facts
1188
- except Exception as e:
1189
- print(f"Error processing text: {str(e)}")
1190
- traceback.print_exc()
1191
- return []
1192
-
1193
-
1194
- import networkx as nx
1195
- import matplotlib.pyplot as plt
1196
-
1197
-
1198
- def visualize_graph(conn):
1199
- """Visualize the knowledge graph using networkx"""
1200
- # Create a networkx graph
1201
- G = nx.DiGraph()
1202
-
1203
- # Get all facts and groups with their relationships
1204
- facts_result = conn.execute("MATCH (f:Fact) RETURN f.content;").get_as_df()
1205
- facts = [row["f.content"] for index, row in facts_result.iterrows()]
1206
-
1207
- groups_result = conn.execute("MATCH (g:Groups) RETURN g.name;").get_as_df()
1208
- groups = [row["g.name"] for index, row in groups_result.iterrows()]
1209
-
1210
- relationships_result = conn.execute(
1211
- """
1212
- MATCH (g:Groups)-[r:Contains]->(f:Fact)
1213
- RETURN g.name, f.content;
1214
- """
1215
- ).get_as_df()
1216
-
1217
- # Add nodes with different colors for facts and groups
1218
- for fact in facts:
1219
- G.add_node(fact, node_type="fact")
1220
- for group in groups:
1221
- G.add_node(group, node_type="group")
1222
-
1223
- # Add edges from relationships
1224
- for index, row in relationships_result.iterrows():
1225
- G.add_edge(row["g.name"], row["f.content"]) # group name -> fact content
1226
-
1227
- # Set up the visualization
1228
- plt.figure(figsize=(20, 12))
1229
- pos = nx.spring_layout(G, k=2, iterations=50)
1230
-
1231
- # Draw groups (larger nodes, distinct color)
1232
- group_nodes = [
1233
- n for n, attr in G.nodes(data=True) if attr.get("node_type") == "group"
1234
- ]
1235
- nx.draw_networkx_nodes(
1236
- G, pos, nodelist=group_nodes, node_color="lightgreen", node_size=3000, alpha=0.7
1237
- )
1238
-
1239
- # Draw facts (smaller nodes, different color)
1240
- fact_nodes = [
1241
- n for n, attr in G.nodes(data=True) if attr.get("node_type") == "fact"
1242
- ]
1243
- nx.draw_networkx_nodes(
1244
- G, pos, nodelist=fact_nodes, node_color="lightblue", node_size=2000, alpha=0.5
1245
- )
1246
-
1247
- # Draw edges with arrows
1248
- nx.draw_networkx_edges(G, pos, edge_color="gray", arrows=True, arrowsize=20)
1249
-
1250
- # Add labels with different sizes for groups and facts
1251
- group_labels = {node: node for node in group_nodes}
1252
- fact_labels = {
1253
- node: node[:50] + "..." if len(node) > 50 else node for node in fact_nodes
1254
- }
1255
-
1256
- nx.draw_networkx_labels(G, pos, group_labels, font_size=10, font_weight="bold")
1257
- nx.draw_networkx_labels(G, pos, fact_labels, font_size=8)
1258
-
1259
- plt.title("Knowledge Graph: Groups and Facts", pad=20, fontsize=16)
1260
- plt.axis("off")
1261
- plt.tight_layout()
1262
-
1263
- # Print statistics
1264
- print("\nKnowledge Graph Statistics:")
1265
- print(f"Number of facts: {len(facts)}")
1266
- print(f"Number of groups: {len(groups)}")
1267
- print(f"Number of relationships: {len(relationships_result)}")
1268
-
1269
- print("\nGroups:")
1270
- for g in groups:
1271
- related_facts = [
1272
- row["f.content"]
1273
- for index, row in relationships_result.iterrows()
1274
- if row["g.name"] == g
1275
- ]
1276
- print(f"\n{g}:")
1277
- for f in related_facts:
1278
- print(f" - {f}")
1279
-
1280
- plt.show()
1281
-
1282
-
1283
- def store_fact_with_embedding(
1284
- collection, fact: str, metadata: dict, embedding: List[float]
1285
- ) -> str:
1286
- """Store a fact with its pre-generated embedding in Chroma DB
1287
-
1288
- Args:
1289
- collection: Chroma collection
1290
- fact: The fact text
1291
- metadata: Dictionary with metadata (path, source, timestamp, etc.)
1292
- embedding: Pre-generated embedding vector from get_embeddings
1293
-
1294
- Returns:
1295
- ID of the stored fact
1296
- """
1297
- try:
1298
- # Generate a deterministic ID from the fact content
1299
- import hashlib
1300
-
1301
- fact_id = hashlib.md5(fact.encode()).hexdigest()
1302
-
1303
- # Store document with pre-generated embedding
1304
- collection.add(
1305
- documents=[fact],
1306
- embeddings=[embedding],
1307
- metadatas=[metadata],
1308
- ids=[fact_id],
1309
- )
1310
-
1311
- return fact_id
1312
- except Exception as e:
1313
- print(f"Error storing fact in Chroma: {e}")
1314
- return None
1315
-
1316
-
1317
- def find_similar_facts_chroma(
1318
- collection,
1319
- query: str,
1320
- query_embedding: List[float],
1321
- n_results: int = 5,
1322
- metadata_filter: Optional[Dict] = None,
1323
- ) -> List[Dict]:
1324
- """Find facts similar to the query using pre-generated embedding
1325
-
1326
- Args:
1327
- collection: Chroma collection
1328
- query: Query text (for reference only)
1329
- query_embedding: Pre-generated embedding from get_embeddings
1330
- n_results: Number of results to return
1331
- metadata_filter: Optional filter for metadata fields
1332
-
1333
- Returns:
1334
- List of dictionaries with results
1335
- """
1336
- try:
1337
- # Perform query with optional metadata filtering
1338
- results = collection.query(
1339
- query_embeddings=[query_embedding],
1340
- n_results=n_results,
1341
- where=metadata_filter,
1342
- )
1343
-
1344
- # Format results
1345
- formatted_results = []
1346
- for i, doc in enumerate(results["documents"][0]):
1347
- formatted_results.append(
1348
- {
1349
- "fact": doc,
1350
- "metadata": results["metadatas"][0][i],
1351
- "id": results["ids"][0][i],
1352
- "distance": (
1353
- results["distances"][0][i] if "distances" in results else None
1354
- ),
1355
- }
1356
- )
1357
-
1358
- return formatted_results
1359
- except Exception as e:
1360
- print(f"Error searching in Chroma: {e}")
1361
- return []
1362
-
1363
-
1364
- def process_text_with_chroma(
1365
- kuzu_db_path: str,
1366
- chroma_db_path: str,
1367
- text: str,
1368
- path: str,
1369
- model: str ,
1370
- provider: str ,
1371
- embedding_model: str ,
1372
- embedding_provider: str ,
1373
- npc: NPC = None,
1374
- batch_size: int = 5,
1375
- ):
1376
- """Process text and store facts in both Kuzu and Chroma DB
1377
-
1378
- Args:
1379
- kuzu_db_path: Path to Kuzu graph database
1380
- chroma_db_path: Path to Chroma vector database
1381
- text: Input text to process
1382
- path: Source path or identifier
1383
- model: LLM model to use
1384
- provider: LLM provider
1385
- embedding_model: Model to use for embeddings
1386
- npc: Optional NPC instance
1387
- batch_size: Batch size for processing
1388
-
1389
- Returns:
1390
- List of extracted facts
1391
- """
1392
- # Initialize databases
1393
- kuzu_conn = init_db(kuzu_db_path, drop=False)
1394
- chroma_client, chroma_collection = setup_chroma_db(
1395
- "knowledge_graph",
1396
- "Facts extracted from various sources",
1397
- chroma_db_path
1398
- )
1399
-
1400
- # Extract facts
1401
- facts = extract_facts(text, model=model, provider=provider, npc=npc)
1402
-
1403
- # Process extracted facts
1404
- for i in range(0, len(facts), batch_size):
1405
- batch = facts[i : i + batch_size]
1406
- print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
1407
-
1408
- # Generate embeddings for the batch using npcpy.llm_funcs.get_embeddings
1409
- from npcpy.llm_funcs import get_embeddings
881
+
882
+ from npcpy.llm_funcs import get_embeddings
1410
883
 
1411
884
  batch_embeddings = get_embeddings(
1412
885
  batch,
@@ -1416,7 +889,7 @@ def process_text_with_chroma(
1416
889
  print(f"Processing fact: {fact}")
1417
890
  embedding = batch_embeddings[j]
1418
891
 
1419
- # Check for similar facts in Chroma before inserting
892
+
1420
893
  similar_facts = find_similar_facts_chroma(
1421
894
  chroma_collection, fact, query_embedding=embedding, n_results=3
1422
895
  )
@@ -1425,9 +898,9 @@ def process_text_with_chroma(
1425
898
  print(f"Similar facts found:")
1426
899
  for result in similar_facts:
1427
900
  print(f" - {result['fact']} (distance: {result['distance']})")
1428
- # Note: Could implement a similarity threshold here to skip highly similar facts
901
+
1429
902
 
1430
- # Prepare metadata
903
+
1431
904
  metadata = {
1432
905
  "path": path,
1433
906
  "timestamp": datetime.now().isoformat(),
@@ -1435,10 +908,10 @@ def process_text_with_chroma(
1435
908
  "source_provider": provider,
1436
909
  }
1437
910
 
1438
- # Insert into Kuzu graph DB
911
+
1439
912
  kuzu_success = insert_fact(kuzu_conn, fact, path)
1440
913
 
1441
- # Insert into Chroma vector DB if Kuzu insert was successful
914
+
1442
915
  if kuzu_success:
1443
916
  chroma_id = store_fact_with_embedding(
1444
917
  chroma_collection, fact, metadata, embedding
@@ -1450,7 +923,7 @@ def process_text_with_chroma(
1450
923
  else:
1451
924
  print(f"Failed to save fact to Kuzu graph")
1452
925
 
1453
- # Close Kuzu connection
926
+
1454
927
  kuzu_conn.close()
1455
928
 
1456
929
  return facts
@@ -1479,12 +952,12 @@ def hybrid_search_with_chroma(
1479
952
  Returns:
1480
953
  List of dictionaries with combined results
1481
954
  """
1482
- # Get embedding for query using npcpy.llm_funcs.get_embeddings
955
+
1483
956
  from npcpy.llm_funcs import get_embeddings
1484
957
 
1485
958
  query_embedding = get_embeddings([query])[0]
1486
959
 
1487
- # Step 1: Find similar facts using Chroma vector search
960
+
1488
961
  vector_results = find_similar_facts_chroma(
1489
962
  chroma_collection,
1490
963
  query,
@@ -1493,13 +966,13 @@ def hybrid_search_with_chroma(
1493
966
  metadata_filter=metadata_filter,
1494
967
  )
1495
968
 
1496
- # Extract just the fact texts from vector results
969
+
1497
970
  vector_facts = [result["fact"] for result in vector_results]
1498
971
 
1499
- # Step 2: Expand context using graph relationships
972
+
1500
973
  expanded_results = []
1501
974
 
1502
- # Add vector search results
975
+
1503
976
  for result in vector_results:
1504
977
  expanded_results.append(
1505
978
  {
@@ -1511,13 +984,13 @@ def hybrid_search_with_chroma(
1511
984
  }
1512
985
  )
1513
986
 
1514
- # For each vector-matched fact, find related facts in the graph
987
+
1515
988
  for fact in vector_facts:
1516
989
  try:
1517
- # Safely escape fact text for Kuzu query
990
+
1518
991
  escaped_fact = fact.replace('"', '\\"')
1519
992
 
1520
- # Find groups containing this fact
993
+
1521
994
  group_result = kuzu_conn.execute(
1522
995
  f"""
1523
996
  MATCH (g:Groups)-[:Contains]->(f:Fact)
@@ -1526,18 +999,18 @@ def hybrid_search_with_chroma(
1526
999
  """
1527
1000
  ).get_as_df()
1528
1001
 
1529
- # Extract group names
1002
+
1530
1003
  fact_groups = [row["g.name"] for _, row in group_result.iterrows()]
1531
1004
 
1532
- # Apply group filter if provided
1005
+
1533
1006
  if group_filter:
1534
1007
  fact_groups = [g for g in fact_groups if g in group_filter]
1535
1008
 
1536
- # For each group, find other related facts
1009
+
1537
1010
  for group in fact_groups:
1538
1011
  escaped_group = group.replace('"', '\\"')
1539
1012
 
1540
- # Find facts in the same group
1013
+
1541
1014
  related_facts_result = kuzu_conn.execute(
1542
1015
  f"""
1543
1016
  MATCH (g:Groups)-[:Contains]->(f:Fact)
@@ -1547,7 +1020,7 @@ def hybrid_search_with_chroma(
1547
1020
  """
1548
1021
  ).get_as_df()
1549
1022
 
1550
- # Add these related facts to results
1023
+
1551
1024
  for _, row in related_facts_result.iterrows():
1552
1025
  related_fact = {
1553
1026
  "fact": row["f.content"],
@@ -1557,7 +1030,7 @@ def hybrid_search_with_chroma(
1557
1030
  "recorded_at": row["f.recorded_at"],
1558
1031
  }
1559
1032
 
1560
- # Avoid duplicates
1033
+
1561
1034
  if not any(
1562
1035
  r.get("fact") == related_fact["fact"] for r in expanded_results
1563
1036
  ):
@@ -1566,806 +1039,380 @@ def hybrid_search_with_chroma(
1566
1039
  except Exception as e:
1567
1040
  print(f"Error expanding results via graph: {e}")
1568
1041
 
1569
- # Return results, limiting to top_k if needed
1042
+
1570
1043
  return expanded_results[:top_k]
1571
1044
 
1572
1045
 
1573
- def get_facts_for_rag(
1574
- kuzu_db_path: str,
1575
- chroma_db_path: str,
1046
+ def find_similar_facts_chroma(
1047
+ collection,
1576
1048
  query: str,
1577
- group_filters: Optional[List[str]] = None,
1578
- top_k: int = 10,
1579
- ) -> str:
1580
- """Get facts for RAG by combining vector and graph search
1049
+ query_embedding: List[float],
1050
+ n_results: int = 5,
1051
+ metadata_filter: Optional[Dict] = None,
1052
+ ) -> List[Dict]:
1053
+ """Find facts similar to the query using pre-generated embedding
1581
1054
 
1582
1055
  Args:
1583
- kuzu_db_path: Path to Kuzu graph database
1584
- chroma_db_path: Path to Chroma vector database
1585
- query: Search query
1586
- group_filters: Optional list of groups to filter by
1587
- top_k: Number of results to return
1588
- embedding_model: Model to use for embeddings
1589
- provider: Provider for embeddings
1056
+ collection: Chroma collection
1057
+ query: Query text (for reference only)
1058
+ query_embedding: Pre-generated embedding from get_embeddings
1059
+ n_results: Number of results to return
1060
+ metadata_filter: Optional filter for metadata fields
1590
1061
 
1591
1062
  Returns:
1592
- Formatted context string with retrieved facts
1063
+ List of dictionaries with results
1593
1064
  """
1594
- # Initialize connections
1595
- kuzu_conn = init_db(kuzu_db_path)
1596
- chroma_client, chroma_collection = setup_chroma_db(
1597
- "knowledge_graph",
1598
- "Facts extracted from various sources",
1599
- chroma_db_path
1600
- )
1601
-
1602
- # Perform hybrid search
1603
- results = hybrid_search_with_chroma(
1604
- kuzu_conn=kuzu_conn,
1605
- chroma_collection=chroma_collection,
1606
- query=query,
1607
- group_filter=group_filters,
1608
- top_k=top_k,
1609
- )
1610
-
1611
- # Format results as context for RAG
1612
- context = "Related facts:\n\n"
1613
-
1614
- # First include direct vector matches
1615
- context += "Most relevant facts:\n"
1616
- vector_matches = [r for r in results if r["source"] == "vector_search"]
1617
- for i, item in enumerate(vector_matches):
1618
- context += f"{i+1}. {item['fact']}\n"
1065
+ try:
1066
+
1067
+ results = collection.query(
1068
+ query_embeddings=[query_embedding],
1069
+ n_results=n_results,
1070
+ where=metadata_filter,
1071
+ )
1619
1072
 
1620
- # Then include graph-related facts
1621
- context += "\nRelated concepts:\n"
1622
- graph_matches = [r for r in results if r["source"] != "vector_search"]
1623
- for i, item in enumerate(graph_matches):
1624
- group = item["source"].replace("graph_relation_via_", "")
1625
- context += f"{i+1}. {item['fact']} (related via {group})\n"
1073
+
1074
+ formatted_results = []
1075
+ for i, doc in enumerate(results["documents"][0]):
1076
+ formatted_results.append(
1077
+ {
1078
+ "fact": doc,
1079
+ "metadata": results["metadatas"][0][i],
1080
+ "id": results["ids"][0][i],
1081
+ "distance": (
1082
+ results["distances"][0][i] if "distances" in results else None
1083
+ ),
1084
+ }
1085
+ )
1626
1086
 
1627
- # Close connections
1628
- kuzu_conn.close()
1087
+ return formatted_results
1088
+ except Exception as e:
1089
+ print(f"Error searching in Chroma: {e}")
1090
+ return []
1629
1091
 
1630
- return context
1631
1092
 
1632
1093
 
1633
- def answer_with_rag(
1634
- query: str,
1635
- kuzu_db_path: str = os.path.expanduser("~/npcsh_graph.db"),
1636
- chroma_db_path: str = os.path.expanduser("~/npcsh_chroma.db"),
1637
- model: str = "ollama",
1638
- provider: str = "llama3.2",
1639
- embedding_model: str = "text-embedding-3-small",
1094
+ def store_fact_with_embedding(
1095
+ collection, fact: str, metadata: dict, embedding: List[float]
1640
1096
  ) -> str:
1641
- """Answer a query using RAG with facts from the knowledge base
1097
+ """Store a fact with its pre-generated embedding in Chroma DB
1642
1098
 
1643
1099
  Args:
1644
- query: User query
1645
- kuzu_db_path: Path to Kuzu graph database
1646
- chroma_db_path: Path to Chroma vector database
1647
- model: LLM model to use
1648
- provider: LLM provider
1649
- embedding_model: Model to use for embeddings
1100
+ collection: Chroma collection
1101
+ fact: The fact text
1102
+ metadata: Dictionary with metadata (path, source, timestamp, etc.)
1103
+ embedding: Pre-generated embedding vector from get_embeddings
1650
1104
 
1651
1105
  Returns:
1652
- Answer from the model
1653
- """
1654
- # Get relevant facts using hybrid search
1655
- context = get_facts_for_rag(
1656
- kuzu_db_path,
1657
- chroma_db_path,
1658
- query,
1659
- )
1660
-
1661
- # Craft prompt with retrieved context
1662
- prompt = f"""
1663
- Answer this question based on the retrieved information.
1664
-
1665
- Question: {query}
1666
-
1667
- {context}
1668
-
1669
- Please provide a comprehensive answer based on the facts above. If the information
1670
- doesn't contain a direct answer, please indicate that clearly but try to synthesize
1671
- from the available facts.
1106
+ ID of the stored fact
1672
1107
  """
1673
-
1674
- # Get response from LLM
1675
- response = get_llm_response(prompt, model=model, provider=provider)
1676
-
1677
- return response["response"]
1678
-
1679
-
1680
-
1681
-
1682
-
1683
- # --- New: KnowledgeGraphManager Class ---
1684
- class KnowledgeGraphManager:
1685
- def __init__(self, db_path: str, model: str, provider: str, npc: Optional[NPC] = None):
1686
- self.db_path = db_path
1687
- self.model = model
1688
- self.provider = provider
1689
- self.npc = npc
1690
- self.conn = None
1691
- self._initialize_database()
1692
- self.current_generation = self._get_latest_generation()
1693
- print(f"KnowledgeGraphManager initialized. Current generation: {self.current_generation}")
1694
-
1695
- def _initialize_database(self, drop: bool = False):
1696
- """Initializes or connects to the Kuzu database."""
1697
- self.conn = init_db(self.db_path, drop=drop)
1698
- if self.conn is None:
1699
- raise ConnectionError("Failed to initialize Kuzu database.")
1700
-
1701
- def close(self):
1702
- """Closes the Kuzu database connection."""
1703
- if self.conn:
1704
- self.conn.close()
1705
- print("Kuzu database connection closed.")
1706
-
1707
- def _get_latest_generation(self) -> int:
1708
- """Queries the database for the latest generation number."""
1709
- query = "MATCH (g:Groups) RETURN MAX(g.generation_created) AS max_gen;"
1710
- result, error = safe_kuzu_execute(self.conn, query, "Failed to get max generation")
1711
- if error:
1712
- return -1 # Indicate no groups or error
1713
-
1714
- # Kuzu returns a kuzu.result.QueryResult object
1715
- # Need to fetch the value
1716
- df = result.fetch_as_df()
1717
- if not df.empty and not df['max_gen'].isnull().all():
1718
- return int(df['max_gen'].iloc[0])
1719
- return -1 # No groups yet
1720
-
1721
- def _get_active_hierarchy_dag(self) -> Dict[str, Dict[str, Any]]:
1722
- """
1723
- Queries the Kuzu database to construct the active conceptual hierarchy DAG
1724
- (ParentOf relationships).
1725
- Returns a dictionary representing the DAG structure:
1726
- {
1727
- 'group_name': {
1728
- 'parents': set(),
1729
- 'children': set(),
1730
- 'is_active': bool,
1731
- 'generation_created': int
1732
- },
1733
- ...
1734
- }
1735
- Also returns a list of top-level groups (roots) and leaf groups.
1736
- """
1737
- dag = {}
1738
- all_groups_query = "MATCH (g:Groups) RETURN g.name, g.is_active, g.generation_created;"
1739
- groups_result, _ = safe_kuzu_execute(self.conn, all_groups_query)
1740
-
1741
- if groups_result:
1742
- for row in groups_result.fetch_as_df().itertuples():
1743
- group_name = row._1 # Assuming the first column is g.name
1744
- is_active = row._2 # Assuming the second column is g.is_active
1745
- generation_created = row._3 # Assuming the third column is g.generation_created
1746
- dag[group_name] = {
1747
- "parents": set(),
1748
- "children": set(),
1749
- "is_active": is_active,
1750
- "generation_created": generation_created
1751
- }
1752
-
1753
- parent_of_query = """
1754
- MATCH (p:Groups)-[:ParentOf]->(c:Groups)
1755
- RETURN p.name, c.name;
1756
- """
1757
- relationships_result, _ = safe_kuzu_execute(self.conn, parent_of_query)
1758
-
1759
- if relationships_result:
1760
- for row in relationships_result.fetch_as_df().itertuples():
1761
- parent_name = row._1
1762
- child_name = row._2
1763
- if child_name in dag and parent_name in dag: # Ensure both nodes exist in the active_dag structure
1764
- dag[child_name]["parents"].add(parent_name)
1765
- dag[parent_name]["children"].add(child_name)
1766
-
1767
- # Filter for active groups and identify roots/leaves
1768
- active_dag = {name: data for name, data in dag.items() if data['is_active']}
1108
+ try:
1769
1109
 
1770
- top_groups = [name for name, data in active_dag.items() if not data["parents"]]
1771
- leaf_groups = [name for name, data in active_dag.items() if not data["children"]]
1772
-
1773
- # Also get all active groups for potential random sampling
1774
- all_active_groups = list(active_dag.keys())
1775
-
1776
- return {
1777
- "dag": active_dag,
1778
- "top_groups": top_groups,
1779
- "leaf_groups": leaf_groups,
1780
- "all_active_groups": all_active_groups
1781
- }
1782
-
1783
- # --- LLM Abstraction Methods (wrap existing functions or define new prompts) ---
1784
-
1785
- def _llm_extract_facts(self, text: str, context: str = "") -> List[str]:
1786
- """Wrapper for extract_facts."""
1787
- return extract_facts(text, self.model, self.provider, self.npc, context)
1788
-
1789
- def _llm_generate_concepts(self, items: List[str], item_type: str = "facts") -> List[str]:
1790
- """Wrapper for generate_group_candidates."""
1791
- return generate_group_candidates(items, item_type, self.model, self.provider, self.npc)
1792
-
1793
- def _llm_clean_concepts(self, concept_candidates: List[str]) -> List[str]:
1794
- """Wrapper for remove_idempotent_groups."""
1795
- return remove_idempotent_groups(concept_candidates, self.model, self.provider, self.npc)
1796
-
1797
- def _llm_build_initial_hierarchy(self, concepts: List[str]) -> Dict:
1798
- """
1799
- Builds a hierarchy DAG from a flat list of concepts.
1800
- This corresponds to LLM_BuildHierarchy in Algorithm 2.
1801
- It uses the existing build_hierarchy_dag function.
1802
- """
1803
- print(f"Building initial hierarchy in memory for {len(concepts)} concepts...")
1804
- hierarchy_structure = build_hierarchy_dag(
1805
- concepts, self.model, self.provider, self.npc,
1806
- max_levels=5, # Can be tuned
1807
- target_top_count=8 # Can be tuned
1808
- )
1809
- print("Initial hierarchy structure built in memory.")
1810
- return hierarchy_structure['dag'] # Return just the DAG portion
1811
-
1812
- def _llm_find_best_fit(self, item: str, candidates: List[str]) -> List[str]:
1813
- """
1814
- Finds the best fit group(s) for an item (fact or concept) from a list of candidates.
1815
- Corresponds to LLM_FindBestFit in Algorithm 3.
1816
- """
1817
- return get_fact_assignments(item, candidates, self.model, self.provider, self.npc)
1818
-
1819
- def _llm_check_direct_link(self, concept_a: str, concept_b: str) -> bool:
1820
- """
1821
- Checks if there's a direct, meaningful semantic link between two concepts.
1822
- Corresponds to LLM_CheckDirectLink in Algorithm 3.
1823
- """
1824
- prompt = f"""Is there a direct and meaningful semantic relationship between "{concept_a}" and "{concept_b}"?
1825
- Consider if one is a component of, a type of, strongly influences, or is directly associated with the other.
1826
- Answer with "yes" or "no".
1827
-
1828
- Concept A: {concept_a}
1829
- Concept B: {concept_b}
1830
-
1831
- Return JSON:
1832
- {{
1833
- "has_link": "yes" or "no"
1834
- }}
1835
- """
1836
- response = get_llm_response(
1837
- prompt, model=self.model, provider=self.provider, format="json", npc=self.npc
1838
- )
1839
- return response["response"].get("has_link", "no").lower() == "yes"
1110
+ import hashlib
1840
1111
 
1841
- def _llm_find_redundant_nodes(self, all_active_groups: List[str]) -> List[Tuple[str, List[str]]]:
1842
- """
1843
- Identifies redundant or consolidatable groups within the hierarchy.
1844
- Corresponds to LLM_FindRedundantNodes in Algorithm 1, Phase 3.
1845
- Returns a list of tuples: (new_consolidated_name, [old_redundant_names]).
1846
- """
1847
- if not all_active_groups:
1848
- return []
1112
+ fact_id = hashlib.md5(fact.encode()).hexdigest()
1849
1113
 
1850
- # It's better to process in batches if all_active_groups is very large
1851
- # For simplicity, sending all for now, but consider batching for production.
1852
-
1853
- prompt = f"""Given the following list of active conceptual groups, identify any groups that are highly redundant, overly specific, or could be consolidated into a single, more abstract, but still precise concept.
1854
- For each set of redundant groups, propose a single, better consolidated group name.
1855
-
1856
- GUIDELINES for Consolidation:
1857
- 1. **Semantic Overlap:** Only consolidate if groups are truly very similar or one is a very specific instance of another.
1858
- 2. **Naming:** The new consolidated name should be concise, specific, and accurately represent all merged concepts. Prioritize nouns/noun phrases. Avoid generic terms (e.g., "Concepts," "Processes").
1859
- 3. **Efficiency:** Aim for meaningful consolidation, not excessive merging.
1860
-
1861
- Example:
1862
- Active Groups: ["Tidal Disruption Events", "Black Hole Mergers", "Supernovae", "Neutron Star Collisions", "Astrophysical Transients", "Stellar Explosions"]
1863
- Consolidation Candidates: [
1864
- {{
1865
- "new_concept": "Cataclysmic Astronomical Events",
1866
- "old_concepts": ["Tidal Disruption Events", "Black Hole Mergers", "Supernovae", "Neutron Star Collisions"]
1867
- }},
1868
- {{
1869
- "new_concept": "Stellar Explosions",
1870
- "old_concepts": ["Supernovae", "Stellar Explosions"]
1871
- }}
1872
- ]
1873
1114
 
1874
- Note: "Astrophysical Transients" might be a broader category that subsumes the events, but if we have the specific events, we consolidate specific events first, then potentially link them to broader concepts in the hierarchy.
1875
-
1876
- Active Groups: {json.dumps(all_active_groups)}
1877
-
1878
- Return JSON:
1879
- {{
1880
- "consolidation_candidates": [
1881
- {{"new_concept": "Proposed Name", "old_concepts": ["Old Name 1", "Old Name 2"]}},
1882
- ...
1883
- ]
1884
- }}
1885
- """
1886
- response = get_llm_response(
1887
- prompt, model=self.model, provider=self.provider, format="json", npc=self.npc
1115
+ collection.add(
1116
+ documents=[fact],
1117
+ embeddings=[embedding],
1118
+ metadatas=[metadata],
1119
+ ids=[fact_id],
1888
1120
  )
1889
- candidates_data = response["response"].get("consolidation_candidates", [])
1890
-
1891
- # Convert to the desired format: List[Tuple[str, List[str]]]
1892
- formatted_candidates = []
1893
- for cand in candidates_data:
1894
- new_concept = cand.get("new_concept")
1895
- old_concepts = cand.get("old_concepts")
1896
- if new_concept and isinstance(old_concepts, list) and old_concepts:
1897
- # Filter out old_concepts that are not actually in all_active_groups
1898
- # to avoid trying to merge non-existent or inactive groups.
1899
- valid_old_concepts = [
1900
- oc for oc in old_concepts if oc in all_active_groups
1901
- ]
1902
- if valid_old_concepts: # Only add if there are valid old concepts to merge
1903
- formatted_candidates.append((new_concept, valid_old_concepts))
1904
-
1905
- return formatted_candidates
1906
-
1907
- # --- Kuzu Graph Update Methods ---
1908
-
1909
- def _add_parent_of_link(self, parent_name: str, child_name: str) -> bool:
1910
- """Creates a ParentOf relationship between two groups."""
1911
- escaped_parent = parent_name.replace('"', '\\"')
1912
- escaped_child = child_name.replace('"', '\\"')
1913
- query = f"""
1914
- MATCH (p:Groups), (c:Groups)
1915
- WHERE p.name = "{escaped_parent}" AND c.name = "{escaped_child}"
1916
- CREATE (p)-[:ParentOf]->(c)
1917
- """
1918
- _, error = safe_kuzu_execute(self.conn, query, f"Failed to create ParentOf link: {parent_name} -> {child_name}")
1919
- if error: print(f"Error creating ParentOf link: {error}")
1920
- return error is None
1921
-
1922
- def _add_associated_with_link(self, source_name: str, target_name: str) -> bool:
1923
- """Creates an AssociatedWith relationship between two groups."""
1924
- escaped_source = source_name.replace('"', '\\"')
1925
- escaped_target = target_name.replace('"', '\\"')
1926
- query = f"""
1927
- MATCH (s:Groups), (t:Groups)
1928
- WHERE s.name = "{escaped_source}" AND t.name = "{escaped_target}"
1929
- CREATE (s)-[:AssociatedWith]->(t)
1930
- """
1931
- _, error = safe_kuzu_execute(self.conn, query, f"Failed to create AssociatedWith link: {source_name} - {target_name}")
1932
- if error: print(f"Error creating AssociatedWith link: {error}")
1933
- return error is None
1934
-
1935
- def _record_evolution_link(self, old_group_name: str, new_group_name: str, event_type: str, reason: str):
1936
- """Records an EvolvedFrom link for genealogical tracking."""
1937
- escaped_old = old_group_name.replace('"', '\\"')
1938
- escaped_new = new_group_name.replace('"', '\\"')
1939
- query = f"""
1940
- MATCH (oldG:Groups), (newG:Groups)
1941
- WHERE oldG.name = "{escaped_old}" AND newG.name = "{escaped_new}"
1942
- CREATE (oldG)-[:EvolvedFrom {{event_type: "{event_type}", generation: {self.current_generation}, reason: "{reason}"}}]->(newG)
1943
- """
1944
- _, error = safe_kuzu_execute(self.conn, query, f"Failed to record evolution link: {old_group_name} -> {new_group_name}")
1945
- if error: print(f"Error recording evolution link: {error}")
1946
- return error is None
1947
-
1948
- def _set_group_active_status(self, group_name: str, is_active: bool):
1949
- """Sets the is_active status of a group."""
1950
- escaped_name = group_name.replace('"', '\\"')
1951
- query = f"""
1952
- MATCH (g:Groups {{name: "{escaped_name}"}})
1953
- SET g.is_active = {str(is_active).lower()}
1954
- """
1955
- _, error = safe_kuzu_execute(self.conn, query, f"Failed to update active status for group: {group_name}")
1956
- if error: print(f"Error setting group active status: {error}")
1957
- return error is None
1958
-
1959
- def _rewire_group_relationships(self, old_group_name: str, new_group_name: str):
1960
- """
1961
- Rewires ParentOf, AssociatedWith, and Contains relationships from an old group to a new one.
1962
- This is crucial during consolidation.
1963
- """
1964
- escaped_old = old_group_name.replace('"', '\\"')
1965
- escaped_new = new_group_name.replace('"', '\\"')
1966
-
1967
- # Kuzu's `SET` on relationship destination or source is not direct.
1968
- # The typical way to "rewire" in graph databases is to:
1969
- # 1. Create new relationships from existing nodes to the new target.
1970
- # 2. Delete the old relationships.
1971
- # This requires careful transaction management if atomicity is critical,
1972
- # but for simple delete-and-create within a loop, it's often fine.
1973
-
1974
- # Rewire ParentOf where old_group is a child
1975
- # (i.e., its parents should now point to new_group instead of old_group)
1976
- query_parent_to_child = f"""
1977
- MATCH (p:Groups)-[r:ParentOf]->(oldG:Groups)
1978
- WHERE oldG.name = "{escaped_old}"
1979
- AND NOT (p)-[:ParentOf]->(:Groups {{name: "{escaped_new}"}}) // Avoid duplicate relationships
1980
- CREATE (p)-[:ParentOf]->(newG:Groups) WHERE newG.name = "{escaped_new}"
1981
- DELETE r;
1982
- """
1983
- _, error = safe_kuzu_execute(self.conn, query_parent_to_child, f"Failed to rewire ParentOf (parent to old): {old_group_name}")
1984
- if error: print(f"Rewire error (ParentOf parent): {error}")
1985
-
1986
- # Rewire ParentOf where old_group is a parent
1987
- # (i.e., its children should now be children of new_group instead of old_group)
1988
- query_child_to_parent = f"""
1989
- MATCH (oldG:Groups)-[r:ParentOf]->(c:Groups)
1990
- WHERE oldG.name = "{escaped_old}"
1991
- AND NOT (:Groups {{name: "{escaped_new}"}})-[:ParentOf]->(c) // Avoid duplicate relationships
1992
- CREATE (newG:Groups)-[:ParentOf]->(c) WHERE newG.name = "{escaped_new}"
1993
- DELETE r;
1994
- """
1995
- _, error = safe_kuzu_execute(self.conn, query_child_to_parent, f"Failed to rewire ParentOf (old to child): {old_group_name}")
1996
- if error: print(f"Rewire error (ParentOf child): {error}")
1997
-
1998
- # Rewire AssociatedWith where old_group is a source
1999
- query_assoc_source = f"""
2000
- MATCH (s:Groups)-[r:AssociatedWith]->(oldG:Groups)
2001
- WHERE oldG.name = "{escaped_old}"
2002
- AND NOT (s)-[:AssociatedWith]->(:Groups {{name: "{escaped_new}"}}) // Avoid duplicate relationships
2003
- CREATE (s)-[:AssociatedWith]->(newG:Groups) WHERE newG.name = "{escaped_new}"
2004
- DELETE r;
2005
- """
2006
- _, error = safe_kuzu_execute(self.conn, query_assoc_source, f"Failed to rewire AssociatedWith (source to old): {old_group_name}")
2007
- if error: print(f"Rewire error (AssociatedWith source): {error}")
2008
-
2009
- # Rewire AssociatedWith where old_group is a target
2010
- query_assoc_target = f"""
2011
- MATCH (oldG:Groups)-[r:AssociatedWith]->(t:Groups)
2012
- WHERE oldG.name = "{escaped_old}"
2013
- AND NOT (:Groups {{name: "{escaped_new}"}})-[:AssociatedWith]->(t) // Avoid duplicate relationships
2014
- CREATE (newG:Groups)-[:AssociatedWith]->(t) WHERE newG.name = "{escaped_new}"
2015
- DELETE r;
2016
- """
2017
- _, error = safe_kuzu_execute(self.conn, query_assoc_target, f"Failed to rewire AssociatedWith (old to target): {old_group_name}")
2018
- if error: print(f"Rewire error (AssociatedWith target): {error}")
2019
-
2020
- # Rewire 'Contains' relationships if facts were directly linked to the old group
2021
- query_contains = f"""
2022
- MATCH (oldG:Groups)-[r:Contains]->(f:Fact)
2023
- WHERE oldG.name = "{escaped_old}"
2024
- AND NOT (:Groups {{name: "{escaped_new}"}})-[:Contains]->(f) // Avoid duplicate relationships
2025
- CREATE (newG:Groups)-[:Contains]->(f) WHERE newG.name = "{escaped_new}"
2026
- DELETE r;
2027
- """
2028
- _, error = safe_kuzu_execute(self.conn, query_contains, f"Failed to rewire Contains (old to fact): {old_group_name}")
2029
- if error: print(f"Rewire error (Contains): {error}")
2030
-
2031
- print(f"Rewired all relationships from '{old_group_name}' to '{new_group_name}'.")
2032
1121
 
2033
- # --- Algorithm 3: FindAllAssociationPaths ---
2034
-
2035
- def _recursive_traversal(self, c_new: str, current_nodes: List[str], hierarchy_dag: Dict, current_path: List[str]) -> Set[Tuple[str, ...]]:
2036
- """
2037
- Helper for FindAllAssociationPaths: Recursively traverses the hierarchy to find paths.
2038
- """
2039
- paths_results = set()
2040
-
2041
- # Base case for recursion: if no current_nodes to evaluate, path terminates.
2042
- # Add the current path to results if it's not empty and represents a complete segment.
2043
- if not current_nodes:
2044
- if current_path:
2045
- paths_results.add(tuple(current_path))
2046
- return paths_results
2047
-
2048
- # Find best fit nodes among current_nodes for the new concept
2049
- relevant_next_nodes = self._llm_find_best_fit(c_new, current_nodes)
2050
-
2051
- if not relevant_next_nodes:
2052
- # If no relevant children found among current_nodes, current path segment terminates.
2053
- # Only add to results if this path segment is valid and contains at least one node.
2054
- if current_path: # Ensures we don't add empty paths if initial_roots have no fit
2055
- paths_results.add(tuple(current_path))
2056
- return paths_results
2057
-
2058
- for node_name in relevant_next_nodes:
2059
- # Ensure the node being added to the path is not already the last node in the path
2060
- # This prevents cycles in a path if LLM returns the same node.
2061
- if current_path and node_name == current_path[-1]:
2062
- continue
2063
-
2064
- new_path = current_path + [node_name]
2065
-
2066
- # Get active children of the current node from the DAG
2067
- children_of_node = []
2068
- if node_name in hierarchy_dag:
2069
- children_of_node = [child for child in hierarchy_dag[node_name]["children"] if hierarchy_dag[child]["is_active"]]
2070
-
2071
- if not children_of_node: # Reached a leaf node or no relevant active children
2072
- paths_results.add(tuple(new_path))
2073
- else:
2074
- # Recurse down
2075
- paths_results.update(self._recursive_traversal(c_new, list(children_of_node), hierarchy_dag, new_path))
2076
-
2077
- return paths_results
2078
-
2079
- def _find_all_association_paths(self, c_new: str, hierarchy_dag: Dict, theta_explore: float) -> Set[Tuple[str, ...]]:
2080
- """
2081
- Algorithm 3: Finds all primary and serendipitous association paths for a new concept.
2082
- Returns a set of tuples, where each tuple is a path of concept names.
2083
- """
2084
- print(f"Finding association paths for new concept: {c_new}")
2085
-
2086
- # Part A: Primary Top-Down Traversal
2087
- # Start with active root nodes (groups with no active parents in the current hierarchy view)
2088
- active_root_nodes = [name for name, data in hierarchy_dag.items() if not data["parents"] and data["is_active"]]
2089
- if not active_root_nodes:
2090
- print("No active root nodes found in hierarchy. Considering all active groups as potential starting points for primary traversal.")
2091
- active_root_nodes = [node for node in hierarchy_dag.keys() if hierarchy_dag[node]["is_active"]]
2092
-
2093
- # Perform initial filtering at the top level
2094
- initial_relevant_roots = self._llm_find_best_fit(c_new, active_root_nodes)
2095
-
2096
- primary_paths = set()
2097
- for root in initial_relevant_roots:
2098
- # Paths start *from* the root selected by LLM
2099
- primary_paths.update(self._recursive_traversal(c_new, [root], hierarchy_dag, []))
2100
-
2101
- print(f"Primary paths found: {primary_paths}")
2102
-
2103
- # Part B: Serendipitous Random Exploration
2104
- all_active_groups = [node for node in hierarchy_dag.keys() if hierarchy_dag[node]["is_active"]]
2105
-
2106
- # Collect all nodes visited in primary paths to exclude them from serendipitous sample
2107
- visited_in_primary = set()
2108
- for path in primary_paths:
2109
- visited_in_primary.update(path)
2110
-
2111
- unvisited_groups = [g for g in all_active_groups if g not in visited_in_primary]
2112
-
2113
- num_sample = int(len(unvisited_groups) * theta_explore)
2114
- sampled_nodes = random.sample(unvisited_groups, min(num_sample, len(unvisited_groups)))
2115
- print(f"Sampled {len(sampled_nodes)} nodes from {len(unvisited_groups)} unvisited for serendipitous exploration.")
2116
-
2117
- serendipity_paths = set()
2118
- for s_node in sampled_nodes:
2119
- if self._llm_check_direct_link(c_new, s_node):
2120
- print(f"Direct link found between '{c_new}' and serendipitous node '{s_node}'. Initiating branch traversal.")
2121
- # Start a new traversal from this node. The path will start with this node.
2122
- branch_paths = self._recursive_traversal(c_new, [s_node], hierarchy_dag, [])
2123
- serendipity_paths.update(branch_paths)
2124
- print(f"Serendipitous paths found: {serendipity_paths}")
2125
-
2126
- return primary_paths.union(serendipity_paths)
2127
-
2128
- # --- Algorithm 2: CreateInitialGraph ---
2129
-
2130
- def create_initial_graph(self, initial_facts: List[str]) -> Dict:
2131
- """
2132
- Algorithm 2: Creates the initial Knowledge Graph at generation 0.
2133
- """
2134
- if self.current_generation >= 0:
2135
- print(f"Warning: Knowledge Graph already exists at generation {self.current_generation}. Returning current state.")
2136
- return self._get_active_hierarchy_dag()
2137
-
2138
- print("Creating initial Knowledge Graph (Generation 0)...")
2139
- self.current_generation = 0 # Set for initial creation
2140
-
2141
- # Store initial facts
2142
- for fact_content in initial_facts:
2143
- self._insert_fact(fact_content, "initial_load")
2144
-
2145
- # Generate concept candidates from initial facts
2146
- concept_candidates = self._llm_generate_concepts(initial_facts, "facts")
2147
- initial_concepts = self._llm_clean_concepts(concept_candidates)
2148
- print(f"Initial concepts identified for hierarchy: {initial_concepts}")
2149
-
2150
- # Build initial hierarchy structure (in-memory DAG)
2151
- hierarchy_dag_structure = self._llm_build_initial_hierarchy(initial_concepts)
2152
- print(f"Initial hierarchy structure built in memory for {len(hierarchy_dag_structure)} groups.")
2153
-
2154
- # Instantiate the concepts (Groups nodes) in Kuzu for Generation 0
2155
- all_groups_in_hierarchy = set(hierarchy_dag_structure.keys())
2156
- for c_name in all_groups_in_hierarchy:
2157
- self.create_group(self.conn, c_name, self.current_generation, is_active=True)
2158
- # Record CREATE link for the new group (concept created in this generation)
2159
- self._record_evolution_link(c_name, c_name, "CREATE", f"Initial creation at generation {self.current_generation}")
2160
-
2161
- # Create 'ParentOf' links in Kuzu based on the hierarchy_dag_structure
2162
- print("Creating ParentOf links in Kuzu...")
2163
- for group_name, data in hierarchy_dag_structure.items():
2164
- for parent_name in data["parents"]: # Parents are defined as the 'source' of ParentOf links
2165
- self._add_parent_of_link(parent_name, group_name)
2166
-
2167
- print("Initial graph creation complete.")
2168
- return self._get_active_hierarchy_dag() # Return the state of the newly created graph
1122
+ return fact_id
1123
+ except Exception as e:
1124
+ print(f"Error storing fact in Chroma: {e}")
1125
+ return None
2169
1126
 
2170
- # --- Algorithm 1: EvolveKnowledgeGraph ---
1127
+ def save_facts_to_graph_db(
1128
+ conn, facts: List[str], path: str, batch_size: int
1129
+ ):
1130
+ """Save a list of facts to the database in batches"""
1131
+ for i in range(0, len(facts), batch_size):
1132
+ batch = facts[i : i + batch_size]
1133
+ print(f"\nProcessing batch {i//batch_size + 1} ({len(batch)} facts)")
2171
1134
 
2172
- def evolve_knowledge_graph(self, new_facts: List[str], theta_explore: float = 0.1) -> Dict:
2173
- """
2174
- Algorithm 1: Generational Knowledge Hierarchy Evolution (EvoSem-MHI).
2175
- """
2176
- if self.current_generation == -1:
2177
- print("No initial graph found. Calling create_initial_graph first for current facts.")
2178
- return self.create_initial_graph(new_facts)
2179
-
2180
- self.current_generation += 1
2181
- print(f"\n--- Starting Evolution for Generation {self.current_generation} ---")
2182
-
2183
- # Phase 1: Discovery of New Concepts
2184
- print("Phase 1: Discovery of New Concepts")
2185
- # LLM_GenerateConcepts for new facts
2186
- candidate_new_concepts = self._llm_generate_concepts(new_facts, "facts")
2187
- # LLM_CleanConcepts
2188
- cleaned_new_concepts = self._llm_clean_concepts(candidate_new_concepts)
2189
1135
 
2190
- # Store initial facts
2191
- for fact_content in new_facts:
2192
- self._insert_fact(fact_content, f"generation_{self.current_generation}_input")
2193
-
2194
- # Ensure new concepts are created as Groups nodes, even if not immediately integrated into hierarchy
2195
- for concept_name in cleaned_new_concepts:
2196
- self.create_group(self.conn, concept_name, self.current_generation, is_active=True)
2197
- # Record CREATE link for the new group (concept created in this generation)
2198
- self._record_evolution_link(concept_name, concept_name, "CREATE", f"Discovered in generation {self.current_generation}")
2199
-
2200
- print(f"Discovered and prepared {len(cleaned_new_concepts)} new concepts.")
2201
-
2202
- # Capture the current state of the hierarchy *before* MHI
2203
- # This DAG needs to include all active groups, including potentially new ones from this generation if they exist
2204
- current_hierarchy_state = self._get_active_hierarchy_dag()
2205
- current_dag_for_mhi = current_hierarchy_state["dag"]
2206
-
2207
- # Phase 2: Multiplicative Hierarchical Integration
2208
- print("\nPhase 2: Multiplicative Hierarchical Integration")
2209
- for new_concept_name in cleaned_new_concepts:
2210
- all_association_paths = self._find_all_association_paths(
2211
- new_concept_name, current_dag_for_mhi, theta_explore
2212
- )
2213
-
2214
- print(f"Paths for '{new_concept_name}': {all_association_paths}")
2215
-
2216
- # Create AssociatedWith links for all nodes along all paths
2217
- # The new concept is linked *to* existing concepts in the hierarchy.
2218
- for path in all_association_paths:
2219
- if not path: continue # Skip empty paths
2220
- for node_in_path in path:
2221
- # Ensure the node in path is an active group.
2222
- if node_in_path in current_dag_for_mhi and current_dag_for_mhi[node_in_path]["is_active"]:
2223
- self._add_associated_with_link(new_concept_name, node_in_path)
2224
- # print(f"Added 'AssociatedWith' link: '{new_concept_name}' -> '{node_in_path}'") # Too verbose
2225
-
2226
- print("Phase 2: Integration complete.")
2227
-
2228
- # Phase 3: Pruning and Consolidation
2229
- print("\nPhase 3: Pruning and Consolidation")
2230
- # Get the *updated* list of all active groups for consolidation check
2231
- # This includes newly created groups from this generation (Phase 1)
2232
- # and existing active groups.
2233
- updated_hierarchy_state_for_pruning = self._get_active_hierarchy_dag()
2234
- all_active_groups_for_consolidation = updated_hierarchy_state_for_pruning["all_active_groups"]
2235
-
2236
- redundant_candidates = self._llm_find_redundant_nodes(all_active_groups_for_consolidation)
2237
-
2238
- if not redundant_candidates:
2239
- print("No redundant concepts identified for consolidation.")
2240
- else:
2241
- print(f"Identified {len(redundant_candidates)} consolidation candidates.")
2242
-
2243
- for new_consolidated_name, old_concept_names in redundant_candidates:
2244
- # Ensure the new_consolidated_name is not one of the old_concept_names
2245
- # If LLM suggests merging "A" into "A", skip.
2246
- if new_consolidated_name in old_concept_names:
2247
- print(f"Skipping consolidation where new concept '{new_consolidated_name}' is also an old concept. This should be handled by LLM.")
2248
- old_concept_names.remove(new_consolidated_name)
2249
- if not old_concept_names: continue # If no other old concepts, skip
2250
-
2251
- print(f"Consolidating: {old_concept_names} into '{new_consolidated_name}'")
2252
- # Create the new consolidated group if it doesn't exist
2253
- # It will be active and created in the current generation
2254
- self.create_group(self.conn, new_consolidated_name, self.current_generation, is_active=True)
2255
-
2256
- # Link old concepts to the new consolidated group and mark them inactive
2257
- for old_name in old_concept_names:
2258
- # Record evolution link from old to new
2259
- self._record_evolution_link(old_name, new_consolidated_name, "SUBSUMED_BY", f"Consolidated in generation {self.current_generation}")
2260
-
2261
- # Mark old group as inactive
2262
- self._set_group_active_status(old_name, False)
2263
-
2264
- # Rewire all relationships (ParentOf, AssociatedWith, Contains) from old to new
2265
- self._rewire_group_relationships(old_name, new_consolidated_name)
2266
- print(f"Marked '{old_name}' as inactive and rewired its connections to '{new_consolidated_name}'.")
2267
-
2268
- print(f"\n--- Evolution for Generation {self.current_generation} Complete ---")
2269
- return self._get_active_hierarchy_dag() # Return the final state of the graph after this generation
2270
-
2271
- # --- Fact Storage (from original code, slightly adapted for self.conn) ---
2272
- def _insert_fact(self, fact_content: str, path: str) -> bool:
2273
- """Insert a fact into the database with robust error handling."""
2274
- if self.conn is None:
2275
- print("Cannot insert fact: database connection is None")
2276
- return False
2277
-
2278
- try:
2279
- escaped_fact = fact_content.replace('"', '\\"')
2280
- escaped_path = os.path.expanduser(path).replace('"', '\\"')
2281
- timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
2282
-
2283
- safe_kuzu_execute(self.conn, "BEGIN TRANSACTION")
2284
- check_query = f'MATCH (f:Fact {{content: "{escaped_fact}"}}) RETURN f'
2285
- result, error = safe_kuzu_execute(self.conn, check_query, "Failed to check if fact exists")
2286
- if error:
2287
- safe_kuzu_execute(self.conn, "ROLLBACK")
2288
- return False
2289
-
2290
- if not result.has_next():
2291
- insert_query = f"""
2292
- CREATE (f:Fact {{
2293
- content: "{escaped_fact}",
2294
- path: "{escaped_path}",
2295
- recorded_at: "{timestamp}"
2296
- }})
2297
- """
2298
- _, error = safe_kuzu_execute(self.conn, insert_query, "Failed to insert fact")
2299
- if error:
2300
- safe_kuzu_execute(self.conn, "ROLLBACK")
2301
- return False
2302
- safe_kuzu_execute(self.conn, "COMMIT")
2303
- return True
2304
- except Exception as e:
2305
- print(f"Error inserting fact: {str(e)}")
2306
- traceback.print_exc()
2307
- safe_kuzu_execute(self.conn, "ROLLBACK")
2308
- return False
1136
+ for fact in batch:
1137
+ try:
1138
+ print(f"Inserting fact: {fact}")
1139
+ print(f"With path: {path}")
1140
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
1141
+ print(f"With recorded_at: {timestamp}")
2309
1142
 
2310
- def _assign_fact_to_group_graph(self, fact_content: str, group_name: str) -> bool:
2311
- """Create a Contains relationship between a fact and a group."""
2312
- if self.conn is None:
2313
- print("Cannot assign fact to group: database connection is None")
2314
- return False
1143
+ insert_fact(conn, fact, path)
1144
+ print("Success!")
1145
+ except Exception as e:
1146
+ print(f"Failed to insert fact: {fact}")
1147
+ print(f"Error: {e}")
1148
+ continue
2315
1149
 
2316
- try:
2317
- escaped_fact = fact_content.replace('"', '\\"')
2318
- escaped_group = group_name.replace('"', '\\"')
2319
-
2320
- # Check if both fact and group exist before creating relationship
2321
- check_fact_query = f'MATCH (f:Fact {{content: "{escaped_fact}"}}) RETURN f'
2322
- fact_result, fact_error = safe_kuzu_execute(self.conn, check_fact_query)
2323
- if fact_error or not fact_result or not fact_result.has_next():
2324
- print(f"Fact not found for assignment: {fact_content}")
2325
- return False
1150
+ print(f"Completed batch {i//batch_size + 1}")
2326
1151
 
2327
- check_group_query = f'MATCH (g:Groups {{name: "{escaped_group}"}}) RETURN g'
2328
- group_result, group_error = safe_kuzu_execute(self.conn, check_group_query)
2329
- if group_error or not group_result or not group_result.has_next():
2330
- print(f"Group not found for assignment: {group_name}")
2331
- return False
2332
1152
 
2333
- # Check if relationship already exists to prevent duplicates
2334
- check_rel_query = f"""
2335
- MATCH (g:Groups {{name: "{escaped_group}"}})-[:Contains]->(f:Fact {{content: "{escaped_fact}"}})
2336
- RETURN g, f
2337
- """
2338
- rel_exists_result, _ = safe_kuzu_execute(self.conn, check_rel_query)
2339
- if rel_exists_result and rel_exists_result.has_next():
2340
- # print(f"Contains relationship already exists for fact '{fact_content}' to group '{group_name}'.")
2341
- return True # Relationship already exists, so it's "successful"
2342
-
2343
- # Create relationship
2344
- query = f"""
2345
- MATCH (f:Fact), (g:Groups)
2346
- WHERE f.content = "{escaped_fact}" AND g.name = "{escaped_group}"
2347
- CREATE (g)-[:Contains]->(f)
2348
- """
2349
- _, error = safe_kuzu_execute(self.conn, query, f"Failed to create Contains relationship for fact {fact_content} to group {group_name}")
2350
- return error is None
2351
- except Exception as e:
2352
- print(f"Error assigning fact to group: {str(e)}")
2353
- traceback.print_exc()
2354
- return False
2355
1153
 
2356
- def store_fact_and_group(self, fact_content: str, groups: List[str], path: str = "unknown_source") -> bool:
2357
- """
2358
- Public method to store a fact and associate it with groups.
2359
- This handles the `Contains` relationships.
2360
- """
2361
- success = self._insert_fact(fact_content, path)
2362
- if not success:
2363
- print(f"Failed to insert fact: {fact_content}")
2364
- return False
2365
-
2366
- for group in groups:
2367
- # Assign fact to group (creates Contains link)
2368
- if not self._assign_fact_to_group_graph(fact_content, group):
2369
- print(f"Failed to assign fact {fact_content} to group {group}")
2370
- success = False # Still continue with other groups but mark overall failure
2371
- return success
1154
+ def kg_add_fact(
1155
+ engine,
1156
+ fact_text: str,
1157
+ npc=None,
1158
+ team=None,
1159
+ model=None,
1160
+ provider=None
1161
+ ):
1162
+ """Add a new fact to the knowledge graph"""
1163
+ directory_path = os.getcwd()
1164
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1165
+ npc_name = npc.name if npc else 'default_npc'
1166
+
1167
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1168
+
1169
+ new_fact = {
1170
+ "statement": fact_text,
1171
+ "source_text": fact_text,
1172
+ "type": "manual",
1173
+ "generation": kg_data.get('generation', 0),
1174
+ "origin": "manual_add"
1175
+ }
1176
+
1177
+ kg_data['facts'].append(new_fact)
1178
+ save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
1179
+
1180
+ return f"Added fact: {fact_text}"
1181
+
1182
+ def kg_search_facts(
1183
+ engine,
1184
+ query: str,
1185
+ npc=None,
1186
+ team=None,
1187
+ model=None,
1188
+ provider=None
1189
+ ):
1190
+ """Search facts in the knowledge graph"""
1191
+ directory_path = os.getcwd()
1192
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1193
+ npc_name = npc.name if npc else 'default_npc'
1194
+
1195
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1196
+
1197
+ matching_facts = []
1198
+ for fact in kg_data.get('facts', []):
1199
+ if query.lower() in fact['statement'].lower():
1200
+ matching_facts.append(fact['statement'])
1201
+
1202
+ return matching_facts
1203
+
1204
+ def kg_remove_fact(
1205
+ engine,
1206
+ fact_text: str,
1207
+ npc=None,
1208
+ team=None,
1209
+ model=None,
1210
+ provider=None
1211
+ ):
1212
+ """Remove a fact from the knowledge graph"""
1213
+ directory_path = os.getcwd()
1214
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1215
+ npc_name = npc.name if npc else 'default_npc'
1216
+
1217
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1218
+
1219
+ original_count = len(kg_data.get('facts', []))
1220
+ kg_data['facts'] = [f for f in kg_data.get('facts', []) if f['statement'] != fact_text]
1221
+ removed_count = original_count - len(kg_data['facts'])
1222
+
1223
+ if removed_count > 0:
1224
+ save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
1225
+ return f"Removed {removed_count} matching fact(s)"
1226
+
1227
+ return "No matching facts found"
1228
+
1229
+ def kg_list_concepts(
1230
+ engine,
1231
+ npc=None,
1232
+ team=None,
1233
+ model=None,
1234
+ provider=None
1235
+ ):
1236
+ """List all concepts in the knowledge graph"""
1237
+ directory_path = os.getcwd()
1238
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1239
+ npc_name = npc.name if npc else 'default_npc'
1240
+
1241
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1242
+
1243
+ concepts = [c['name'] for c in kg_data.get('concepts', [])]
1244
+ return concepts
1245
+
1246
+ def kg_get_facts_for_concept(
1247
+ engine,
1248
+ concept_name: str,
1249
+ npc=None,
1250
+ team=None,
1251
+ model=None,
1252
+ provider=None
1253
+ ):
1254
+ """Get all facts linked to a specific concept"""
1255
+ directory_path = os.getcwd()
1256
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1257
+ npc_name = npc.name if npc else 'default_npc'
1258
+
1259
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1260
+
1261
+ fact_to_concept_links = kg_data.get('fact_to_concept_links', {})
1262
+ linked_facts = []
1263
+
1264
+ for fact_statement, linked_concepts in fact_to_concept_links.items():
1265
+ if concept_name in linked_concepts:
1266
+ linked_facts.append(fact_statement)
1267
+
1268
+ return linked_facts
1269
+
1270
+ def kg_add_concept(
1271
+ engine,
1272
+ concept_name: str,
1273
+ concept_description: str,
1274
+ npc=None,
1275
+ team=None,
1276
+ model=None,
1277
+ provider=None
1278
+ ):
1279
+ """Add a new concept to the knowledge graph"""
1280
+ directory_path = os.getcwd()
1281
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1282
+ npc_name = npc.name if npc else 'default_npc'
1283
+
1284
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1285
+
1286
+ new_concept = {
1287
+ "name": concept_name,
1288
+ "description": concept_description,
1289
+ "generation": kg_data.get('generation', 0)
1290
+ }
1291
+
1292
+ kg_data['concepts'].append(new_concept)
1293
+ save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
1294
+
1295
+ return f"Added concept: {concept_name}"
1296
+
1297
+ def kg_remove_concept(
1298
+ engine,
1299
+ concept_name: str,
1300
+ npc=None,
1301
+ team=None,
1302
+ model=None,
1303
+ provider=None
1304
+ ):
1305
+ """Remove a concept from the knowledge graph"""
1306
+ directory_path = os.getcwd()
1307
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1308
+ npc_name = npc.name if npc else 'default_npc'
1309
+
1310
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1311
+
1312
+ original_count = len(kg_data.get('concepts', []))
1313
+ kg_data['concepts'] = [c for c in kg_data.get('concepts', []) if c['name'] != concept_name]
1314
+ removed_count = original_count - len(kg_data['concepts'])
1315
+
1316
+ if removed_count > 0:
1317
+ save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
1318
+ return f"Removed concept: {concept_name}"
1319
+
1320
+ return "Concept not found"
1321
+
1322
+ def kg_link_fact_to_concept(
1323
+ engine,
1324
+ fact_text: str,
1325
+ concept_name: str,
1326
+ npc=None,
1327
+ team=None,
1328
+ model=None,
1329
+ provider=None
1330
+ ):
1331
+ """Link a fact to a concept in the knowledge graph"""
1332
+ directory_path = os.getcwd()
1333
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1334
+ npc_name = npc.name if npc else 'default_npc'
1335
+
1336
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1337
+
1338
+ fact_to_concept_links = kg_data.get('fact_to_concept_links', {})
1339
+
1340
+ if fact_text not in fact_to_concept_links:
1341
+ fact_to_concept_links[fact_text] = []
1342
+
1343
+ if concept_name not in fact_to_concept_links[fact_text]:
1344
+ fact_to_concept_links[fact_text].append(concept_name)
1345
+ kg_data['fact_to_concept_links'] = fact_to_concept_links
1346
+ save_kg_to_db(engine, kg_data, team_name, npc_name, directory_path)
1347
+ return f"Linked fact '{fact_text}' to concept '{concept_name}'"
1348
+
1349
+ return "Fact already linked to concept"
1350
+
1351
+ def kg_get_all_facts(
1352
+ engine,
1353
+ npc=None,
1354
+ team=None,
1355
+ model=None,
1356
+ provider=None
1357
+ ):
1358
+ """Get all facts from the knowledge graph"""
1359
+ directory_path = os.getcwd()
1360
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1361
+ npc_name = npc.name if npc else 'default_npc'
1362
+
1363
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1364
+
1365
+ facts = [f['statement'] for f in kg_data.get('facts', [])]
1366
+ return facts
1367
+
1368
+ def kg_get_stats(
1369
+ engine,
1370
+ npc=None,
1371
+ team=None,
1372
+ model=None,
1373
+ provider=None
1374
+ ):
1375
+ """Get statistics about the knowledge graph"""
1376
+ directory_path = os.getcwd()
1377
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1378
+ npc_name = npc.name if npc else 'default_npc'
1379
+
1380
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1381
+
1382
+ return {
1383
+ "total_facts": len(kg_data.get('facts', [])),
1384
+ "total_concepts": len(kg_data.get('concepts', [])),
1385
+ "total_fact_concept_links": len(kg_data.get('fact_to_concept_links', {})),
1386
+ "generation": kg_data.get('generation', 0)
1387
+ }
1388
+
1389
+ def kg_evolve_knowledge(
1390
+ engine,
1391
+ content_text: str,
1392
+ npc=None,
1393
+ team=None,
1394
+ model=None,
1395
+ provider=None
1396
+ ):
1397
+ """Evolve the knowledge graph with new content"""
1398
+ directory_path = os.getcwd()
1399
+ team_name = getattr(team, 'name', 'default_team') if team else 'default_team'
1400
+ npc_name = npc.name if npc else 'default_npc'
1401
+
1402
+ kg_data = load_kg_from_db(engine, team_name, npc_name, directory_path)
1403
+
1404
+ evolved_kg, _ = kg_evolve_incremental(
1405
+ existing_kg=kg_data,
1406
+ new_content_text=content_text,
1407
+ model=npc.model if npc else model,
1408
+ provider=npc.provider if npc else provider,
1409
+ npc=npc,
1410
+ get_concepts=True,
1411
+ link_concepts_facts=False,
1412
+ link_concepts_concepts=False,
1413
+ link_facts_facts=False
1414
+ )
1415
+
1416
+ save_kg_to_db(engine, evolved_kg, team_name, npc_name, directory_path)
1417
+
1418
+ return "Knowledge graph evolved with new content"