sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/__init__.py +8 -0
- sirchmunk/base.py +17 -0
- sirchmunk/insight/__init__.py +4 -0
- sirchmunk/insight/text_insights.py +292 -0
- sirchmunk/learnings/__init__.py +1 -0
- sirchmunk/learnings/evidence_processor.py +525 -0
- sirchmunk/learnings/knowledge_base.py +232 -0
- sirchmunk/llm/__init__.py +2 -0
- sirchmunk/llm/openai_chat.py +247 -0
- sirchmunk/llm/prompts.py +216 -0
- sirchmunk/retrieve/__init__.py +1 -0
- sirchmunk/retrieve/base.py +25 -0
- sirchmunk/retrieve/text_retriever.py +1026 -0
- sirchmunk/scan/__init__.py +1 -0
- sirchmunk/scan/base.py +18 -0
- sirchmunk/scan/file_scanner.py +373 -0
- sirchmunk/scan/web_scanner.py +18 -0
- sirchmunk/scheduler/__init__.py +0 -0
- sirchmunk/schema/__init__.py +2 -0
- sirchmunk/schema/cognition.py +106 -0
- sirchmunk/schema/context.py +25 -0
- sirchmunk/schema/knowledge.py +318 -0
- sirchmunk/schema/metadata.py +658 -0
- sirchmunk/schema/request.py +221 -0
- sirchmunk/schema/response.py +20 -0
- sirchmunk/schema/snapshot.py +346 -0
- sirchmunk/search.py +475 -0
- sirchmunk/storage/__init__.py +7 -0
- sirchmunk/storage/duckdb.py +676 -0
- sirchmunk/storage/knowledge_manager.py +720 -0
- sirchmunk/utils/__init__.py +15 -0
- sirchmunk/utils/constants.py +15 -0
- sirchmunk/utils/deps.py +23 -0
- sirchmunk/utils/file_utils.py +70 -0
- sirchmunk/utils/install_rga.py +124 -0
- sirchmunk/utils/log_utils.py +360 -0
- sirchmunk/utils/tokenizer_util.py +55 -0
- sirchmunk/utils/utils.py +108 -0
- sirchmunk/version.py +1 -1
- sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
- sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.0.dist-info/METADATA +0 -26
- sirchmunk-0.0.0.dist-info/RECORD +0 -8
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
"""
|
|
3
|
+
Knowledge Manager using DuckDB and Parquet
|
|
4
|
+
Manages KnowledgeCluster objects with persistence
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import json
|
|
9
|
+
from typing import Dict, Any, List, Optional
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from loguru import logger
|
|
13
|
+
|
|
14
|
+
from .duckdb import DuckDBManager
|
|
15
|
+
from sirchmunk.schema.knowledge import (
|
|
16
|
+
KnowledgeCluster,
|
|
17
|
+
EvidenceUnit,
|
|
18
|
+
Constraint,
|
|
19
|
+
WeakSemanticEdge,
|
|
20
|
+
Lifecycle,
|
|
21
|
+
AbstractionLevel
|
|
22
|
+
)
|
|
23
|
+
from ..utils.constants import DEFAULT_WORK_PATH
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class KnowledgeManager:
|
|
27
|
+
"""
|
|
28
|
+
Manages persistent storage of KnowledgeCluster objects using DuckDB and Parquet
|
|
29
|
+
|
|
30
|
+
Architecture:
|
|
31
|
+
- Uses KnowledgeCluster as core schema
|
|
32
|
+
- Stores data in Parquet format for efficient storage and querying
|
|
33
|
+
- Provides full CRUD operations with fuzzy search capabilities
|
|
34
|
+
- Follows Single Responsibility Principle (SRP)
|
|
35
|
+
|
|
36
|
+
Storage Path: {WORK_PATH}/.cache/knowledge/
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, work_path: Optional[str] = None):
|
|
40
|
+
"""
|
|
41
|
+
Initialize Knowledge Manager
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
work_path: Base work path. If None, uses WORK_PATH env variable
|
|
45
|
+
"""
|
|
46
|
+
# Get work path from env if not provided
|
|
47
|
+
if work_path is None:
|
|
48
|
+
work_path = os.getenv("WORK_PATH", DEFAULT_WORK_PATH)
|
|
49
|
+
|
|
50
|
+
# Create knowledge storage path
|
|
51
|
+
self.knowledge_path = Path(work_path) / ".cache" / "knowledge"
|
|
52
|
+
self.knowledge_path.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
|
|
54
|
+
# Parquet file path
|
|
55
|
+
self.parquet_file = str(self.knowledge_path / "knowledge_clusters.parquet")
|
|
56
|
+
|
|
57
|
+
# Initialize DuckDB (in-memory for fast operations)
|
|
58
|
+
self.db = DuckDBManager(db_path=None) # In-memory database
|
|
59
|
+
|
|
60
|
+
# Table name
|
|
61
|
+
self.table_name = "knowledge_clusters"
|
|
62
|
+
|
|
63
|
+
# Load data from parquet if exists
|
|
64
|
+
self._load_from_parquet()
|
|
65
|
+
|
|
66
|
+
logger.info(f"Knowledge Manager initialized at: {self.knowledge_path}")
|
|
67
|
+
|
|
68
|
+
def _load_from_parquet(self):
|
|
69
|
+
"""Load knowledge clusters from parquet file into DuckDB"""
|
|
70
|
+
try:
|
|
71
|
+
if Path(self.parquet_file).exists():
|
|
72
|
+
# Drop existing table first to avoid conflicts
|
|
73
|
+
self.db.drop_table(self.table_name, if_exists=True)
|
|
74
|
+
# Load parquet file into DuckDB table
|
|
75
|
+
self.db.import_from_parquet(self.table_name, self.parquet_file, create_table=True)
|
|
76
|
+
count = self.db.get_table_count(self.table_name)
|
|
77
|
+
logger.info(f"Loaded {count} knowledge clusters from {self.parquet_file}")
|
|
78
|
+
else:
|
|
79
|
+
# Create empty table with schema
|
|
80
|
+
self._create_table()
|
|
81
|
+
logger.info("Created new knowledge clusters table")
|
|
82
|
+
except Exception as e:
|
|
83
|
+
logger.error(f"Failed to load from parquet: {e}")
|
|
84
|
+
# Try to recreate table
|
|
85
|
+
self.db.drop_table(self.table_name, if_exists=True)
|
|
86
|
+
self._create_table()
|
|
87
|
+
|
|
88
|
+
def _create_table(self):
|
|
89
|
+
"""Create knowledge clusters table with schema"""
|
|
90
|
+
schema = {
|
|
91
|
+
"id": "VARCHAR PRIMARY KEY",
|
|
92
|
+
"name": "VARCHAR NOT NULL",
|
|
93
|
+
"description": "VARCHAR",
|
|
94
|
+
"content": "VARCHAR",
|
|
95
|
+
"scripts": "VARCHAR", # JSON array
|
|
96
|
+
"resources": "VARCHAR", # JSON array
|
|
97
|
+
"evidences": "VARCHAR", # JSON array
|
|
98
|
+
"patterns": "VARCHAR", # JSON array
|
|
99
|
+
"constraints": "VARCHAR", # JSON array
|
|
100
|
+
"confidence": "DOUBLE",
|
|
101
|
+
"abstraction_level": "VARCHAR",
|
|
102
|
+
"landmark_potential": "DOUBLE",
|
|
103
|
+
"hotness": "DOUBLE",
|
|
104
|
+
"lifecycle": "VARCHAR",
|
|
105
|
+
"create_time": "TIMESTAMP",
|
|
106
|
+
"last_modified": "TIMESTAMP",
|
|
107
|
+
"version": "INTEGER",
|
|
108
|
+
"related_clusters": "VARCHAR", # JSON array
|
|
109
|
+
"search_results": "VARCHAR", # JSON array
|
|
110
|
+
}
|
|
111
|
+
self.db.create_table(self.table_name, schema, if_not_exists=True)
|
|
112
|
+
logger.info(f"Created table {self.table_name}")
|
|
113
|
+
|
|
114
|
+
def _save_to_parquet(self):
|
|
115
|
+
"""Save current knowledge clusters to parquet file"""
|
|
116
|
+
try:
|
|
117
|
+
# Export table to parquet
|
|
118
|
+
self.db.export_to_parquet(self.table_name, self.parquet_file)
|
|
119
|
+
logger.debug(f"Saved knowledge clusters to {self.parquet_file}")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.error(f"Failed to save to parquet: {e}")
|
|
122
|
+
raise
|
|
123
|
+
|
|
124
|
+
def _cluster_to_row(self, cluster: KnowledgeCluster) -> Dict[str, Any]:
|
|
125
|
+
"""Convert KnowledgeCluster to database row"""
|
|
126
|
+
# Handle list/string fields for description and content
|
|
127
|
+
description_str = (
|
|
128
|
+
json.dumps(cluster.description)
|
|
129
|
+
if isinstance(cluster.description, list)
|
|
130
|
+
else cluster.description
|
|
131
|
+
)
|
|
132
|
+
content_str = (
|
|
133
|
+
json.dumps(cluster.content)
|
|
134
|
+
if isinstance(cluster.content, list)
|
|
135
|
+
else cluster.content
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
"id": cluster.id,
|
|
140
|
+
"name": cluster.name,
|
|
141
|
+
"description": description_str,
|
|
142
|
+
"content": content_str,
|
|
143
|
+
"scripts": json.dumps(cluster.scripts) if cluster.scripts else None,
|
|
144
|
+
"resources": json.dumps(cluster.resources) if cluster.resources else None,
|
|
145
|
+
"evidences": json.dumps([e.to_dict() for e in cluster.evidences]),
|
|
146
|
+
"patterns": json.dumps(cluster.patterns),
|
|
147
|
+
"constraints": json.dumps([c.to_dict() for c in cluster.constraints]),
|
|
148
|
+
"confidence": cluster.confidence,
|
|
149
|
+
"abstraction_level": cluster.abstraction_level.name if cluster.abstraction_level else None,
|
|
150
|
+
"landmark_potential": cluster.landmark_potential,
|
|
151
|
+
"hotness": cluster.hotness,
|
|
152
|
+
"lifecycle": cluster.lifecycle.name,
|
|
153
|
+
"create_time": cluster.create_time.isoformat() if cluster.create_time else None,
|
|
154
|
+
"last_modified": cluster.last_modified.isoformat() if cluster.last_modified else None,
|
|
155
|
+
"version": cluster.version,
|
|
156
|
+
"related_clusters": json.dumps([rc.to_dict() for rc in cluster.related_clusters]),
|
|
157
|
+
"search_results": json.dumps(cluster.search_results) if cluster.search_results else None,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
def _row_to_cluster(self, row: tuple) -> KnowledgeCluster:
|
|
161
|
+
"""Convert database row to KnowledgeCluster"""
|
|
162
|
+
# Unpack row (order matches schema). Older tables may not include search_results.
|
|
163
|
+
if len(row) == 19:
|
|
164
|
+
(
|
|
165
|
+
id, name, description, content, scripts, resources, evidences, patterns,
|
|
166
|
+
constraints, confidence, abstraction_level, landmark_potential, hotness,
|
|
167
|
+
lifecycle, create_time, last_modified, version, related_clusters, search_results
|
|
168
|
+
) = row
|
|
169
|
+
elif len(row) == 18:
|
|
170
|
+
(
|
|
171
|
+
id, name, description, content, scripts, resources, evidences, patterns,
|
|
172
|
+
constraints, confidence, abstraction_level, landmark_potential, hotness,
|
|
173
|
+
lifecycle, create_time, last_modified, version, related_clusters
|
|
174
|
+
) = row
|
|
175
|
+
search_results = None
|
|
176
|
+
elif len(row) == 17:
|
|
177
|
+
(
|
|
178
|
+
id, name, description, content, scripts, resources, evidences, patterns,
|
|
179
|
+
constraints, confidence, abstraction_level, landmark_potential, hotness,
|
|
180
|
+
lifecycle, create_time, last_modified, version
|
|
181
|
+
) = row
|
|
182
|
+
related_clusters = None
|
|
183
|
+
search_results = None
|
|
184
|
+
else:
|
|
185
|
+
raise ValueError(f"Unexpected knowledge_clusters row length: {len(row)}")
|
|
186
|
+
|
|
187
|
+
# Parse JSON fields
|
|
188
|
+
try:
|
|
189
|
+
description_parsed = json.loads(description) if description and description.startswith('[') else description
|
|
190
|
+
except:
|
|
191
|
+
description_parsed = description
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
content_parsed = json.loads(content) if content and content.startswith('[') else content
|
|
195
|
+
except:
|
|
196
|
+
content_parsed = content
|
|
197
|
+
|
|
198
|
+
scripts_parsed = json.loads(scripts) if scripts else None
|
|
199
|
+
resources_parsed = json.loads(resources) if resources else None
|
|
200
|
+
patterns_parsed = json.loads(patterns) if patterns else []
|
|
201
|
+
|
|
202
|
+
# Parse evidences
|
|
203
|
+
evidences_parsed = []
|
|
204
|
+
if evidences:
|
|
205
|
+
evidences_data = json.loads(evidences)
|
|
206
|
+
for ev_dict in evidences_data:
|
|
207
|
+
evidences_parsed.append(EvidenceUnit(
|
|
208
|
+
doc_id=ev_dict["doc_id"],
|
|
209
|
+
file_or_url=Path(ev_dict["file_or_url"]),
|
|
210
|
+
summary=ev_dict["summary"],
|
|
211
|
+
is_found=ev_dict["is_found"],
|
|
212
|
+
snippets=ev_dict["snippets"],
|
|
213
|
+
extracted_at=datetime.fromisoformat(ev_dict["extracted_at"]),
|
|
214
|
+
conflict_group=ev_dict.get("conflict_group")
|
|
215
|
+
))
|
|
216
|
+
|
|
217
|
+
# Parse constraints
|
|
218
|
+
constraints_parsed = []
|
|
219
|
+
if constraints:
|
|
220
|
+
constraints_data = json.loads(constraints)
|
|
221
|
+
for c_dict in constraints_data:
|
|
222
|
+
constraints_parsed.append(Constraint.from_dict(c_dict))
|
|
223
|
+
|
|
224
|
+
# Parse related clusters
|
|
225
|
+
related_clusters_parsed = []
|
|
226
|
+
if related_clusters:
|
|
227
|
+
related_data = json.loads(related_clusters)
|
|
228
|
+
for rc_dict in related_data:
|
|
229
|
+
related_clusters_parsed.append(WeakSemanticEdge.from_dict(rc_dict))
|
|
230
|
+
|
|
231
|
+
# Parse search results
|
|
232
|
+
search_results_parsed = []
|
|
233
|
+
if search_results:
|
|
234
|
+
search_results_parsed = json.loads(search_results)
|
|
235
|
+
|
|
236
|
+
return KnowledgeCluster(
|
|
237
|
+
id=id,
|
|
238
|
+
name=name,
|
|
239
|
+
description=description_parsed,
|
|
240
|
+
content=content_parsed,
|
|
241
|
+
scripts=scripts_parsed,
|
|
242
|
+
resources=resources_parsed,
|
|
243
|
+
evidences=evidences_parsed,
|
|
244
|
+
patterns=patterns_parsed,
|
|
245
|
+
constraints=constraints_parsed,
|
|
246
|
+
confidence=confidence,
|
|
247
|
+
abstraction_level=AbstractionLevel[abstraction_level] if abstraction_level else None,
|
|
248
|
+
landmark_potential=landmark_potential,
|
|
249
|
+
hotness=hotness,
|
|
250
|
+
lifecycle=Lifecycle[lifecycle],
|
|
251
|
+
create_time=datetime.fromisoformat(create_time) if create_time else None,
|
|
252
|
+
last_modified=datetime.fromisoformat(last_modified) if last_modified else None,
|
|
253
|
+
version=version,
|
|
254
|
+
related_clusters=related_clusters_parsed,
|
|
255
|
+
search_results=search_results_parsed,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
async def get(self, cluster_id: str) -> Optional[KnowledgeCluster]:
|
|
259
|
+
"""
|
|
260
|
+
Get a knowledge cluster by ID (exact match)
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
cluster_id: Unique cluster ID
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
KnowledgeCluster if found, None otherwise
|
|
267
|
+
"""
|
|
268
|
+
try:
|
|
269
|
+
row = self.db.fetch_one(
|
|
270
|
+
f"SELECT * FROM {self.table_name} WHERE id = ?",
|
|
271
|
+
[cluster_id]
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
if row:
|
|
275
|
+
return self._row_to_cluster(row)
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.error(f"Failed to get cluster {cluster_id}: {e}")
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
async def insert(self, cluster: KnowledgeCluster) -> bool:
|
|
283
|
+
"""
|
|
284
|
+
Insert a new knowledge cluster
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
cluster: KnowledgeCluster to insert
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
True if successful, False otherwise
|
|
291
|
+
"""
|
|
292
|
+
try:
|
|
293
|
+
# Check if cluster already exists
|
|
294
|
+
existing = await self.get(cluster.id)
|
|
295
|
+
if existing:
|
|
296
|
+
logger.warning(f"Cluster {cluster.id} already exists, use update() instead")
|
|
297
|
+
return False
|
|
298
|
+
|
|
299
|
+
# Set creation and modification times if not set
|
|
300
|
+
if not cluster.create_time:
|
|
301
|
+
cluster.create_time = datetime.now()
|
|
302
|
+
if not cluster.last_modified:
|
|
303
|
+
cluster.last_modified = datetime.now()
|
|
304
|
+
if cluster.version is None:
|
|
305
|
+
cluster.version = 1
|
|
306
|
+
|
|
307
|
+
# Insert into database
|
|
308
|
+
row = self._cluster_to_row(cluster)
|
|
309
|
+
self.db.insert_data(self.table_name, row)
|
|
310
|
+
|
|
311
|
+
# Save to parquet
|
|
312
|
+
self._save_to_parquet()
|
|
313
|
+
|
|
314
|
+
logger.info(f"Inserted cluster: {cluster.id}")
|
|
315
|
+
return True
|
|
316
|
+
|
|
317
|
+
except Exception as e:
|
|
318
|
+
logger.error(f"Failed to insert cluster {cluster.id}: {e}")
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
async def update(self, cluster: KnowledgeCluster) -> bool:
|
|
322
|
+
"""
|
|
323
|
+
Update an existing knowledge cluster
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
cluster: KnowledgeCluster with updated data
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
True if successful, False otherwise
|
|
330
|
+
"""
|
|
331
|
+
try:
|
|
332
|
+
# Check if cluster exists
|
|
333
|
+
existing = await self.get(cluster.id)
|
|
334
|
+
if not existing:
|
|
335
|
+
logger.warning(f"Cluster {cluster.id} does not exist, use insert() instead")
|
|
336
|
+
return False
|
|
337
|
+
|
|
338
|
+
# Update modification time and version
|
|
339
|
+
cluster.last_modified = datetime.now()
|
|
340
|
+
cluster.version = (cluster.version or 0) + 1
|
|
341
|
+
|
|
342
|
+
# Prepare update data
|
|
343
|
+
row = self._cluster_to_row(cluster)
|
|
344
|
+
set_clause = {k: v for k, v in row.items() if k != "id"}
|
|
345
|
+
|
|
346
|
+
# Update in database
|
|
347
|
+
self.db.update_data(
|
|
348
|
+
self.table_name,
|
|
349
|
+
set_clause=set_clause,
|
|
350
|
+
where_clause="id = ?",
|
|
351
|
+
where_params=[cluster.id]
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Save to parquet
|
|
355
|
+
self._save_to_parquet()
|
|
356
|
+
|
|
357
|
+
logger.info(f"Updated cluster: {cluster.id} (version {cluster.version})")
|
|
358
|
+
return True
|
|
359
|
+
|
|
360
|
+
except Exception as e:
|
|
361
|
+
logger.error(f"Failed to update cluster {cluster.id}: {e}")
|
|
362
|
+
return False
|
|
363
|
+
|
|
364
|
+
async def remove(self, cluster_id: str) -> bool:
|
|
365
|
+
"""
|
|
366
|
+
Remove a knowledge cluster by ID
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
cluster_id: Unique cluster ID
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
True if successful, False otherwise
|
|
373
|
+
"""
|
|
374
|
+
try:
|
|
375
|
+
# Check if cluster exists
|
|
376
|
+
existing = await self.get(cluster_id)
|
|
377
|
+
if not existing:
|
|
378
|
+
logger.warning(f"Cluster {cluster_id} does not exist")
|
|
379
|
+
return False
|
|
380
|
+
|
|
381
|
+
# Delete from database
|
|
382
|
+
self.db.delete_data(self.table_name, "id = ?", [cluster_id])
|
|
383
|
+
|
|
384
|
+
# Save to parquet
|
|
385
|
+
self._save_to_parquet()
|
|
386
|
+
|
|
387
|
+
logger.info(f"Removed cluster: {cluster_id}")
|
|
388
|
+
return True
|
|
389
|
+
|
|
390
|
+
except Exception as e:
|
|
391
|
+
logger.error(f"Failed to remove cluster {cluster_id}: {e}")
|
|
392
|
+
return False
|
|
393
|
+
|
|
394
|
+
async def clear(self) -> bool:
|
|
395
|
+
"""
|
|
396
|
+
Clear all knowledge clusters
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
True if successful, False otherwise
|
|
400
|
+
"""
|
|
401
|
+
try:
|
|
402
|
+
# Drop and recreate table
|
|
403
|
+
self.db.drop_table(self.table_name, if_exists=True)
|
|
404
|
+
self._create_table()
|
|
405
|
+
|
|
406
|
+
# Delete parquet file
|
|
407
|
+
if Path(self.parquet_file).exists():
|
|
408
|
+
Path(self.parquet_file).unlink()
|
|
409
|
+
|
|
410
|
+
logger.info("Cleared all knowledge clusters")
|
|
411
|
+
return True
|
|
412
|
+
|
|
413
|
+
except Exception as e:
|
|
414
|
+
logger.error(f"Failed to clear knowledge clusters: {e}")
|
|
415
|
+
return False
|
|
416
|
+
|
|
417
|
+
async def find(self, query: str, limit: int = 10) -> List[KnowledgeCluster]:
|
|
418
|
+
"""
|
|
419
|
+
Find knowledge clusters using fuzzy search
|
|
420
|
+
Searches in: id, name, description, content, patterns
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
query: Search query string
|
|
424
|
+
limit: Maximum number of results to return
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
List of matching KnowledgeCluster objects
|
|
428
|
+
"""
|
|
429
|
+
try:
|
|
430
|
+
# Fuzzy search using LIKE with wildcards
|
|
431
|
+
search_pattern = f"%{query}%"
|
|
432
|
+
|
|
433
|
+
sql = f"""
|
|
434
|
+
SELECT * FROM {self.table_name}
|
|
435
|
+
WHERE
|
|
436
|
+
id LIKE ? OR
|
|
437
|
+
name LIKE ? OR
|
|
438
|
+
description LIKE ? OR
|
|
439
|
+
content LIKE ? OR
|
|
440
|
+
patterns LIKE ?
|
|
441
|
+
ORDER BY
|
|
442
|
+
CASE
|
|
443
|
+
WHEN id = ? THEN 1
|
|
444
|
+
WHEN name LIKE ? THEN 2
|
|
445
|
+
WHEN description LIKE ? THEN 3
|
|
446
|
+
ELSE 4
|
|
447
|
+
END
|
|
448
|
+
LIMIT ?
|
|
449
|
+
"""
|
|
450
|
+
|
|
451
|
+
params = [
|
|
452
|
+
search_pattern, # id LIKE
|
|
453
|
+
search_pattern, # name LIKE
|
|
454
|
+
search_pattern, # description LIKE
|
|
455
|
+
search_pattern, # content LIKE
|
|
456
|
+
search_pattern, # patterns LIKE
|
|
457
|
+
query, # exact id match
|
|
458
|
+
f"{query}%", # name starts with
|
|
459
|
+
f"%{query}%", # description contains
|
|
460
|
+
limit
|
|
461
|
+
]
|
|
462
|
+
|
|
463
|
+
rows = self.db.fetch_all(sql, params)
|
|
464
|
+
|
|
465
|
+
clusters = [self._row_to_cluster(row) for row in rows]
|
|
466
|
+
|
|
467
|
+
logger.debug(f"Found {len(clusters)} clusters matching '{query}'")
|
|
468
|
+
return clusters
|
|
469
|
+
|
|
470
|
+
except Exception as e:
|
|
471
|
+
logger.error(f"Failed to search clusters with query '{query}': {e}")
|
|
472
|
+
return []
|
|
473
|
+
|
|
474
|
+
async def merge(self, clusters: List[KnowledgeCluster]) -> Optional[KnowledgeCluster]:
|
|
475
|
+
"""
|
|
476
|
+
Merge multiple knowledge clusters into one
|
|
477
|
+
|
|
478
|
+
Strategy:
|
|
479
|
+
- Use first cluster as base
|
|
480
|
+
- Merge evidences, patterns, constraints from all clusters
|
|
481
|
+
- Average numeric scores (confidence, hotness, etc.)
|
|
482
|
+
- Update version and timestamps
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
clusters: List of KnowledgeCluster objects to merge
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
Merged KnowledgeCluster, or None if merge fails
|
|
489
|
+
"""
|
|
490
|
+
if not clusters:
|
|
491
|
+
logger.warning("No clusters to merge")
|
|
492
|
+
return None
|
|
493
|
+
|
|
494
|
+
if len(clusters) == 1:
|
|
495
|
+
logger.warning("Only one cluster provided, returning as-is")
|
|
496
|
+
return clusters[0]
|
|
497
|
+
|
|
498
|
+
try:
|
|
499
|
+
# Use first cluster as base
|
|
500
|
+
merged = clusters[0]
|
|
501
|
+
|
|
502
|
+
# Merge content and descriptions
|
|
503
|
+
all_descriptions = []
|
|
504
|
+
all_contents = []
|
|
505
|
+
|
|
506
|
+
for cluster in clusters:
|
|
507
|
+
# Handle descriptions
|
|
508
|
+
if isinstance(cluster.description, list):
|
|
509
|
+
all_descriptions.extend(cluster.description)
|
|
510
|
+
else:
|
|
511
|
+
all_descriptions.append(cluster.description)
|
|
512
|
+
|
|
513
|
+
# Handle contents
|
|
514
|
+
if isinstance(cluster.content, list):
|
|
515
|
+
all_contents.extend(cluster.content)
|
|
516
|
+
else:
|
|
517
|
+
all_contents.append(cluster.content)
|
|
518
|
+
|
|
519
|
+
merged.description = list(set(all_descriptions)) # Deduplicate
|
|
520
|
+
merged.content = list(set(all_contents)) # Deduplicate
|
|
521
|
+
|
|
522
|
+
# Merge evidences (deduplicate by doc_id)
|
|
523
|
+
evidences_map = {}
|
|
524
|
+
for cluster in clusters:
|
|
525
|
+
for evidence in cluster.evidences:
|
|
526
|
+
if evidence.doc_id not in evidences_map:
|
|
527
|
+
evidences_map[evidence.doc_id] = evidence
|
|
528
|
+
merged.evidences = list(evidences_map.values())
|
|
529
|
+
|
|
530
|
+
# Merge patterns (deduplicate)
|
|
531
|
+
all_patterns = []
|
|
532
|
+
for cluster in clusters:
|
|
533
|
+
all_patterns.extend(cluster.patterns)
|
|
534
|
+
merged.patterns = list(set(all_patterns))
|
|
535
|
+
|
|
536
|
+
# Merge constraints (deduplicate by condition)
|
|
537
|
+
constraints_map = {}
|
|
538
|
+
for cluster in clusters:
|
|
539
|
+
for constraint in cluster.constraints:
|
|
540
|
+
if constraint.condition not in constraints_map:
|
|
541
|
+
constraints_map[constraint.condition] = constraint
|
|
542
|
+
merged.constraints = list(constraints_map.values())
|
|
543
|
+
|
|
544
|
+
# Merge related clusters (deduplicate by target_cluster_id)
|
|
545
|
+
related_map = {}
|
|
546
|
+
for cluster in clusters:
|
|
547
|
+
for related in cluster.related_clusters:
|
|
548
|
+
if related.target_cluster_id not in related_map:
|
|
549
|
+
related_map[related.target_cluster_id] = related
|
|
550
|
+
else:
|
|
551
|
+
# Average weights if duplicate
|
|
552
|
+
existing = related_map[related.target_cluster_id]
|
|
553
|
+
existing.weight = (existing.weight + related.weight) / 2
|
|
554
|
+
merged.related_clusters = list(related_map.values())
|
|
555
|
+
|
|
556
|
+
# Average numeric scores
|
|
557
|
+
valid_confidences = [c.confidence for c in clusters if c.confidence is not None]
|
|
558
|
+
if valid_confidences:
|
|
559
|
+
merged.confidence = sum(valid_confidences) / len(valid_confidences)
|
|
560
|
+
|
|
561
|
+
valid_hotness = [c.hotness for c in clusters if c.hotness is not None]
|
|
562
|
+
if valid_hotness:
|
|
563
|
+
merged.hotness = sum(valid_hotness) / len(valid_hotness)
|
|
564
|
+
|
|
565
|
+
valid_landmark = [c.landmark_potential for c in clusters if c.landmark_potential is not None]
|
|
566
|
+
if valid_landmark:
|
|
567
|
+
merged.landmark_potential = sum(valid_landmark) / len(valid_landmark)
|
|
568
|
+
|
|
569
|
+
# Update metadata
|
|
570
|
+
merged.name = f"{merged.name} (merged)"
|
|
571
|
+
merged.last_modified = datetime.now()
|
|
572
|
+
merged.version = (merged.version or 0) + 1
|
|
573
|
+
|
|
574
|
+
# Update the merged cluster in database
|
|
575
|
+
await self.update(merged)
|
|
576
|
+
|
|
577
|
+
# Remove source clusters (except the first one which is now merged)
|
|
578
|
+
for cluster in clusters[1:]:
|
|
579
|
+
await self.remove(cluster.id)
|
|
580
|
+
|
|
581
|
+
logger.info(f"Merged {len(clusters)} clusters into {merged.id}")
|
|
582
|
+
return merged
|
|
583
|
+
|
|
584
|
+
except Exception as e:
|
|
585
|
+
logger.error(f"Failed to merge clusters: {e}")
|
|
586
|
+
return None
|
|
587
|
+
|
|
588
|
+
async def split(self, cluster: KnowledgeCluster, num_splits: int = 2) -> List[KnowledgeCluster]:
|
|
589
|
+
"""
|
|
590
|
+
Split a knowledge cluster into multiple smaller clusters
|
|
591
|
+
|
|
592
|
+
Strategy:
|
|
593
|
+
- Split evidences evenly across new clusters
|
|
594
|
+
- Distribute patterns and constraints
|
|
595
|
+
- Create new cluster IDs based on original ID
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
cluster: KnowledgeCluster to split
|
|
599
|
+
num_splits: Number of clusters to split into (default: 2)
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
List of new KnowledgeCluster objects
|
|
603
|
+
"""
|
|
604
|
+
if num_splits < 2:
|
|
605
|
+
logger.warning("num_splits must be >= 2, returning original cluster")
|
|
606
|
+
return [cluster]
|
|
607
|
+
|
|
608
|
+
try:
|
|
609
|
+
new_clusters = []
|
|
610
|
+
|
|
611
|
+
# Split evidences
|
|
612
|
+
evidences_per_cluster = len(cluster.evidences) // num_splits
|
|
613
|
+
if evidences_per_cluster == 0:
|
|
614
|
+
logger.warning("Not enough evidences to split, returning original cluster")
|
|
615
|
+
return [cluster]
|
|
616
|
+
|
|
617
|
+
for i in range(num_splits):
|
|
618
|
+
# Create new cluster ID
|
|
619
|
+
new_id = f"{cluster.id}_split{i+1}"
|
|
620
|
+
|
|
621
|
+
# Calculate evidence range
|
|
622
|
+
start_idx = i * evidences_per_cluster
|
|
623
|
+
end_idx = start_idx + evidences_per_cluster if i < num_splits - 1 else len(cluster.evidences)
|
|
624
|
+
|
|
625
|
+
# Create new cluster
|
|
626
|
+
new_cluster = KnowledgeCluster(
|
|
627
|
+
id=new_id,
|
|
628
|
+
name=f"{cluster.name} (part {i+1})",
|
|
629
|
+
description=cluster.description,
|
|
630
|
+
content=cluster.content,
|
|
631
|
+
scripts=cluster.scripts,
|
|
632
|
+
resources=cluster.resources,
|
|
633
|
+
evidences=cluster.evidences[start_idx:end_idx],
|
|
634
|
+
patterns=cluster.patterns[i::num_splits], # Distribute patterns
|
|
635
|
+
constraints=cluster.constraints[i::num_splits], # Distribute constraints
|
|
636
|
+
confidence=cluster.confidence,
|
|
637
|
+
abstraction_level=cluster.abstraction_level,
|
|
638
|
+
landmark_potential=cluster.landmark_potential,
|
|
639
|
+
hotness=cluster.hotness,
|
|
640
|
+
lifecycle=Lifecycle.EMERGING, # New clusters are emerging
|
|
641
|
+
create_time=datetime.now(),
|
|
642
|
+
last_modified=datetime.now(),
|
|
643
|
+
version=1,
|
|
644
|
+
related_clusters=cluster.related_clusters,
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
# Insert new cluster
|
|
648
|
+
await self.insert(new_cluster)
|
|
649
|
+
new_clusters.append(new_cluster)
|
|
650
|
+
|
|
651
|
+
# Remove original cluster
|
|
652
|
+
await self.remove(cluster.id)
|
|
653
|
+
|
|
654
|
+
logger.info(f"Split cluster {cluster.id} into {num_splits} clusters")
|
|
655
|
+
return new_clusters
|
|
656
|
+
|
|
657
|
+
except Exception as e:
|
|
658
|
+
logger.error(f"Failed to split cluster {cluster.id}: {e}")
|
|
659
|
+
return [cluster]
|
|
660
|
+
|
|
661
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
662
|
+
"""
|
|
663
|
+
Get statistics about stored knowledge clusters
|
|
664
|
+
|
|
665
|
+
Returns:
|
|
666
|
+
Dictionary with statistics
|
|
667
|
+
"""
|
|
668
|
+
try:
|
|
669
|
+
stats = self.db.analyze_table(self.table_name)
|
|
670
|
+
|
|
671
|
+
# Add custom stats
|
|
672
|
+
total_count = self.db.get_table_count(self.table_name)
|
|
673
|
+
|
|
674
|
+
# Count by lifecycle
|
|
675
|
+
lifecycle_counts = {}
|
|
676
|
+
for lifecycle in Lifecycle:
|
|
677
|
+
count_row = self.db.fetch_one(
|
|
678
|
+
f"SELECT COUNT(*) FROM {self.table_name} WHERE lifecycle = ?",
|
|
679
|
+
[lifecycle.name]
|
|
680
|
+
)
|
|
681
|
+
lifecycle_counts[lifecycle.name] = count_row[0] if count_row else 0
|
|
682
|
+
|
|
683
|
+
# Average confidence
|
|
684
|
+
avg_confidence_row = self.db.fetch_one(
|
|
685
|
+
f"SELECT AVG(confidence) FROM {self.table_name} WHERE confidence IS NOT NULL"
|
|
686
|
+
)
|
|
687
|
+
avg_confidence = avg_confidence_row[0] if avg_confidence_row and avg_confidence_row[0] else 0
|
|
688
|
+
|
|
689
|
+
stats["custom_stats"] = {
|
|
690
|
+
"total_clusters": total_count,
|
|
691
|
+
"lifecycle_distribution": lifecycle_counts,
|
|
692
|
+
"average_confidence": round(avg_confidence, 4) if avg_confidence else None,
|
|
693
|
+
"parquet_file": self.parquet_file,
|
|
694
|
+
"parquet_exists": Path(self.parquet_file).exists(),
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
return stats
|
|
698
|
+
|
|
699
|
+
except Exception as e:
|
|
700
|
+
logger.error(f"Failed to get stats: {e}")
|
|
701
|
+
return {}
|
|
702
|
+
|
|
703
|
+
def close(self):
|
|
704
|
+
"""Close database connection"""
|
|
705
|
+
if self.db:
|
|
706
|
+
self.db.close()
|
|
707
|
+
logger.info("Knowledge Manager closed")
|
|
708
|
+
|
|
709
|
+
def __enter__(self):
|
|
710
|
+
"""Context manager entry"""
|
|
711
|
+
return self
|
|
712
|
+
|
|
713
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
714
|
+
"""Context manager exit"""
|
|
715
|
+
self.close()
|
|
716
|
+
|
|
717
|
+
def __del__(self):
|
|
718
|
+
"""Destructor to ensure connection is closed"""
|
|
719
|
+
if hasattr(self, 'db') and self.db:
|
|
720
|
+
self.close()
|