sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
  41. sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,720 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ """
3
+ Knowledge Manager using DuckDB and Parquet
4
+ Manages KnowledgeCluster objects with persistence
5
+ """
6
+
7
+ import os
8
+ import json
9
+ from typing import Dict, Any, List, Optional
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+ from loguru import logger
13
+
14
+ from .duckdb import DuckDBManager
15
+ from sirchmunk.schema.knowledge import (
16
+ KnowledgeCluster,
17
+ EvidenceUnit,
18
+ Constraint,
19
+ WeakSemanticEdge,
20
+ Lifecycle,
21
+ AbstractionLevel
22
+ )
23
+ from ..utils.constants import DEFAULT_WORK_PATH
24
+
25
+
26
+ class KnowledgeManager:
27
+ """
28
+ Manages persistent storage of KnowledgeCluster objects using DuckDB and Parquet
29
+
30
+ Architecture:
31
+ - Uses KnowledgeCluster as core schema
32
+ - Stores data in Parquet format for efficient storage and querying
33
+ - Provides full CRUD operations with fuzzy search capabilities
34
+ - Follows Single Responsibility Principle (SRP)
35
+
36
+ Storage Path: {WORK_PATH}/.cache/knowledge/
37
+ """
38
+
39
+ def __init__(self, work_path: Optional[str] = None):
40
+ """
41
+ Initialize Knowledge Manager
42
+
43
+ Args:
44
+ work_path: Base work path. If None, uses WORK_PATH env variable
45
+ """
46
+ # Get work path from env if not provided
47
+ if work_path is None:
48
+ work_path = os.getenv("WORK_PATH", DEFAULT_WORK_PATH)
49
+
50
+ # Create knowledge storage path
51
+ self.knowledge_path = Path(work_path) / ".cache" / "knowledge"
52
+ self.knowledge_path.mkdir(parents=True, exist_ok=True)
53
+
54
+ # Parquet file path
55
+ self.parquet_file = str(self.knowledge_path / "knowledge_clusters.parquet")
56
+
57
+ # Initialize DuckDB (in-memory for fast operations)
58
+ self.db = DuckDBManager(db_path=None) # In-memory database
59
+
60
+ # Table name
61
+ self.table_name = "knowledge_clusters"
62
+
63
+ # Load data from parquet if exists
64
+ self._load_from_parquet()
65
+
66
+ logger.info(f"Knowledge Manager initialized at: {self.knowledge_path}")
67
+
68
+ def _load_from_parquet(self):
69
+ """Load knowledge clusters from parquet file into DuckDB"""
70
+ try:
71
+ if Path(self.parquet_file).exists():
72
+ # Drop existing table first to avoid conflicts
73
+ self.db.drop_table(self.table_name, if_exists=True)
74
+ # Load parquet file into DuckDB table
75
+ self.db.import_from_parquet(self.table_name, self.parquet_file, create_table=True)
76
+ count = self.db.get_table_count(self.table_name)
77
+ logger.info(f"Loaded {count} knowledge clusters from {self.parquet_file}")
78
+ else:
79
+ # Create empty table with schema
80
+ self._create_table()
81
+ logger.info("Created new knowledge clusters table")
82
+ except Exception as e:
83
+ logger.error(f"Failed to load from parquet: {e}")
84
+ # Try to recreate table
85
+ self.db.drop_table(self.table_name, if_exists=True)
86
+ self._create_table()
87
+
88
+ def _create_table(self):
89
+ """Create knowledge clusters table with schema"""
90
+ schema = {
91
+ "id": "VARCHAR PRIMARY KEY",
92
+ "name": "VARCHAR NOT NULL",
93
+ "description": "VARCHAR",
94
+ "content": "VARCHAR",
95
+ "scripts": "VARCHAR", # JSON array
96
+ "resources": "VARCHAR", # JSON array
97
+ "evidences": "VARCHAR", # JSON array
98
+ "patterns": "VARCHAR", # JSON array
99
+ "constraints": "VARCHAR", # JSON array
100
+ "confidence": "DOUBLE",
101
+ "abstraction_level": "VARCHAR",
102
+ "landmark_potential": "DOUBLE",
103
+ "hotness": "DOUBLE",
104
+ "lifecycle": "VARCHAR",
105
+ "create_time": "TIMESTAMP",
106
+ "last_modified": "TIMESTAMP",
107
+ "version": "INTEGER",
108
+ "related_clusters": "VARCHAR", # JSON array
109
+ "search_results": "VARCHAR", # JSON array
110
+ }
111
+ self.db.create_table(self.table_name, schema, if_not_exists=True)
112
+ logger.info(f"Created table {self.table_name}")
113
+
114
+ def _save_to_parquet(self):
115
+ """Save current knowledge clusters to parquet file"""
116
+ try:
117
+ # Export table to parquet
118
+ self.db.export_to_parquet(self.table_name, self.parquet_file)
119
+ logger.debug(f"Saved knowledge clusters to {self.parquet_file}")
120
+ except Exception as e:
121
+ logger.error(f"Failed to save to parquet: {e}")
122
+ raise
123
+
124
+ def _cluster_to_row(self, cluster: KnowledgeCluster) -> Dict[str, Any]:
125
+ """Convert KnowledgeCluster to database row"""
126
+ # Handle list/string fields for description and content
127
+ description_str = (
128
+ json.dumps(cluster.description)
129
+ if isinstance(cluster.description, list)
130
+ else cluster.description
131
+ )
132
+ content_str = (
133
+ json.dumps(cluster.content)
134
+ if isinstance(cluster.content, list)
135
+ else cluster.content
136
+ )
137
+
138
+ return {
139
+ "id": cluster.id,
140
+ "name": cluster.name,
141
+ "description": description_str,
142
+ "content": content_str,
143
+ "scripts": json.dumps(cluster.scripts) if cluster.scripts else None,
144
+ "resources": json.dumps(cluster.resources) if cluster.resources else None,
145
+ "evidences": json.dumps([e.to_dict() for e in cluster.evidences]),
146
+ "patterns": json.dumps(cluster.patterns),
147
+ "constraints": json.dumps([c.to_dict() for c in cluster.constraints]),
148
+ "confidence": cluster.confidence,
149
+ "abstraction_level": cluster.abstraction_level.name if cluster.abstraction_level else None,
150
+ "landmark_potential": cluster.landmark_potential,
151
+ "hotness": cluster.hotness,
152
+ "lifecycle": cluster.lifecycle.name,
153
+ "create_time": cluster.create_time.isoformat() if cluster.create_time else None,
154
+ "last_modified": cluster.last_modified.isoformat() if cluster.last_modified else None,
155
+ "version": cluster.version,
156
+ "related_clusters": json.dumps([rc.to_dict() for rc in cluster.related_clusters]),
157
+ "search_results": json.dumps(cluster.search_results) if cluster.search_results else None,
158
+ }
159
+
160
+ def _row_to_cluster(self, row: tuple) -> KnowledgeCluster:
161
+ """Convert database row to KnowledgeCluster"""
162
+ # Unpack row (order matches schema). Older tables may not include search_results.
163
+ if len(row) == 19:
164
+ (
165
+ id, name, description, content, scripts, resources, evidences, patterns,
166
+ constraints, confidence, abstraction_level, landmark_potential, hotness,
167
+ lifecycle, create_time, last_modified, version, related_clusters, search_results
168
+ ) = row
169
+ elif len(row) == 18:
170
+ (
171
+ id, name, description, content, scripts, resources, evidences, patterns,
172
+ constraints, confidence, abstraction_level, landmark_potential, hotness,
173
+ lifecycle, create_time, last_modified, version, related_clusters
174
+ ) = row
175
+ search_results = None
176
+ elif len(row) == 17:
177
+ (
178
+ id, name, description, content, scripts, resources, evidences, patterns,
179
+ constraints, confidence, abstraction_level, landmark_potential, hotness,
180
+ lifecycle, create_time, last_modified, version
181
+ ) = row
182
+ related_clusters = None
183
+ search_results = None
184
+ else:
185
+ raise ValueError(f"Unexpected knowledge_clusters row length: {len(row)}")
186
+
187
+ # Parse JSON fields
188
+ try:
189
+ description_parsed = json.loads(description) if description and description.startswith('[') else description
190
+ except:
191
+ description_parsed = description
192
+
193
+ try:
194
+ content_parsed = json.loads(content) if content and content.startswith('[') else content
195
+ except:
196
+ content_parsed = content
197
+
198
+ scripts_parsed = json.loads(scripts) if scripts else None
199
+ resources_parsed = json.loads(resources) if resources else None
200
+ patterns_parsed = json.loads(patterns) if patterns else []
201
+
202
+ # Parse evidences
203
+ evidences_parsed = []
204
+ if evidences:
205
+ evidences_data = json.loads(evidences)
206
+ for ev_dict in evidences_data:
207
+ evidences_parsed.append(EvidenceUnit(
208
+ doc_id=ev_dict["doc_id"],
209
+ file_or_url=Path(ev_dict["file_or_url"]),
210
+ summary=ev_dict["summary"],
211
+ is_found=ev_dict["is_found"],
212
+ snippets=ev_dict["snippets"],
213
+ extracted_at=datetime.fromisoformat(ev_dict["extracted_at"]),
214
+ conflict_group=ev_dict.get("conflict_group")
215
+ ))
216
+
217
+ # Parse constraints
218
+ constraints_parsed = []
219
+ if constraints:
220
+ constraints_data = json.loads(constraints)
221
+ for c_dict in constraints_data:
222
+ constraints_parsed.append(Constraint.from_dict(c_dict))
223
+
224
+ # Parse related clusters
225
+ related_clusters_parsed = []
226
+ if related_clusters:
227
+ related_data = json.loads(related_clusters)
228
+ for rc_dict in related_data:
229
+ related_clusters_parsed.append(WeakSemanticEdge.from_dict(rc_dict))
230
+
231
+ # Parse search results
232
+ search_results_parsed = []
233
+ if search_results:
234
+ search_results_parsed = json.loads(search_results)
235
+
236
+ return KnowledgeCluster(
237
+ id=id,
238
+ name=name,
239
+ description=description_parsed,
240
+ content=content_parsed,
241
+ scripts=scripts_parsed,
242
+ resources=resources_parsed,
243
+ evidences=evidences_parsed,
244
+ patterns=patterns_parsed,
245
+ constraints=constraints_parsed,
246
+ confidence=confidence,
247
+ abstraction_level=AbstractionLevel[abstraction_level] if abstraction_level else None,
248
+ landmark_potential=landmark_potential,
249
+ hotness=hotness,
250
+ lifecycle=Lifecycle[lifecycle],
251
+ create_time=datetime.fromisoformat(create_time) if create_time else None,
252
+ last_modified=datetime.fromisoformat(last_modified) if last_modified else None,
253
+ version=version,
254
+ related_clusters=related_clusters_parsed,
255
+ search_results=search_results_parsed,
256
+ )
257
+
258
+ async def get(self, cluster_id: str) -> Optional[KnowledgeCluster]:
259
+ """
260
+ Get a knowledge cluster by ID (exact match)
261
+
262
+ Args:
263
+ cluster_id: Unique cluster ID
264
+
265
+ Returns:
266
+ KnowledgeCluster if found, None otherwise
267
+ """
268
+ try:
269
+ row = self.db.fetch_one(
270
+ f"SELECT * FROM {self.table_name} WHERE id = ?",
271
+ [cluster_id]
272
+ )
273
+
274
+ if row:
275
+ return self._row_to_cluster(row)
276
+ return None
277
+
278
+ except Exception as e:
279
+ logger.error(f"Failed to get cluster {cluster_id}: {e}")
280
+ return None
281
+
282
+ async def insert(self, cluster: KnowledgeCluster) -> bool:
283
+ """
284
+ Insert a new knowledge cluster
285
+
286
+ Args:
287
+ cluster: KnowledgeCluster to insert
288
+
289
+ Returns:
290
+ True if successful, False otherwise
291
+ """
292
+ try:
293
+ # Check if cluster already exists
294
+ existing = await self.get(cluster.id)
295
+ if existing:
296
+ logger.warning(f"Cluster {cluster.id} already exists, use update() instead")
297
+ return False
298
+
299
+ # Set creation and modification times if not set
300
+ if not cluster.create_time:
301
+ cluster.create_time = datetime.now()
302
+ if not cluster.last_modified:
303
+ cluster.last_modified = datetime.now()
304
+ if cluster.version is None:
305
+ cluster.version = 1
306
+
307
+ # Insert into database
308
+ row = self._cluster_to_row(cluster)
309
+ self.db.insert_data(self.table_name, row)
310
+
311
+ # Save to parquet
312
+ self._save_to_parquet()
313
+
314
+ logger.info(f"Inserted cluster: {cluster.id}")
315
+ return True
316
+
317
+ except Exception as e:
318
+ logger.error(f"Failed to insert cluster {cluster.id}: {e}")
319
+ return False
320
+
321
+ async def update(self, cluster: KnowledgeCluster) -> bool:
322
+ """
323
+ Update an existing knowledge cluster
324
+
325
+ Args:
326
+ cluster: KnowledgeCluster with updated data
327
+
328
+ Returns:
329
+ True if successful, False otherwise
330
+ """
331
+ try:
332
+ # Check if cluster exists
333
+ existing = await self.get(cluster.id)
334
+ if not existing:
335
+ logger.warning(f"Cluster {cluster.id} does not exist, use insert() instead")
336
+ return False
337
+
338
+ # Update modification time and version
339
+ cluster.last_modified = datetime.now()
340
+ cluster.version = (cluster.version or 0) + 1
341
+
342
+ # Prepare update data
343
+ row = self._cluster_to_row(cluster)
344
+ set_clause = {k: v for k, v in row.items() if k != "id"}
345
+
346
+ # Update in database
347
+ self.db.update_data(
348
+ self.table_name,
349
+ set_clause=set_clause,
350
+ where_clause="id = ?",
351
+ where_params=[cluster.id]
352
+ )
353
+
354
+ # Save to parquet
355
+ self._save_to_parquet()
356
+
357
+ logger.info(f"Updated cluster: {cluster.id} (version {cluster.version})")
358
+ return True
359
+
360
+ except Exception as e:
361
+ logger.error(f"Failed to update cluster {cluster.id}: {e}")
362
+ return False
363
+
364
+ async def remove(self, cluster_id: str) -> bool:
365
+ """
366
+ Remove a knowledge cluster by ID
367
+
368
+ Args:
369
+ cluster_id: Unique cluster ID
370
+
371
+ Returns:
372
+ True if successful, False otherwise
373
+ """
374
+ try:
375
+ # Check if cluster exists
376
+ existing = await self.get(cluster_id)
377
+ if not existing:
378
+ logger.warning(f"Cluster {cluster_id} does not exist")
379
+ return False
380
+
381
+ # Delete from database
382
+ self.db.delete_data(self.table_name, "id = ?", [cluster_id])
383
+
384
+ # Save to parquet
385
+ self._save_to_parquet()
386
+
387
+ logger.info(f"Removed cluster: {cluster_id}")
388
+ return True
389
+
390
+ except Exception as e:
391
+ logger.error(f"Failed to remove cluster {cluster_id}: {e}")
392
+ return False
393
+
394
+ async def clear(self) -> bool:
395
+ """
396
+ Clear all knowledge clusters
397
+
398
+ Returns:
399
+ True if successful, False otherwise
400
+ """
401
+ try:
402
+ # Drop and recreate table
403
+ self.db.drop_table(self.table_name, if_exists=True)
404
+ self._create_table()
405
+
406
+ # Delete parquet file
407
+ if Path(self.parquet_file).exists():
408
+ Path(self.parquet_file).unlink()
409
+
410
+ logger.info("Cleared all knowledge clusters")
411
+ return True
412
+
413
+ except Exception as e:
414
+ logger.error(f"Failed to clear knowledge clusters: {e}")
415
+ return False
416
+
417
+ async def find(self, query: str, limit: int = 10) -> List[KnowledgeCluster]:
418
+ """
419
+ Find knowledge clusters using fuzzy search
420
+ Searches in: id, name, description, content, patterns
421
+
422
+ Args:
423
+ query: Search query string
424
+ limit: Maximum number of results to return
425
+
426
+ Returns:
427
+ List of matching KnowledgeCluster objects
428
+ """
429
+ try:
430
+ # Fuzzy search using LIKE with wildcards
431
+ search_pattern = f"%{query}%"
432
+
433
+ sql = f"""
434
+ SELECT * FROM {self.table_name}
435
+ WHERE
436
+ id LIKE ? OR
437
+ name LIKE ? OR
438
+ description LIKE ? OR
439
+ content LIKE ? OR
440
+ patterns LIKE ?
441
+ ORDER BY
442
+ CASE
443
+ WHEN id = ? THEN 1
444
+ WHEN name LIKE ? THEN 2
445
+ WHEN description LIKE ? THEN 3
446
+ ELSE 4
447
+ END
448
+ LIMIT ?
449
+ """
450
+
451
+ params = [
452
+ search_pattern, # id LIKE
453
+ search_pattern, # name LIKE
454
+ search_pattern, # description LIKE
455
+ search_pattern, # content LIKE
456
+ search_pattern, # patterns LIKE
457
+ query, # exact id match
458
+ f"{query}%", # name starts with
459
+ f"%{query}%", # description contains
460
+ limit
461
+ ]
462
+
463
+ rows = self.db.fetch_all(sql, params)
464
+
465
+ clusters = [self._row_to_cluster(row) for row in rows]
466
+
467
+ logger.debug(f"Found {len(clusters)} clusters matching '{query}'")
468
+ return clusters
469
+
470
+ except Exception as e:
471
+ logger.error(f"Failed to search clusters with query '{query}': {e}")
472
+ return []
473
+
474
+ async def merge(self, clusters: List[KnowledgeCluster]) -> Optional[KnowledgeCluster]:
475
+ """
476
+ Merge multiple knowledge clusters into one
477
+
478
+ Strategy:
479
+ - Use first cluster as base
480
+ - Merge evidences, patterns, constraints from all clusters
481
+ - Average numeric scores (confidence, hotness, etc.)
482
+ - Update version and timestamps
483
+
484
+ Args:
485
+ clusters: List of KnowledgeCluster objects to merge
486
+
487
+ Returns:
488
+ Merged KnowledgeCluster, or None if merge fails
489
+ """
490
+ if not clusters:
491
+ logger.warning("No clusters to merge")
492
+ return None
493
+
494
+ if len(clusters) == 1:
495
+ logger.warning("Only one cluster provided, returning as-is")
496
+ return clusters[0]
497
+
498
+ try:
499
+ # Use first cluster as base
500
+ merged = clusters[0]
501
+
502
+ # Merge content and descriptions
503
+ all_descriptions = []
504
+ all_contents = []
505
+
506
+ for cluster in clusters:
507
+ # Handle descriptions
508
+ if isinstance(cluster.description, list):
509
+ all_descriptions.extend(cluster.description)
510
+ else:
511
+ all_descriptions.append(cluster.description)
512
+
513
+ # Handle contents
514
+ if isinstance(cluster.content, list):
515
+ all_contents.extend(cluster.content)
516
+ else:
517
+ all_contents.append(cluster.content)
518
+
519
+ merged.description = list(set(all_descriptions)) # Deduplicate
520
+ merged.content = list(set(all_contents)) # Deduplicate
521
+
522
+ # Merge evidences (deduplicate by doc_id)
523
+ evidences_map = {}
524
+ for cluster in clusters:
525
+ for evidence in cluster.evidences:
526
+ if evidence.doc_id not in evidences_map:
527
+ evidences_map[evidence.doc_id] = evidence
528
+ merged.evidences = list(evidences_map.values())
529
+
530
+ # Merge patterns (deduplicate)
531
+ all_patterns = []
532
+ for cluster in clusters:
533
+ all_patterns.extend(cluster.patterns)
534
+ merged.patterns = list(set(all_patterns))
535
+
536
+ # Merge constraints (deduplicate by condition)
537
+ constraints_map = {}
538
+ for cluster in clusters:
539
+ for constraint in cluster.constraints:
540
+ if constraint.condition not in constraints_map:
541
+ constraints_map[constraint.condition] = constraint
542
+ merged.constraints = list(constraints_map.values())
543
+
544
+ # Merge related clusters (deduplicate by target_cluster_id)
545
+ related_map = {}
546
+ for cluster in clusters:
547
+ for related in cluster.related_clusters:
548
+ if related.target_cluster_id not in related_map:
549
+ related_map[related.target_cluster_id] = related
550
+ else:
551
+ # Average weights if duplicate
552
+ existing = related_map[related.target_cluster_id]
553
+ existing.weight = (existing.weight + related.weight) / 2
554
+ merged.related_clusters = list(related_map.values())
555
+
556
+ # Average numeric scores
557
+ valid_confidences = [c.confidence for c in clusters if c.confidence is not None]
558
+ if valid_confidences:
559
+ merged.confidence = sum(valid_confidences) / len(valid_confidences)
560
+
561
+ valid_hotness = [c.hotness for c in clusters if c.hotness is not None]
562
+ if valid_hotness:
563
+ merged.hotness = sum(valid_hotness) / len(valid_hotness)
564
+
565
+ valid_landmark = [c.landmark_potential for c in clusters if c.landmark_potential is not None]
566
+ if valid_landmark:
567
+ merged.landmark_potential = sum(valid_landmark) / len(valid_landmark)
568
+
569
+ # Update metadata
570
+ merged.name = f"{merged.name} (merged)"
571
+ merged.last_modified = datetime.now()
572
+ merged.version = (merged.version or 0) + 1
573
+
574
+ # Update the merged cluster in database
575
+ await self.update(merged)
576
+
577
+ # Remove source clusters (except the first one which is now merged)
578
+ for cluster in clusters[1:]:
579
+ await self.remove(cluster.id)
580
+
581
+ logger.info(f"Merged {len(clusters)} clusters into {merged.id}")
582
+ return merged
583
+
584
+ except Exception as e:
585
+ logger.error(f"Failed to merge clusters: {e}")
586
+ return None
587
+
588
+ async def split(self, cluster: KnowledgeCluster, num_splits: int = 2) -> List[KnowledgeCluster]:
589
+ """
590
+ Split a knowledge cluster into multiple smaller clusters
591
+
592
+ Strategy:
593
+ - Split evidences evenly across new clusters
594
+ - Distribute patterns and constraints
595
+ - Create new cluster IDs based on original ID
596
+
597
+ Args:
598
+ cluster: KnowledgeCluster to split
599
+ num_splits: Number of clusters to split into (default: 2)
600
+
601
+ Returns:
602
+ List of new KnowledgeCluster objects
603
+ """
604
+ if num_splits < 2:
605
+ logger.warning("num_splits must be >= 2, returning original cluster")
606
+ return [cluster]
607
+
608
+ try:
609
+ new_clusters = []
610
+
611
+ # Split evidences
612
+ evidences_per_cluster = len(cluster.evidences) // num_splits
613
+ if evidences_per_cluster == 0:
614
+ logger.warning("Not enough evidences to split, returning original cluster")
615
+ return [cluster]
616
+
617
+ for i in range(num_splits):
618
+ # Create new cluster ID
619
+ new_id = f"{cluster.id}_split{i+1}"
620
+
621
+ # Calculate evidence range
622
+ start_idx = i * evidences_per_cluster
623
+ end_idx = start_idx + evidences_per_cluster if i < num_splits - 1 else len(cluster.evidences)
624
+
625
+ # Create new cluster
626
+ new_cluster = KnowledgeCluster(
627
+ id=new_id,
628
+ name=f"{cluster.name} (part {i+1})",
629
+ description=cluster.description,
630
+ content=cluster.content,
631
+ scripts=cluster.scripts,
632
+ resources=cluster.resources,
633
+ evidences=cluster.evidences[start_idx:end_idx],
634
+ patterns=cluster.patterns[i::num_splits], # Distribute patterns
635
+ constraints=cluster.constraints[i::num_splits], # Distribute constraints
636
+ confidence=cluster.confidence,
637
+ abstraction_level=cluster.abstraction_level,
638
+ landmark_potential=cluster.landmark_potential,
639
+ hotness=cluster.hotness,
640
+ lifecycle=Lifecycle.EMERGING, # New clusters are emerging
641
+ create_time=datetime.now(),
642
+ last_modified=datetime.now(),
643
+ version=1,
644
+ related_clusters=cluster.related_clusters,
645
+ )
646
+
647
+ # Insert new cluster
648
+ await self.insert(new_cluster)
649
+ new_clusters.append(new_cluster)
650
+
651
+ # Remove original cluster
652
+ await self.remove(cluster.id)
653
+
654
+ logger.info(f"Split cluster {cluster.id} into {num_splits} clusters")
655
+ return new_clusters
656
+
657
+ except Exception as e:
658
+ logger.error(f"Failed to split cluster {cluster.id}: {e}")
659
+ return [cluster]
660
+
661
+ def get_stats(self) -> Dict[str, Any]:
662
+ """
663
+ Get statistics about stored knowledge clusters
664
+
665
+ Returns:
666
+ Dictionary with statistics
667
+ """
668
+ try:
669
+ stats = self.db.analyze_table(self.table_name)
670
+
671
+ # Add custom stats
672
+ total_count = self.db.get_table_count(self.table_name)
673
+
674
+ # Count by lifecycle
675
+ lifecycle_counts = {}
676
+ for lifecycle in Lifecycle:
677
+ count_row = self.db.fetch_one(
678
+ f"SELECT COUNT(*) FROM {self.table_name} WHERE lifecycle = ?",
679
+ [lifecycle.name]
680
+ )
681
+ lifecycle_counts[lifecycle.name] = count_row[0] if count_row else 0
682
+
683
+ # Average confidence
684
+ avg_confidence_row = self.db.fetch_one(
685
+ f"SELECT AVG(confidence) FROM {self.table_name} WHERE confidence IS NOT NULL"
686
+ )
687
+ avg_confidence = avg_confidence_row[0] if avg_confidence_row and avg_confidence_row[0] else 0
688
+
689
+ stats["custom_stats"] = {
690
+ "total_clusters": total_count,
691
+ "lifecycle_distribution": lifecycle_counts,
692
+ "average_confidence": round(avg_confidence, 4) if avg_confidence else None,
693
+ "parquet_file": self.parquet_file,
694
+ "parquet_exists": Path(self.parquet_file).exists(),
695
+ }
696
+
697
+ return stats
698
+
699
+ except Exception as e:
700
+ logger.error(f"Failed to get stats: {e}")
701
+ return {}
702
+
703
+ def close(self):
704
+ """Close database connection"""
705
+ if self.db:
706
+ self.db.close()
707
+ logger.info("Knowledge Manager closed")
708
+
709
+ def __enter__(self):
710
+ """Context manager entry"""
711
+ return self
712
+
713
+ def __exit__(self, exc_type, exc_val, exc_tb):
714
+ """Context manager exit"""
715
+ self.close()
716
+
717
+ def __del__(self):
718
+ """Destructor to ensure connection is closed"""
719
+ if hasattr(self, 'db') and self.db:
720
+ self.close()