sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
  41. sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,318 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime, timezone
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional, Set, Union
7
+
8
+
9
+ class Lifecycle(Enum):
10
+ """Lifecycle status of the knowledge cluster, used by Cognition Layer for path planning."""
11
+
12
+ STABLE = "stable"
13
+ EMERGING = "emerging"
14
+ CONTESTED = "contested"
15
+ DEPRECATED = "deprecated"
16
+
17
+
18
+ class AbstractionLevel(Enum):
19
+ """Abstraction tier for cognitive mapping and navigation depth control."""
20
+
21
+ TECHNIQUE = 1 # e.g., QLoRA, Speculative Decoding
22
+ PRINCIPLE = 2 # e.g., Low-Rank Update, Token Pruning
23
+ PARADIGM = 3 # e.g., Parameter-Efficient Learning
24
+ FOUNDATION = 4 # e.g., Gradient-Based Optimization
25
+ PHILOSOPHY = 5 # e.g., Occam's Razor in ML
26
+
27
+
28
+ @dataclass
29
+ class EvidenceUnit:
30
+ """
31
+ Lightweight reference to an evidence unit in the Evidence Layer.
32
+
33
+ Enables traceability and dynamic validation without storing full content.
34
+ """
35
+
36
+ # ID of the source doc, pointing to FileInfo cache key (FileInfo.get_cache_key())
37
+ # If URL-based, this can be a hash of the URL
38
+ doc_id: str
39
+
40
+ # Path or URL to the source document
41
+ # From `file_or_url` in FileInfo
42
+ file_or_url: Union[str, Path]
43
+
44
+ # Summarized of snippets
45
+ summary: str
46
+
47
+ # Segment within the document (e.g., paragraph, code snippet)
48
+ is_found: bool
49
+
50
+ # Segments within the document (e.g., paragraph, code snippet)
51
+ # Format: {"snippet": "xxx", "start": 7, "end": 65, "score": 9.0, "reasoning": "xxx"}
52
+ snippets: List[Dict[str, Any]]
53
+
54
+ # When this evidence was processed
55
+ extracted_at: datetime
56
+
57
+ # IDs of conflict group if this evidence contradicts others
58
+ conflict_group: Optional[List[str]] = None
59
+
60
+ def to_dict(self) -> Dict[str, Any]:
61
+ """
62
+ Serialize EvidenceUnit to a dictionary.
63
+ """
64
+ return {
65
+ "doc_id": self.doc_id,
66
+ "file_or_url": str(self.file_or_url),
67
+ "summary": self.summary,
68
+ "is_found": self.is_found,
69
+ "snippets": self.snippets,
70
+ "extracted_at": self.extracted_at.isoformat(),
71
+ "conflict_group": self.conflict_group,
72
+ }
73
+
74
+
75
+ @dataclass
76
+ class Constraint:
77
+ """
78
+ Boundary condition for safe application of the cluster's conclusions.
79
+
80
+ Used by Cognition Layer to activate/deactivate Barrier edges.
81
+ """
82
+
83
+ # DSL expression, e.g., "data_size < 100", "model_arch == 'decoder'"
84
+ condition: str
85
+
86
+ # "low", "medium", "high" — affects path blocking in Cognition Layer
87
+ severity: str
88
+
89
+ # Human-readable explanation of the constraint
90
+ description: str
91
+
92
+ def to_dict(self) -> Dict[str, Any]:
93
+ """
94
+ Serialize Constraint to a dictionary.
95
+ """
96
+ return {
97
+ "condition": self.condition,
98
+ "severity": self.severity,
99
+ "description": self.description,
100
+ }
101
+
102
+ @staticmethod
103
+ def from_dict(data: Dict[str, Any]) -> "Constraint":
104
+ """
105
+ Deserialize Constraint from a dictionary.
106
+ """
107
+ return Constraint(
108
+ condition=data["condition"],
109
+ severity=data["severity"],
110
+ description=data["description"],
111
+ )
112
+
113
+
114
+ @dataclass
115
+ class WeakSemanticEdge:
116
+ """
117
+ A lightweight, statistical, probabilistic association to another cluster. Undirected edge.
118
+
119
+ Used for:
120
+ - Fast nearest-neighbor search (e.g., "similar topics")
121
+ - Cognitive map layout (force-directed positioning)
122
+ - Fallback path suggestion when rich edges fail
123
+ - Cold-start cluster grouping (e.g., for multi-cluster nodes)
124
+
125
+ Weight semantics depend on source:
126
+ - co_occur: P(B | A) from evidence co-mention
127
+ - query_seq: P(next=B | current=A) from user logs
128
+ - embed_sim: cosine similarity of cluster embeddings (if temporarily used)
129
+ """
130
+
131
+ target_cluster_id: str # e.g., "C1005"
132
+ weight: float # [0.0, 1.0]; higher = stronger association
133
+ source: str # e.g., "co_occur", "query_seq", "embed_sim"
134
+
135
+ def to_dict(self) -> Dict[str, Any]:
136
+ """
137
+ Serialize WeakSemanticEdge to a dictionary.
138
+ """
139
+ return {
140
+ "target_cluster_id": self.target_cluster_id,
141
+ "weight": self.weight,
142
+ "source": self.source,
143
+ }
144
+
145
+ @staticmethod
146
+ def from_dict(data: Dict[str, Any]) -> "WeakSemanticEdge":
147
+ """
148
+ Deserialize WeakSemanticEdge from a dictionary.
149
+ """
150
+ return WeakSemanticEdge(
151
+ target_cluster_id=data["target_cluster_id"],
152
+ weight=data["weight"],
153
+ source=data["source"],
154
+ )
155
+
156
+
157
+ @dataclass
158
+ class KnowledgeCluster:
159
+ """
160
+ A high-level, dynamic, consensus-based knowledge unit distilled from multiple evidence sources.
161
+
162
+ Long-term memory notes with persistence.
163
+
164
+ Serves as the bridge between raw evidence (Evidence Layer) and cognitive navigation (Cognition Layer).
165
+ Designed for efficient retrieval, evolution, and integration into a cognitive map.
166
+ """
167
+
168
+ # Globally unique cluster ID, e.g., "C1007"
169
+ id: str
170
+
171
+ # Concise, human-readable name, e.g., "QLoRA: 4-bit Quantized Low-Rank Adaptation"
172
+ name: str
173
+
174
+ # Detailed abstract/summary of the knowledge cluster from different perspectives
175
+ description: Union[str, List[str]]
176
+
177
+ # The markdown main content of the knowledge cluster, could be table of contents, references, etc.
178
+ content: Union[str, List[str]]
179
+
180
+ # Optional code snippets to process or demonstrate the knowledge
181
+ # Each item should be standard code string (e.g., Python, Bash) with function annotations.
182
+ scripts: Optional[List[str]] = None
183
+
184
+ # Related resources such as files, URLs
185
+ # Example: [{"type": "url", "value": "https://example.com"}, {"type": "file", "value": "/path/to/file"}]
186
+ resources: Optional[List[Dict[str, Any]]] = None
187
+
188
+ # References to supporting evidence items (e.g., paragraphs, code snippets)
189
+ evidences: List[EvidenceUnit] = field(default_factory=list)
190
+
191
+ # 3–5 generalizable design principles or mechanisms
192
+ patterns: List[str] = field(default_factory=list)
193
+
194
+ # Boundary conditions for safe application
195
+ constraints: List[Constraint] = field(default_factory=list)
196
+
197
+ # Total consensus score: aggregated from evidence weights, co-occurrence, and consistency.
198
+ # Range: [0.0, 1.0]; dynamically updated upon ingestion of new evidence.
199
+ confidence: Optional[float] = None
200
+
201
+ # Abstraction level (e.g., "conceptual", "architectural", "implementation");
202
+ # guides hierarchy placement and path traversal depth in the cognitive map.
203
+ abstraction_level: Optional[AbstractionLevel] = None
204
+
205
+ # Estimated suitability as a cognitive landmark node (e.g., for navigation shortcuts).
206
+ # Range: [0.0, 1.0]
207
+ landmark_potential: Optional[float] = None
208
+
209
+ # Activity score reflecting query coverage or update frequency.
210
+ # Range: [0.0, 1.0]
211
+ hotness: Optional[float] = None
212
+
213
+ # Structural lifecycle classification of knowledge cluster node
214
+ # critical for path planning and validity of cached shortcuts in Cognition Layer.
215
+ lifecycle: Lifecycle = Lifecycle.EMERGING
216
+
217
+ # ISO 8601 timestamp of creation
218
+ create_time: Optional[datetime] = None
219
+
220
+ # ISO 8601 timestamp of last structural or semantic update
221
+ last_modified: Optional[datetime] = None
222
+
223
+ # Version number; incremented on structural changes (e.g., pattern/constraint updates)
224
+ version: Optional[int] = None
225
+
226
+ # Related knowledge clusters for estimated weak semantic connections
227
+ related_clusters: List[WeakSemanticEdge] = None
228
+
229
+ # Search results: list of file paths or URLs that were retrieved
230
+ # Used to track which sources contributed to this knowledge cluster
231
+ search_results: List[str] = None
232
+
233
+ def __post_init__(self):
234
+ if self.related_clusters is None:
235
+ self.related_clusters = []
236
+
237
+ if self.search_results is None:
238
+ self.search_results = []
239
+
240
+ if self.create_time is None:
241
+ self.create_time = datetime.now(timezone.utc)
242
+
243
+ if self.last_modified is None:
244
+ self.last_modified = datetime.now(timezone.utc)
245
+
246
+ if self.version is None:
247
+ self.version = 0
248
+
249
+ @property
250
+ def primary_evidence_files(self) -> Set[str]:
251
+ """Return set of unique file IDs backing this cluster — useful for evidence-layer prefetch."""
252
+ return {ref.doc_id for ref in self.evidences}
253
+
254
+ def get_conflict_groups(self) -> Set[str]:
255
+ """Extract conflict group IDs for cognitive conflict-aware reasoning."""
256
+ return {ref.conflict_group for ref in self.evidences if ref.conflict_group}
257
+
258
+ def to_dict(self) -> Dict[str, Any]:
259
+ """
260
+ Serialize KnowledgeCluster to a dictionary.
261
+ """
262
+ return {
263
+ "id": self.id,
264
+ "name": self.name,
265
+ "description": self.description,
266
+ "content": self.content,
267
+ "scripts": self.scripts,
268
+ "resources": self.resources,
269
+ "patterns": self.patterns,
270
+ "constraints": [c.to_dict() for c in self.constraints],
271
+ "evidences": [er.to_dict() for er in self.evidences],
272
+ "confidence": self.confidence,
273
+ "abstraction_level": self.abstraction_level.name if self.abstraction_level else None,
274
+ "landmark_potential": self.landmark_potential,
275
+ "hotness": self.hotness,
276
+ "lifecycle": self.lifecycle.name,
277
+ "create_time": self.create_time.isoformat() if self.create_time else None,
278
+ "last_modified": self.last_modified.isoformat() if self.last_modified else None,
279
+ "version": self.version,
280
+ "related_clusters": [rc.to_dict() for rc in self.related_clusters],
281
+ "search_results": self.search_results,
282
+ }
283
+
284
+
285
+ if __name__ == "__main__":
286
+
287
+ # Create instance
288
+ cluster = KnowledgeCluster(
289
+ id="C1001",
290
+ name="Test Cluster",
291
+ description=["A desc from perspective A.", "A desc from perspective B."],
292
+ content="Detailed content of the knowledge cluster.",
293
+ scripts=["print('Hello World')"],
294
+ resources=[
295
+ {"type": "url", "value": "https://example.com"},
296
+ {"type": "file", "value": "/data/image1.png"},
297
+ ],
298
+ patterns=["pattern A", "pattern B"],
299
+ constraints=[Constraint("x > 0", "low", "x must be positive")],
300
+ evidences=[
301
+ EvidenceUnit(
302
+ doc_id="doc1",
303
+ file_or_url=Path("/data/file.txt"),
304
+ segment={"text": "supporting text", "type": "match", "line_number": 10},
305
+ score=0.9,
306
+ extracted_at=datetime(2025, 1, 1),
307
+ )
308
+ ],
309
+ confidence=0.85,
310
+ abstraction_level=AbstractionLevel.PRINCIPLE,
311
+ landmark_potential=0.6,
312
+ hotness=0.4,
313
+ lifecycle=Lifecycle.STABLE,
314
+ create_time=datetime(2025, 1, 1),
315
+ last_modified=datetime(2025, 1, 2),
316
+ )
317
+
318
+ print(cluster.to_dict())