sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/__init__.py +8 -0
- sirchmunk/base.py +17 -0
- sirchmunk/insight/__init__.py +4 -0
- sirchmunk/insight/text_insights.py +292 -0
- sirchmunk/learnings/__init__.py +1 -0
- sirchmunk/learnings/evidence_processor.py +525 -0
- sirchmunk/learnings/knowledge_base.py +232 -0
- sirchmunk/llm/__init__.py +2 -0
- sirchmunk/llm/openai_chat.py +247 -0
- sirchmunk/llm/prompts.py +216 -0
- sirchmunk/retrieve/__init__.py +1 -0
- sirchmunk/retrieve/base.py +25 -0
- sirchmunk/retrieve/text_retriever.py +1026 -0
- sirchmunk/scan/__init__.py +1 -0
- sirchmunk/scan/base.py +18 -0
- sirchmunk/scan/file_scanner.py +373 -0
- sirchmunk/scan/web_scanner.py +18 -0
- sirchmunk/scheduler/__init__.py +0 -0
- sirchmunk/schema/__init__.py +2 -0
- sirchmunk/schema/cognition.py +106 -0
- sirchmunk/schema/context.py +25 -0
- sirchmunk/schema/knowledge.py +318 -0
- sirchmunk/schema/metadata.py +658 -0
- sirchmunk/schema/request.py +221 -0
- sirchmunk/schema/response.py +20 -0
- sirchmunk/schema/snapshot.py +346 -0
- sirchmunk/search.py +475 -0
- sirchmunk/storage/__init__.py +7 -0
- sirchmunk/storage/duckdb.py +676 -0
- sirchmunk/storage/knowledge_manager.py +720 -0
- sirchmunk/utils/__init__.py +15 -0
- sirchmunk/utils/constants.py +15 -0
- sirchmunk/utils/deps.py +23 -0
- sirchmunk/utils/file_utils.py +70 -0
- sirchmunk/utils/install_rga.py +124 -0
- sirchmunk/utils/log_utils.py +360 -0
- sirchmunk/utils/tokenizer_util.py +55 -0
- sirchmunk/utils/utils.py +108 -0
- sirchmunk/version.py +1 -1
- sirchmunk-0.0.1.dist-info/METADATA +416 -0
- sirchmunk-0.0.1.dist-info/RECORD +45 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.0.dist-info/METADATA +0 -26
- sirchmunk-0.0.0.dist-info/RECORD +0 -8
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Lifecycle(Enum):
|
|
10
|
+
"""Lifecycle status of the knowledge cluster, used by Cognition Layer for path planning."""
|
|
11
|
+
|
|
12
|
+
STABLE = "stable"
|
|
13
|
+
EMERGING = "emerging"
|
|
14
|
+
CONTESTED = "contested"
|
|
15
|
+
DEPRECATED = "deprecated"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AbstractionLevel(Enum):
|
|
19
|
+
"""Abstraction tier for cognitive mapping and navigation depth control."""
|
|
20
|
+
|
|
21
|
+
TECHNIQUE = 1 # e.g., QLoRA, Speculative Decoding
|
|
22
|
+
PRINCIPLE = 2 # e.g., Low-Rank Update, Token Pruning
|
|
23
|
+
PARADIGM = 3 # e.g., Parameter-Efficient Learning
|
|
24
|
+
FOUNDATION = 4 # e.g., Gradient-Based Optimization
|
|
25
|
+
PHILOSOPHY = 5 # e.g., Occam's Razor in ML
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class EvidenceUnit:
|
|
30
|
+
"""
|
|
31
|
+
Lightweight reference to an evidence unit in the Evidence Layer.
|
|
32
|
+
|
|
33
|
+
Enables traceability and dynamic validation without storing full content.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# ID of the source doc, pointing to FileInfo cache key (FileInfo.get_cache_key())
|
|
37
|
+
# If URL-based, this can be a hash of the URL
|
|
38
|
+
doc_id: str
|
|
39
|
+
|
|
40
|
+
# Path or URL to the source document
|
|
41
|
+
# From `file_or_url` in FileInfo
|
|
42
|
+
file_or_url: Union[str, Path]
|
|
43
|
+
|
|
44
|
+
# Summarized of snippets
|
|
45
|
+
summary: str
|
|
46
|
+
|
|
47
|
+
# Segment within the document (e.g., paragraph, code snippet)
|
|
48
|
+
is_found: bool
|
|
49
|
+
|
|
50
|
+
# Segments within the document (e.g., paragraph, code snippet)
|
|
51
|
+
# Format: {"snippet": "xxx", "start": 7, "end": 65, "score": 9.0, "reasoning": "xxx"}
|
|
52
|
+
snippets: List[Dict[str, Any]]
|
|
53
|
+
|
|
54
|
+
# When this evidence was processed
|
|
55
|
+
extracted_at: datetime
|
|
56
|
+
|
|
57
|
+
# IDs of conflict group if this evidence contradicts others
|
|
58
|
+
conflict_group: Optional[List[str]] = None
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
61
|
+
"""
|
|
62
|
+
Serialize EvidenceUnit to a dictionary.
|
|
63
|
+
"""
|
|
64
|
+
return {
|
|
65
|
+
"doc_id": self.doc_id,
|
|
66
|
+
"file_or_url": str(self.file_or_url),
|
|
67
|
+
"summary": self.summary,
|
|
68
|
+
"is_found": self.is_found,
|
|
69
|
+
"snippets": self.snippets,
|
|
70
|
+
"extracted_at": self.extracted_at.isoformat(),
|
|
71
|
+
"conflict_group": self.conflict_group,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class Constraint:
|
|
77
|
+
"""
|
|
78
|
+
Boundary condition for safe application of the cluster's conclusions.
|
|
79
|
+
|
|
80
|
+
Used by Cognition Layer to activate/deactivate Barrier edges.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
# DSL expression, e.g., "data_size < 100", "model_arch == 'decoder'"
|
|
84
|
+
condition: str
|
|
85
|
+
|
|
86
|
+
# "low", "medium", "high" — affects path blocking in Cognition Layer
|
|
87
|
+
severity: str
|
|
88
|
+
|
|
89
|
+
# Human-readable explanation of the constraint
|
|
90
|
+
description: str
|
|
91
|
+
|
|
92
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
93
|
+
"""
|
|
94
|
+
Serialize Constraint to a dictionary.
|
|
95
|
+
"""
|
|
96
|
+
return {
|
|
97
|
+
"condition": self.condition,
|
|
98
|
+
"severity": self.severity,
|
|
99
|
+
"description": self.description,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
@staticmethod
|
|
103
|
+
def from_dict(data: Dict[str, Any]) -> "Constraint":
|
|
104
|
+
"""
|
|
105
|
+
Deserialize Constraint from a dictionary.
|
|
106
|
+
"""
|
|
107
|
+
return Constraint(
|
|
108
|
+
condition=data["condition"],
|
|
109
|
+
severity=data["severity"],
|
|
110
|
+
description=data["description"],
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class WeakSemanticEdge:
|
|
116
|
+
"""
|
|
117
|
+
A lightweight, statistical, probabilistic association to another cluster. Undirected edge.
|
|
118
|
+
|
|
119
|
+
Used for:
|
|
120
|
+
- Fast nearest-neighbor search (e.g., "similar topics")
|
|
121
|
+
- Cognitive map layout (force-directed positioning)
|
|
122
|
+
- Fallback path suggestion when rich edges fail
|
|
123
|
+
- Cold-start cluster grouping (e.g., for multi-cluster nodes)
|
|
124
|
+
|
|
125
|
+
Weight semantics depend on source:
|
|
126
|
+
- co_occur: P(B | A) from evidence co-mention
|
|
127
|
+
- query_seq: P(next=B | current=A) from user logs
|
|
128
|
+
- embed_sim: cosine similarity of cluster embeddings (if temporarily used)
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
target_cluster_id: str # e.g., "C1005"
|
|
132
|
+
weight: float # [0.0, 1.0]; higher = stronger association
|
|
133
|
+
source: str # e.g., "co_occur", "query_seq", "embed_sim"
|
|
134
|
+
|
|
135
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
136
|
+
"""
|
|
137
|
+
Serialize WeakSemanticEdge to a dictionary.
|
|
138
|
+
"""
|
|
139
|
+
return {
|
|
140
|
+
"target_cluster_id": self.target_cluster_id,
|
|
141
|
+
"weight": self.weight,
|
|
142
|
+
"source": self.source,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def from_dict(data: Dict[str, Any]) -> "WeakSemanticEdge":
|
|
147
|
+
"""
|
|
148
|
+
Deserialize WeakSemanticEdge from a dictionary.
|
|
149
|
+
"""
|
|
150
|
+
return WeakSemanticEdge(
|
|
151
|
+
target_cluster_id=data["target_cluster_id"],
|
|
152
|
+
weight=data["weight"],
|
|
153
|
+
source=data["source"],
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@dataclass
|
|
158
|
+
class KnowledgeCluster:
|
|
159
|
+
"""
|
|
160
|
+
A high-level, dynamic, consensus-based knowledge unit distilled from multiple evidence sources.
|
|
161
|
+
|
|
162
|
+
Long-term memory notes with persistence.
|
|
163
|
+
|
|
164
|
+
Serves as the bridge between raw evidence (Evidence Layer) and cognitive navigation (Cognition Layer).
|
|
165
|
+
Designed for efficient retrieval, evolution, and integration into a cognitive map.
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
# Globally unique cluster ID, e.g., "C1007"
|
|
169
|
+
id: str
|
|
170
|
+
|
|
171
|
+
# Concise, human-readable name, e.g., "QLoRA: 4-bit Quantized Low-Rank Adaptation"
|
|
172
|
+
name: str
|
|
173
|
+
|
|
174
|
+
# Detailed abstract/summary of the knowledge cluster from different perspectives
|
|
175
|
+
description: Union[str, List[str]]
|
|
176
|
+
|
|
177
|
+
# The markdown main content of the knowledge cluster, could be table of contents, references, etc.
|
|
178
|
+
content: Union[str, List[str]]
|
|
179
|
+
|
|
180
|
+
# Optional code snippets to process or demonstrate the knowledge
|
|
181
|
+
# Each item should be standard code string (e.g., Python, Bash) with function annotations.
|
|
182
|
+
scripts: Optional[List[str]] = None
|
|
183
|
+
|
|
184
|
+
# Related resources such as files, URLs
|
|
185
|
+
# Example: [{"type": "url", "value": "https://example.com"}, {"type": "file", "value": "/path/to/file"}]
|
|
186
|
+
resources: Optional[List[Dict[str, Any]]] = None
|
|
187
|
+
|
|
188
|
+
# References to supporting evidence items (e.g., paragraphs, code snippets)
|
|
189
|
+
evidences: List[EvidenceUnit] = field(default_factory=list)
|
|
190
|
+
|
|
191
|
+
# 3–5 generalizable design principles or mechanisms
|
|
192
|
+
patterns: List[str] = field(default_factory=list)
|
|
193
|
+
|
|
194
|
+
# Boundary conditions for safe application
|
|
195
|
+
constraints: List[Constraint] = field(default_factory=list)
|
|
196
|
+
|
|
197
|
+
# Total consensus score: aggregated from evidence weights, co-occurrence, and consistency.
|
|
198
|
+
# Range: [0.0, 1.0]; dynamically updated upon ingestion of new evidence.
|
|
199
|
+
confidence: Optional[float] = None
|
|
200
|
+
|
|
201
|
+
# Abstraction level (e.g., "conceptual", "architectural", "implementation");
|
|
202
|
+
# guides hierarchy placement and path traversal depth in the cognitive map.
|
|
203
|
+
abstraction_level: Optional[AbstractionLevel] = None
|
|
204
|
+
|
|
205
|
+
# Estimated suitability as a cognitive landmark node (e.g., for navigation shortcuts).
|
|
206
|
+
# Range: [0.0, 1.0]
|
|
207
|
+
landmark_potential: Optional[float] = None
|
|
208
|
+
|
|
209
|
+
# Activity score reflecting query coverage or update frequency.
|
|
210
|
+
# Range: [0.0, 1.0]
|
|
211
|
+
hotness: Optional[float] = None
|
|
212
|
+
|
|
213
|
+
# Structural lifecycle classification of knowledge cluster node
|
|
214
|
+
# critical for path planning and validity of cached shortcuts in Cognition Layer.
|
|
215
|
+
lifecycle: Lifecycle = Lifecycle.EMERGING
|
|
216
|
+
|
|
217
|
+
# ISO 8601 timestamp of creation
|
|
218
|
+
create_time: Optional[datetime] = None
|
|
219
|
+
|
|
220
|
+
# ISO 8601 timestamp of last structural or semantic update
|
|
221
|
+
last_modified: Optional[datetime] = None
|
|
222
|
+
|
|
223
|
+
# Version number; incremented on structural changes (e.g., pattern/constraint updates)
|
|
224
|
+
version: Optional[int] = None
|
|
225
|
+
|
|
226
|
+
# Related knowledge clusters for estimated weak semantic connections
|
|
227
|
+
related_clusters: List[WeakSemanticEdge] = None
|
|
228
|
+
|
|
229
|
+
# Search results: list of file paths or URLs that were retrieved
|
|
230
|
+
# Used to track which sources contributed to this knowledge cluster
|
|
231
|
+
search_results: List[str] = None
|
|
232
|
+
|
|
233
|
+
def __post_init__(self):
|
|
234
|
+
if self.related_clusters is None:
|
|
235
|
+
self.related_clusters = []
|
|
236
|
+
|
|
237
|
+
if self.search_results is None:
|
|
238
|
+
self.search_results = []
|
|
239
|
+
|
|
240
|
+
if self.create_time is None:
|
|
241
|
+
self.create_time = datetime.now(timezone.utc)
|
|
242
|
+
|
|
243
|
+
if self.last_modified is None:
|
|
244
|
+
self.last_modified = datetime.now(timezone.utc)
|
|
245
|
+
|
|
246
|
+
if self.version is None:
|
|
247
|
+
self.version = 0
|
|
248
|
+
|
|
249
|
+
@property
|
|
250
|
+
def primary_evidence_files(self) -> Set[str]:
|
|
251
|
+
"""Return set of unique file IDs backing this cluster — useful for evidence-layer prefetch."""
|
|
252
|
+
return {ref.doc_id for ref in self.evidences}
|
|
253
|
+
|
|
254
|
+
def get_conflict_groups(self) -> Set[str]:
|
|
255
|
+
"""Extract conflict group IDs for cognitive conflict-aware reasoning."""
|
|
256
|
+
return {ref.conflict_group for ref in self.evidences if ref.conflict_group}
|
|
257
|
+
|
|
258
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
259
|
+
"""
|
|
260
|
+
Serialize KnowledgeCluster to a dictionary.
|
|
261
|
+
"""
|
|
262
|
+
return {
|
|
263
|
+
"id": self.id,
|
|
264
|
+
"name": self.name,
|
|
265
|
+
"description": self.description,
|
|
266
|
+
"content": self.content,
|
|
267
|
+
"scripts": self.scripts,
|
|
268
|
+
"resources": self.resources,
|
|
269
|
+
"patterns": self.patterns,
|
|
270
|
+
"constraints": [c.to_dict() for c in self.constraints],
|
|
271
|
+
"evidences": [er.to_dict() for er in self.evidences],
|
|
272
|
+
"confidence": self.confidence,
|
|
273
|
+
"abstraction_level": self.abstraction_level.name if self.abstraction_level else None,
|
|
274
|
+
"landmark_potential": self.landmark_potential,
|
|
275
|
+
"hotness": self.hotness,
|
|
276
|
+
"lifecycle": self.lifecycle.name,
|
|
277
|
+
"create_time": self.create_time.isoformat() if self.create_time else None,
|
|
278
|
+
"last_modified": self.last_modified.isoformat() if self.last_modified else None,
|
|
279
|
+
"version": self.version,
|
|
280
|
+
"related_clusters": [rc.to_dict() for rc in self.related_clusters],
|
|
281
|
+
"search_results": self.search_results,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
if __name__ == "__main__":
|
|
286
|
+
|
|
287
|
+
# Create instance
|
|
288
|
+
cluster = KnowledgeCluster(
|
|
289
|
+
id="C1001",
|
|
290
|
+
name="Test Cluster",
|
|
291
|
+
description=["A desc from perspective A.", "A desc from perspective B."],
|
|
292
|
+
content="Detailed content of the knowledge cluster.",
|
|
293
|
+
scripts=["print('Hello World')"],
|
|
294
|
+
resources=[
|
|
295
|
+
{"type": "url", "value": "https://example.com"},
|
|
296
|
+
{"type": "file", "value": "/data/image1.png"},
|
|
297
|
+
],
|
|
298
|
+
patterns=["pattern A", "pattern B"],
|
|
299
|
+
constraints=[Constraint("x > 0", "low", "x must be positive")],
|
|
300
|
+
evidences=[
|
|
301
|
+
EvidenceUnit(
|
|
302
|
+
doc_id="doc1",
|
|
303
|
+
file_or_url=Path("/data/file.txt"),
|
|
304
|
+
segment={"text": "supporting text", "type": "match", "line_number": 10},
|
|
305
|
+
score=0.9,
|
|
306
|
+
extracted_at=datetime(2025, 1, 1),
|
|
307
|
+
)
|
|
308
|
+
],
|
|
309
|
+
confidence=0.85,
|
|
310
|
+
abstraction_level=AbstractionLevel.PRINCIPLE,
|
|
311
|
+
landmark_potential=0.6,
|
|
312
|
+
hotness=0.4,
|
|
313
|
+
lifecycle=Lifecycle.STABLE,
|
|
314
|
+
create_time=datetime(2025, 1, 1),
|
|
315
|
+
last_modified=datetime(2025, 1, 2),
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
print(cluster.to_dict())
|