sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/__init__.py +8 -0
- sirchmunk/base.py +17 -0
- sirchmunk/insight/__init__.py +4 -0
- sirchmunk/insight/text_insights.py +292 -0
- sirchmunk/learnings/__init__.py +1 -0
- sirchmunk/learnings/evidence_processor.py +525 -0
- sirchmunk/learnings/knowledge_base.py +232 -0
- sirchmunk/llm/__init__.py +2 -0
- sirchmunk/llm/openai_chat.py +247 -0
- sirchmunk/llm/prompts.py +216 -0
- sirchmunk/retrieve/__init__.py +1 -0
- sirchmunk/retrieve/base.py +25 -0
- sirchmunk/retrieve/text_retriever.py +1026 -0
- sirchmunk/scan/__init__.py +1 -0
- sirchmunk/scan/base.py +18 -0
- sirchmunk/scan/file_scanner.py +373 -0
- sirchmunk/scan/web_scanner.py +18 -0
- sirchmunk/scheduler/__init__.py +0 -0
- sirchmunk/schema/__init__.py +2 -0
- sirchmunk/schema/cognition.py +106 -0
- sirchmunk/schema/context.py +25 -0
- sirchmunk/schema/knowledge.py +318 -0
- sirchmunk/schema/metadata.py +658 -0
- sirchmunk/schema/request.py +221 -0
- sirchmunk/schema/response.py +20 -0
- sirchmunk/schema/snapshot.py +346 -0
- sirchmunk/search.py +475 -0
- sirchmunk/storage/__init__.py +7 -0
- sirchmunk/storage/duckdb.py +676 -0
- sirchmunk/storage/knowledge_manager.py +720 -0
- sirchmunk/utils/__init__.py +15 -0
- sirchmunk/utils/constants.py +15 -0
- sirchmunk/utils/deps.py +23 -0
- sirchmunk/utils/file_utils.py +70 -0
- sirchmunk/utils/install_rga.py +124 -0
- sirchmunk/utils/log_utils.py +360 -0
- sirchmunk/utils/tokenizer_util.py +55 -0
- sirchmunk/utils/utils.py +108 -0
- sirchmunk/version.py +1 -1
- sirchmunk-0.0.1.dist-info/METADATA +416 -0
- sirchmunk-0.0.1.dist-info/RECORD +45 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.0.dist-info/METADATA +0 -26
- sirchmunk-0.0.0.dist-info/RECORD +0 -8
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
sirchmunk/utils/utils.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
import math
|
|
3
|
+
import re
|
|
4
|
+
from typing import Dict, List, LiteralString, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import RootModel, model_validator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class KeywordValidation(RootModel):
|
|
10
|
+
root: Dict[str, float]
|
|
11
|
+
|
|
12
|
+
@model_validator(mode="after")
|
|
13
|
+
def validate_values(self) -> "KeywordValidation":
|
|
14
|
+
"""Ensure all keyword scores are within the range [1.0, 10.0]."""
|
|
15
|
+
for k, v in self.root.items():
|
|
16
|
+
self.root[k] = max(1.0, min(10.0, v))
|
|
17
|
+
return self
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def log_tf_norm(count: int):
|
|
21
|
+
"""Log normalization for term frequency."""
|
|
22
|
+
return 1 + math.log(count) if count > 0 else 0
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def log_tf_norm_penalty(count, ideal_range=(1, 5), penalty_alpha=0.2):
|
|
26
|
+
"""Refined Log Normalization with Double-Ended Penalty for Term Frequency."""
|
|
27
|
+
if count <= 0:
|
|
28
|
+
return 0.0
|
|
29
|
+
|
|
30
|
+
min_t, max_t = ideal_range
|
|
31
|
+
|
|
32
|
+
# Base Log Scale
|
|
33
|
+
score = math.log(count + 1)
|
|
34
|
+
|
|
35
|
+
# 1. Low Frequency Penalty
|
|
36
|
+
if count < min_t:
|
|
37
|
+
score *= count / min_t
|
|
38
|
+
|
|
39
|
+
# 2. High Frequency Penalty
|
|
40
|
+
if count > max_t:
|
|
41
|
+
overage = count - max_t
|
|
42
|
+
penalty = math.exp(-penalty_alpha * (overage**0.5))
|
|
43
|
+
score *= penalty
|
|
44
|
+
|
|
45
|
+
return score
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def extract_fields(
|
|
49
|
+
content: str, tags: Optional[List[str]] = None
|
|
50
|
+
) -> Dict[str, LiteralString | None]:
|
|
51
|
+
"""
|
|
52
|
+
Extracts specified fields from the LLM output content.
|
|
53
|
+
e.g. <DESCRIPTION>xxx</DESCRIPTION>, <NAME>xxx</NAME>, <CONTENT>xxx</CONTENT>.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
content (str): The raw output content from the LLM.
|
|
57
|
+
tags (Optional[List[str]]): List of tags to extract. Defaults to ["DESCRIPTION", "NAME", "CONTENT"].
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Dict[str, LiteralString | None]: A dictionary with extracted fields.
|
|
61
|
+
Keys are the lowercase tag names, and values are the extracted content or None if not found.
|
|
62
|
+
"""
|
|
63
|
+
# Define the list of tags to extract
|
|
64
|
+
tags = tags or ["DESCRIPTION", "NAME", "CONTENT"]
|
|
65
|
+
extracted_data = {}
|
|
66
|
+
|
|
67
|
+
for tag in tags:
|
|
68
|
+
# Regex Breakdown:
|
|
69
|
+
# <{tag}>: Matches the opening tag
|
|
70
|
+
# (.*?): Non-greedy match to capture everything inside the tags
|
|
71
|
+
# </{tag}>: Matches the closing tag
|
|
72
|
+
# re.DOTALL: Allows the dot (.) to match newlines, handling multi-line content
|
|
73
|
+
pattern = f"<{tag}>(.*?)</{tag}>"
|
|
74
|
+
match = re.search(pattern, content, re.DOTALL)
|
|
75
|
+
|
|
76
|
+
if match:
|
|
77
|
+
# .strip() removes leading/trailing whitespace or newlines
|
|
78
|
+
extracted_data[tag.lower()] = match.group(1).strip()
|
|
79
|
+
else:
|
|
80
|
+
# Handle cases where the LLM might miss a tag
|
|
81
|
+
extracted_data[tag.lower()] = None
|
|
82
|
+
|
|
83
|
+
return extracted_data
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
|
|
88
|
+
# --- Test Case ---
|
|
89
|
+
llm_raw_output = """
|
|
90
|
+
Some irrelevant preamble from the LLM...
|
|
91
|
+
<DESCRIPTION>
|
|
92
|
+
This document set provides a detailed overview of the installation steps for Open-Agentic-Search.
|
|
93
|
+
It covers environment configuration and core dependencies.
|
|
94
|
+
</DESCRIPTION>
|
|
95
|
+
<NAME>Environment Setup Guide</NAME>
|
|
96
|
+
<CONTENT>
|
|
97
|
+
1. Install Python 3.10+
|
|
98
|
+
2. Run pip install -r requirements.txt
|
|
99
|
+
3. Configure the .env environment variables.
|
|
100
|
+
</CONTENT>
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
result = extract_fields(llm_raw_output)
|
|
104
|
+
|
|
105
|
+
# Print results
|
|
106
|
+
print(f"Name: {result['name']}")
|
|
107
|
+
print(f"Description: {result['description']}")
|
|
108
|
+
print(f"Content: \n{result['content']}")
|
sirchmunk/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.1"
|
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sirchmunk
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Sirchmunk: From raw data to self-evolving real-time intelligence.
|
|
5
|
+
Home-page: https://github.com/modelscope/sirchmunk
|
|
6
|
+
Author: ModelScope Team
|
|
7
|
+
Author-email: contact@modelscope.cn
|
|
8
|
+
License: Apache License 2.0
|
|
9
|
+
Project-URL: Homepage, https://github.com/modelscope/sirchmunk
|
|
10
|
+
Keywords: LLM,Agentic,Search,RAG,Indexless,Self-evolving,Real-time Intelligence,Multi-modal
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Provides-Extra: docs
|
|
24
|
+
Provides-Extra: tests
|
|
25
|
+
Provides-Extra: web
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
Dynamic: provides-extra
|
|
30
|
+
|
|
31
|
+
<div align="center">
|
|
32
|
+
|
|
33
|
+
<img src="web/public/logo-v2.png" alt="Sirchmunk Logo" width="250" style="border-radius: 15px;">
|
|
34
|
+
|
|
35
|
+
# Sirchmunk: Raw data to self-evolving intelligence, real-time.
|
|
36
|
+
|
|
37
|
+
[](https://www.python.org/downloads/)
|
|
38
|
+
[](https://fastapi.tiangolo.com/)
|
|
39
|
+
[](https://nextjs.org/)
|
|
40
|
+
[](https://tailwindcss.com/)
|
|
41
|
+
[](https://duckdb.org/)
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
[](https://github.com/phiresky/ripgrep-all)
|
|
44
|
+
[](https://github.com/openai/openai-python)
|
|
45
|
+
[](https://github.com/kreuzberg-dev/kreuzberg)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
[**Quick Start**](#-quick-start) · [**Key Features**](#-key-features) · [**Web UI**](#-web-ui) · [**How it Works**](#-how-it-works) · [**FAQ**](#-faq)
|
|
49
|
+
|
|
50
|
+
[🇨🇳 中文](README_zh.md)
|
|
51
|
+
|
|
52
|
+
</div>
|
|
53
|
+
|
|
54
|
+
<div align="center">
|
|
55
|
+
|
|
56
|
+
🔍 **Agentic Search** • 🧠 **Knowledge Clustering** • 📊 **Monte Carlo Evidence Sampling**<br>
|
|
57
|
+
⚡ **Indexless Retrieval** • 🔄 **Self-Evolving Knowledge Base** • 💬 **Real-time Chat**
|
|
58
|
+
|
|
59
|
+
</div>
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## 🌰 Why “Sirchmunk”?
|
|
64
|
+
|
|
65
|
+
Intelligence pipelines built upon vector-based retrieval can be _rigid and brittle_. They rely on static vector embeddings that are **expensive to compute, blind to real-time changes, and detached from the raw context**. We introduce **Sirchmunk** to usher in a more agile paradigm, where data is no longer treated as a snapshot, and insights can evolve together with the data.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## ✨ Key Features
|
|
70
|
+
|
|
71
|
+
### 1. Embedding-Free: Data in its Purest Form
|
|
72
|
+
|
|
73
|
+
**Sirchmunk** works directly with **raw data** -- bypassing the heavy overhead of squeezing your rich files into fixed-dimensional vectors.
|
|
74
|
+
|
|
75
|
+
* **Instant Search:** Eliminating complex pre-processing pipelines in hours long indexing; just drop your files and search immediately.
|
|
76
|
+
* **Full Fidelity:** Zero information loss —- stay true to your data without vector approximation.
|
|
77
|
+
|
|
78
|
+
### 2. Self-Evolving: A Living Index
|
|
79
|
+
|
|
80
|
+
Data is a stream, not a snapshot. **Sirchmunk** is **dynamic by design**, while vector DB can become obsolete the moment your data changes.
|
|
81
|
+
|
|
82
|
+
* **Context-Aware:** Evolves in real-time with your data context.
|
|
83
|
+
* **LLM-Powered Autonomy:** Designed for Agents that perceive data as it lives, utilizing **token-efficient** reasoning that triggers LLM inference only when necessary to maximize intelligence while minimizing cost.
|
|
84
|
+
|
|
85
|
+
### 3. Intelligence at Scale: Real-Time & Massive
|
|
86
|
+
|
|
87
|
+
**Sirchmunk** bridges massive local repositories and the web with **high-scale throughput** and **real-time awareness**. <br/>
|
|
88
|
+
It serves as a unified intelligent hub for AI agents, delivering deep insights across vast datasets at the speed of thought.
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
### Traditional RAG vs. Sirchmunk
|
|
93
|
+
|
|
94
|
+
<div style="display: flex; justify-content: center; width: 100%;">
|
|
95
|
+
<table style="width: 100%; max-width: 900px; border-collapse: separate; border-spacing: 0; overflow: hidden; border-radius: 12px; font-family: sans-serif; border: 1px solid rgba(128, 128, 128, 0.2); margin: 0 auto;">
|
|
96
|
+
<colgroup>
|
|
97
|
+
<col style="width: 25%;">
|
|
98
|
+
<col style="width: 30%;">
|
|
99
|
+
<col style="width: 45%;">
|
|
100
|
+
</colgroup>
|
|
101
|
+
<thead>
|
|
102
|
+
<tr style="background-color: rgba(128, 128, 128, 0.05);">
|
|
103
|
+
<th style="text-align: left; padding: 16px; border-bottom: 2px solid rgba(128, 128, 128, 0.2); font-size: 1.3em;">Dimension</th>
|
|
104
|
+
<th style="text-align: left; padding: 16px; border-bottom: 2px solid rgba(128, 128, 128, 0.2); font-size: 1.3em; opacity: 0.7;">Traditional RAG</th>
|
|
105
|
+
<th style="text-align: left; padding: 16px; border-bottom: 2px solid rgba(58, 134, 255, 0.5); color: #3a86ff; font-weight: 800; font-size: 1.3em;">✨Sirchmunk</th>
|
|
106
|
+
</tr>
|
|
107
|
+
</thead>
|
|
108
|
+
<tbody>
|
|
109
|
+
<tr>
|
|
110
|
+
<td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">💰 Setup Cost</td>
|
|
111
|
+
<td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">High Overhead <br/> (VectorDB, GraphDB, Complex Document Parser...)</td>
|
|
112
|
+
<td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
|
|
113
|
+
✅ Zero Infrastructure <br/>
|
|
114
|
+
<small style="opacity: 0.8; font-size: 0.85em;">Direct-to-data retrieval without vector silos</small>
|
|
115
|
+
</td>
|
|
116
|
+
</tr>
|
|
117
|
+
<tr>
|
|
118
|
+
<td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">🕒 Data Freshness</td>
|
|
119
|
+
<td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">Stale (Batch Re-indexing)</td>
|
|
120
|
+
<td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
|
|
121
|
+
✅ Instant & Dynamic <br/>
|
|
122
|
+
<small style="opacity: 0.8; font-size: 0.85em;">Self-evolving index that reflects live changes</small>
|
|
123
|
+
</td>
|
|
124
|
+
</tr>
|
|
125
|
+
<tr>
|
|
126
|
+
<td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">📈 Scalability</td>
|
|
127
|
+
<td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">Linear Cost Growth</td>
|
|
128
|
+
<td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
|
|
129
|
+
✅ Extremely low RAM/CPU consumption <br/>
|
|
130
|
+
<small style="opacity: 0.8; font-size: 0.85em;">Native Elastic Support, efficiently handles large-scale datasets</small>
|
|
131
|
+
</td>
|
|
132
|
+
</tr>
|
|
133
|
+
<tr>
|
|
134
|
+
<td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">🎯 Accuracy</td>
|
|
135
|
+
<td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">Approximate Vector Matches</td>
|
|
136
|
+
<td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
|
|
137
|
+
✅ Deterministic & Contextual <br/>
|
|
138
|
+
<small style="opacity: 0.8; font-size: 0.85em;">Hybrid logic ensuring semantic precision</small>
|
|
139
|
+
</td>
|
|
140
|
+
</tr>
|
|
141
|
+
<tr>
|
|
142
|
+
<td style="padding: 16px; font-weight: 600;">⚙️ Workflow</td>
|
|
143
|
+
<td style="padding: 16px; opacity: 0.6;">Complex ETL Pipelines</td>
|
|
144
|
+
<td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef;">
|
|
145
|
+
✅ Drop-and-Search <br/>
|
|
146
|
+
<small style="opacity: 0.8; font-size: 0.85em;">Zero-config integration for rapid deployment</small>
|
|
147
|
+
</td>
|
|
148
|
+
</tr>
|
|
149
|
+
</tbody>
|
|
150
|
+
</table>
|
|
151
|
+
</div>
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
## Demonstration
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
<div align="center">
|
|
160
|
+
<img src="assets/gif/Sirchmunk_Web.gif" alt="Sirchmunk WebUI" width="100%">
|
|
161
|
+
<p style="font-size: 1.1em; font-weight: 600; margin-top: 8px; color: #00bcd4;">
|
|
162
|
+
Access files directly to start chatting
|
|
163
|
+
</p>
|
|
164
|
+
</div>
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
## 🚀 Quick Start
|
|
170
|
+
|
|
171
|
+
### Prerequisites
|
|
172
|
+
|
|
173
|
+
- **Python** 3.10+
|
|
174
|
+
- **LLM API Key** (OpenAI-compatible endpoint, local or remote)
|
|
175
|
+
- **Node.js** 18+ (Optional, for web interface)
|
|
176
|
+
|
|
177
|
+
### Installation
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
# Create virtual environment (recommended)
|
|
181
|
+
conda create -n sirchmunk python=3.13 -y && conda activate sirchmunk
|
|
182
|
+
|
|
183
|
+
pip install sirchmunk
|
|
184
|
+
|
|
185
|
+
# Or via UV:
|
|
186
|
+
uv pip install sirchmunk
|
|
187
|
+
|
|
188
|
+
# Alternatively, install from source:
|
|
189
|
+
git clone https://github.com/modelscope/sirchmunk.git && cd sirchmunk
|
|
190
|
+
pip install -e .
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Python SDK Usage
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
import asyncio
|
|
197
|
+
|
|
198
|
+
from sirchmunk import AgenticSearch
|
|
199
|
+
from sirchmunk.llm import OpenAIChat
|
|
200
|
+
|
|
201
|
+
llm = OpenAIChat(
|
|
202
|
+
api_key="your-api-key",
|
|
203
|
+
base_url="your-base-url", # e.g., https://api.openai.com/v1
|
|
204
|
+
model="your-model-name" # e.g., gpt-4o
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
async def main():
|
|
208
|
+
|
|
209
|
+
agent_search = AgenticSearch(llm=llm)
|
|
210
|
+
|
|
211
|
+
result: str = await agent_search.search(
|
|
212
|
+
query="How does transformer attention work?",
|
|
213
|
+
search_paths=["/path/to/documents"],
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
print(result)
|
|
217
|
+
|
|
218
|
+
asyncio.run(main())
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
**⚠️ Notes:**
|
|
222
|
+
- Upon initialization, AgenticSearch automatically checks if ripgrep-all and ripgrep are installed. If they are missing, it will attempt to install them automatically. If the automatic installation fails, please install them manually.
|
|
223
|
+
- References: https://github.com/BurntSushi/ripgrep | https://github.com/phiresky/ripgrep-all
|
|
224
|
+
- Replace `"your-api-key"`, `"your-base-url"`, `"your-model-name"` and `/path/to/documents` with your actual values.
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
## 🖥️ Web UI
|
|
230
|
+
|
|
231
|
+
The web UI is built for fast, transparent workflows: chat, knowledge analytics, and system monitoring in one place.
|
|
232
|
+
|
|
233
|
+
<div align="center">
|
|
234
|
+
<img src="assets/pic/Sirchmunk_Home.png" alt="Sirchmunk Home" width="85%">
|
|
235
|
+
<p><sub>Home — Chat with streaming logs, file-based RAG, and session management.</sub></p>
|
|
236
|
+
</div>
|
|
237
|
+
|
|
238
|
+
<div align="center">
|
|
239
|
+
<img src="assets/pic/Sirchmunk_Monitor.png" alt="Sirchmunk Monitor" width="85%">
|
|
240
|
+
<p><sub>Monitor — System health, chat activity, knowledge analytics, and LLM usage.</sub></p>
|
|
241
|
+
</div>
|
|
242
|
+
|
|
243
|
+
### Installation
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
git clone https://github.com/modelscope/sirchmunk.git && cd sirchmunk
|
|
247
|
+
|
|
248
|
+
pip install ".[web]"
|
|
249
|
+
|
|
250
|
+
npm install --prefix web
|
|
251
|
+
```
|
|
252
|
+
- Note: Node.js 18+ is required for the web interface.
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
### Running the Web UI
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
# Start frontend and backend
|
|
259
|
+
python scripts/start_web.py
|
|
260
|
+
|
|
261
|
+
# Stop frontend and backend
|
|
262
|
+
python scripts/stop_web.py
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
**Access the web UI at (By default):**
|
|
266
|
+
- Backend APIs: http://localhost:8584/docs
|
|
267
|
+
- Frontend: http://localhost:8585
|
|
268
|
+
|
|
269
|
+
**Configuration:**
|
|
270
|
+
|
|
271
|
+
- Access `Settings` → `Envrionment Variables` to configure LLM API, and other parameters.
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## 🏗️ How it Works
|
|
277
|
+
|
|
278
|
+
### Sirchmunk Framework
|
|
279
|
+
|
|
280
|
+
<div align="center">
|
|
281
|
+
<img src="assets/pic/Sirchmunk_Architecture.png" alt="Sirchmunk Architecture" width="85%">
|
|
282
|
+
</div>
|
|
283
|
+
|
|
284
|
+
### Core Components
|
|
285
|
+
|
|
286
|
+
| Component | Description |
|
|
287
|
+
|:----------------------|:-------------------------------------------------------------------------|
|
|
288
|
+
| **AgenticSearch** | Search orchestrator with LLM-enhanced retrieval capabilities |
|
|
289
|
+
| **KnowledgeBase** | Transforms raw results into structured knowledge clusters with evidences |
|
|
290
|
+
| **EvidenceProcessor** | Evidence processing based on the MonteCarlo Importance Sampling |
|
|
291
|
+
| **GrepRetriever** | High-performance _indexless_ file search with parallel processing |
|
|
292
|
+
| **OpenAIChat** | Unified LLM interface supporting streaming and usage tracking |
|
|
293
|
+
| **MonitorTracker** | Real-time system and application metrics collection |
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
### Data Storage
|
|
299
|
+
|
|
300
|
+
All persistent data is stored in the configured `WORK_PATH` (default: `~/.sirchmunk/`):
|
|
301
|
+
|
|
302
|
+
```
|
|
303
|
+
{WORK_PATH}/
|
|
304
|
+
├── .cache/
|
|
305
|
+
├── history/ # Chat session history (DuckDB)
|
|
306
|
+
│ └── chat_history.db
|
|
307
|
+
├── knowledge/ # Knowledge clusters (Parquet)
|
|
308
|
+
│ └── knowledge_clusters.parquet
|
|
309
|
+
└── settings/ # User settings (DuckDB)
|
|
310
|
+
└── settings.db
|
|
311
|
+
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
---
|
|
315
|
+
|
|
316
|
+
## ❓ FAQ
|
|
317
|
+
|
|
318
|
+
<details>
|
|
319
|
+
<summary><b>How is this different from traditional RAG systems?</b></summary>
|
|
320
|
+
|
|
321
|
+
Sirchmunk takes an **indexless approach**:
|
|
322
|
+
|
|
323
|
+
1. **No pre-indexing**: Direct file search without vector database setup
|
|
324
|
+
2. **Self-evolving**: Knowledge clusters evolve based on search patterns
|
|
325
|
+
3. **Multi-level retrieval**: Adaptive keyword granularity for better recall
|
|
326
|
+
4. **Evidence-based**: Monte Carlo sampling for precise content extraction
|
|
327
|
+
|
|
328
|
+
</details>
|
|
329
|
+
|
|
330
|
+
<details>
|
|
331
|
+
<summary><b>What LLM providers are supported?</b></summary>
|
|
332
|
+
|
|
333
|
+
Any OpenAI-compatible API endpoint, including (but not limited too):
|
|
334
|
+
- OpenAI (GPT-4, GPT-4o, GPT-3.5)
|
|
335
|
+
- Local models served via Ollama, llama.cpp, vLLM, SGLang etc.
|
|
336
|
+
- Claude via API proxy
|
|
337
|
+
|
|
338
|
+
</details>
|
|
339
|
+
|
|
340
|
+
<details>
|
|
341
|
+
<summary><b>How do I add documents to search?</b></summary>
|
|
342
|
+
|
|
343
|
+
Simply specify the path in your search query:
|
|
344
|
+
|
|
345
|
+
```python
|
|
346
|
+
result = await search.search(
|
|
347
|
+
query="Your question",
|
|
348
|
+
search_paths=["/path/to/folder", "/path/to/file.pdf"]
|
|
349
|
+
)
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
No pre-processing or indexing required!
|
|
353
|
+
|
|
354
|
+
</details>
|
|
355
|
+
|
|
356
|
+
<details>
|
|
357
|
+
<summary><b>Where are knowledge clusters stored?</b></summary>
|
|
358
|
+
|
|
359
|
+
Knowledge clusters are persisted in Parquet format at:
|
|
360
|
+
```
|
|
361
|
+
{WORK_PATH}/.cache/knowledge/knowledge_clusters.parquet
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
You can query them using DuckDB or the `KnowledgeManager` API.
|
|
365
|
+
|
|
366
|
+
</details>
|
|
367
|
+
|
|
368
|
+
<details>
|
|
369
|
+
<summary><b>How do I monitor LLM token usage?</b></summary>
|
|
370
|
+
|
|
371
|
+
1. **Web Dashboard**: Visit the Monitor page for real-time statistics
|
|
372
|
+
2. **API**: `GET /api/v1/monitor/llm` returns usage metrics
|
|
373
|
+
3. **Code**: Access `search.llm_usages` after search completion
|
|
374
|
+
|
|
375
|
+
</details>
|
|
376
|
+
|
|
377
|
+
---
|
|
378
|
+
|
|
379
|
+
## 📋 Roadmap
|
|
380
|
+
|
|
381
|
+
- [x] Text-retrieval from raw files
|
|
382
|
+
- [x] Knowledge structuring & persistence
|
|
383
|
+
- [x] Real-time chat with RAG
|
|
384
|
+
- [x] Web UI support
|
|
385
|
+
- [ ] Web search integration
|
|
386
|
+
- [ ] Multi-modal support (images, videos)
|
|
387
|
+
- [ ] Distributed search across nodes
|
|
388
|
+
- [ ] Knowledge visualization and deep analytics
|
|
389
|
+
- [ ] More file type support
|
|
390
|
+
|
|
391
|
+
---
|
|
392
|
+
|
|
393
|
+
## 🤝 Contributing
|
|
394
|
+
|
|
395
|
+
We welcome [contributions](https://github.com/modelscope/sirchmunk/pulls) !
|
|
396
|
+
|
|
397
|
+
---
|
|
398
|
+
|
|
399
|
+
## 📄 License
|
|
400
|
+
|
|
401
|
+
This project is licensed under the [Apache License 2.0](LICENSE).
|
|
402
|
+
|
|
403
|
+
---
|
|
404
|
+
|
|
405
|
+
<div align="center">
|
|
406
|
+
|
|
407
|
+
**[ModelScope](https://github.com/modelscope)** · [⭐ Star us](https://github.com/modelscope/sirchmunk/stargazers) · [🐛 Report a bug](https://github.com/modelscope/sirchmunk/issues) · [💬 Discussions](https://github.com/modelscope/sirchmunk/discussions)
|
|
408
|
+
|
|
409
|
+
*✨ Sirchmunk: Raw data to self-evolving intelligence, real-time.*
|
|
410
|
+
|
|
411
|
+
</div>
|
|
412
|
+
|
|
413
|
+
<p align="center">
|
|
414
|
+
<em> ❤️ Thanks for Visiting ✨ Sirchmunk !</em><br><br>
|
|
415
|
+
<img src="https://visitor-badge.laobi.icu/badge?page_id=modelscope.sirchmunk&style=for-the-badge&color=00d4ff" alt="Views">
|
|
416
|
+
</p>
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
sirchmunk/__init__.py,sha256=5sdppELUcjnTofJtwZ2ACuUscmLYIPhCwj-G-MMSl-M,184
|
|
2
|
+
sirchmunk/base.py,sha256=qVQ63QfEWhEvOJl3OxQvC2rOUNTZCD5weXRn-1vvEkU,439
|
|
3
|
+
sirchmunk/search.py,sha256=NROpV39oklDT4wusE8EnkQPABq9LiAa0kQrtyR6viHQ,18210
|
|
4
|
+
sirchmunk/version.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
|
|
5
|
+
sirchmunk/insight/__init__.py,sha256=7sQeT0fSg5-b9jrwthA9_fSCR1Q5qSvm33L9l7kHjLY,144
|
|
6
|
+
sirchmunk/insight/text_insights.py,sha256=BwFqmDNG8FmOz4Lv3qO1Mz2xRv_LVg4b3wj6YUhVLVE,9075
|
|
7
|
+
sirchmunk/learnings/__init__.py,sha256=310L84MdAIw4THnzf5YsLiUhW_oaxgJHHcZZeMso3jY,61
|
|
8
|
+
sirchmunk/learnings/evidence_processor.py,sha256=QDg-qReSte8R8I2BrRRC-d-Glyfjcnqazbe57-PSDHU,17922
|
|
9
|
+
sirchmunk/learnings/knowledge_base.py,sha256=8szliKLFKu_BTIVf0vhy2mSe7Tix4145Umdd5OQ2rww,8343
|
|
10
|
+
sirchmunk/llm/__init__.py,sha256=4ynF6R63afMWW1d9T21C8JqfVc9xJTiOFXZNzDwPLww,98
|
|
11
|
+
sirchmunk/llm/openai_chat.py,sha256=ET7HqEoFTbbvhUTlAUZNOoJwxF9hA73qeH3xFTsNK-w,8138
|
|
12
|
+
sirchmunk/llm/prompts.py,sha256=8Wpugif43EhCaDuRF4U6xRgbv6EB_UdU-2NdDFuglM0,8755
|
|
13
|
+
sirchmunk/retrieve/__init__.py,sha256=310L84MdAIw4THnzf5YsLiUhW_oaxgJHHcZZeMso3jY,61
|
|
14
|
+
sirchmunk/retrieve/base.py,sha256=VDpCwdwhjVYuj0mbe78qg_FhbcQkasbCS0cd66hQ4hk,618
|
|
15
|
+
sirchmunk/retrieve/text_retriever.py,sha256=1ryDjqecXJz2bwjeoJhaUvMxCAdoNzPMMOAJYH3x7g0,39484
|
|
16
|
+
sirchmunk/scan/__init__.py,sha256=310L84MdAIw4THnzf5YsLiUhW_oaxgJHHcZZeMso3jY,61
|
|
17
|
+
sirchmunk/scan/base.py,sha256=3Jqvn0W9KJ00u2oy8U-qD16KksV7taAx4s9qhCpRD-c,496
|
|
18
|
+
sirchmunk/scan/file_scanner.py,sha256=GtGFP9ZwqemOSkHjFcEDaD2WCnw5qMB_bCeiVUaIEQg,14645
|
|
19
|
+
sirchmunk/scan/web_scanner.py,sha256=YQRBQ5JnATwNdek2o1_Y8GB0eG0IWQvx4YhqW4Zc5Tw,531
|
|
20
|
+
sirchmunk/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
sirchmunk/schema/__init__.py,sha256=kvzKU8Rt1t91Rm5tuy6KqgZXj8lfYQ2q1XvLYGNLUh4,103
|
|
22
|
+
sirchmunk/schema/cognition.py,sha256=NDgCOjMpqyUBxd_ymeMQrjSr3Ri-nJeMUNRuWfLUhBI,4031
|
|
23
|
+
sirchmunk/schema/context.py,sha256=wwsSBE5EJXmrvh-RCluf3nJm7hcxkWaiNWfJ10coGY4,628
|
|
24
|
+
sirchmunk/schema/knowledge.py,sha256=mPKuhqXzLFIfcz5yHRcNe7B4WifUVlHDNnzeevnFKQM,11108
|
|
25
|
+
sirchmunk/schema/metadata.py,sha256=VdUD6GPCB_jtyBkowkGEpCPMYL1c-sFVSUloKPSZRwE,22810
|
|
26
|
+
sirchmunk/schema/request.py,sha256=EPum-IzmN15EgFmwN12oL-CNoI-KvTcHN4nLuO-c09c,7702
|
|
27
|
+
sirchmunk/schema/response.py,sha256=6xc5tvAnqL_zpUhQtchVGwbrt1btzy4QArCeS81DrIU,488
|
|
28
|
+
sirchmunk/schema/snapshot.py,sha256=zZSKDRN8jMtpOIH3TL0FCrAn7AlUc_5zO8qFX2f2u_s,12341
|
|
29
|
+
sirchmunk/storage/__init__.py,sha256=9mF1JdbVrgC0mt9uvBYU9RwiLGRdrmwGAyTj7EA21Pg,231
|
|
30
|
+
sirchmunk/storage/duckdb.py,sha256=Jw_EK9i589YyKXYhi-yjAmX7zIM7txDsoxy0Gx-XY8o,23092
|
|
31
|
+
sirchmunk/storage/knowledge_manager.py,sha256=YYlZa4CnbCXzFWjOleztil2hohqrvrOZs-8UcV0DTEM,27746
|
|
32
|
+
sirchmunk/utils/__init__.py,sha256=33bVrhpUfPXpM85r_SEB07QJnGDjD-4BE5p9vpF-fXw,265
|
|
33
|
+
sirchmunk/utils/constants.py,sha256=D6RP2mtRH9uA3ZI1SpFD77_QPtYkxIKIi7zZY8-do1w,551
|
|
34
|
+
sirchmunk/utils/deps.py,sha256=QTL0k7CN1t0_r-CrZ4TM__pdK8U2X1d0OKvJUtQe_xA,563
|
|
35
|
+
sirchmunk/utils/file_utils.py,sha256=9OtYNffXbo1Pz6XuJsOECQS_mYRBg1NpGUsglNgfWnU,2257
|
|
36
|
+
sirchmunk/utils/install_rga.py,sha256=i7sWYi6u2A32dc0mq5LB_OtqcMWqKezSz6WILWaK1Oc,5215
|
|
37
|
+
sirchmunk/utils/log_utils.py,sha256=HujosgEXV9fpSVd6JjRh7KEQuskhrcpC4Kit6aIpnW0,14195
|
|
38
|
+
sirchmunk/utils/tokenizer_util.py,sha256=JIg4FylB6o8AIe71q8Uv83PKmPF1funX3ffhb9cPX3c,1624
|
|
39
|
+
sirchmunk/utils/utils.py,sha256=qnwZ8R9xLqsMooW0IdcWoPKto7q4AQ49-SOR33rCy1g,3394
|
|
40
|
+
sirchmunk-0.0.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
41
|
+
sirchmunk-0.0.1.dist-info/METADATA,sha256=MnMXApUAaWluApy7zthN2yGhhT-t2-OhXxiG9VM__lw,15724
|
|
42
|
+
sirchmunk-0.0.1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
43
|
+
sirchmunk-0.0.1.dist-info/entry_points.txt,sha256=lpnP-Ll2CUY0P1wYm6kutcBMrxwG67astmgY-vVhF14,56
|
|
44
|
+
sirchmunk-0.0.1.dist-info/top_level.txt,sha256=8MiQvqjFkqiGJ7m4xqxsfxwxFHtH2mIXlrkV9PfX-aM,10
|
|
45
|
+
sirchmunk-0.0.1.dist-info/RECORD,,
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: sirchmunk
|
|
3
|
-
Version: 0.0.0
|
|
4
|
-
Summary: sirchmunk
|
|
5
|
-
Author: sirchmunk team
|
|
6
|
-
Author-email: xx@yy.com
|
|
7
|
-
License: Apache License 2.0
|
|
8
|
-
Project-URL: Homepage, https://github.com
|
|
9
|
-
Keywords: sirchmunk,Software
|
|
10
|
-
Classifier: Development Status :: 4 - Beta
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
19
|
-
Requires-Python: >=3.10
|
|
20
|
-
Description-Content-Type: text/markdown
|
|
21
|
-
License-File: LICENSE
|
|
22
|
-
Provides-Extra: docs
|
|
23
|
-
Provides-Extra: tests
|
|
24
|
-
Dynamic: license-file
|
|
25
|
-
|
|
26
|
-
# sentis
|
sirchmunk-0.0.0.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
sirchmunk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
sirchmunk/version.py,sha256=qwX_VmtTcxitUuo61-lzyWhU70ydr0gDejMn5eqC3Dk,21
|
|
3
|
-
sirchmunk-0.0.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
4
|
-
sirchmunk-0.0.0.dist-info/METADATA,sha256=d1AhiO-kUXkKBo3JSgZ2wS-WzHvz9NP6D52EghbMi0c,835
|
|
5
|
-
sirchmunk-0.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
-
sirchmunk-0.0.0.dist-info/entry_points.txt,sha256=lpnP-Ll2CUY0P1wYm6kutcBMrxwG67astmgY-vVhF14,56
|
|
7
|
-
sirchmunk-0.0.0.dist-info/top_level.txt,sha256=8MiQvqjFkqiGJ7m4xqxsfxwxFHtH2mIXlrkV9PfX-aM,10
|
|
8
|
-
sirchmunk-0.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|