sirchmunk 0.0.0__tar.gz → 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. sirchmunk-0.0.1/PKG-INFO +493 -0
  2. sirchmunk-0.0.1/README.md +386 -0
  3. {sirchmunk-0.0.0 → sirchmunk-0.0.1}/pyproject.toml +12 -17
  4. sirchmunk-0.0.1/setup.py +179 -0
  5. sirchmunk-0.0.1/src/sirchmunk/__init__.py +8 -0
  6. sirchmunk-0.0.1/src/sirchmunk/base.py +17 -0
  7. sirchmunk-0.0.1/src/sirchmunk/insight/__init__.py +4 -0
  8. sirchmunk-0.0.1/src/sirchmunk/insight/text_insights.py +292 -0
  9. sirchmunk-0.0.1/src/sirchmunk/learnings/__init__.py +1 -0
  10. sirchmunk-0.0.1/src/sirchmunk/learnings/evidence_processor.py +525 -0
  11. sirchmunk-0.0.1/src/sirchmunk/learnings/knowledge_base.py +232 -0
  12. sirchmunk-0.0.1/src/sirchmunk/llm/__init__.py +2 -0
  13. sirchmunk-0.0.1/src/sirchmunk/llm/openai_chat.py +247 -0
  14. sirchmunk-0.0.1/src/sirchmunk/llm/prompts.py +216 -0
  15. sirchmunk-0.0.1/src/sirchmunk/retrieve/__init__.py +1 -0
  16. sirchmunk-0.0.1/src/sirchmunk/retrieve/base.py +25 -0
  17. sirchmunk-0.0.1/src/sirchmunk/retrieve/text_retriever.py +1026 -0
  18. sirchmunk-0.0.1/src/sirchmunk/scan/__init__.py +1 -0
  19. sirchmunk-0.0.1/src/sirchmunk/scan/base.py +18 -0
  20. sirchmunk-0.0.1/src/sirchmunk/scan/file_scanner.py +373 -0
  21. sirchmunk-0.0.1/src/sirchmunk/scan/web_scanner.py +18 -0
  22. sirchmunk-0.0.1/src/sirchmunk/schema/__init__.py +2 -0
  23. sirchmunk-0.0.1/src/sirchmunk/schema/cognition.py +106 -0
  24. sirchmunk-0.0.1/src/sirchmunk/schema/context.py +25 -0
  25. sirchmunk-0.0.1/src/sirchmunk/schema/knowledge.py +318 -0
  26. sirchmunk-0.0.1/src/sirchmunk/schema/metadata.py +658 -0
  27. sirchmunk-0.0.1/src/sirchmunk/schema/request.py +221 -0
  28. sirchmunk-0.0.1/src/sirchmunk/schema/response.py +20 -0
  29. sirchmunk-0.0.1/src/sirchmunk/schema/snapshot.py +346 -0
  30. sirchmunk-0.0.1/src/sirchmunk/search.py +475 -0
  31. sirchmunk-0.0.1/src/sirchmunk/storage/__init__.py +7 -0
  32. sirchmunk-0.0.1/src/sirchmunk/storage/duckdb.py +676 -0
  33. sirchmunk-0.0.1/src/sirchmunk/storage/knowledge_manager.py +720 -0
  34. sirchmunk-0.0.1/src/sirchmunk/utils/__init__.py +15 -0
  35. sirchmunk-0.0.1/src/sirchmunk/utils/constants.py +15 -0
  36. sirchmunk-0.0.1/src/sirchmunk/utils/deps.py +23 -0
  37. sirchmunk-0.0.1/src/sirchmunk/utils/file_utils.py +70 -0
  38. sirchmunk-0.0.1/src/sirchmunk/utils/install_rga.py +124 -0
  39. sirchmunk-0.0.1/src/sirchmunk/utils/log_utils.py +360 -0
  40. sirchmunk-0.0.1/src/sirchmunk/utils/tokenizer_util.py +55 -0
  41. sirchmunk-0.0.1/src/sirchmunk/utils/utils.py +108 -0
  42. sirchmunk-0.0.1/src/sirchmunk/version.py +1 -0
  43. sirchmunk-0.0.1/src/sirchmunk.egg-info/PKG-INFO +493 -0
  44. sirchmunk-0.0.1/src/sirchmunk.egg-info/SOURCES.txt +50 -0
  45. sirchmunk-0.0.1/src/sirchmunk.egg-info/not-zip-safe +1 -0
  46. sirchmunk-0.0.1/src/sirchmunk.egg-info/requires.txt +84 -0
  47. sirchmunk-0.0.0/PKG-INFO +0 -26
  48. sirchmunk-0.0.0/README.md +0 -1
  49. sirchmunk-0.0.0/requirements/docs.txt +0 -0
  50. sirchmunk-0.0.0/requirements/framework.txt +0 -0
  51. sirchmunk-0.0.0/requirements/tests.txt +0 -0
  52. sirchmunk-0.0.0/sirchmunk/version.py +0 -1
  53. sirchmunk-0.0.0/sirchmunk.egg-info/PKG-INFO +0 -26
  54. sirchmunk-0.0.0/sirchmunk.egg-info/SOURCES.txt +0 -14
  55. sirchmunk-0.0.0/sirchmunk.egg-info/requires.txt +0 -4
  56. {sirchmunk-0.0.0 → sirchmunk-0.0.1}/LICENSE +0 -0
  57. {sirchmunk-0.0.0 → sirchmunk-0.0.1}/setup.cfg +0 -0
  58. {sirchmunk-0.0.0/sirchmunk → sirchmunk-0.0.1/src/sirchmunk/scheduler}/__init__.py +0 -0
  59. {sirchmunk-0.0.0 → sirchmunk-0.0.1/src}/sirchmunk.egg-info/dependency_links.txt +0 -0
  60. {sirchmunk-0.0.0 → sirchmunk-0.0.1/src}/sirchmunk.egg-info/entry_points.txt +0 -0
  61. {sirchmunk-0.0.0 → sirchmunk-0.0.1/src}/sirchmunk.egg-info/top_level.txt +0 -0
@@ -0,0 +1,493 @@
1
+ Metadata-Version: 2.4
2
+ Name: sirchmunk
3
+ Version: 0.0.1
4
+ Summary: Sirchmunk: From raw data to self-evolving real-time intelligence.
5
+ Home-page: https://github.com/modelscope/sirchmunk
6
+ Author: ModelScope Team
7
+ Author-email: contact@modelscope.cn
8
+ License: Apache License 2.0
9
+ Project-URL: Homepage, https://github.com/modelscope/sirchmunk
10
+ Keywords: LLM,Agentic,Search,RAG,Indexless,Self-evolving,Real-time Intelligence,Multi-modal
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: License :: OSI Approved :: Apache Software License
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: loguru
24
+ Requires-Dist: fastapi
25
+ Requires-Dist: openai
26
+ Requires-Dist: genson
27
+ Requires-Dist: pillow
28
+ Requires-Dist: pypdf
29
+ Requires-Dist: pandas
30
+ Requires-Dist: parquet
31
+ Requires-Dist: numpy
32
+ Requires-Dist: msgpack
33
+ Requires-Dist: sentencepiece
34
+ Requires-Dist: tqdm
35
+ Requires-Dist: rapidfuzz
36
+ Requires-Dist: duckdb
37
+ Requires-Dist: kreuzberg>=4.0.0rc1
38
+ Provides-Extra: docs
39
+ Requires-Dist: docutils>=0.16.0; extra == "docs"
40
+ Requires-Dist: myst_parser; extra == "docs"
41
+ Requires-Dist: recommonmark; extra == "docs"
42
+ Requires-Dist: sphinx>=5.3.0; extra == "docs"
43
+ Requires-Dist: sphinx-book-theme; extra == "docs"
44
+ Requires-Dist: sphinx-copybutton; extra == "docs"
45
+ Requires-Dist: sphinx-design; extra == "docs"
46
+ Requires-Dist: sphinx_markdown_tables; extra == "docs"
47
+ Requires-Dist: sphinxawesome-theme; extra == "docs"
48
+ Requires-Dist: sphinxcontrib-mermaid; extra == "docs"
49
+ Provides-Extra: tests
50
+ Requires-Dist: pytest; extra == "tests"
51
+ Requires-Dist: pytest-asyncio; extra == "tests"
52
+ Provides-Extra: web
53
+ Requires-Dist: fastapi>=0.100.0; extra == "web"
54
+ Requires-Dist: uvicorn[standard]>=0.24.0; extra == "web"
55
+ Requires-Dist: websockets>=12.0; extra == "web"
56
+ Requires-Dist: python-multipart>=0.0.6; extra == "web"
57
+ Requires-Dist: pydantic>=2.0.0; extra == "web"
58
+ Requires-Dist: requests>=2.32.2; extra == "web"
59
+ Requires-Dist: aiohttp>=3.9.4; extra == "web"
60
+ Requires-Dist: urllib3>=2.2.1; extra == "web"
61
+ Requires-Dist: pydantic>=2.0; extra == "web"
62
+ Requires-Dist: python-dotenv>=1.0.0; extra == "web"
63
+ Requires-Dist: psutil; extra == "web"
64
+ Provides-Extra: all
65
+ Requires-Dist: loguru; extra == "all"
66
+ Requires-Dist: fastapi; extra == "all"
67
+ Requires-Dist: openai; extra == "all"
68
+ Requires-Dist: genson; extra == "all"
69
+ Requires-Dist: pillow; extra == "all"
70
+ Requires-Dist: pypdf; extra == "all"
71
+ Requires-Dist: pandas; extra == "all"
72
+ Requires-Dist: parquet; extra == "all"
73
+ Requires-Dist: numpy; extra == "all"
74
+ Requires-Dist: msgpack; extra == "all"
75
+ Requires-Dist: sentencepiece; extra == "all"
76
+ Requires-Dist: tqdm; extra == "all"
77
+ Requires-Dist: rapidfuzz; extra == "all"
78
+ Requires-Dist: duckdb; extra == "all"
79
+ Requires-Dist: kreuzberg>=4.0.0rc1; extra == "all"
80
+ Requires-Dist: docutils>=0.16.0; extra == "all"
81
+ Requires-Dist: myst_parser; extra == "all"
82
+ Requires-Dist: recommonmark; extra == "all"
83
+ Requires-Dist: sphinx>=5.3.0; extra == "all"
84
+ Requires-Dist: sphinx-book-theme; extra == "all"
85
+ Requires-Dist: sphinx-copybutton; extra == "all"
86
+ Requires-Dist: sphinx-design; extra == "all"
87
+ Requires-Dist: sphinx_markdown_tables; extra == "all"
88
+ Requires-Dist: sphinxawesome-theme; extra == "all"
89
+ Requires-Dist: sphinxcontrib-mermaid; extra == "all"
90
+ Requires-Dist: pytest; extra == "all"
91
+ Requires-Dist: pytest-asyncio; extra == "all"
92
+ Requires-Dist: fastapi>=0.100.0; extra == "all"
93
+ Requires-Dist: uvicorn[standard]>=0.24.0; extra == "all"
94
+ Requires-Dist: websockets>=12.0; extra == "all"
95
+ Requires-Dist: python-multipart>=0.0.6; extra == "all"
96
+ Requires-Dist: pydantic>=2.0.0; extra == "all"
97
+ Requires-Dist: requests>=2.32.2; extra == "all"
98
+ Requires-Dist: aiohttp>=3.9.4; extra == "all"
99
+ Requires-Dist: urllib3>=2.2.1; extra == "all"
100
+ Requires-Dist: pydantic>=2.0; extra == "all"
101
+ Requires-Dist: python-dotenv>=1.0.0; extra == "all"
102
+ Requires-Dist: psutil; extra == "all"
103
+ Dynamic: home-page
104
+ Dynamic: license-file
105
+ Dynamic: provides-extra
106
+ Dynamic: requires-dist
107
+
108
+ <div align="center">
109
+
110
+ <img src="web/public/logo-v2.png" alt="Sirchmunk Logo" width="250" style="border-radius: 15px;">
111
+
112
+ # Sirchmunk: Raw data to self-evolving intelligence, real-time.
113
+
114
+ [![Python](https://img.shields.io/badge/Python-3.10%2B-3776AB?style=flat-square&logo=python&logoColor=white)](https://www.python.org/downloads/)
115
+ [![FastAPI](https://img.shields.io/badge/FastAPI-0.100%2B-009688?style=flat-square&logo=fastapi&logoColor=white)](https://fastapi.tiangolo.com/)
116
+ [![Next.js](https://img.shields.io/badge/Next.js-14-000000?style=flat-square&logo=next.js&logoColor=white)](https://nextjs.org/)
117
+ [![TailwindCSS](https://img.shields.io/badge/Tailwind-3.4-06B6D4?style=flat-square&logo=tailwindcss&logoColor=white)](https://tailwindcss.com/)
118
+ [![DuckDB](https://img.shields.io/badge/DuckDB-OLAP-FFF000?style=flat-square&logo=duckdb&logoColor=black)](https://duckdb.org/)
119
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue?style=flat-square)](LICENSE)
120
+ [![ripgrep-all](https://img.shields.io/badge/ripgrep--all-Search-E67E22?style=flat-square&logo=rust&logoColor=white)](https://github.com/phiresky/ripgrep-all)
121
+ [![OpenAI](https://img.shields.io/badge/OpenAI-API-412991?style=flat-square&logo=openai&logoColor=white)](https://github.com/openai/openai-python)
122
+ [![Kreuzberg](https://img.shields.io/badge/Kreuzberg-Text_Extraction-4CAF50?style=flat-square)](https://github.com/kreuzberg-dev/kreuzberg)
123
+
124
+
125
+ [**Quick Start**](#-quick-start) · [**Key Features**](#-key-features) · [**Web UI**](#-web-ui) · [**How it Works**](#-how-it-works) · [**FAQ**](#-faq)
126
+
127
+ [🇨🇳 中文](README_zh.md)
128
+
129
+ </div>
130
+
131
+ <div align="center">
132
+
133
+ 🔍 **Agentic Search** &nbsp;•&nbsp; 🧠 **Knowledge Clustering** &nbsp;•&nbsp; 📊 **Monte Carlo Evidence Sampling**<br>
134
+ ⚡ **Indexless Retrieval** &nbsp;•&nbsp; 🔄 **Self-Evolving Knowledge Base** &nbsp;•&nbsp; 💬 **Real-time Chat**
135
+
136
+ </div>
137
+
138
+ ---
139
+
140
+ ## 🌰 Why “Sirchmunk”?
141
+
142
+ Intelligence pipelines built upon vector-based retrieval can be _rigid and brittle_. They rely on static vector embeddings that are **expensive to compute, blind to real-time changes, and detached from the raw context**. We introduce **Sirchmunk** to usher in a more agile paradigm, where data is no longer treated as a snapshot, and insights can evolve together with the data.
143
+
144
+ ---
145
+
146
+ ## ✨ Key Features
147
+
148
+ ### 1. Embedding-Free: Data in its Purest Form
149
+
150
+ **Sirchmunk** works directly with **raw data** -- bypassing the heavy overhead of squeezing your rich files into fixed-dimensional vectors.
151
+
152
+ * **Instant Search:** Eliminating complex pre-processing pipelines in hours long indexing; just drop your files and search immediately.
153
+ * **Full Fidelity:** Zero information loss —- stay true to your data without vector approximation.
154
+
155
+ ### 2. Self-Evolving: A Living Index
156
+
157
+ Data is a stream, not a snapshot. **Sirchmunk** is **dynamic by design**, while vector DB can become obsolete the moment your data changes.
158
+
159
+ * **Context-Aware:** Evolves in real-time with your data context.
160
+ * **LLM-Powered Autonomy:** Designed for Agents that perceive data as it lives, utilizing **token-efficient** reasoning that triggers LLM inference only when necessary to maximize intelligence while minimizing cost.
161
+
162
+ ### 3. Intelligence at Scale: Real-Time & Massive
163
+
164
+ **Sirchmunk** bridges massive local repositories and the web with **high-scale throughput** and **real-time awareness**. <br/>
165
+ It serves as a unified intelligent hub for AI agents, delivering deep insights across vast datasets at the speed of thought.
166
+
167
+ ---
168
+
169
+ ### Traditional RAG vs. Sirchmunk
170
+
171
+ <div style="display: flex; justify-content: center; width: 100%;">
172
+ <table style="width: 100%; max-width: 900px; border-collapse: separate; border-spacing: 0; overflow: hidden; border-radius: 12px; font-family: sans-serif; border: 1px solid rgba(128, 128, 128, 0.2); margin: 0 auto;">
173
+ <colgroup>
174
+ <col style="width: 25%;">
175
+ <col style="width: 30%;">
176
+ <col style="width: 45%;">
177
+ </colgroup>
178
+ <thead>
179
+ <tr style="background-color: rgba(128, 128, 128, 0.05);">
180
+ <th style="text-align: left; padding: 16px; border-bottom: 2px solid rgba(128, 128, 128, 0.2); font-size: 1.3em;">Dimension</th>
181
+ <th style="text-align: left; padding: 16px; border-bottom: 2px solid rgba(128, 128, 128, 0.2); font-size: 1.3em; opacity: 0.7;">Traditional RAG</th>
182
+ <th style="text-align: left; padding: 16px; border-bottom: 2px solid rgba(58, 134, 255, 0.5); color: #3a86ff; font-weight: 800; font-size: 1.3em;">✨Sirchmunk</th>
183
+ </tr>
184
+ </thead>
185
+ <tbody>
186
+ <tr>
187
+ <td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">💰 Setup Cost</td>
188
+ <td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">High Overhead <br/> (VectorDB, GraphDB, Complex Document Parser...)</td>
189
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
190
+ ✅ Zero Infrastructure <br/>
191
+ <small style="opacity: 0.8; font-size: 0.85em;">Direct-to-data retrieval without vector silos</small>
192
+ </td>
193
+ </tr>
194
+ <tr>
195
+ <td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">🕒 Data Freshness</td>
196
+ <td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">Stale (Batch Re-indexing)</td>
197
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
198
+ ✅ Instant &amp; Dynamic <br/>
199
+ <small style="opacity: 0.8; font-size: 0.85em;">Self-evolving index that reflects live changes</small>
200
+ </td>
201
+ </tr>
202
+ <tr>
203
+ <td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">📈 Scalability</td>
204
+ <td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">Linear Cost Growth</td>
205
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
206
+ ✅ Extremely low RAM/CPU consumption <br/>
207
+ <small style="opacity: 0.8; font-size: 0.85em;">Native Elastic Support, efficiently handles large-scale datasets</small>
208
+ </td>
209
+ </tr>
210
+ <tr>
211
+ <td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">🎯 Accuracy</td>
212
+ <td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">Approximate Vector Matches</td>
213
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
214
+ ✅ Deterministic &amp; Contextual <br/>
215
+ <small style="opacity: 0.8; font-size: 0.85em;">Hybrid logic ensuring semantic precision</small>
216
+ </td>
217
+ </tr>
218
+ <tr>
219
+ <td style="padding: 16px; font-weight: 600;">⚙️ Workflow</td>
220
+ <td style="padding: 16px; opacity: 0.6;">Complex ETL Pipelines</td>
221
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef;">
222
+ ✅ Drop-and-Search <br/>
223
+ <small style="opacity: 0.8; font-size: 0.85em;">Zero-config integration for rapid deployment</small>
224
+ </td>
225
+ </tr>
226
+ </tbody>
227
+ </table>
228
+ </div>
229
+
230
+ ---
231
+
232
+
233
+ ## Demonstration
234
+
235
+
236
+ <div align="center">
237
+ <img src="assets/gif/Sirchmunk_Web.gif" alt="Sirchmunk WebUI" width="100%">
238
+ <p style="font-size: 1.1em; font-weight: 600; margin-top: 8px; color: #00bcd4;">
239
+ Access files directly to start chatting
240
+ </p>
241
+ </div>
242
+
243
+ ---
244
+
245
+
246
+ ## 🚀 Quick Start
247
+
248
+ ### Prerequisites
249
+
250
+ - **Python** 3.10+
251
+ - **LLM API Key** (OpenAI-compatible endpoint, local or remote)
252
+ - **Node.js** 18+ (Optional, for web interface)
253
+
254
+ ### Installation
255
+
256
+ ```bash
257
+ # Create virtual environment (recommended)
258
+ conda create -n sirchmunk python=3.13 -y && conda activate sirchmunk
259
+
260
+ pip install sirchmunk
261
+
262
+ # Or via UV:
263
+ uv pip install sirchmunk
264
+
265
+ # Alternatively, install from source:
266
+ git clone https://github.com/modelscope/sirchmunk.git && cd sirchmunk
267
+ pip install -e .
268
+ ```
269
+
270
+ ### Python SDK Usage
271
+
272
+ ```python
273
+ import asyncio
274
+
275
+ from sirchmunk import AgenticSearch
276
+ from sirchmunk.llm import OpenAIChat
277
+
278
+ llm = OpenAIChat(
279
+ api_key="your-api-key",
280
+ base_url="your-base-url", # e.g., https://api.openai.com/v1
281
+ model="your-model-name" # e.g., gpt-4o
282
+ )
283
+
284
+ async def main():
285
+
286
+ agent_search = AgenticSearch(llm=llm)
287
+
288
+ result: str = await agent_search.search(
289
+ query="How does transformer attention work?",
290
+ search_paths=["/path/to/documents"],
291
+ )
292
+
293
+ print(result)
294
+
295
+ asyncio.run(main())
296
+ ```
297
+
298
+ **⚠️ Notes:**
299
+ - Upon initialization, AgenticSearch automatically checks if ripgrep-all and ripgrep are installed. If they are missing, it will attempt to install them automatically. If the automatic installation fails, please install them manually.
300
+ - References: https://github.com/BurntSushi/ripgrep | https://github.com/phiresky/ripgrep-all
301
+ - Replace `"your-api-key"`, `"your-base-url"`, `"your-model-name"` and `/path/to/documents` with your actual values.
302
+
303
+
304
+ ---
305
+
306
+ ## 🖥️ Web UI
307
+
308
+ The web UI is built for fast, transparent workflows: chat, knowledge analytics, and system monitoring in one place.
309
+
310
+ <div align="center">
311
+ <img src="assets/pic/Sirchmunk_Home.png" alt="Sirchmunk Home" width="85%">
312
+ <p><sub>Home — Chat with streaming logs, file-based RAG, and session management.</sub></p>
313
+ </div>
314
+
315
+ <div align="center">
316
+ <img src="assets/pic/Sirchmunk_Monitor.png" alt="Sirchmunk Monitor" width="85%">
317
+ <p><sub>Monitor — System health, chat activity, knowledge analytics, and LLM usage.</sub></p>
318
+ </div>
319
+
320
+ ### Installation
321
+
322
+ ```bash
323
+ git clone https://github.com/modelscope/sirchmunk.git && cd sirchmunk
324
+
325
+ pip install ".[web]"
326
+
327
+ npm install --prefix web
328
+ ```
329
+ - Note: Node.js 18+ is required for the web interface.
330
+
331
+
332
+ ### Running the Web UI
333
+
334
+ ```bash
335
+ # Start frontend and backend
336
+ python scripts/start_web.py
337
+
338
+ # Stop frontend and backend
339
+ python scripts/stop_web.py
340
+ ```
341
+
342
+ **Access the web UI at (By default):**
343
+ - Backend APIs: http://localhost:8584/docs
344
+ - Frontend: http://localhost:8585
345
+
346
+ **Configuration:**
347
+
348
+ - Access `Settings` → `Envrionment Variables` to configure LLM API, and other parameters.
349
+
350
+
351
+ ---
352
+
353
+ ## 🏗️ How it Works
354
+
355
+ ### Sirchmunk Framework
356
+
357
+ <div align="center">
358
+ <img src="assets/pic/Sirchmunk_Architecture.png" alt="Sirchmunk Architecture" width="85%">
359
+ </div>
360
+
361
+ ### Core Components
362
+
363
+ | Component | Description |
364
+ |:----------------------|:-------------------------------------------------------------------------|
365
+ | **AgenticSearch** | Search orchestrator with LLM-enhanced retrieval capabilities |
366
+ | **KnowledgeBase** | Transforms raw results into structured knowledge clusters with evidences |
367
+ | **EvidenceProcessor** | Evidence processing based on the MonteCarlo Importance Sampling |
368
+ | **GrepRetriever** | High-performance _indexless_ file search with parallel processing |
369
+ | **OpenAIChat** | Unified LLM interface supporting streaming and usage tracking |
370
+ | **MonitorTracker** | Real-time system and application metrics collection |
371
+
372
+ ---
373
+
374
+
375
+ ### Data Storage
376
+
377
+ All persistent data is stored in the configured `WORK_PATH` (default: `~/.sirchmunk/`):
378
+
379
+ ```
380
+ {WORK_PATH}/
381
+ ├── .cache/
382
+ ├── history/ # Chat session history (DuckDB)
383
+ │ └── chat_history.db
384
+ ├── knowledge/ # Knowledge clusters (Parquet)
385
+ │ └── knowledge_clusters.parquet
386
+ └── settings/ # User settings (DuckDB)
387
+ └── settings.db
388
+
389
+ ```
390
+
391
+ ---
392
+
393
+ ## ❓ FAQ
394
+
395
+ <details>
396
+ <summary><b>How is this different from traditional RAG systems?</b></summary>
397
+
398
+ Sirchmunk takes an **indexless approach**:
399
+
400
+ 1. **No pre-indexing**: Direct file search without vector database setup
401
+ 2. **Self-evolving**: Knowledge clusters evolve based on search patterns
402
+ 3. **Multi-level retrieval**: Adaptive keyword granularity for better recall
403
+ 4. **Evidence-based**: Monte Carlo sampling for precise content extraction
404
+
405
+ </details>
406
+
407
+ <details>
408
+ <summary><b>What LLM providers are supported?</b></summary>
409
+
410
+ Any OpenAI-compatible API endpoint, including (but not limited too):
411
+ - OpenAI (GPT-4, GPT-4o, GPT-3.5)
412
+ - Local models served via Ollama, llama.cpp, vLLM, SGLang etc.
413
+ - Claude via API proxy
414
+
415
+ </details>
416
+
417
+ <details>
418
+ <summary><b>How do I add documents to search?</b></summary>
419
+
420
+ Simply specify the path in your search query:
421
+
422
+ ```python
423
+ result = await search.search(
424
+ query="Your question",
425
+ search_paths=["/path/to/folder", "/path/to/file.pdf"]
426
+ )
427
+ ```
428
+
429
+ No pre-processing or indexing required!
430
+
431
+ </details>
432
+
433
+ <details>
434
+ <summary><b>Where are knowledge clusters stored?</b></summary>
435
+
436
+ Knowledge clusters are persisted in Parquet format at:
437
+ ```
438
+ {WORK_PATH}/.cache/knowledge/knowledge_clusters.parquet
439
+ ```
440
+
441
+ You can query them using DuckDB or the `KnowledgeManager` API.
442
+
443
+ </details>
444
+
445
+ <details>
446
+ <summary><b>How do I monitor LLM token usage?</b></summary>
447
+
448
+ 1. **Web Dashboard**: Visit the Monitor page for real-time statistics
449
+ 2. **API**: `GET /api/v1/monitor/llm` returns usage metrics
450
+ 3. **Code**: Access `search.llm_usages` after search completion
451
+
452
+ </details>
453
+
454
+ ---
455
+
456
+ ## 📋 Roadmap
457
+
458
+ - [x] Text-retrieval from raw files
459
+ - [x] Knowledge structuring & persistence
460
+ - [x] Real-time chat with RAG
461
+ - [x] Web UI support
462
+ - [ ] Web search integration
463
+ - [ ] Multi-modal support (images, videos)
464
+ - [ ] Distributed search across nodes
465
+ - [ ] Knowledge visualization and deep analytics
466
+ - [ ] More file type support
467
+
468
+ ---
469
+
470
+ ## 🤝 Contributing
471
+
472
+ We welcome [contributions](https://github.com/modelscope/sirchmunk/pulls) !
473
+
474
+ ---
475
+
476
+ ## 📄 License
477
+
478
+ This project is licensed under the [Apache License 2.0](LICENSE).
479
+
480
+ ---
481
+
482
+ <div align="center">
483
+
484
+ **[ModelScope](https://github.com/modelscope)** · [⭐ Star us](https://github.com/modelscope/sirchmunk/stargazers) · [🐛 Report a bug](https://github.com/modelscope/sirchmunk/issues) · [💬 Discussions](https://github.com/modelscope/sirchmunk/discussions)
485
+
486
+ *✨ Sirchmunk: Raw data to self-evolving intelligence, real-time.*
487
+
488
+ </div>
489
+
490
+ <p align="center">
491
+ <em> ❤️ Thanks for Visiting ✨ Sirchmunk !</em><br><br>
492
+ <img src="https://visitor-badge.laobi.icu/badge?page_id=modelscope.sirchmunk&style=for-the-badge&color=00d4ff" alt="Views">
493
+ </p>