sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.dist-info/METADATA +416 -0
  41. sirchmunk-0.0.1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,108 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import math
3
+ import re
4
+ from typing import Dict, List, LiteralString, Optional
5
+
6
+ from pydantic import RootModel, model_validator
7
+
8
+
9
+ class KeywordValidation(RootModel):
10
+ root: Dict[str, float]
11
+
12
+ @model_validator(mode="after")
13
+ def validate_values(self) -> "KeywordValidation":
14
+ """Ensure all keyword scores are within the range [1.0, 10.0]."""
15
+ for k, v in self.root.items():
16
+ self.root[k] = max(1.0, min(10.0, v))
17
+ return self
18
+
19
+
20
+ def log_tf_norm(count: int):
21
+ """Log normalization for term frequency."""
22
+ return 1 + math.log(count) if count > 0 else 0
23
+
24
+
25
+ def log_tf_norm_penalty(count, ideal_range=(1, 5), penalty_alpha=0.2):
26
+ """Refined Log Normalization with Double-Ended Penalty for Term Frequency."""
27
+ if count <= 0:
28
+ return 0.0
29
+
30
+ min_t, max_t = ideal_range
31
+
32
+ # Base Log Scale
33
+ score = math.log(count + 1)
34
+
35
+ # 1. Low Frequency Penalty
36
+ if count < min_t:
37
+ score *= count / min_t
38
+
39
+ # 2. High Frequency Penalty
40
+ if count > max_t:
41
+ overage = count - max_t
42
+ penalty = math.exp(-penalty_alpha * (overage**0.5))
43
+ score *= penalty
44
+
45
+ return score
46
+
47
+
48
+ def extract_fields(
49
+ content: str, tags: Optional[List[str]] = None
50
+ ) -> Dict[str, LiteralString | None]:
51
+ """
52
+ Extracts specified fields from the LLM output content.
53
+ e.g. <DESCRIPTION>xxx</DESCRIPTION>, <NAME>xxx</NAME>, <CONTENT>xxx</CONTENT>.
54
+
55
+ Args:
56
+ content (str): The raw output content from the LLM.
57
+ tags (Optional[List[str]]): List of tags to extract. Defaults to ["DESCRIPTION", "NAME", "CONTENT"].
58
+
59
+ Returns:
60
+ Dict[str, LiteralString | None]: A dictionary with extracted fields.
61
+ Keys are the lowercase tag names, and values are the extracted content or None if not found.
62
+ """
63
+ # Define the list of tags to extract
64
+ tags = tags or ["DESCRIPTION", "NAME", "CONTENT"]
65
+ extracted_data = {}
66
+
67
+ for tag in tags:
68
+ # Regex Breakdown:
69
+ # <{tag}>: Matches the opening tag
70
+ # (.*?): Non-greedy match to capture everything inside the tags
71
+ # </{tag}>: Matches the closing tag
72
+ # re.DOTALL: Allows the dot (.) to match newlines, handling multi-line content
73
+ pattern = f"<{tag}>(.*?)</{tag}>"
74
+ match = re.search(pattern, content, re.DOTALL)
75
+
76
+ if match:
77
+ # .strip() removes leading/trailing whitespace or newlines
78
+ extracted_data[tag.lower()] = match.group(1).strip()
79
+ else:
80
+ # Handle cases where the LLM might miss a tag
81
+ extracted_data[tag.lower()] = None
82
+
83
+ return extracted_data
84
+
85
+
86
+ if __name__ == "__main__":
87
+
88
+ # --- Test Case ---
89
+ llm_raw_output = """
90
+ Some irrelevant preamble from the LLM...
91
+ <DESCRIPTION>
92
+ This document set provides a detailed overview of the installation steps for Open-Agentic-Search.
93
+ It covers environment configuration and core dependencies.
94
+ </DESCRIPTION>
95
+ <NAME>Environment Setup Guide</NAME>
96
+ <CONTENT>
97
+ 1. Install Python 3.10+
98
+ 2. Run pip install -r requirements.txt
99
+ 3. Configure the .env environment variables.
100
+ </CONTENT>
101
+ """
102
+
103
+ result = extract_fields(llm_raw_output)
104
+
105
+ # Print results
106
+ print(f"Name: {result['name']}")
107
+ print(f"Description: {result['description']}")
108
+ print(f"Content: \n{result['content']}")
sirchmunk/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.0.0"
1
+ __version__ = "0.0.1"
@@ -0,0 +1,416 @@
1
+ Metadata-Version: 2.4
2
+ Name: sirchmunk
3
+ Version: 0.0.1
4
+ Summary: Sirchmunk: From raw data to self-evolving real-time intelligence.
5
+ Home-page: https://github.com/modelscope/sirchmunk
6
+ Author: ModelScope Team
7
+ Author-email: contact@modelscope.cn
8
+ License: Apache License 2.0
9
+ Project-URL: Homepage, https://github.com/modelscope/sirchmunk
10
+ Keywords: LLM,Agentic,Search,RAG,Indexless,Self-evolving,Real-time Intelligence,Multi-modal
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: License :: OSI Approved :: Apache Software License
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Provides-Extra: docs
24
+ Provides-Extra: tests
25
+ Provides-Extra: web
26
+ Provides-Extra: all
27
+ Dynamic: home-page
28
+ Dynamic: license-file
29
+ Dynamic: provides-extra
30
+
31
+ <div align="center">
32
+
33
+ <img src="web/public/logo-v2.png" alt="Sirchmunk Logo" width="250" style="border-radius: 15px;">
34
+
35
+ # Sirchmunk: Raw data to self-evolving intelligence, real-time.
36
+
37
+ [![Python](https://img.shields.io/badge/Python-3.10%2B-3776AB?style=flat-square&logo=python&logoColor=white)](https://www.python.org/downloads/)
38
+ [![FastAPI](https://img.shields.io/badge/FastAPI-0.100%2B-009688?style=flat-square&logo=fastapi&logoColor=white)](https://fastapi.tiangolo.com/)
39
+ [![Next.js](https://img.shields.io/badge/Next.js-14-000000?style=flat-square&logo=next.js&logoColor=white)](https://nextjs.org/)
40
+ [![TailwindCSS](https://img.shields.io/badge/Tailwind-3.4-06B6D4?style=flat-square&logo=tailwindcss&logoColor=white)](https://tailwindcss.com/)
41
+ [![DuckDB](https://img.shields.io/badge/DuckDB-OLAP-FFF000?style=flat-square&logo=duckdb&logoColor=black)](https://duckdb.org/)
42
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue?style=flat-square)](LICENSE)
43
+ [![ripgrep-all](https://img.shields.io/badge/ripgrep--all-Search-E67E22?style=flat-square&logo=rust&logoColor=white)](https://github.com/phiresky/ripgrep-all)
44
+ [![OpenAI](https://img.shields.io/badge/OpenAI-API-412991?style=flat-square&logo=openai&logoColor=white)](https://github.com/openai/openai-python)
45
+ [![Kreuzberg](https://img.shields.io/badge/Kreuzberg-Text_Extraction-4CAF50?style=flat-square)](https://github.com/kreuzberg-dev/kreuzberg)
46
+
47
+
48
+ [**Quick Start**](#-quick-start) · [**Key Features**](#-key-features) · [**Web UI**](#-web-ui) · [**How it Works**](#-how-it-works) · [**FAQ**](#-faq)
49
+
50
+ [🇨🇳 中文](README_zh.md)
51
+
52
+ </div>
53
+
54
+ <div align="center">
55
+
56
+ 🔍 **Agentic Search** &nbsp;•&nbsp; 🧠 **Knowledge Clustering** &nbsp;•&nbsp; 📊 **Monte Carlo Evidence Sampling**<br>
57
+ ⚡ **Indexless Retrieval** &nbsp;•&nbsp; 🔄 **Self-Evolving Knowledge Base** &nbsp;•&nbsp; 💬 **Real-time Chat**
58
+
59
+ </div>
60
+
61
+ ---
62
+
63
+ ## 🌰 Why “Sirchmunk”?
64
+
65
+ Intelligence pipelines built upon vector-based retrieval can be _rigid and brittle_. They rely on static vector embeddings that are **expensive to compute, blind to real-time changes, and detached from the raw context**. We introduce **Sirchmunk** to usher in a more agile paradigm, where data is no longer treated as a snapshot, and insights can evolve together with the data.
66
+
67
+ ---
68
+
69
+ ## ✨ Key Features
70
+
71
+ ### 1. Embedding-Free: Data in its Purest Form
72
+
73
+ **Sirchmunk** works directly with **raw data** -- bypassing the heavy overhead of squeezing your rich files into fixed-dimensional vectors.
74
+
75
+ * **Instant Search:** Eliminating complex pre-processing pipelines in hours long indexing; just drop your files and search immediately.
76
+ * **Full Fidelity:** Zero information loss —- stay true to your data without vector approximation.
77
+
78
+ ### 2. Self-Evolving: A Living Index
79
+
80
+ Data is a stream, not a snapshot. **Sirchmunk** is **dynamic by design**, while vector DB can become obsolete the moment your data changes.
81
+
82
+ * **Context-Aware:** Evolves in real-time with your data context.
83
+ * **LLM-Powered Autonomy:** Designed for Agents that perceive data as it lives, utilizing **token-efficient** reasoning that triggers LLM inference only when necessary to maximize intelligence while minimizing cost.
84
+
85
+ ### 3. Intelligence at Scale: Real-Time & Massive
86
+
87
+ **Sirchmunk** bridges massive local repositories and the web with **high-scale throughput** and **real-time awareness**. <br/>
88
+ It serves as a unified intelligent hub for AI agents, delivering deep insights across vast datasets at the speed of thought.
89
+
90
+ ---
91
+
92
+ ### Traditional RAG vs. Sirchmunk
93
+
94
+ <div style="display: flex; justify-content: center; width: 100%;">
95
+ <table style="width: 100%; max-width: 900px; border-collapse: separate; border-spacing: 0; overflow: hidden; border-radius: 12px; font-family: sans-serif; border: 1px solid rgba(128, 128, 128, 0.2); margin: 0 auto;">
96
+ <colgroup>
97
+ <col style="width: 25%;">
98
+ <col style="width: 30%;">
99
+ <col style="width: 45%;">
100
+ </colgroup>
101
+ <thead>
102
+ <tr style="background-color: rgba(128, 128, 128, 0.05);">
103
+ <th style="text-align: left; padding: 16px; border-bottom: 2px solid rgba(128, 128, 128, 0.2); font-size: 1.3em;">Dimension</th>
104
+ <th style="text-align: left; padding: 16px; border-bottom: 2px solid rgba(128, 128, 128, 0.2); font-size: 1.3em; opacity: 0.7;">Traditional RAG</th>
105
+ <th style="text-align: left; padding: 16px; border-bottom: 2px solid rgba(58, 134, 255, 0.5); color: #3a86ff; font-weight: 800; font-size: 1.3em;">✨Sirchmunk</th>
106
+ </tr>
107
+ </thead>
108
+ <tbody>
109
+ <tr>
110
+ <td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">💰 Setup Cost</td>
111
+ <td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">High Overhead <br/> (VectorDB, GraphDB, Complex Document Parser...)</td>
112
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
113
+ ✅ Zero Infrastructure <br/>
114
+ <small style="opacity: 0.8; font-size: 0.85em;">Direct-to-data retrieval without vector silos</small>
115
+ </td>
116
+ </tr>
117
+ <tr>
118
+ <td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">🕒 Data Freshness</td>
119
+ <td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">Stale (Batch Re-indexing)</td>
120
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
121
+ ✅ Instant &amp; Dynamic <br/>
122
+ <small style="opacity: 0.8; font-size: 0.85em;">Self-evolving index that reflects live changes</small>
123
+ </td>
124
+ </tr>
125
+ <tr>
126
+ <td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">📈 Scalability</td>
127
+ <td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">Linear Cost Growth</td>
128
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
129
+ ✅ Extremely low RAM/CPU consumption <br/>
130
+ <small style="opacity: 0.8; font-size: 0.85em;">Native Elastic Support, efficiently handles large-scale datasets</small>
131
+ </td>
132
+ </tr>
133
+ <tr>
134
+ <td style="padding: 16px; font-weight: 600; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">🎯 Accuracy</td>
135
+ <td style="padding: 16px; opacity: 0.6; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">Approximate Vector Matches</td>
136
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef; border-bottom: 1px solid rgba(128, 128, 128, 0.1);">
137
+ ✅ Deterministic &amp; Contextual <br/>
138
+ <small style="opacity: 0.8; font-size: 0.85em;">Hybrid logic ensuring semantic precision</small>
139
+ </td>
140
+ </tr>
141
+ <tr>
142
+ <td style="padding: 16px; font-weight: 600;">⚙️ Workflow</td>
143
+ <td style="padding: 16px; opacity: 0.6;">Complex ETL Pipelines</td>
144
+ <td style="padding: 16px; background-color: rgba(58, 134, 255, 0.08); color: #4895ef;">
145
+ ✅ Drop-and-Search <br/>
146
+ <small style="opacity: 0.8; font-size: 0.85em;">Zero-config integration for rapid deployment</small>
147
+ </td>
148
+ </tr>
149
+ </tbody>
150
+ </table>
151
+ </div>
152
+
153
+ ---
154
+
155
+
156
+ ## Demonstration
157
+
158
+
159
+ <div align="center">
160
+ <img src="assets/gif/Sirchmunk_Web.gif" alt="Sirchmunk WebUI" width="100%">
161
+ <p style="font-size: 1.1em; font-weight: 600; margin-top: 8px; color: #00bcd4;">
162
+ Access files directly to start chatting
163
+ </p>
164
+ </div>
165
+
166
+ ---
167
+
168
+
169
+ ## 🚀 Quick Start
170
+
171
+ ### Prerequisites
172
+
173
+ - **Python** 3.10+
174
+ - **LLM API Key** (OpenAI-compatible endpoint, local or remote)
175
+ - **Node.js** 18+ (Optional, for web interface)
176
+
177
+ ### Installation
178
+
179
+ ```bash
180
+ # Create virtual environment (recommended)
181
+ conda create -n sirchmunk python=3.13 -y && conda activate sirchmunk
182
+
183
+ pip install sirchmunk
184
+
185
+ # Or via UV:
186
+ uv pip install sirchmunk
187
+
188
+ # Alternatively, install from source:
189
+ git clone https://github.com/modelscope/sirchmunk.git && cd sirchmunk
190
+ pip install -e .
191
+ ```
192
+
193
+ ### Python SDK Usage
194
+
195
+ ```python
196
+ import asyncio
197
+
198
+ from sirchmunk import AgenticSearch
199
+ from sirchmunk.llm import OpenAIChat
200
+
201
+ llm = OpenAIChat(
202
+ api_key="your-api-key",
203
+ base_url="your-base-url", # e.g., https://api.openai.com/v1
204
+ model="your-model-name" # e.g., gpt-4o
205
+ )
206
+
207
+ async def main():
208
+
209
+ agent_search = AgenticSearch(llm=llm)
210
+
211
+ result: str = await agent_search.search(
212
+ query="How does transformer attention work?",
213
+ search_paths=["/path/to/documents"],
214
+ )
215
+
216
+ print(result)
217
+
218
+ asyncio.run(main())
219
+ ```
220
+
221
+ **⚠️ Notes:**
222
+ - Upon initialization, AgenticSearch automatically checks if ripgrep-all and ripgrep are installed. If they are missing, it will attempt to install them automatically. If the automatic installation fails, please install them manually.
223
+ - References: https://github.com/BurntSushi/ripgrep | https://github.com/phiresky/ripgrep-all
224
+ - Replace `"your-api-key"`, `"your-base-url"`, `"your-model-name"` and `/path/to/documents` with your actual values.
225
+
226
+
227
+ ---
228
+
229
+ ## 🖥️ Web UI
230
+
231
+ The web UI is built for fast, transparent workflows: chat, knowledge analytics, and system monitoring in one place.
232
+
233
+ <div align="center">
234
+ <img src="assets/pic/Sirchmunk_Home.png" alt="Sirchmunk Home" width="85%">
235
+ <p><sub>Home — Chat with streaming logs, file-based RAG, and session management.</sub></p>
236
+ </div>
237
+
238
+ <div align="center">
239
+ <img src="assets/pic/Sirchmunk_Monitor.png" alt="Sirchmunk Monitor" width="85%">
240
+ <p><sub>Monitor — System health, chat activity, knowledge analytics, and LLM usage.</sub></p>
241
+ </div>
242
+
243
+ ### Installation
244
+
245
+ ```bash
246
+ git clone https://github.com/modelscope/sirchmunk.git && cd sirchmunk
247
+
248
+ pip install ".[web]"
249
+
250
+ npm install --prefix web
251
+ ```
252
+ - Note: Node.js 18+ is required for the web interface.
253
+
254
+
255
+ ### Running the Web UI
256
+
257
+ ```bash
258
+ # Start frontend and backend
259
+ python scripts/start_web.py
260
+
261
+ # Stop frontend and backend
262
+ python scripts/stop_web.py
263
+ ```
264
+
265
+ **Access the web UI at (By default):**
266
+ - Backend APIs: http://localhost:8584/docs
267
+ - Frontend: http://localhost:8585
268
+
269
+ **Configuration:**
270
+
271
+ - Access `Settings` → `Envrionment Variables` to configure LLM API, and other parameters.
272
+
273
+
274
+ ---
275
+
276
+ ## 🏗️ How it Works
277
+
278
+ ### Sirchmunk Framework
279
+
280
+ <div align="center">
281
+ <img src="assets/pic/Sirchmunk_Architecture.png" alt="Sirchmunk Architecture" width="85%">
282
+ </div>
283
+
284
+ ### Core Components
285
+
286
+ | Component | Description |
287
+ |:----------------------|:-------------------------------------------------------------------------|
288
+ | **AgenticSearch** | Search orchestrator with LLM-enhanced retrieval capabilities |
289
+ | **KnowledgeBase** | Transforms raw results into structured knowledge clusters with evidences |
290
+ | **EvidenceProcessor** | Evidence processing based on the MonteCarlo Importance Sampling |
291
+ | **GrepRetriever** | High-performance _indexless_ file search with parallel processing |
292
+ | **OpenAIChat** | Unified LLM interface supporting streaming and usage tracking |
293
+ | **MonitorTracker** | Real-time system and application metrics collection |
294
+
295
+ ---
296
+
297
+
298
+ ### Data Storage
299
+
300
+ All persistent data is stored in the configured `WORK_PATH` (default: `~/.sirchmunk/`):
301
+
302
+ ```
303
+ {WORK_PATH}/
304
+ ├── .cache/
305
+ ├── history/ # Chat session history (DuckDB)
306
+ │ └── chat_history.db
307
+ ├── knowledge/ # Knowledge clusters (Parquet)
308
+ │ └── knowledge_clusters.parquet
309
+ └── settings/ # User settings (DuckDB)
310
+ └── settings.db
311
+
312
+ ```
313
+
314
+ ---
315
+
316
+ ## ❓ FAQ
317
+
318
+ <details>
319
+ <summary><b>How is this different from traditional RAG systems?</b></summary>
320
+
321
+ Sirchmunk takes an **indexless approach**:
322
+
323
+ 1. **No pre-indexing**: Direct file search without vector database setup
324
+ 2. **Self-evolving**: Knowledge clusters evolve based on search patterns
325
+ 3. **Multi-level retrieval**: Adaptive keyword granularity for better recall
326
+ 4. **Evidence-based**: Monte Carlo sampling for precise content extraction
327
+
328
+ </details>
329
+
330
+ <details>
331
+ <summary><b>What LLM providers are supported?</b></summary>
332
+
333
+ Any OpenAI-compatible API endpoint, including (but not limited too):
334
+ - OpenAI (GPT-4, GPT-4o, GPT-3.5)
335
+ - Local models served via Ollama, llama.cpp, vLLM, SGLang etc.
336
+ - Claude via API proxy
337
+
338
+ </details>
339
+
340
+ <details>
341
+ <summary><b>How do I add documents to search?</b></summary>
342
+
343
+ Simply specify the path in your search query:
344
+
345
+ ```python
346
+ result = await search.search(
347
+ query="Your question",
348
+ search_paths=["/path/to/folder", "/path/to/file.pdf"]
349
+ )
350
+ ```
351
+
352
+ No pre-processing or indexing required!
353
+
354
+ </details>
355
+
356
+ <details>
357
+ <summary><b>Where are knowledge clusters stored?</b></summary>
358
+
359
+ Knowledge clusters are persisted in Parquet format at:
360
+ ```
361
+ {WORK_PATH}/.cache/knowledge/knowledge_clusters.parquet
362
+ ```
363
+
364
+ You can query them using DuckDB or the `KnowledgeManager` API.
365
+
366
+ </details>
367
+
368
+ <details>
369
+ <summary><b>How do I monitor LLM token usage?</b></summary>
370
+
371
+ 1. **Web Dashboard**: Visit the Monitor page for real-time statistics
372
+ 2. **API**: `GET /api/v1/monitor/llm` returns usage metrics
373
+ 3. **Code**: Access `search.llm_usages` after search completion
374
+
375
+ </details>
376
+
377
+ ---
378
+
379
+ ## 📋 Roadmap
380
+
381
+ - [x] Text-retrieval from raw files
382
+ - [x] Knowledge structuring & persistence
383
+ - [x] Real-time chat with RAG
384
+ - [x] Web UI support
385
+ - [ ] Web search integration
386
+ - [ ] Multi-modal support (images, videos)
387
+ - [ ] Distributed search across nodes
388
+ - [ ] Knowledge visualization and deep analytics
389
+ - [ ] More file type support
390
+
391
+ ---
392
+
393
+ ## 🤝 Contributing
394
+
395
+ We welcome [contributions](https://github.com/modelscope/sirchmunk/pulls) !
396
+
397
+ ---
398
+
399
+ ## 📄 License
400
+
401
+ This project is licensed under the [Apache License 2.0](LICENSE).
402
+
403
+ ---
404
+
405
+ <div align="center">
406
+
407
+ **[ModelScope](https://github.com/modelscope)** · [⭐ Star us](https://github.com/modelscope/sirchmunk/stargazers) · [🐛 Report a bug](https://github.com/modelscope/sirchmunk/issues) · [💬 Discussions](https://github.com/modelscope/sirchmunk/discussions)
408
+
409
+ *✨ Sirchmunk: Raw data to self-evolving intelligence, real-time.*
410
+
411
+ </div>
412
+
413
+ <p align="center">
414
+ <em> ❤️ Thanks for Visiting ✨ Sirchmunk !</em><br><br>
415
+ <img src="https://visitor-badge.laobi.icu/badge?page_id=modelscope.sirchmunk&style=for-the-badge&color=00d4ff" alt="Views">
416
+ </p>
@@ -0,0 +1,45 @@
1
+ sirchmunk/__init__.py,sha256=5sdppELUcjnTofJtwZ2ACuUscmLYIPhCwj-G-MMSl-M,184
2
+ sirchmunk/base.py,sha256=qVQ63QfEWhEvOJl3OxQvC2rOUNTZCD5weXRn-1vvEkU,439
3
+ sirchmunk/search.py,sha256=NROpV39oklDT4wusE8EnkQPABq9LiAa0kQrtyR6viHQ,18210
4
+ sirchmunk/version.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
5
+ sirchmunk/insight/__init__.py,sha256=7sQeT0fSg5-b9jrwthA9_fSCR1Q5qSvm33L9l7kHjLY,144
6
+ sirchmunk/insight/text_insights.py,sha256=BwFqmDNG8FmOz4Lv3qO1Mz2xRv_LVg4b3wj6YUhVLVE,9075
7
+ sirchmunk/learnings/__init__.py,sha256=310L84MdAIw4THnzf5YsLiUhW_oaxgJHHcZZeMso3jY,61
8
+ sirchmunk/learnings/evidence_processor.py,sha256=QDg-qReSte8R8I2BrRRC-d-Glyfjcnqazbe57-PSDHU,17922
9
+ sirchmunk/learnings/knowledge_base.py,sha256=8szliKLFKu_BTIVf0vhy2mSe7Tix4145Umdd5OQ2rww,8343
10
+ sirchmunk/llm/__init__.py,sha256=4ynF6R63afMWW1d9T21C8JqfVc9xJTiOFXZNzDwPLww,98
11
+ sirchmunk/llm/openai_chat.py,sha256=ET7HqEoFTbbvhUTlAUZNOoJwxF9hA73qeH3xFTsNK-w,8138
12
+ sirchmunk/llm/prompts.py,sha256=8Wpugif43EhCaDuRF4U6xRgbv6EB_UdU-2NdDFuglM0,8755
13
+ sirchmunk/retrieve/__init__.py,sha256=310L84MdAIw4THnzf5YsLiUhW_oaxgJHHcZZeMso3jY,61
14
+ sirchmunk/retrieve/base.py,sha256=VDpCwdwhjVYuj0mbe78qg_FhbcQkasbCS0cd66hQ4hk,618
15
+ sirchmunk/retrieve/text_retriever.py,sha256=1ryDjqecXJz2bwjeoJhaUvMxCAdoNzPMMOAJYH3x7g0,39484
16
+ sirchmunk/scan/__init__.py,sha256=310L84MdAIw4THnzf5YsLiUhW_oaxgJHHcZZeMso3jY,61
17
+ sirchmunk/scan/base.py,sha256=3Jqvn0W9KJ00u2oy8U-qD16KksV7taAx4s9qhCpRD-c,496
18
+ sirchmunk/scan/file_scanner.py,sha256=GtGFP9ZwqemOSkHjFcEDaD2WCnw5qMB_bCeiVUaIEQg,14645
19
+ sirchmunk/scan/web_scanner.py,sha256=YQRBQ5JnATwNdek2o1_Y8GB0eG0IWQvx4YhqW4Zc5Tw,531
20
+ sirchmunk/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ sirchmunk/schema/__init__.py,sha256=kvzKU8Rt1t91Rm5tuy6KqgZXj8lfYQ2q1XvLYGNLUh4,103
22
+ sirchmunk/schema/cognition.py,sha256=NDgCOjMpqyUBxd_ymeMQrjSr3Ri-nJeMUNRuWfLUhBI,4031
23
+ sirchmunk/schema/context.py,sha256=wwsSBE5EJXmrvh-RCluf3nJm7hcxkWaiNWfJ10coGY4,628
24
+ sirchmunk/schema/knowledge.py,sha256=mPKuhqXzLFIfcz5yHRcNe7B4WifUVlHDNnzeevnFKQM,11108
25
+ sirchmunk/schema/metadata.py,sha256=VdUD6GPCB_jtyBkowkGEpCPMYL1c-sFVSUloKPSZRwE,22810
26
+ sirchmunk/schema/request.py,sha256=EPum-IzmN15EgFmwN12oL-CNoI-KvTcHN4nLuO-c09c,7702
27
+ sirchmunk/schema/response.py,sha256=6xc5tvAnqL_zpUhQtchVGwbrt1btzy4QArCeS81DrIU,488
28
+ sirchmunk/schema/snapshot.py,sha256=zZSKDRN8jMtpOIH3TL0FCrAn7AlUc_5zO8qFX2f2u_s,12341
29
+ sirchmunk/storage/__init__.py,sha256=9mF1JdbVrgC0mt9uvBYU9RwiLGRdrmwGAyTj7EA21Pg,231
30
+ sirchmunk/storage/duckdb.py,sha256=Jw_EK9i589YyKXYhi-yjAmX7zIM7txDsoxy0Gx-XY8o,23092
31
+ sirchmunk/storage/knowledge_manager.py,sha256=YYlZa4CnbCXzFWjOleztil2hohqrvrOZs-8UcV0DTEM,27746
32
+ sirchmunk/utils/__init__.py,sha256=33bVrhpUfPXpM85r_SEB07QJnGDjD-4BE5p9vpF-fXw,265
33
+ sirchmunk/utils/constants.py,sha256=D6RP2mtRH9uA3ZI1SpFD77_QPtYkxIKIi7zZY8-do1w,551
34
+ sirchmunk/utils/deps.py,sha256=QTL0k7CN1t0_r-CrZ4TM__pdK8U2X1d0OKvJUtQe_xA,563
35
+ sirchmunk/utils/file_utils.py,sha256=9OtYNffXbo1Pz6XuJsOECQS_mYRBg1NpGUsglNgfWnU,2257
36
+ sirchmunk/utils/install_rga.py,sha256=i7sWYi6u2A32dc0mq5LB_OtqcMWqKezSz6WILWaK1Oc,5215
37
+ sirchmunk/utils/log_utils.py,sha256=HujosgEXV9fpSVd6JjRh7KEQuskhrcpC4Kit6aIpnW0,14195
38
+ sirchmunk/utils/tokenizer_util.py,sha256=JIg4FylB6o8AIe71q8Uv83PKmPF1funX3ffhb9cPX3c,1624
39
+ sirchmunk/utils/utils.py,sha256=qnwZ8R9xLqsMooW0IdcWoPKto7q4AQ49-SOR33rCy1g,3394
40
+ sirchmunk-0.0.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
41
+ sirchmunk-0.0.1.dist-info/METADATA,sha256=MnMXApUAaWluApy7zthN2yGhhT-t2-OhXxiG9VM__lw,15724
42
+ sirchmunk-0.0.1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
43
+ sirchmunk-0.0.1.dist-info/entry_points.txt,sha256=lpnP-Ll2CUY0P1wYm6kutcBMrxwG67astmgY-vVhF14,56
44
+ sirchmunk-0.0.1.dist-info/top_level.txt,sha256=8MiQvqjFkqiGJ7m4xqxsfxwxFHtH2mIXlrkV9PfX-aM,10
45
+ sirchmunk-0.0.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,26 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: sirchmunk
3
- Version: 0.0.0
4
- Summary: sirchmunk
5
- Author: sirchmunk team
6
- Author-email: xx@yy.com
7
- License: Apache License 2.0
8
- Project-URL: Homepage, https://github.com
9
- Keywords: sirchmunk,Software
10
- Classifier: Development Status :: 4 - Beta
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Programming Language :: Python :: 3.13
17
- Classifier: Programming Language :: Python :: 3.14
18
- Classifier: License :: OSI Approved :: Apache Software License
19
- Requires-Python: >=3.10
20
- Description-Content-Type: text/markdown
21
- License-File: LICENSE
22
- Provides-Extra: docs
23
- Provides-Extra: tests
24
- Dynamic: license-file
25
-
26
- # sentis
@@ -1,8 +0,0 @@
1
- sirchmunk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- sirchmunk/version.py,sha256=qwX_VmtTcxitUuo61-lzyWhU70ydr0gDejMn5eqC3Dk,21
3
- sirchmunk-0.0.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
4
- sirchmunk-0.0.0.dist-info/METADATA,sha256=d1AhiO-kUXkKBo3JSgZ2wS-WzHvz9NP6D52EghbMi0c,835
5
- sirchmunk-0.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- sirchmunk-0.0.0.dist-info/entry_points.txt,sha256=lpnP-Ll2CUY0P1wYm6kutcBMrxwG67astmgY-vVhF14,56
7
- sirchmunk-0.0.0.dist-info/top_level.txt,sha256=8MiQvqjFkqiGJ7m4xqxsfxwxFHtH2mIXlrkV9PfX-aM,10
8
- sirchmunk-0.0.0.dist-info/RECORD,,