sheaf-ai 0.4.0a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sheaf_ai-0.4.0a0/LICENSE +21 -0
- sheaf_ai-0.4.0a0/PKG-INFO +254 -0
- sheaf_ai-0.4.0a0/README.md +206 -0
- sheaf_ai-0.4.0a0/prompts/__init__.py +0 -0
- sheaf_ai-0.4.0a0/prompts/classify.md +53 -0
- sheaf_ai-0.4.0a0/prompts/crystallize.md +38 -0
- sheaf_ai-0.4.0a0/prompts/summarize.md +50 -0
- sheaf_ai-0.4.0a0/pyproject.toml +87 -0
- sheaf_ai-0.4.0a0/setup.cfg +4 -0
- sheaf_ai-0.4.0a0/sheaf_ai/__init__.py +12 -0
- sheaf_ai-0.4.0a0/sheaf_ai/api.py +345 -0
- sheaf_ai-0.4.0a0/sheaf_ai/cli.py +271 -0
- sheaf_ai-0.4.0a0/sheaf_ai/config.py +74 -0
- sheaf_ai-0.4.0a0/sheaf_ai/crystallize.py +549 -0
- sheaf_ai-0.4.0a0/sheaf_ai/display.py +337 -0
- sheaf_ai-0.4.0a0/sheaf_ai/embedding_bridge.py +203 -0
- sheaf_ai-0.4.0a0/sheaf_ai/exceptions.py +45 -0
- sheaf_ai-0.4.0a0/sheaf_ai/feedback.py +171 -0
- sheaf_ai-0.4.0a0/sheaf_ai/fetch_article.py +709 -0
- sheaf_ai-0.4.0a0/sheaf_ai/gamification.py +708 -0
- sheaf_ai-0.4.0a0/sheaf_ai/insights.py +325 -0
- sheaf_ai-0.4.0a0/sheaf_ai/llm_client.py +166 -0
- sheaf_ai-0.4.0a0/sheaf_ai/mcp_server.py +379 -0
- sheaf_ai-0.4.0a0/sheaf_ai/onboarding.py +145 -0
- sheaf_ai-0.4.0a0/sheaf_ai/pipeline.py +405 -0
- sheaf_ai-0.4.0a0/sheaf_ai/quality.py +214 -0
- sheaf_ai-0.4.0a0/sheaf_ai/query.py +199 -0
- sheaf_ai-0.4.0a0/sheaf_ai/renderer.py +419 -0
- sheaf_ai-0.4.0a0/sheaf_ai/search.py +170 -0
- sheaf_ai-0.4.0a0/sheaf_ai/storage.py +291 -0
- sheaf_ai-0.4.0a0/sheaf_ai/utils.py +109 -0
- sheaf_ai-0.4.0a0/sheaf_ai.egg-info/PKG-INFO +254 -0
- sheaf_ai-0.4.0a0/sheaf_ai.egg-info/SOURCES.txt +48 -0
- sheaf_ai-0.4.0a0/sheaf_ai.egg-info/dependency_links.txt +1 -0
- sheaf_ai-0.4.0a0/sheaf_ai.egg-info/entry_points.txt +5 -0
- sheaf_ai-0.4.0a0/sheaf_ai.egg-info/requires.txt +30 -0
- sheaf_ai-0.4.0a0/sheaf_ai.egg-info/top_level.txt +3 -0
- sheaf_ai-0.4.0a0/sheaf_cards/__init__.py +19 -0
- sheaf_ai-0.4.0a0/sheaf_cards/base.py +267 -0
- sheaf_ai-0.4.0a0/sheaf_cards/embeddings.py +264 -0
- sheaf_ai-0.4.0a0/sheaf_cards/generator.py +276 -0
- sheaf_ai-0.4.0a0/tests/test_api.py +209 -0
- sheaf_ai-0.4.0a0/tests/test_crystallize.py +500 -0
- sheaf_ai-0.4.0a0/tests/test_gamification.py +541 -0
- sheaf_ai-0.4.0a0/tests/test_install.py +184 -0
- sheaf_ai-0.4.0a0/tests/test_mcp.py +137 -0
- sheaf_ai-0.4.0a0/tests/test_quality.py +314 -0
- sheaf_ai-0.4.0a0/tests/test_renderer.py +446 -0
- sheaf_ai-0.4.0a0/tests/test_storage.py +169 -0
- sheaf_ai-0.4.0a0/tests/test_unit.py +113 -0
sheaf_ai-0.4.0a0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 zhelunSun
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: sheaf-ai
|
|
3
|
+
Version: 0.4.0a0
|
|
4
|
+
Summary: Sheaf — Your personal knowledge layer. Paste a link, AI does the rest.
|
|
5
|
+
Author: zhelunSun
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/zhelunSun/sheaf-ai
|
|
8
|
+
Project-URL: Repository, https://github.com/zhelunSun/sheaf-ai
|
|
9
|
+
Project-URL: Issues, https://github.com/zhelunSun/sheaf-ai/issues
|
|
10
|
+
Keywords: knowledge-management,ai,mcp,cli,agent
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
19
|
+
Classifier: Framework :: Pytest
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: openai>=2.32.0
|
|
24
|
+
Requires-Dist: requests>=2.32.0
|
|
25
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
26
|
+
Requires-Dist: numpy>=1.24.0
|
|
27
|
+
Provides-Extra: browser
|
|
28
|
+
Requires-Dist: playwright>=1.59.0; extra == "browser"
|
|
29
|
+
Provides-Extra: cards
|
|
30
|
+
Requires-Dist: numpy>=1.24.0; extra == "cards"
|
|
31
|
+
Provides-Extra: server
|
|
32
|
+
Requires-Dist: fastapi>=0.100.0; extra == "server"
|
|
33
|
+
Requires-Dist: uvicorn>=0.20.0; extra == "server"
|
|
34
|
+
Requires-Dist: httpx>=0.24.0; extra == "server"
|
|
35
|
+
Provides-Extra: all
|
|
36
|
+
Requires-Dist: playwright>=1.59.0; extra == "all"
|
|
37
|
+
Requires-Dist: numpy>=1.24.0; extra == "all"
|
|
38
|
+
Requires-Dist: fastapi>=0.100.0; extra == "all"
|
|
39
|
+
Requires-Dist: uvicorn>=0.20.0; extra == "all"
|
|
40
|
+
Requires-Dist: httpx>=0.24.0; extra == "all"
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
43
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
44
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
45
|
+
Requires-Dist: fastapi>=0.100.0; extra == "dev"
|
|
46
|
+
Requires-Dist: uvicorn>=0.20.0; extra == "dev"
|
|
47
|
+
Requires-Dist: httpx>=0.24.0; extra == "dev"
|
|
48
|
+
|
|
49
|
+
# Sheaf
|
|
50
|
+
|
|
51
|
+
> **Harvest your knowledge. Bundle it. Share it.**
|
|
52
|
+
|
|
53
|
+
[](https://www.python.org/downloads/)
|
|
54
|
+
[](LICENSE)
|
|
55
|
+
[](tests/)
|
|
56
|
+
[](https://pypi.org/project/sheaf-ai/)
|
|
57
|
+
[](https://pypi.org/project/sheaf-ai/)
|
|
58
|
+
|
|
59
|
+
A **sheaf** is a bundle of grain — the basic unit a farmer brings to market. Sheaf does the same for knowledge: gather what you read, crystallize it into structured bundles, and make it tradable. Your AI agents can search, cite, and reason over everything you've collected.
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Install from PyPI
|
|
65
|
+
pip install sheaf-ai
|
|
66
|
+
|
|
67
|
+
# Or install from source
|
|
68
|
+
git clone https://github.com/zhelunSun/sheaf-ai.git
|
|
69
|
+
cd sheaf-ai
|
|
70
|
+
pip install -e .
|
|
71
|
+
|
|
72
|
+
# Set your LLM API key (any OpenAI-compatible endpoint)
|
|
73
|
+
export OPENAI_API_KEY=sk-...
|
|
74
|
+
|
|
75
|
+
# First-time onboarding (collects 3 sample articles)
|
|
76
|
+
sheaf init
|
|
77
|
+
|
|
78
|
+
# Collect a link
|
|
79
|
+
sheaf collect https://arxiv.org/abs/2401.00000
|
|
80
|
+
|
|
81
|
+
# Search your collection
|
|
82
|
+
sheaf search "transformer architecture"
|
|
83
|
+
|
|
84
|
+
# Crystallize knowledge cards from collected articles
|
|
85
|
+
sheaf crystallize AI
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
No accounts. No cloud. Your data lives in `./data/` as Markdown + JSON.
|
|
89
|
+
|
|
90
|
+
## The Problem
|
|
91
|
+
|
|
92
|
+
You save links every day — articles, repos, papers, tutorials. **95% never get opened again.**
|
|
93
|
+
|
|
94
|
+
Not because you're lazy. Because bookmarks serve *human reading*, not *agent workflows*. When you ask your coding agent "what did I read about MCP last week?", it has no idea.
|
|
95
|
+
|
|
96
|
+
Sheaf fixes this. Every link you save becomes a **structured entry** — a single stalk of grain. Crystallize enough of them, and you get a **bundle**: a portable, searchable knowledge pack any agent can consume.
|
|
97
|
+
|
|
98
|
+
## What It Does
|
|
99
|
+
|
|
100
|
+
1. **Harvest** — paste a link, Sheaf fetches, classifies, and summarizes it
|
|
101
|
+
2. **Crystallize** — distill 3+ related entries into structured knowledge cards with evidence tracing
|
|
102
|
+
3. **Bundle** — package cards into a portable `.sheaf` unit (coming soon)
|
|
103
|
+
4. **Agent-ready** — built-in MCP server lets any LLM agent query your knowledge base
|
|
104
|
+
|
|
105
|
+
## Core Commands
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
sheaf collect <url> # Collect an article, paper, or webpage
|
|
109
|
+
sheaf search <query> # Full-text search across your collection
|
|
110
|
+
sheaf stats # Collection statistics with topic trends
|
|
111
|
+
sheaf crystallize <topic> # Crystallize knowledge cards from a topic
|
|
112
|
+
sheaf crystallize --list # List all crystallized cards
|
|
113
|
+
sheaf crystallize --semantic <q> # Semantic vector search across cards
|
|
114
|
+
sheaf tags # Tag statistics
|
|
115
|
+
sheaf weekly # Weekly summary report
|
|
116
|
+
sheaf insights # Cross-topic association discovery
|
|
117
|
+
sheaf urgent # Show entries with upcoming deadlines
|
|
118
|
+
sheaf mcp # Start MCP server (stdio transport)
|
|
119
|
+
sheaf init # First-time onboarding with demo
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Crystallize: Your Second Brain
|
|
123
|
+
|
|
124
|
+
This is Sheaf's killer feature. Instead of leaving your bookmarks to rot, `sheaf crystallize` synthesizes insights across multiple entries:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
$ sheaf crystallize AI
|
|
128
|
+
Crystallizing 'AI'...
|
|
129
|
+
✨ 5 knowledge cards crystallized:
|
|
130
|
+
📌 RAG faces retrieval relevance challenges (90%)
|
|
131
|
+
RAG systems heavily depend on retrieval quality; errors degrade LLM output reliability.
|
|
132
|
+
📌 CRAG framework improves RAG robustness (95%)
|
|
133
|
+
CRAG introduces a retrieval evaluator, web search augmentation, and document decomposition.
|
|
134
|
+
📌 Retrieval granularity significantly impacts performance (90%)
|
|
135
|
+
Finer-grained units like propositions outperform traditional passage-level retrieval.
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Each card includes:
|
|
139
|
+
- **Confidence score** (0-100%)
|
|
140
|
+
- **Evidence tracing** — which source entries contributed
|
|
141
|
+
- **Topic provenance** — what topic this card belongs to
|
|
142
|
+
- **Tags** — for filtering and cross-referencing
|
|
143
|
+
|
|
144
|
+
Use `sheaf crystallize --semantic "query"` for vector-based semantic search across all your cards.
|
|
145
|
+
|
|
146
|
+
## MCP Server
|
|
147
|
+
|
|
148
|
+
Sheaf ships with a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server. Any MCP-compatible agent can query your knowledge base:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
sheaf mcp
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Available tools (9 total):**
|
|
155
|
+
|
|
156
|
+
| Tool | Description |
|
|
157
|
+
|------|-------------|
|
|
158
|
+
| `sheaf_search` | Full-text search across all entries |
|
|
159
|
+
| `sheaf_list` | List recent entries with filtering |
|
|
160
|
+
| `sheaf_get` | Get full entry details by ID |
|
|
161
|
+
| `sheaf_urgent` | Find time-sensitive entries (deadlines, CFPs) |
|
|
162
|
+
| `sheaf_collect` | Add a new URL to your collection |
|
|
163
|
+
| `sheaf_correct` | Correct a classification error |
|
|
164
|
+
| `sheaf_crystallize` | Crystallize knowledge cards from a topic |
|
|
165
|
+
| `sheaf_list_cards` | List crystallized cards (optional topic filter) |
|
|
166
|
+
| `sheaf_get_card` | Get full card details by ID |
|
|
167
|
+
|
|
168
|
+
## What You Can Collect
|
|
169
|
+
|
|
170
|
+
Sheaf handles more than just web articles:
|
|
171
|
+
|
|
172
|
+
| Input | Example | What Sheaf does |
|
|
173
|
+
|-------|---------|-----------------|
|
|
174
|
+
| **Web articles** | `sheaf collect https://arxiv.org/abs/2401.00000` | Fetches full text, extracts title/author/abstract, classifies topic |
|
|
175
|
+
| **AI chat shares** | `sheaf collect https://chatgpt.com/share/...` | Extracts the Q&A conversation, structures it as reusable knowledge |
|
|
176
|
+
| **WeChat / Zhihu posts** | `sheaf collect https://mp.weixin.qq.com/s/...` | Handles paywalls and dynamic rendering via Playwright fallback |
|
|
177
|
+
| **Pasted text** | `sheaf collect --text "Key insight..."` | Wraps freeform text into a structured entry with auto-classification |
|
|
178
|
+
|
|
179
|
+
Under the hood, every input goes through the same pipeline: **fetch → classify → summarize → store**. The output is always a structured entry your agents can search and cite.
|
|
180
|
+
|
|
181
|
+
## Architecture
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
URL → fetch → classify → summarize → store → query
|
|
185
|
+
↓ ↓ ↓ ↓
|
|
186
|
+
3-strategy LLM tags summary JSONL + MD
|
|
187
|
+
fallback + topics + deadline index
|
|
188
|
+
|
|
189
|
+
↓
|
|
190
|
+
crystallize → KnowledgeCard → EmbeddingEngine
|
|
191
|
+
↓ ↓
|
|
192
|
+
CLI/MCP semantic search
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
| Module | Purpose |
|
|
196
|
+
|--------|---------|
|
|
197
|
+
| `sheaf_ai/` | Core — pipeline, storage, search, CLI, MCP server, crystallize engine |
|
|
198
|
+
| `sheaf_cards/` | Knowledge card engine — base types, embeddings, generation |
|
|
199
|
+
| `prompts/` | LLM prompt templates (classify, summarize, crystallize) |
|
|
200
|
+
| `data/` | Local knowledge base (JSONL + Markdown, gitignored) |
|
|
201
|
+
|
|
202
|
+
## Privacy & Local-First
|
|
203
|
+
|
|
204
|
+
**Your data never leaves your machine unless you choose to.**
|
|
205
|
+
|
|
206
|
+
- All content stored locally in `./data/` (configurable via `SHEAF_DATA_DIR`)
|
|
207
|
+
- LLM calls go to **your** chosen API provider
|
|
208
|
+
- No telemetry, no analytics, no accounts
|
|
209
|
+
- Markdown + JSONL format — fully portable, zero lock-in
|
|
210
|
+
|
|
211
|
+
## Configuration
|
|
212
|
+
|
|
213
|
+
Sheaf works with any OpenAI-compatible API:
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
# OpenAI
|
|
217
|
+
export OPENAI_API_KEY=sk-...
|
|
218
|
+
|
|
219
|
+
# Or any compatible endpoint (Together, Groq, DeepSeek, etc.)
|
|
220
|
+
export OPENAI_API_KEY=sk-...
|
|
221
|
+
export OPENAI_BASE_URL=https://api.together.xyz/v1
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
Optional: create a `.env` file in your working directory. See [.env.example](.env.example) for all options.
|
|
225
|
+
|
|
226
|
+
## Requirements
|
|
227
|
+
|
|
228
|
+
- **Python 3.10+**
|
|
229
|
+
- **An LLM API key** — any OpenAI-compatible endpoint
|
|
230
|
+
- **Playwright Chromium** (optional, for JS-heavy sites): `pip install -e ".[browser]" && playwright install chromium`
|
|
231
|
+
|
|
232
|
+
## Development
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
git clone https://github.com/zhelunSun/sheaf-ai.git
|
|
236
|
+
cd sheaf-ai
|
|
237
|
+
pip install -e ".[dev]"
|
|
238
|
+
pytest tests/ -v # 104 tests
|
|
239
|
+
ruff check sheaf_ai/ tests/ sheaf_cards/
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## Alpha Status
|
|
243
|
+
|
|
244
|
+
Sheaf is in early alpha. The core collect → search → crystallize → MCP pipeline works and is tested with 104 tests. We're validating with real users before beta.
|
|
245
|
+
|
|
246
|
+
**Try it:** save 20+ links, run `sheaf crystallize <topic>`, then ask your agent to find them. If it works for you, open an issue or discussion to tell us what you'd change.
|
|
247
|
+
|
|
248
|
+
## License
|
|
249
|
+
|
|
250
|
+
[MIT](LICENSE)
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
*A **sheaf** is a bundle of harvested grain — the unit a farmer brings to market. In mathematics, a [sheaf](https://en.wikipedia.org/wiki/Sheaf_(mathematics)) attaches local data to open sets and glues them into a global picture. Sheaf the tool does both: gather scattered knowledge into coherent bundles, ready for your agents to consume or for you to share.*
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# Sheaf
|
|
2
|
+
|
|
3
|
+
> **Harvest your knowledge. Bundle it. Share it.**
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
[](tests/)
|
|
8
|
+
[](https://pypi.org/project/sheaf-ai/)
|
|
9
|
+
[](https://pypi.org/project/sheaf-ai/)
|
|
10
|
+
|
|
11
|
+
A **sheaf** is a bundle of grain — the basic unit a farmer brings to market. Sheaf does the same for knowledge: gather what you read, crystallize it into structured bundles, and make it tradable. Your AI agents can search, cite, and reason over everything you've collected.
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Install from PyPI
|
|
17
|
+
pip install sheaf-ai
|
|
18
|
+
|
|
19
|
+
# Or install from source
|
|
20
|
+
git clone https://github.com/zhelunSun/sheaf-ai.git
|
|
21
|
+
cd sheaf-ai
|
|
22
|
+
pip install -e .
|
|
23
|
+
|
|
24
|
+
# Set your LLM API key (any OpenAI-compatible endpoint)
|
|
25
|
+
export OPENAI_API_KEY=sk-...
|
|
26
|
+
|
|
27
|
+
# First-time onboarding (collects 3 sample articles)
|
|
28
|
+
sheaf init
|
|
29
|
+
|
|
30
|
+
# Collect a link
|
|
31
|
+
sheaf collect https://arxiv.org/abs/2401.00000
|
|
32
|
+
|
|
33
|
+
# Search your collection
|
|
34
|
+
sheaf search "transformer architecture"
|
|
35
|
+
|
|
36
|
+
# Crystallize knowledge cards from collected articles
|
|
37
|
+
sheaf crystallize AI
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
No accounts. No cloud. Your data lives in `./data/` as Markdown + JSON.
|
|
41
|
+
|
|
42
|
+
## The Problem
|
|
43
|
+
|
|
44
|
+
You save links every day — articles, repos, papers, tutorials. **95% never get opened again.**
|
|
45
|
+
|
|
46
|
+
Not because you're lazy. Because bookmarks serve *human reading*, not *agent workflows*. When you ask your coding agent "what did I read about MCP last week?", it has no idea.
|
|
47
|
+
|
|
48
|
+
Sheaf fixes this. Every link you save becomes a **structured entry** — a single stalk of grain. Crystallize enough of them, and you get a **bundle**: a portable, searchable knowledge pack any agent can consume.
|
|
49
|
+
|
|
50
|
+
## What It Does
|
|
51
|
+
|
|
52
|
+
1. **Harvest** — paste a link, Sheaf fetches, classifies, and summarizes it
|
|
53
|
+
2. **Crystallize** — distill 3+ related entries into structured knowledge cards with evidence tracing
|
|
54
|
+
3. **Bundle** — package cards into a portable `.sheaf` unit (coming soon)
|
|
55
|
+
4. **Agent-ready** — built-in MCP server lets any LLM agent query your knowledge base
|
|
56
|
+
|
|
57
|
+
## Core Commands
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
sheaf collect <url> # Collect an article, paper, or webpage
|
|
61
|
+
sheaf search <query> # Full-text search across your collection
|
|
62
|
+
sheaf stats # Collection statistics with topic trends
|
|
63
|
+
sheaf crystallize <topic> # Crystallize knowledge cards from a topic
|
|
64
|
+
sheaf crystallize --list # List all crystallized cards
|
|
65
|
+
sheaf crystallize --semantic <q> # Semantic vector search across cards
|
|
66
|
+
sheaf tags # Tag statistics
|
|
67
|
+
sheaf weekly # Weekly summary report
|
|
68
|
+
sheaf insights # Cross-topic association discovery
|
|
69
|
+
sheaf urgent # Show entries with upcoming deadlines
|
|
70
|
+
sheaf mcp # Start MCP server (stdio transport)
|
|
71
|
+
sheaf init # First-time onboarding with demo
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Crystallize: Your Second Brain
|
|
75
|
+
|
|
76
|
+
This is Sheaf's killer feature. Instead of leaving your bookmarks to rot, `sheaf crystallize` synthesizes insights across multiple entries:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
$ sheaf crystallize AI
|
|
80
|
+
Crystallizing 'AI'...
|
|
81
|
+
✨ 5 knowledge cards crystallized:
|
|
82
|
+
📌 RAG faces retrieval relevance challenges (90%)
|
|
83
|
+
RAG systems heavily depend on retrieval quality; errors degrade LLM output reliability.
|
|
84
|
+
📌 CRAG framework improves RAG robustness (95%)
|
|
85
|
+
CRAG introduces a retrieval evaluator, web search augmentation, and document decomposition.
|
|
86
|
+
📌 Retrieval granularity significantly impacts performance (90%)
|
|
87
|
+
Finer-grained units like propositions outperform traditional passage-level retrieval.
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Each card includes:
|
|
91
|
+
- **Confidence score** (0-100%)
|
|
92
|
+
- **Evidence tracing** — which source entries contributed
|
|
93
|
+
- **Topic provenance** — what topic this card belongs to
|
|
94
|
+
- **Tags** — for filtering and cross-referencing
|
|
95
|
+
|
|
96
|
+
Use `sheaf crystallize --semantic "query"` for vector-based semantic search across all your cards.
|
|
97
|
+
|
|
98
|
+
## MCP Server
|
|
99
|
+
|
|
100
|
+
Sheaf ships with a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server. Any MCP-compatible agent can query your knowledge base:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
sheaf mcp
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Available tools (9 total):**
|
|
107
|
+
|
|
108
|
+
| Tool | Description |
|
|
109
|
+
|------|-------------|
|
|
110
|
+
| `sheaf_search` | Full-text search across all entries |
|
|
111
|
+
| `sheaf_list` | List recent entries with filtering |
|
|
112
|
+
| `sheaf_get` | Get full entry details by ID |
|
|
113
|
+
| `sheaf_urgent` | Find time-sensitive entries (deadlines, CFPs) |
|
|
114
|
+
| `sheaf_collect` | Add a new URL to your collection |
|
|
115
|
+
| `sheaf_correct` | Correct a classification error |
|
|
116
|
+
| `sheaf_crystallize` | Crystallize knowledge cards from a topic |
|
|
117
|
+
| `sheaf_list_cards` | List crystallized cards (optional topic filter) |
|
|
118
|
+
| `sheaf_get_card` | Get full card details by ID |
|
|
119
|
+
|
|
120
|
+
## What You Can Collect
|
|
121
|
+
|
|
122
|
+
Sheaf handles more than just web articles:
|
|
123
|
+
|
|
124
|
+
| Input | Example | What Sheaf does |
|
|
125
|
+
|-------|---------|-----------------|
|
|
126
|
+
| **Web articles** | `sheaf collect https://arxiv.org/abs/2401.00000` | Fetches full text, extracts title/author/abstract, classifies topic |
|
|
127
|
+
| **AI chat shares** | `sheaf collect https://chatgpt.com/share/...` | Extracts the Q&A conversation, structures it as reusable knowledge |
|
|
128
|
+
| **WeChat / Zhihu posts** | `sheaf collect https://mp.weixin.qq.com/s/...` | Handles paywalls and dynamic rendering via Playwright fallback |
|
|
129
|
+
| **Pasted text** | `sheaf collect --text "Key insight..."` | Wraps freeform text into a structured entry with auto-classification |
|
|
130
|
+
|
|
131
|
+
Under the hood, every input goes through the same pipeline: **fetch → classify → summarize → store**. The output is always a structured entry your agents can search and cite.
|
|
132
|
+
|
|
133
|
+
## Architecture
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
URL → fetch → classify → summarize → store → query
|
|
137
|
+
↓ ↓ ↓ ↓
|
|
138
|
+
3-strategy LLM tags summary JSONL + MD
|
|
139
|
+
fallback + topics + deadline index
|
|
140
|
+
|
|
141
|
+
↓
|
|
142
|
+
crystallize → KnowledgeCard → EmbeddingEngine
|
|
143
|
+
↓ ↓
|
|
144
|
+
CLI/MCP semantic search
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
| Module | Purpose |
|
|
148
|
+
|--------|---------|
|
|
149
|
+
| `sheaf_ai/` | Core — pipeline, storage, search, CLI, MCP server, crystallize engine |
|
|
150
|
+
| `sheaf_cards/` | Knowledge card engine — base types, embeddings, generation |
|
|
151
|
+
| `prompts/` | LLM prompt templates (classify, summarize, crystallize) |
|
|
152
|
+
| `data/` | Local knowledge base (JSONL + Markdown, gitignored) |
|
|
153
|
+
|
|
154
|
+
## Privacy & Local-First
|
|
155
|
+
|
|
156
|
+
**Your data never leaves your machine unless you choose to.**
|
|
157
|
+
|
|
158
|
+
- All content stored locally in `./data/` (configurable via `SHEAF_DATA_DIR`)
|
|
159
|
+
- LLM calls go to **your** chosen API provider
|
|
160
|
+
- No telemetry, no analytics, no accounts
|
|
161
|
+
- Markdown + JSONL format — fully portable, zero lock-in
|
|
162
|
+
|
|
163
|
+
## Configuration
|
|
164
|
+
|
|
165
|
+
Sheaf works with any OpenAI-compatible API:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
# OpenAI
|
|
169
|
+
export OPENAI_API_KEY=sk-...
|
|
170
|
+
|
|
171
|
+
# Or any compatible endpoint (Together, Groq, DeepSeek, etc.)
|
|
172
|
+
export OPENAI_API_KEY=sk-...
|
|
173
|
+
export OPENAI_BASE_URL=https://api.together.xyz/v1
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Optional: create a `.env` file in your working directory. See [.env.example](.env.example) for all options.
|
|
177
|
+
|
|
178
|
+
## Requirements
|
|
179
|
+
|
|
180
|
+
- **Python 3.10+**
|
|
181
|
+
- **An LLM API key** — any OpenAI-compatible endpoint
|
|
182
|
+
- **Playwright Chromium** (optional, for JS-heavy sites): `pip install -e ".[browser]" && playwright install chromium`
|
|
183
|
+
|
|
184
|
+
## Development
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
git clone https://github.com/zhelunSun/sheaf-ai.git
|
|
188
|
+
cd sheaf-ai
|
|
189
|
+
pip install -e ".[dev]"
|
|
190
|
+
pytest tests/ -v # 104 tests
|
|
191
|
+
ruff check sheaf_ai/ tests/ sheaf_cards/
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Alpha Status
|
|
195
|
+
|
|
196
|
+
Sheaf is in early alpha. The core collect → search → crystallize → MCP pipeline works and is tested with 104 tests. We're validating with real users before beta.
|
|
197
|
+
|
|
198
|
+
**Try it:** save 20+ links, run `sheaf crystallize <topic>`, then ask your agent to find them. If it works for you, open an issue or discussion to tell us what you'd change.
|
|
199
|
+
|
|
200
|
+
## License
|
|
201
|
+
|
|
202
|
+
[MIT](LICENSE)
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
*A **sheaf** is a bundle of harvested grain — the unit a farmer brings to market. In mathematics, a [sheaf](https://en.wikipedia.org/wiki/Sheaf_(mathematics)) attaches local data to open sets and glues them into a global picture. Sheaf the tool does both: gather scattered knowledge into coherent bundles, ready for your agents to consume or for you to share.*
|
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
## Sheaf — 动态分类 Prompt
|
|
2
|
+
|
|
3
|
+
给定一篇网页文章(标题 + 正文),提取文章的主题和标签。
|
|
4
|
+
|
|
5
|
+
**核心原则:不做硬分类,让内容自然归类。**
|
|
6
|
+
|
|
7
|
+
### 输出格式 (JSON)
|
|
8
|
+
|
|
9
|
+
```json
|
|
10
|
+
{
|
|
11
|
+
"topics": [
|
|
12
|
+
{"name": "主题名称", "confidence": 0.9},
|
|
13
|
+
{"name": "次要主题", "confidence": 0.4}
|
|
14
|
+
],
|
|
15
|
+
"tags": ["标签1", "标签2", "标签3"],
|
|
16
|
+
"importance": "high | medium | low",
|
|
17
|
+
"content_type": "news | analysis | research | tutorial | opinion | event | product | reference",
|
|
18
|
+
"relevance_note": "一句话解释为什么提取这些主题和标签"
|
|
19
|
+
}
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### 主题提取规则
|
|
23
|
+
|
|
24
|
+
1. **topics 是领域维度的归类**,不是固定分类表。LLM 自由提取,例如:
|
|
25
|
+
- "AI Agent", "Remote Sensing", "Web3", "Climate", "Investing", "International Relations", "Open Source", "LLM", "Computer Vision"...
|
|
26
|
+
- 一篇文章可以有 1-3 个主题,每个带 confidence(0-1)
|
|
27
|
+
- confidence > 0.7 的视为主要主题
|
|
28
|
+
|
|
29
|
+
2. **tags 是更细粒度的关键词**,用于交叉检索。例如:
|
|
30
|
+
- 文章讨论 GPT-5 发布 → tags: ["GPT-5", "OpenAI", "大模型", "产品发布"]
|
|
31
|
+
- 文章讨论 RAG 在遥感中的应用 → tags: ["RAG", "remote sensing", "knowledge retrieval", "geospatial"]
|
|
32
|
+
|
|
33
|
+
3. **content_type 描述文章体裁**:
|
|
34
|
+
- `news` — 新闻快讯、动态报道
|
|
35
|
+
- `analysis` — 深度分析、行业洞察
|
|
36
|
+
- `research` — 学术论文、研究方法
|
|
37
|
+
- `tutorial` — 教程、实操指南
|
|
38
|
+
- `opinion` — 观点评论、博客
|
|
39
|
+
- `event` — 活动预告、会议征稿
|
|
40
|
+
- `product` — 产品发布、评测
|
|
41
|
+
- `reference` — 参考资料、工具清单
|
|
42
|
+
- `ai_conversation` — AI 对话记录(ChatGPT、Claude 等多轮对话归档)
|
|
43
|
+
|
|
44
|
+
4. **importance 判断**:
|
|
45
|
+
- `high` — 突破性信息、与用户研究领域直接相关、重大行业变化
|
|
46
|
+
- `medium` — 值得了解、有参考价值
|
|
47
|
+
- `low` — 边缘兴趣、信息增量较小
|
|
48
|
+
|
|
49
|
+
5. **标签提取原则**:
|
|
50
|
+
- 具体 > 抽象("RAG" > "AI技术")
|
|
51
|
+
- 保留专有名词原文(DeepSeek、OpenAI、Transformer)
|
|
52
|
+
- 每篇文章 3-8 个标签
|
|
53
|
+
- 尽量复用已有常见标签(如 "大模型"、"Agent"、"LLM"、"投资")
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
## Sheaf — Crystallize 知识结晶 Prompt
|
|
2
|
+
|
|
3
|
+
你是一个知识合成引擎。给定多篇关于同一主题的文章,从中提炼结构化的知识卡片。
|
|
4
|
+
|
|
5
|
+
**重要:所有文本输出使用中文。** 保留专有名词的英文原文(如模型名、公司名、框架名),但解释和描述用中文。
|
|
6
|
+
|
|
7
|
+
### 核心原则
|
|
8
|
+
|
|
9
|
+
1. **跨源合成** — 优先提取多篇来源共同支持的模式和洞察
|
|
10
|
+
2. **证据追溯** — 每个结论必须标注具体来源(用 [Source 0], [Source 1] 格式)
|
|
11
|
+
3. **矛盾标注** — 如果来源之间有矛盾,注明分歧并降低 confidence
|
|
12
|
+
4. **置信度校准** — confidence 应反映证据质量和来源一致性
|
|
13
|
+
|
|
14
|
+
### 输出格式 (JSON)
|
|
15
|
+
|
|
16
|
+
```json
|
|
17
|
+
[
|
|
18
|
+
{
|
|
19
|
+
"title": "简洁标题(5-15词)",
|
|
20
|
+
"claim": "核心知识陈述(1-3句话)",
|
|
21
|
+
"evidence": "具体证据,标注来源 [Source 0]",
|
|
22
|
+
"tags": ["tag1", "tag2", "tag3"],
|
|
23
|
+
"confidence": 0.85,
|
|
24
|
+
"source_indices": [0, 2]
|
|
25
|
+
}
|
|
26
|
+
]
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### 指导原则
|
|
30
|
+
|
|
31
|
+
- 每个卡片代表一个独立的知识洞察
|
|
32
|
+
- confidence 评分标准:
|
|
33
|
+
- 0.9+: 多源一致 + 具体数据支撑
|
|
34
|
+
- 0.7-0.9: 多源支持或单源强证据
|
|
35
|
+
- 0.5-0.7: 单源支持,需更多验证
|
|
36
|
+
- <0.5: 推测性结论,源之间存在矛盾
|
|
37
|
+
- 如果来源内容太薄无法提取有意义的模式,返回空数组 []
|
|
38
|
+
- 只输出 JSON,不要解释文字
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
## Sheaf — 摘要 Prompt
|
|
2
|
+
|
|
3
|
+
给定一篇网页文章(标题 + 正文),生成两个层级的摘要。
|
|
4
|
+
|
|
5
|
+
**重要:所有文本输出必须使用中文。** 保留专有名词的英文原文(如模型名、公司名、框架名),但解释和描述用中文。
|
|
6
|
+
|
|
7
|
+
### 输出格式 (JSON)
|
|
8
|
+
|
|
9
|
+
```json
|
|
10
|
+
{
|
|
11
|
+
"one_liner": "一句话概括核心要点(≤50字)",
|
|
12
|
+
"structured": {
|
|
13
|
+
"core_argument": "主要论点或关键洞察(1-3句话)",
|
|
14
|
+
"key_data": "文章中提到的重要数字、统计数据或具体证据",
|
|
15
|
+
"relevance_to_user": "为什么这对一位关注 AI 技术趋势的科研或工程人员很重要",
|
|
16
|
+
"action_items": "可以采取的行动(例如'试用这个框架'、'关注这个趋势'、'在论文中引用')",
|
|
17
|
+
"deadline_or_timing": "从文章中提取的时间敏感信息,如果没有则为 null"
|
|
18
|
+
},
|
|
19
|
+
"original_title": "文章标题",
|
|
20
|
+
"source_author": "作者 / 发布方名称(如果能识别的话)"
|
|
21
|
+
}
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### 指导原则
|
|
25
|
+
- 简洁。结构化摘要总计控制在 ~200 字以内。
|
|
26
|
+
- 对于 "relevance_to_user",假设读者是关注 AI 技术趋势的科研或工程人员。
|
|
27
|
+
- 如果文章与用户已知兴趣明显无关,请诚实说明。
|
|
28
|
+
- 保留所有专有名词(人名、公司名、模型名、框架名、论文名)。
|
|
29
|
+
|
|
30
|
+
### Deadline/Time-sensitive Information Extraction
|
|
31
|
+
|
|
32
|
+
**CRITICAL**: Pay special attention to any time-sensitive information in the article. This includes:
|
|
33
|
+
|
|
34
|
+
1. **Conference/Journal deadlines**: CFP deadlines, submission dates, notification dates
|
|
35
|
+
2. **Event dates**: Workshop dates, meetup dates, hackathon dates, forum dates
|
|
36
|
+
3. **Release dates**: Product launches, model releases, version updates
|
|
37
|
+
4. **Registration deadlines**: Sign-up deadlines, early bird deadlines
|
|
38
|
+
5. **Policy/regulation dates**: Compliance deadlines, effective dates
|
|
39
|
+
|
|
40
|
+
**Extraction format for `deadline_or_timing`**:
|
|
41
|
+
- If there IS time-sensitive info: Write a natural language sentence that INCLUDES the date in ISO format at the end. Examples:
|
|
42
|
+
- "交叉学科论坛征稿,截止日期为 2026-05-30"
|
|
43
|
+
- "GPT-5 发布会定于 2026-06-15"
|
|
44
|
+
- "ACL 2027 投稿截止 2027-01-15"
|
|
45
|
+
- If there is NO time-sensitive info: set to `null`
|
|
46
|
+
|
|
47
|
+
**Important**:
|
|
48
|
+
- Always extract the SPECIFIC date, not vague references like "next month"
|
|
49
|
+
- If multiple dates exist, list the most important/urgent one
|
|
50
|
+
- Convert relative dates (e.g., "明天", "下周五") to absolute ISO dates based on current context if possible
|