sheaf-ai 0.4.0a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. sheaf_ai-0.4.0a0/LICENSE +21 -0
  2. sheaf_ai-0.4.0a0/PKG-INFO +254 -0
  3. sheaf_ai-0.4.0a0/README.md +206 -0
  4. sheaf_ai-0.4.0a0/prompts/__init__.py +0 -0
  5. sheaf_ai-0.4.0a0/prompts/classify.md +53 -0
  6. sheaf_ai-0.4.0a0/prompts/crystallize.md +38 -0
  7. sheaf_ai-0.4.0a0/prompts/summarize.md +50 -0
  8. sheaf_ai-0.4.0a0/pyproject.toml +87 -0
  9. sheaf_ai-0.4.0a0/setup.cfg +4 -0
  10. sheaf_ai-0.4.0a0/sheaf_ai/__init__.py +12 -0
  11. sheaf_ai-0.4.0a0/sheaf_ai/api.py +345 -0
  12. sheaf_ai-0.4.0a0/sheaf_ai/cli.py +271 -0
  13. sheaf_ai-0.4.0a0/sheaf_ai/config.py +74 -0
  14. sheaf_ai-0.4.0a0/sheaf_ai/crystallize.py +549 -0
  15. sheaf_ai-0.4.0a0/sheaf_ai/display.py +337 -0
  16. sheaf_ai-0.4.0a0/sheaf_ai/embedding_bridge.py +203 -0
  17. sheaf_ai-0.4.0a0/sheaf_ai/exceptions.py +45 -0
  18. sheaf_ai-0.4.0a0/sheaf_ai/feedback.py +171 -0
  19. sheaf_ai-0.4.0a0/sheaf_ai/fetch_article.py +709 -0
  20. sheaf_ai-0.4.0a0/sheaf_ai/gamification.py +708 -0
  21. sheaf_ai-0.4.0a0/sheaf_ai/insights.py +325 -0
  22. sheaf_ai-0.4.0a0/sheaf_ai/llm_client.py +166 -0
  23. sheaf_ai-0.4.0a0/sheaf_ai/mcp_server.py +379 -0
  24. sheaf_ai-0.4.0a0/sheaf_ai/onboarding.py +145 -0
  25. sheaf_ai-0.4.0a0/sheaf_ai/pipeline.py +405 -0
  26. sheaf_ai-0.4.0a0/sheaf_ai/quality.py +214 -0
  27. sheaf_ai-0.4.0a0/sheaf_ai/query.py +199 -0
  28. sheaf_ai-0.4.0a0/sheaf_ai/renderer.py +419 -0
  29. sheaf_ai-0.4.0a0/sheaf_ai/search.py +170 -0
  30. sheaf_ai-0.4.0a0/sheaf_ai/storage.py +291 -0
  31. sheaf_ai-0.4.0a0/sheaf_ai/utils.py +109 -0
  32. sheaf_ai-0.4.0a0/sheaf_ai.egg-info/PKG-INFO +254 -0
  33. sheaf_ai-0.4.0a0/sheaf_ai.egg-info/SOURCES.txt +48 -0
  34. sheaf_ai-0.4.0a0/sheaf_ai.egg-info/dependency_links.txt +1 -0
  35. sheaf_ai-0.4.0a0/sheaf_ai.egg-info/entry_points.txt +5 -0
  36. sheaf_ai-0.4.0a0/sheaf_ai.egg-info/requires.txt +30 -0
  37. sheaf_ai-0.4.0a0/sheaf_ai.egg-info/top_level.txt +3 -0
  38. sheaf_ai-0.4.0a0/sheaf_cards/__init__.py +19 -0
  39. sheaf_ai-0.4.0a0/sheaf_cards/base.py +267 -0
  40. sheaf_ai-0.4.0a0/sheaf_cards/embeddings.py +264 -0
  41. sheaf_ai-0.4.0a0/sheaf_cards/generator.py +276 -0
  42. sheaf_ai-0.4.0a0/tests/test_api.py +209 -0
  43. sheaf_ai-0.4.0a0/tests/test_crystallize.py +500 -0
  44. sheaf_ai-0.4.0a0/tests/test_gamification.py +541 -0
  45. sheaf_ai-0.4.0a0/tests/test_install.py +184 -0
  46. sheaf_ai-0.4.0a0/tests/test_mcp.py +137 -0
  47. sheaf_ai-0.4.0a0/tests/test_quality.py +314 -0
  48. sheaf_ai-0.4.0a0/tests/test_renderer.py +446 -0
  49. sheaf_ai-0.4.0a0/tests/test_storage.py +169 -0
  50. sheaf_ai-0.4.0a0/tests/test_unit.py +113 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 zhelunSun
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,254 @@
1
+ Metadata-Version: 2.2
2
+ Name: sheaf-ai
3
+ Version: 0.4.0a0
4
+ Summary: Sheaf — Your personal knowledge layer. Paste a link, AI does the rest.
5
+ Author: zhelunSun
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/zhelunSun/sheaf-ai
8
+ Project-URL: Repository, https://github.com/zhelunSun/sheaf-ai
9
+ Project-URL: Issues, https://github.com/zhelunSun/sheaf-ai/issues
10
+ Keywords: knowledge-management,ai,mcp,cli,agent
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
18
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
19
+ Classifier: Framework :: Pytest
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: openai>=2.32.0
24
+ Requires-Dist: requests>=2.32.0
25
+ Requires-Dist: beautifulsoup4>=4.12.0
26
+ Requires-Dist: numpy>=1.24.0
27
+ Provides-Extra: browser
28
+ Requires-Dist: playwright>=1.59.0; extra == "browser"
29
+ Provides-Extra: cards
30
+ Requires-Dist: numpy>=1.24.0; extra == "cards"
31
+ Provides-Extra: server
32
+ Requires-Dist: fastapi>=0.100.0; extra == "server"
33
+ Requires-Dist: uvicorn>=0.20.0; extra == "server"
34
+ Requires-Dist: httpx>=0.24.0; extra == "server"
35
+ Provides-Extra: all
36
+ Requires-Dist: playwright>=1.59.0; extra == "all"
37
+ Requires-Dist: numpy>=1.24.0; extra == "all"
38
+ Requires-Dist: fastapi>=0.100.0; extra == "all"
39
+ Requires-Dist: uvicorn>=0.20.0; extra == "all"
40
+ Requires-Dist: httpx>=0.24.0; extra == "all"
41
+ Provides-Extra: dev
42
+ Requires-Dist: pytest>=8.0; extra == "dev"
43
+ Requires-Dist: ruff>=0.4; extra == "dev"
44
+ Requires-Dist: build>=1.0; extra == "dev"
45
+ Requires-Dist: fastapi>=0.100.0; extra == "dev"
46
+ Requires-Dist: uvicorn>=0.20.0; extra == "dev"
47
+ Requires-Dist: httpx>=0.24.0; extra == "dev"
48
+
49
+ # Sheaf
50
+
51
+ > **Harvest your knowledge. Bundle it. Share it.**
52
+
53
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
54
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
55
+ [![Tests](https://img.shields.io/badge/tests-104%20pass-brightgreen)](tests/)
56
+ [![PyPI](https://img.shields.io/pypi/v/sheaf-ai.svg)](https://pypi.org/project/sheaf-ai/)
57
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/sheaf-ai.svg)](https://pypi.org/project/sheaf-ai/)
58
+
59
+ A **sheaf** is a bundle of grain — the basic unit a farmer brings to market. Sheaf does the same for knowledge: gather what you read, crystallize it into structured bundles, and make it tradable. Your AI agents can search, cite, and reason over everything you've collected.
60
+
61
+ ## Quick Start
62
+
63
+ ```bash
64
+ # Install from PyPI
65
+ pip install sheaf-ai
66
+
67
+ # Or install from source
68
+ git clone https://github.com/zhelunSun/sheaf-ai.git
69
+ cd sheaf-ai
70
+ pip install -e .
71
+
72
+ # Set your LLM API key (any OpenAI-compatible endpoint)
73
+ export OPENAI_API_KEY=sk-...
74
+
75
+ # First-time onboarding (collects 3 sample articles)
76
+ sheaf init
77
+
78
+ # Collect a link
79
+ sheaf collect https://arxiv.org/abs/2401.00000
80
+
81
+ # Search your collection
82
+ sheaf search "transformer architecture"
83
+
84
+ # Crystallize knowledge cards from collected articles
85
+ sheaf crystallize AI
86
+ ```
87
+
88
+ No accounts. No cloud. Your data lives in `./data/` as Markdown + JSON.
89
+
90
+ ## The Problem
91
+
92
+ You save links every day — articles, repos, papers, tutorials. **95% never get opened again.**
93
+
94
+ Not because you're lazy. Because bookmarks serve *human reading*, not *agent workflows*. When you ask your coding agent "what did I read about MCP last week?", it has no idea.
95
+
96
+ Sheaf fixes this. Every link you save becomes a **structured entry** — a single stalk of grain. Crystallize enough of them, and you get a **bundle**: a portable, searchable knowledge pack any agent can consume.
97
+
98
+ ## What It Does
99
+
100
+ 1. **Harvest** — paste a link, Sheaf fetches, classifies, and summarizes it
101
+ 2. **Crystallize** — distill 3+ related entries into structured knowledge cards with evidence tracing
102
+ 3. **Bundle** — package cards into a portable `.sheaf` unit (coming soon)
103
+ 4. **Agent-ready** — built-in MCP server lets any LLM agent query your knowledge base
104
+
105
+ ## Core Commands
106
+
107
+ ```bash
108
+ sheaf collect <url> # Collect an article, paper, or webpage
109
+ sheaf search <query> # Full-text search across your collection
110
+ sheaf stats # Collection statistics with topic trends
111
+ sheaf crystallize <topic> # Crystallize knowledge cards from a topic
112
+ sheaf crystallize --list # List all crystallized cards
113
+ sheaf crystallize --semantic <q> # Semantic vector search across cards
114
+ sheaf tags # Tag statistics
115
+ sheaf weekly # Weekly summary report
116
+ sheaf insights # Cross-topic association discovery
117
+ sheaf urgent # Show entries with upcoming deadlines
118
+ sheaf mcp # Start MCP server (stdio transport)
119
+ sheaf init # First-time onboarding with demo
120
+ ```
121
+
122
+ ## Crystallize: Your Second Brain
123
+
124
+ This is Sheaf's killer feature. Instead of leaving your bookmarks to rot, `sheaf crystallize` synthesizes insights across multiple entries:
125
+
126
+ ```bash
127
+ $ sheaf crystallize AI
128
+ Crystallizing 'AI'...
129
+ ✨ 5 knowledge cards crystallized:
130
+ 📌 RAG faces retrieval relevance challenges (90%)
131
+ RAG systems heavily depend on retrieval quality; errors degrade LLM output reliability.
132
+ 📌 CRAG framework improves RAG robustness (95%)
133
+ CRAG introduces a retrieval evaluator, web search augmentation, and document decomposition.
134
+ 📌 Retrieval granularity significantly impacts performance (90%)
135
+ Finer-grained units like propositions outperform traditional passage-level retrieval.
136
+ ```
137
+
138
+ Each card includes:
139
+ - **Confidence score** (0-100%)
140
+ - **Evidence tracing** — which source entries contributed
141
+ - **Topic provenance** — what topic this card belongs to
142
+ - **Tags** — for filtering and cross-referencing
143
+
144
+ Use `sheaf crystallize --semantic "query"` for vector-based semantic search across all your cards.
145
+
146
+ ## MCP Server
147
+
148
+ Sheaf ships with a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server. Any MCP-compatible agent can query your knowledge base:
149
+
150
+ ```bash
151
+ sheaf mcp
152
+ ```
153
+
154
+ **Available tools (9 total):**
155
+
156
+ | Tool | Description |
157
+ |------|-------------|
158
+ | `sheaf_search` | Full-text search across all entries |
159
+ | `sheaf_list` | List recent entries with filtering |
160
+ | `sheaf_get` | Get full entry details by ID |
161
+ | `sheaf_urgent` | Find time-sensitive entries (deadlines, CFPs) |
162
+ | `sheaf_collect` | Add a new URL to your collection |
163
+ | `sheaf_correct` | Correct a classification error |
164
+ | `sheaf_crystallize` | Crystallize knowledge cards from a topic |
165
+ | `sheaf_list_cards` | List crystallized cards (optional topic filter) |
166
+ | `sheaf_get_card` | Get full card details by ID |
167
+
168
+ ## What You Can Collect
169
+
170
+ Sheaf handles more than just web articles:
171
+
172
+ | Input | Example | What Sheaf does |
173
+ |-------|---------|-----------------|
174
+ | **Web articles** | `sheaf collect https://arxiv.org/abs/2401.00000` | Fetches full text, extracts title/author/abstract, classifies topic |
175
+ | **AI chat shares** | `sheaf collect https://chatgpt.com/share/...` | Extracts the Q&A conversation, structures it as reusable knowledge |
176
+ | **WeChat / Zhihu posts** | `sheaf collect https://mp.weixin.qq.com/s/...` | Handles paywalls and dynamic rendering via Playwright fallback |
177
+ | **Pasted text** | `sheaf collect --text "Key insight..."` | Wraps freeform text into a structured entry with auto-classification |
178
+
179
+ Under the hood, every input goes through the same pipeline: **fetch → classify → summarize → store**. The output is always a structured entry your agents can search and cite.
180
+
181
+ ## Architecture
182
+
183
+ ```
184
+ URL → fetch → classify → summarize → store → query
185
+ ↓ ↓ ↓ ↓
186
+ 3-strategy LLM tags summary JSONL + MD
187
+ fallback + topics + deadline index
188
+
189
+
190
+ crystallize → KnowledgeCard → EmbeddingEngine
191
+ ↓ ↓
192
+ CLI/MCP semantic search
193
+ ```
194
+
195
+ | Module | Purpose |
196
+ |--------|---------|
197
+ | `sheaf_ai/` | Core — pipeline, storage, search, CLI, MCP server, crystallize engine |
198
+ | `sheaf_cards/` | Knowledge card engine — base types, embeddings, generation |
199
+ | `prompts/` | LLM prompt templates (classify, summarize, crystallize) |
200
+ | `data/` | Local knowledge base (JSONL + Markdown, gitignored) |
201
+
202
+ ## Privacy & Local-First
203
+
204
+ **Your data never leaves your machine unless you choose to.**
205
+
206
+ - All content stored locally in `./data/` (configurable via `SHEAF_DATA_DIR`)
207
+ - LLM calls go to **your** chosen API provider
208
+ - No telemetry, no analytics, no accounts
209
+ - Markdown + JSONL format — fully portable, zero lock-in
210
+
211
+ ## Configuration
212
+
213
+ Sheaf works with any OpenAI-compatible API:
214
+
215
+ ```bash
216
+ # OpenAI
217
+ export OPENAI_API_KEY=sk-...
218
+
219
+ # Or any compatible endpoint (Together, Groq, DeepSeek, etc.)
220
+ export OPENAI_API_KEY=sk-...
221
+ export OPENAI_BASE_URL=https://api.together.xyz/v1
222
+ ```
223
+
224
+ Optional: create a `.env` file in your working directory. See [.env.example](.env.example) for all options.
225
+
226
+ ## Requirements
227
+
228
+ - **Python 3.10+**
229
+ - **An LLM API key** — any OpenAI-compatible endpoint
230
+ - **Playwright Chromium** (optional, for JS-heavy sites): `pip install -e ".[browser]" && playwright install chromium`
231
+
232
+ ## Development
233
+
234
+ ```bash
235
+ git clone https://github.com/zhelunSun/sheaf-ai.git
236
+ cd sheaf-ai
237
+ pip install -e ".[dev]"
238
+ pytest tests/ -v # 104 tests
239
+ ruff check sheaf_ai/ tests/ sheaf_cards/
240
+ ```
241
+
242
+ ## Alpha Status
243
+
244
+ Sheaf is in early alpha. The core collect → search → crystallize → MCP pipeline works and is tested with 104 tests. We're validating with real users before beta.
245
+
246
+ **Try it:** save 20+ links, run `sheaf crystallize <topic>`, then ask your agent to find them. If it works for you, open an issue or discussion to tell us what you'd change.
247
+
248
+ ## License
249
+
250
+ [MIT](LICENSE)
251
+
252
+ ---
253
+
254
+ *A **sheaf** is a bundle of harvested grain — the unit a farmer brings to market. In mathematics, a [sheaf](https://en.wikipedia.org/wiki/Sheaf_(mathematics)) attaches local data to open sets and glues them into a global picture. Sheaf the tool does both: gather scattered knowledge into coherent bundles, ready for your agents to consume or for you to share.*
@@ -0,0 +1,206 @@
1
+ # Sheaf
2
+
3
+ > **Harvest your knowledge. Bundle it. Share it.**
4
+
5
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
7
+ [![Tests](https://img.shields.io/badge/tests-104%20pass-brightgreen)](tests/)
8
+ [![PyPI](https://img.shields.io/pypi/v/sheaf-ai.svg)](https://pypi.org/project/sheaf-ai/)
9
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/sheaf-ai.svg)](https://pypi.org/project/sheaf-ai/)
10
+
11
+ A **sheaf** is a bundle of grain — the basic unit a farmer brings to market. Sheaf does the same for knowledge: gather what you read, crystallize it into structured bundles, and make it tradable. Your AI agents can search, cite, and reason over everything you've collected.
12
+
13
+ ## Quick Start
14
+
15
+ ```bash
16
+ # Install from PyPI
17
+ pip install sheaf-ai
18
+
19
+ # Or install from source
20
+ git clone https://github.com/zhelunSun/sheaf-ai.git
21
+ cd sheaf-ai
22
+ pip install -e .
23
+
24
+ # Set your LLM API key (any OpenAI-compatible endpoint)
25
+ export OPENAI_API_KEY=sk-...
26
+
27
+ # First-time onboarding (collects 3 sample articles)
28
+ sheaf init
29
+
30
+ # Collect a link
31
+ sheaf collect https://arxiv.org/abs/2401.00000
32
+
33
+ # Search your collection
34
+ sheaf search "transformer architecture"
35
+
36
+ # Crystallize knowledge cards from collected articles
37
+ sheaf crystallize AI
38
+ ```
39
+
40
+ No accounts. No cloud. Your data lives in `./data/` as Markdown + JSON.
41
+
42
+ ## The Problem
43
+
44
+ You save links every day — articles, repos, papers, tutorials. **95% never get opened again.**
45
+
46
+ Not because you're lazy. Because bookmarks serve *human reading*, not *agent workflows*. When you ask your coding agent "what did I read about MCP last week?", it has no idea.
47
+
48
+ Sheaf fixes this. Every link you save becomes a **structured entry** — a single stalk of grain. Crystallize enough of them, and you get a **bundle**: a portable, searchable knowledge pack any agent can consume.
49
+
50
+ ## What It Does
51
+
52
+ 1. **Harvest** — paste a link, Sheaf fetches, classifies, and summarizes it
53
+ 2. **Crystallize** — distill 3+ related entries into structured knowledge cards with evidence tracing
54
+ 3. **Bundle** — package cards into a portable `.sheaf` unit (coming soon)
55
+ 4. **Agent-ready** — built-in MCP server lets any LLM agent query your knowledge base
56
+
57
+ ## Core Commands
58
+
59
+ ```bash
60
+ sheaf collect <url> # Collect an article, paper, or webpage
61
+ sheaf search <query> # Full-text search across your collection
62
+ sheaf stats # Collection statistics with topic trends
63
+ sheaf crystallize <topic> # Crystallize knowledge cards from a topic
64
+ sheaf crystallize --list # List all crystallized cards
65
+ sheaf crystallize --semantic <q> # Semantic vector search across cards
66
+ sheaf tags # Tag statistics
67
+ sheaf weekly # Weekly summary report
68
+ sheaf insights # Cross-topic association discovery
69
+ sheaf urgent # Show entries with upcoming deadlines
70
+ sheaf mcp # Start MCP server (stdio transport)
71
+ sheaf init # First-time onboarding with demo
72
+ ```
73
+
74
+ ## Crystallize: Your Second Brain
75
+
76
+ This is Sheaf's killer feature. Instead of leaving your bookmarks to rot, `sheaf crystallize` synthesizes insights across multiple entries:
77
+
78
+ ```bash
79
+ $ sheaf crystallize AI
80
+ Crystallizing 'AI'...
81
+ ✨ 5 knowledge cards crystallized:
82
+ 📌 RAG faces retrieval relevance challenges (90%)
83
+ RAG systems heavily depend on retrieval quality; errors degrade LLM output reliability.
84
+ 📌 CRAG framework improves RAG robustness (95%)
85
+ CRAG introduces a retrieval evaluator, web search augmentation, and document decomposition.
86
+ 📌 Retrieval granularity significantly impacts performance (90%)
87
+ Finer-grained units like propositions outperform traditional passage-level retrieval.
88
+ ```
89
+
90
+ Each card includes:
91
+ - **Confidence score** (0-100%)
92
+ - **Evidence tracing** — which source entries contributed
93
+ - **Topic provenance** — what topic this card belongs to
94
+ - **Tags** — for filtering and cross-referencing
95
+
96
+ Use `sheaf crystallize --semantic "query"` for vector-based semantic search across all your cards.
97
+
98
+ ## MCP Server
99
+
100
+ Sheaf ships with a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server. Any MCP-compatible agent can query your knowledge base:
101
+
102
+ ```bash
103
+ sheaf mcp
104
+ ```
105
+
106
+ **Available tools (9 total):**
107
+
108
+ | Tool | Description |
109
+ |------|-------------|
110
+ | `sheaf_search` | Full-text search across all entries |
111
+ | `sheaf_list` | List recent entries with filtering |
112
+ | `sheaf_get` | Get full entry details by ID |
113
+ | `sheaf_urgent` | Find time-sensitive entries (deadlines, CFPs) |
114
+ | `sheaf_collect` | Add a new URL to your collection |
115
+ | `sheaf_correct` | Correct a classification error |
116
+ | `sheaf_crystallize` | Crystallize knowledge cards from a topic |
117
+ | `sheaf_list_cards` | List crystallized cards (optional topic filter) |
118
+ | `sheaf_get_card` | Get full card details by ID |
119
+
120
+ ## What You Can Collect
121
+
122
+ Sheaf handles more than just web articles:
123
+
124
+ | Input | Example | What Sheaf does |
125
+ |-------|---------|-----------------|
126
+ | **Web articles** | `sheaf collect https://arxiv.org/abs/2401.00000` | Fetches full text, extracts title/author/abstract, classifies topic |
127
+ | **AI chat shares** | `sheaf collect https://chatgpt.com/share/...` | Extracts the Q&A conversation, structures it as reusable knowledge |
128
+ | **WeChat / Zhihu posts** | `sheaf collect https://mp.weixin.qq.com/s/...` | Handles paywalls and dynamic rendering via Playwright fallback |
129
+ | **Pasted text** | `sheaf collect --text "Key insight..."` | Wraps freeform text into a structured entry with auto-classification |
130
+
131
+ Under the hood, every input goes through the same pipeline: **fetch → classify → summarize → store**. The output is always a structured entry your agents can search and cite.
132
+
133
+ ## Architecture
134
+
135
+ ```
136
+ URL → fetch → classify → summarize → store → query
137
+ ↓ ↓ ↓ ↓
138
+ 3-strategy LLM tags summary JSONL + MD
139
+ fallback + topics + deadline index
140
+
141
+
142
+ crystallize → KnowledgeCard → EmbeddingEngine
143
+ ↓ ↓
144
+ CLI/MCP semantic search
145
+ ```
146
+
147
+ | Module | Purpose |
148
+ |--------|---------|
149
+ | `sheaf_ai/` | Core — pipeline, storage, search, CLI, MCP server, crystallize engine |
150
+ | `sheaf_cards/` | Knowledge card engine — base types, embeddings, generation |
151
+ | `prompts/` | LLM prompt templates (classify, summarize, crystallize) |
152
+ | `data/` | Local knowledge base (JSONL + Markdown, gitignored) |
153
+
154
+ ## Privacy & Local-First
155
+
156
+ **Your data never leaves your machine unless you choose to.**
157
+
158
+ - All content stored locally in `./data/` (configurable via `SHEAF_DATA_DIR`)
159
+ - LLM calls go to **your** chosen API provider
160
+ - No telemetry, no analytics, no accounts
161
+ - Markdown + JSONL format — fully portable, zero lock-in
162
+
163
+ ## Configuration
164
+
165
+ Sheaf works with any OpenAI-compatible API:
166
+
167
+ ```bash
168
+ # OpenAI
169
+ export OPENAI_API_KEY=sk-...
170
+
171
+ # Or any compatible endpoint (Together, Groq, DeepSeek, etc.)
172
+ export OPENAI_API_KEY=sk-...
173
+ export OPENAI_BASE_URL=https://api.together.xyz/v1
174
+ ```
175
+
176
+ Optional: create a `.env` file in your working directory. See [.env.example](.env.example) for all options.
177
+
178
+ ## Requirements
179
+
180
+ - **Python 3.10+**
181
+ - **An LLM API key** — any OpenAI-compatible endpoint
182
+ - **Playwright Chromium** (optional, for JS-heavy sites): `pip install -e ".[browser]" && playwright install chromium`
183
+
184
+ ## Development
185
+
186
+ ```bash
187
+ git clone https://github.com/zhelunSun/sheaf-ai.git
188
+ cd sheaf-ai
189
+ pip install -e ".[dev]"
190
+ pytest tests/ -v # 104 tests
191
+ ruff check sheaf_ai/ tests/ sheaf_cards/
192
+ ```
193
+
194
+ ## Alpha Status
195
+
196
+ Sheaf is in early alpha. The core collect → search → crystallize → MCP pipeline works and is tested with 104 tests. We're validating with real users before beta.
197
+
198
+ **Try it:** save 20+ links, run `sheaf crystallize <topic>`, then ask your agent to find them. If it works for you, open an issue or discussion to tell us what you'd change.
199
+
200
+ ## License
201
+
202
+ [MIT](LICENSE)
203
+
204
+ ---
205
+
206
+ *A **sheaf** is a bundle of harvested grain — the unit a farmer brings to market. In mathematics, a [sheaf](https://en.wikipedia.org/wiki/Sheaf_(mathematics)) attaches local data to open sets and glues them into a global picture. Sheaf the tool does both: gather scattered knowledge into coherent bundles, ready for your agents to consume or for you to share.*
File without changes
@@ -0,0 +1,53 @@
1
+ ## Sheaf — 动态分类 Prompt
2
+
3
+ 给定一篇网页文章(标题 + 正文),提取文章的主题和标签。
4
+
5
+ **核心原则:不做硬分类,让内容自然归类。**
6
+
7
+ ### 输出格式 (JSON)
8
+
9
+ ```json
10
+ {
11
+ "topics": [
12
+ {"name": "主题名称", "confidence": 0.9},
13
+ {"name": "次要主题", "confidence": 0.4}
14
+ ],
15
+ "tags": ["标签1", "标签2", "标签3"],
16
+ "importance": "high | medium | low",
17
+ "content_type": "news | analysis | research | tutorial | opinion | event | product | reference",
18
+ "relevance_note": "一句话解释为什么提取这些主题和标签"
19
+ }
20
+ ```
21
+
22
+ ### 主题提取规则
23
+
24
+ 1. **topics 是领域维度的归类**,不是固定分类表。LLM 自由提取,例如:
25
+ - "AI Agent", "Remote Sensing", "Web3", "Climate", "Investing", "International Relations", "Open Source", "LLM", "Computer Vision"...
26
+ - 一篇文章可以有 1-3 个主题,每个带 confidence(0-1)
27
+ - confidence > 0.7 的视为主要主题
28
+
29
+ 2. **tags 是更细粒度的关键词**,用于交叉检索。例如:
30
+ - 文章讨论 GPT-5 发布 → tags: ["GPT-5", "OpenAI", "大模型", "产品发布"]
31
+ - 文章讨论 RAG 在遥感中的应用 → tags: ["RAG", "remote sensing", "knowledge retrieval", "geospatial"]
32
+
33
+ 3. **content_type 描述文章体裁**:
34
+ - `news` — 新闻快讯、动态报道
35
+ - `analysis` — 深度分析、行业洞察
36
+ - `research` — 学术论文、研究方法
37
+ - `tutorial` — 教程、实操指南
38
+ - `opinion` — 观点评论、博客
39
+ - `event` — 活动预告、会议征稿
40
+ - `product` — 产品发布、评测
41
+ - `reference` — 参考资料、工具清单
42
+ - `ai_conversation` — AI 对话记录(ChatGPT、Claude 等多轮对话归档)
43
+
44
+ 4. **importance 判断**:
45
+ - `high` — 突破性信息、与用户研究领域直接相关、重大行业变化
46
+ - `medium` — 值得了解、有参考价值
47
+ - `low` — 边缘兴趣、信息增量较小
48
+
49
+ 5. **标签提取原则**:
50
+ - 具体 > 抽象("RAG" > "AI技术")
51
+ - 保留专有名词原文(DeepSeek、OpenAI、Transformer)
52
+ - 每篇文章 3-8 个标签
53
+ - 尽量复用已有常见标签(如 "大模型"、"Agent"、"LLM"、"投资")
@@ -0,0 +1,38 @@
1
+ ## Sheaf — Crystallize 知识结晶 Prompt
2
+
3
+ 你是一个知识合成引擎。给定多篇关于同一主题的文章,从中提炼结构化的知识卡片。
4
+
5
+ **重要:所有文本输出使用中文。** 保留专有名词的英文原文(如模型名、公司名、框架名),但解释和描述用中文。
6
+
7
+ ### 核心原则
8
+
9
+ 1. **跨源合成** — 优先提取多篇来源共同支持的模式和洞察
10
+ 2. **证据追溯** — 每个结论必须标注具体来源(用 [Source 0], [Source 1] 格式)
11
+ 3. **矛盾标注** — 如果来源之间有矛盾,注明分歧并降低 confidence
12
+ 4. **置信度校准** — confidence 应反映证据质量和来源一致性
13
+
14
+ ### 输出格式 (JSON)
15
+
16
+ ```json
17
+ [
18
+ {
19
+ "title": "简洁标题(5-15词)",
20
+ "claim": "核心知识陈述(1-3句话)",
21
+ "evidence": "具体证据,标注来源 [Source 0]",
22
+ "tags": ["tag1", "tag2", "tag3"],
23
+ "confidence": 0.85,
24
+ "source_indices": [0, 2]
25
+ }
26
+ ]
27
+ ```
28
+
29
+ ### 指导原则
30
+
31
+ - 每个卡片代表一个独立的知识洞察
32
+ - confidence 评分标准:
33
+ - 0.9+: 多源一致 + 具体数据支撑
34
+ - 0.7-0.9: 多源支持或单源强证据
35
+ - 0.5-0.7: 单源支持,需更多验证
36
+ - <0.5: 推测性结论,源之间存在矛盾
37
+ - 如果来源内容太薄无法提取有意义的模式,返回空数组 []
38
+ - 只输出 JSON,不要解释文字
@@ -0,0 +1,50 @@
1
+ ## Sheaf — 摘要 Prompt
2
+
3
+ 给定一篇网页文章(标题 + 正文),生成两个层级的摘要。
4
+
5
+ **重要:所有文本输出必须使用中文。** 保留专有名词的英文原文(如模型名、公司名、框架名),但解释和描述用中文。
6
+
7
+ ### 输出格式 (JSON)
8
+
9
+ ```json
10
+ {
11
+ "one_liner": "一句话概括核心要点(≤50字)",
12
+ "structured": {
13
+ "core_argument": "主要论点或关键洞察(1-3句话)",
14
+ "key_data": "文章中提到的重要数字、统计数据或具体证据",
15
+ "relevance_to_user": "为什么这对一位关注 AI 技术趋势的科研或工程人员很重要",
16
+ "action_items": "可以采取的行动(例如'试用这个框架'、'关注这个趋势'、'在论文中引用')",
17
+ "deadline_or_timing": "从文章中提取的时间敏感信息,如果没有则为 null"
18
+ },
19
+ "original_title": "文章标题",
20
+ "source_author": "作者 / 发布方名称(如果能识别的话)"
21
+ }
22
+ ```
23
+
24
+ ### 指导原则
25
+ - 简洁。结构化摘要总计控制在 ~200 字以内。
26
+ - 对于 "relevance_to_user",假设读者是关注 AI 技术趋势的科研或工程人员。
27
+ - 如果文章与用户已知兴趣明显无关,请诚实说明。
28
+ - 保留所有专有名词(人名、公司名、模型名、框架名、论文名)。
29
+
30
+ ### Deadline/Time-sensitive Information Extraction
31
+
32
+ **CRITICAL**: Pay special attention to any time-sensitive information in the article. This includes:
33
+
34
+ 1. **Conference/Journal deadlines**: CFP deadlines, submission dates, notification dates
35
+ 2. **Event dates**: Workshop dates, meetup dates, hackathon dates, forum dates
36
+ 3. **Release dates**: Product launches, model releases, version updates
37
+ 4. **Registration deadlines**: Sign-up deadlines, early bird deadlines
38
+ 5. **Policy/regulation dates**: Compliance deadlines, effective dates
39
+
40
+ **Extraction format for `deadline_or_timing`**:
41
+ - If there IS time-sensitive info: Write a natural language sentence that INCLUDES the date in ISO format at the end. Examples:
42
+ - "交叉学科论坛征稿,截止日期为 2026-05-30"
43
+ - "GPT-5 发布会定于 2026-06-15"
44
+ - "ACL 2027 投稿截止 2027-01-15"
45
+ - If there is NO time-sensitive info: set to `null`
46
+
47
+ **Important**:
48
+ - Always extract the SPECIFIC date, not vague references like "next month"
49
+ - If multiple dates exist, list the most important/urgent one
50
+ - Convert relative dates (e.g., "明天", "下周五") to absolute ISO dates based on current context if possible