distillr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- distillr-0.1.0/LICENSE +21 -0
- distillr-0.1.0/PKG-INFO +257 -0
- distillr-0.1.0/README.md +202 -0
- distillr-0.1.0/distill/__init__.py +1 -0
- distillr-0.1.0/distill/accordion.py +829 -0
- distillr-0.1.0/distill/analysis.py +215 -0
- distillr-0.1.0/distill/banner.py +298 -0
- distillr-0.1.0/distill/briefing.py +74 -0
- distillr-0.1.0/distill/browser_search.py +259 -0
- distillr-0.1.0/distill/cli.py +6426 -0
- distillr-0.1.0/distill/cli_shared.py +463 -0
- distillr-0.1.0/distill/config.py +136 -0
- distillr-0.1.0/distill/corpus_analysis.py +52 -0
- distillr-0.1.0/distill/costs.py +180 -0
- distillr-0.1.0/distill/dashboard_data.py +760 -0
- distillr-0.1.0/distill/discovery.py +377 -0
- distillr-0.1.0/distill/docx_export.py +477 -0
- distillr-0.1.0/distill/file_search.py +413 -0
- distillr-0.1.0/distill/library.py +359 -0
- distillr-0.1.0/distill/mcp_server.py +1072 -0
- distillr-0.1.0/distill/net.py +34 -0
- distillr-0.1.0/distill/paper_analysis.py +81 -0
- distillr-0.1.0/distill/paper_ingest.py +255 -0
- distillr-0.1.0/distill/preflight.py +164 -0
- distillr-0.1.0/distill/prompts.py +880 -0
- distillr-0.1.0/distill/prompts_accordion.py +536 -0
- distillr-0.1.0/distill/ranking.py +647 -0
- distillr-0.1.0/distill/research.py +293 -0
- distillr-0.1.0/distill/research_brief.py +298 -0
- distillr-0.1.0/distill/site_analysis.py +165 -0
- distillr-0.1.0/distill/site_attachments.py +204 -0
- distillr-0.1.0/distill/site_scraper.py +516 -0
- distillr-0.1.0/distill/state.py +62 -0
- distillr-0.1.0/distill/summary.py +415 -0
- distillr-0.1.0/distill/synthesis.py +162 -0
- distillr-0.1.0/distill/synthesize.py +103 -0
- distillr-0.1.0/distill/transcripts.py +153 -0
- distillr-0.1.0/distill/web/__init__.py +0 -0
- distillr-0.1.0/distill/web/routes/__init__.py +0 -0
- distillr-0.1.0/distill/web/routes/channels.py +69 -0
- distillr-0.1.0/distill/web/routes/costs.py +36 -0
- distillr-0.1.0/distill/web/routes/dashboard.py +27 -0
- distillr-0.1.0/distill/web/routes/topics.py +122 -0
- distillr-0.1.0/distill/web/routes/videos.py +45 -0
- distillr-0.1.0/distill/web/routes/watchlist.py +43 -0
- distillr-0.1.0/distill/web/server.py +64 -0
- distillr-0.1.0/distill/web/static/htmx.min.js +1 -0
- distillr-0.1.0/distill/web/static/style.css +360 -0
- distillr-0.1.0/distill/web/templates/base.html +38 -0
- distillr-0.1.0/distill/web/templates/channel_detail.html +66 -0
- distillr-0.1.0/distill/web/templates/costs.html +69 -0
- distillr-0.1.0/distill/web/templates/dashboard.html +153 -0
- distillr-0.1.0/distill/web/templates/topic_detail.html +77 -0
- distillr-0.1.0/distill/web/templates/topic_list.html +29 -0
- distillr-0.1.0/distill/web/templates/video_detail.html +54 -0
- distillr-0.1.0/distill/web/templates/watchlist.html +70 -0
- distillr-0.1.0/distillr.egg-info/PKG-INFO +257 -0
- distillr-0.1.0/distillr.egg-info/SOURCES.txt +101 -0
- distillr-0.1.0/distillr.egg-info/dependency_links.txt +1 -0
- distillr-0.1.0/distillr.egg-info/entry_points.txt +3 -0
- distillr-0.1.0/distillr.egg-info/requires.txt +28 -0
- distillr-0.1.0/distillr.egg-info/top_level.txt +1 -0
- distillr-0.1.0/pyproject.toml +168 -0
- distillr-0.1.0/setup.cfg +4 -0
- distillr-0.1.0/tests/test_accordion.py +885 -0
- distillr-0.1.0/tests/test_analysis.py +160 -0
- distillr-0.1.0/tests/test_banner.py +278 -0
- distillr-0.1.0/tests/test_briefing.py +79 -0
- distillr-0.1.0/tests/test_browser_search.py +298 -0
- distillr-0.1.0/tests/test_cli.py +2647 -0
- distillr-0.1.0/tests/test_cli_shared.py +600 -0
- distillr-0.1.0/tests/test_cli_support_discover.py +167 -0
- distillr-0.1.0/tests/test_cli_support_learning.py +299 -0
- distillr-0.1.0/tests/test_cli_support_learning_flow.py +275 -0
- distillr-0.1.0/tests/test_config.py +156 -0
- distillr-0.1.0/tests/test_corpus_analysis.py +25 -0
- distillr-0.1.0/tests/test_costs.py +80 -0
- distillr-0.1.0/tests/test_dashboard_data.py +384 -0
- distillr-0.1.0/tests/test_discovery.py +626 -0
- distillr-0.1.0/tests/test_docx_export.py +197 -0
- distillr-0.1.0/tests/test_file_search.py +359 -0
- distillr-0.1.0/tests/test_integration.py +63 -0
- distillr-0.1.0/tests/test_library.py +170 -0
- distillr-0.1.0/tests/test_mcp_server.py +1185 -0
- distillr-0.1.0/tests/test_paper_analysis.py +57 -0
- distillr-0.1.0/tests/test_paper_ingest.py +172 -0
- distillr-0.1.0/tests/test_preflight.py +165 -0
- distillr-0.1.0/tests/test_prompts.py +223 -0
- distillr-0.1.0/tests/test_ranking.py +464 -0
- distillr-0.1.0/tests/test_research.py +315 -0
- distillr-0.1.0/tests/test_research_brief_module.py +169 -0
- distillr-0.1.0/tests/test_site_analysis.py +160 -0
- distillr-0.1.0/tests/test_site_attachments.py +205 -0
- distillr-0.1.0/tests/test_site_scraper.py +384 -0
- distillr-0.1.0/tests/test_state.py +198 -0
- distillr-0.1.0/tests/test_summary.py +269 -0
- distillr-0.1.0/tests/test_synthesis.py +160 -0
- distillr-0.1.0/tests/test_synthesize_module.py +37 -0
- distillr-0.1.0/tests/test_topic_watch.py +691 -0
- distillr-0.1.0/tests/test_transcripts.py +337 -0
- distillr-0.1.0/tests/test_watchlist.py +138 -0
- distillr-0.1.0/tests/test_web_server.py +160 -0
- distillr-0.1.0/tests/test_ytdlp_contract.py +78 -0
distillr-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nick Seal
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
distillr-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: distillr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Source-to-intelligence platform: turn YouTube, websites, and arXiv papers into a structured, reusable corpus with per-source insights, cross-source synthesis, and Deep Research reports.
|
|
5
|
+
Author: Nick Seal
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/blisspixel/distillr
|
|
8
|
+
Project-URL: Repository, https://github.com/blisspixel/distillr
|
|
9
|
+
Project-URL: Issues, https://github.com/blisspixel/distillr/issues
|
|
10
|
+
Keywords: research,intelligence,synthesis,arxiv,youtube,rag,knowledge-base,mcp
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Information Technology
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: yt-dlp>=2025.1.0
|
|
28
|
+
Requires-Dist: openai>=1.0.0
|
|
29
|
+
Requires-Dist: google-genai>=1.50.0
|
|
30
|
+
Requires-Dist: typer>=0.9.0
|
|
31
|
+
Requires-Dist: pydantic>=2.0
|
|
32
|
+
Requires-Dist: pydantic-settings>=2.0
|
|
33
|
+
Requires-Dist: rich>=13.0.0
|
|
34
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
35
|
+
Requires-Dist: python-docx>=1.0.0
|
|
36
|
+
Requires-Dist: playwright>=1.52.0
|
|
37
|
+
Requires-Dist: mcp>=1.0.0
|
|
38
|
+
Requires-Dist: fastapi>=0.115.0
|
|
39
|
+
Requires-Dist: uvicorn[standard]>=0.30.0
|
|
40
|
+
Requires-Dist: jinja2>=3.1.0
|
|
41
|
+
Requires-Dist: markdown>=3.5.0
|
|
42
|
+
Requires-Dist: pypdf>=4.0.0
|
|
43
|
+
Requires-Dist: requests>=2.32.0
|
|
44
|
+
Requires-Dist: defusedxml>=0.7.1
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
47
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
48
|
+
Requires-Dist: ruff>=0.6.0; extra == "dev"
|
|
49
|
+
Requires-Dist: bandit[toml]>=1.7; extra == "dev"
|
|
50
|
+
Requires-Dist: pip-audit>=2.7; extra == "dev"
|
|
51
|
+
Requires-Dist: pre-commit>=3.5; extra == "dev"
|
|
52
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
53
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
54
|
+
Dynamic: license-file
|
|
55
|
+
|
|
56
|
+
# Distill
|
|
57
|
+
|
|
58
|
+
[](https://github.com/blisspixel/distillr/actions/workflows/ci.yml)
|
|
59
|
+
[](https://pypi.org/project/distillr/)
|
|
60
|
+
[](https://pypi.org/project/distillr/)
|
|
61
|
+
[](LICENSE)
|
|
62
|
+
[](https://github.com/astral-sh/ruff)
|
|
63
|
+
[](https://github.com/pre-commit/pre-commit)
|
|
64
|
+
|
|
65
|
+
> Turn YouTube, websites, and arXiv papers into a structured, reusable corpus of insights, syntheses, and reports — all plain markdown on your disk.
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install distillr
|
|
69
|
+
distill papers "temporal knowledge graph" --topic tkg --limit 20
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
That one command searches arXiv, downloads 20 PDFs, extracts full text, runs structured analysis on each, and writes a cross-paper synthesis. For a 20-paper run like the example below, expect single-digit minutes and roughly ~$1 in model spend. Terminal output during the run looks like this:
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
Papers: temporal knowledge graph
|
|
76
|
+
Topic: tkg | Selected papers: 20
|
|
77
|
+
|
|
78
|
+
[1/20] Time is Not a Label: Continuous Phase Rotation for Temporal Knowledge
|
|
79
|
+
Graphs and Agentic Memory
|
|
80
|
+
[2/20] Inductive Reasoning for Temporal Knowledge Graphs with Emerging Entities
|
|
81
|
+
...
|
|
82
|
+
|
|
83
|
+
6m 47s ~$1.01 (391,278 in / 38,117 out)
|
|
84
|
+
|
|
85
|
+
paper.md 90.4 KB
|
|
86
|
+
insights.md 8.1 KB
|
|
87
|
+
...
|
|
88
|
+
paper_synthesis.md 11.8 KB
|
|
89
|
+
corpus_synthesis.md 10.5 KB
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## What you get
|
|
93
|
+
|
|
94
|
+
One local `library/` directory of plain markdown. No database, no cloud lock-in, no proprietary format. Open it in any text editor, Obsidian, VS Code, or feed it into another tool.
|
|
95
|
+
|
|
96
|
+
Three source types, same pipeline shape (capture → analyze → synthesize → report):
|
|
97
|
+
|
|
98
|
+
- **YouTube** — channels, topic searches, videos, Shorts
|
|
99
|
+
- **Websites** — vendor sites, research hubs, curated URL sets (browser-first crawl with PDF/embedded-video ingestion)
|
|
100
|
+
- **arXiv papers** — phrase-matched search, full-PDF extraction, structured per-paper insights, cross-paper synthesis
|
|
101
|
+
|
|
102
|
+
Plus an MCP server so AI assistants and agent systems can query the library directly.
|
|
103
|
+
|
|
104
|
+
## Quick start
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
pip install distillr
|
|
108
|
+
playwright install chromium # for YouTube search + website capture
|
|
109
|
+
distill doctor # verify API keys + system health
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Set two keys in `.env` (copy from `.env.example`):
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
XAI_API_KEY=xai-... # Grok models
|
|
116
|
+
GEMINI_API_KEY=AIza... # Gemini Deep Research (reports + briefings)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Then try any of:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
# Goal-aware cross-source discovery (papers + videos, reranked against a goal)
|
|
123
|
+
distill discover "help an AI become a great music composer" --topic music --preview
|
|
124
|
+
distill discover --goal-file private/my-goal.md --topic research --yes
|
|
125
|
+
|
|
126
|
+
# Get smart on a YouTube topic, fast
|
|
127
|
+
distill latest "Microsoft Fabric best practices" --limit 10 --report
|
|
128
|
+
|
|
129
|
+
# Discover and ingest arXiv papers — expands the query, LLM-reranks candidates,
|
|
130
|
+
# picks the top N (use --preview to see the shortlist without ingesting)
|
|
131
|
+
distill papers "agent memory systems" --topic memory --limit 20
|
|
132
|
+
distill papers "agent memory systems" --topic memory --limit 20 --preview
|
|
133
|
+
|
|
134
|
+
# Distill a vendor/research site
|
|
135
|
+
distill site-batch configs/example_seeds.json --topic example --seed-only
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
The full command reference lives in [`docs/usage.md`](docs/usage.md).
|
|
139
|
+
|
|
140
|
+
## Mental model
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
library/
|
|
144
|
+
└── topics/<topic>/
|
|
145
|
+
├── channels/<creator>/videos/<video>/
|
|
146
|
+
│ ├── transcript.txt
|
|
147
|
+
│ └── insights.md
|
|
148
|
+
├── sites/<hostname>/pages/<page>/
|
|
149
|
+
│ ├── content.md
|
|
150
|
+
│ └── insights.md
|
|
151
|
+
├── papers/<paper>/
|
|
152
|
+
│ ├── paper.md
|
|
153
|
+
│ └── insights.md
|
|
154
|
+
├── topic_synthesis.md # cross-source
|
|
155
|
+
└── corpus_synthesis.md # mixed-source view
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
You build a topic library over time. Ingest once, refresh on a cadence, generate a report or briefing when you need one.
|
|
159
|
+
|
|
160
|
+
See [`docs/outputs.md`](docs/outputs.md) for what every artifact contains.
|
|
161
|
+
|
|
162
|
+
## Sample output
|
|
163
|
+
|
|
164
|
+
A per-paper `insights.md` (excerpt):
|
|
165
|
+
|
|
166
|
+
```markdown
|
|
167
|
+
---
|
|
168
|
+
paper_title: "Time is Not a Label: Continuous Phase Rotation for Temporal Knowledge Graphs"
|
|
169
|
+
paper_id: 2604.11544v1
|
|
170
|
+
analyzed_by: grok-4.20-0309-reasoning
|
|
171
|
+
source_mode: full_pdf
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
### Core Contribution
|
|
175
|
+
1. Continuous functional rotation θ_r(τ) = s · α_r · τ · ω instead of discrete
|
|
176
|
+
timestamp lookup tables. Zero-shot interpolation of unseen dates.
|
|
177
|
+
2. Semantic Speed Gate: MLP that reads only text embedding ϕ(r) and outputs α_r.
|
|
178
|
+
Learns relational volatility from data.
|
|
179
|
+
3. Geometric shadowing in complex space: obsolete facts rotated out of phase so
|
|
180
|
+
the correct fact outranks contradictions via the scoring function alone.
|
|
181
|
+
|
|
182
|
+
### Methods and Evidence
|
|
183
|
+
- On ICEWS05-15, RoMem-ChronoR reaches 72.6 MRR (vs vanilla ChronoR 68.4).
|
|
184
|
+
- Zero-shot domain transfer to FinTMMBench: 0.728 MRR, 0.673 R@5.
|
|
185
|
+
- All baselines use identical answer LLM and judge for fairness.
|
|
186
|
+
|
|
187
|
+
### Limits and Open Questions
|
|
188
|
+
- Computational cost at millions-of-facts scale is motivation but no latency,
|
|
189
|
+
memory, or throughput numbers are reported.
|
|
190
|
+
- Gate pretrained only on ICEWS05-15 political events; generalization to
|
|
191
|
+
highly ambiguous relations is not quantified.
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
A cross-paper `paper_synthesis.md` (excerpt):
|
|
195
|
+
|
|
196
|
+
```markdown
|
|
197
|
+
## Strongest Research Signals
|
|
198
|
+
|
|
199
|
+
- Append-only temporal representations improve long-horizon extrapolation:
|
|
200
|
+
RoMem (arXiv:2604.11544), EST (arXiv:2602.12389v3), and CID-TKG converge on
|
|
201
|
+
persistent or dual-view entity state over destructive overwriting, with
|
|
202
|
+
consistent MRR/Hits@K gains on ICEWS and GDELT.
|
|
203
|
+
|
|
204
|
+
- Semantic gating scales better than manual relation tagging: RoMem's Semantic
|
|
205
|
+
Speed Gate and EST's energy-barrier gate both learn relational volatility
|
|
206
|
+
from text embeddings rather than schema tags…
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
For **multi-topic** literature reviews, stakeholder briefings, or agent grounding, `distill research-brief` (Gemini Deep Research, web-augmented) and `distill synthesize` (Grok 4.20 single-call, corpus-only) take a user-written context file that shapes the output. See [`docs/usage.md#research-briefings-and-deep-synthesis`](docs/usage.md#research-briefings-and-deep-synthesis).
|
|
210
|
+
|
|
211
|
+
## Dashboard
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
distill # terminal home screen
|
|
215
|
+
distill serve # local web dashboard at http://127.0.0.1:8899
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
The terminal home screen shows tracked topics, channel and topic watches, recent runs, failures, and rolling spend. The web dashboard adds clickable drill-downs to per-topic, per-channel, and per-video views with rendered markdown, plus cost history and watchlist status. Both auto-refresh and read directly from library files — no database.
|
|
219
|
+
|
|
220
|
+
## MCP server
|
|
221
|
+
|
|
222
|
+
Claude Desktop / Claude Code config:
|
|
223
|
+
|
|
224
|
+
```json
|
|
225
|
+
{ "mcpServers": { "distill": { "command": "distill-mcp" } } }
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
Distill exposes 8 tools, 12 resources, and 4 prompts. See [`docs/mcp.md`](docs/mcp.md) for the list.
|
|
229
|
+
|
|
230
|
+
## Cost
|
|
231
|
+
|
|
232
|
+
Bulk video analysis is essentially free (~$0.006/video). Gemini Deep Research dominates paid reports (~$2–3/report). `distill synthesize` is ~$0.50 for a multi-topic corpus pass. Every run logs actual vs estimated cost to `library/cost_log.jsonl`; `distill costs` shows the history.
|
|
233
|
+
|
|
234
|
+
Full cost model in [`docs/cost.md`](docs/cost.md).
|
|
235
|
+
|
|
236
|
+
## Docs
|
|
237
|
+
|
|
238
|
+
- [`docs/usage.md`](docs/usage.md) — full command reference
|
|
239
|
+
- [`docs/architecture.md`](docs/architecture.md) — data flow, 4-phase report pipeline, model routing, security hardening
|
|
240
|
+
- [`docs/outputs.md`](docs/outputs.md) — what every artifact contains
|
|
241
|
+
- [`docs/cost.md`](docs/cost.md) — cost model, examples, guardrails
|
|
242
|
+
- [`docs/mcp.md`](docs/mcp.md) — MCP tools, resources, prompts
|
|
243
|
+
- [`docs/briefing-contexts/TEMPLATE.md`](docs/briefing-contexts/TEMPLATE.md) — starting point for `--context-file` prompts
|
|
244
|
+
- [`private/README.md`](private/README.md) — where personal/client-specific files go (git-ignored)
|
|
245
|
+
|
|
246
|
+
## Roadmap and changelog
|
|
247
|
+
|
|
248
|
+
- [`docs/CHANGELOG.md`](docs/CHANGELOG.md) — what shipped in `0.1.0`
|
|
249
|
+
- [`ROADMAP.md`](ROADMAP.md) — what's next
|
|
250
|
+
|
|
251
|
+
## Contributing
|
|
252
|
+
|
|
253
|
+
See [`docs/CONTRIBUTING.md`](docs/CONTRIBUTING.md) for dev setup, quality gates, and scope. Security disclosures go through [`docs/SECURITY.md`](docs/SECURITY.md).
|
|
254
|
+
|
|
255
|
+
## License
|
|
256
|
+
|
|
257
|
+
MIT — see [`LICENSE`](LICENSE).
|
distillr-0.1.0/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# Distill
|
|
2
|
+
|
|
3
|
+
[](https://github.com/blisspixel/distillr/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/distillr/)
|
|
5
|
+
[](https://pypi.org/project/distillr/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
[](https://github.com/astral-sh/ruff)
|
|
8
|
+
[](https://github.com/pre-commit/pre-commit)
|
|
9
|
+
|
|
10
|
+
> Turn YouTube, websites, and arXiv papers into a structured, reusable corpus of insights, syntheses, and reports — all plain markdown on your disk.
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install distillr
|
|
14
|
+
distill papers "temporal knowledge graph" --topic tkg --limit 20
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
That one command searches arXiv, downloads 20 PDFs, extracts full text, runs structured analysis on each, and writes a cross-paper synthesis. For a 20-paper run like the example below, expect single-digit minutes and roughly ~$1 in model spend. Terminal output during the run looks like this:
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
Papers: temporal knowledge graph
|
|
21
|
+
Topic: tkg | Selected papers: 20
|
|
22
|
+
|
|
23
|
+
[1/20] Time is Not a Label: Continuous Phase Rotation for Temporal Knowledge
|
|
24
|
+
Graphs and Agentic Memory
|
|
25
|
+
[2/20] Inductive Reasoning for Temporal Knowledge Graphs with Emerging Entities
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
6m 47s ~$1.01 (391,278 in / 38,117 out)
|
|
29
|
+
|
|
30
|
+
paper.md 90.4 KB
|
|
31
|
+
insights.md 8.1 KB
|
|
32
|
+
...
|
|
33
|
+
paper_synthesis.md 11.8 KB
|
|
34
|
+
corpus_synthesis.md 10.5 KB
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## What you get
|
|
38
|
+
|
|
39
|
+
One local `library/` directory of plain markdown. No database, no cloud lock-in, no proprietary format. Open it in any text editor, Obsidian, VS Code, or feed it into another tool.
|
|
40
|
+
|
|
41
|
+
Three source types, same pipeline shape (capture → analyze → synthesize → report):
|
|
42
|
+
|
|
43
|
+
- **YouTube** — channels, topic searches, videos, Shorts
|
|
44
|
+
- **Websites** — vendor sites, research hubs, curated URL sets (browser-first crawl with PDF/embedded-video ingestion)
|
|
45
|
+
- **arXiv papers** — phrase-matched search, full-PDF extraction, structured per-paper insights, cross-paper synthesis
|
|
46
|
+
|
|
47
|
+
Plus an MCP server so AI assistants and agent systems can query the library directly.
|
|
48
|
+
|
|
49
|
+
## Quick start
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install distillr
|
|
53
|
+
playwright install chromium # for YouTube search + website capture
|
|
54
|
+
distill doctor # verify API keys + system health
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Set two keys in `.env` (copy from `.env.example`):
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
XAI_API_KEY=xai-... # Grok models
|
|
61
|
+
GEMINI_API_KEY=AIza... # Gemini Deep Research (reports + briefings)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Then try any of:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Goal-aware cross-source discovery (papers + videos, reranked against a goal)
|
|
68
|
+
distill discover "help an AI become a great music composer" --topic music --preview
|
|
69
|
+
distill discover --goal-file private/my-goal.md --topic research --yes
|
|
70
|
+
|
|
71
|
+
# Get smart on a YouTube topic, fast
|
|
72
|
+
distill latest "Microsoft Fabric best practices" --limit 10 --report
|
|
73
|
+
|
|
74
|
+
# Discover and ingest arXiv papers — expands the query, LLM-reranks candidates,
|
|
75
|
+
# picks the top N (use --preview to see the shortlist without ingesting)
|
|
76
|
+
distill papers "agent memory systems" --topic memory --limit 20
|
|
77
|
+
distill papers "agent memory systems" --topic memory --limit 20 --preview
|
|
78
|
+
|
|
79
|
+
# Distill a vendor/research site
|
|
80
|
+
distill site-batch configs/example_seeds.json --topic example --seed-only
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
The full command reference lives in [`docs/usage.md`](docs/usage.md).
|
|
84
|
+
|
|
85
|
+
## Mental model
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
library/
|
|
89
|
+
└── topics/<topic>/
|
|
90
|
+
├── channels/<creator>/videos/<video>/
|
|
91
|
+
│ ├── transcript.txt
|
|
92
|
+
│ └── insights.md
|
|
93
|
+
├── sites/<hostname>/pages/<page>/
|
|
94
|
+
│ ├── content.md
|
|
95
|
+
│ └── insights.md
|
|
96
|
+
├── papers/<paper>/
|
|
97
|
+
│ ├── paper.md
|
|
98
|
+
│ └── insights.md
|
|
99
|
+
├── topic_synthesis.md # cross-source
|
|
100
|
+
└── corpus_synthesis.md # mixed-source view
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
You build a topic library over time. Ingest once, refresh on a cadence, generate a report or briefing when you need one.
|
|
104
|
+
|
|
105
|
+
See [`docs/outputs.md`](docs/outputs.md) for what every artifact contains.
|
|
106
|
+
|
|
107
|
+
## Sample output
|
|
108
|
+
|
|
109
|
+
A per-paper `insights.md` (excerpt):
|
|
110
|
+
|
|
111
|
+
```markdown
|
|
112
|
+
---
|
|
113
|
+
paper_title: "Time is Not a Label: Continuous Phase Rotation for Temporal Knowledge Graphs"
|
|
114
|
+
paper_id: 2604.11544v1
|
|
115
|
+
analyzed_by: grok-4.20-0309-reasoning
|
|
116
|
+
source_mode: full_pdf
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
### Core Contribution
|
|
120
|
+
1. Continuous functional rotation θ_r(τ) = s · α_r · τ · ω instead of discrete
|
|
121
|
+
timestamp lookup tables. Zero-shot interpolation of unseen dates.
|
|
122
|
+
2. Semantic Speed Gate: MLP that reads only text embedding ϕ(r) and outputs α_r.
|
|
123
|
+
Learns relational volatility from data.
|
|
124
|
+
3. Geometric shadowing in complex space: obsolete facts rotated out of phase so
|
|
125
|
+
the correct fact outranks contradictions via the scoring function alone.
|
|
126
|
+
|
|
127
|
+
### Methods and Evidence
|
|
128
|
+
- On ICEWS05-15, RoMem-ChronoR reaches 72.6 MRR (vs vanilla ChronoR 68.4).
|
|
129
|
+
- Zero-shot domain transfer to FinTMMBench: 0.728 MRR, 0.673 R@5.
|
|
130
|
+
- All baselines use identical answer LLM and judge for fairness.
|
|
131
|
+
|
|
132
|
+
### Limits and Open Questions
|
|
133
|
+
- Computational cost at millions-of-facts scale is motivation but no latency,
|
|
134
|
+
memory, or throughput numbers are reported.
|
|
135
|
+
- Gate pretrained only on ICEWS05-15 political events; generalization to
|
|
136
|
+
highly ambiguous relations is not quantified.
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
A cross-paper `paper_synthesis.md` (excerpt):
|
|
140
|
+
|
|
141
|
+
```markdown
|
|
142
|
+
## Strongest Research Signals
|
|
143
|
+
|
|
144
|
+
- Append-only temporal representations improve long-horizon extrapolation:
|
|
145
|
+
RoMem (arXiv:2604.11544), EST (arXiv:2602.12389v3), and CID-TKG converge on
|
|
146
|
+
persistent or dual-view entity state over destructive overwriting, with
|
|
147
|
+
consistent MRR/Hits@K gains on ICEWS and GDELT.
|
|
148
|
+
|
|
149
|
+
- Semantic gating scales better than manual relation tagging: RoMem's Semantic
|
|
150
|
+
Speed Gate and EST's energy-barrier gate both learn relational volatility
|
|
151
|
+
from text embeddings rather than schema tags…
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
For **multi-topic** literature reviews, stakeholder briefings, or agent grounding, `distill research-brief` (Gemini Deep Research, web-augmented) and `distill synthesize` (Grok 4.20 single-call, corpus-only) take a user-written context file that shapes the output. See [`docs/usage.md#research-briefings-and-deep-synthesis`](docs/usage.md#research-briefings-and-deep-synthesis).
|
|
155
|
+
|
|
156
|
+
## Dashboard
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
distill # terminal home screen
|
|
160
|
+
distill serve # local web dashboard at http://127.0.0.1:8899
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
The terminal home screen shows tracked topics, channel and topic watches, recent runs, failures, and rolling spend. The web dashboard adds clickable drill-downs to per-topic, per-channel, and per-video views with rendered markdown, plus cost history and watchlist status. Both auto-refresh and read directly from library files — no database.
|
|
164
|
+
|
|
165
|
+
## MCP server
|
|
166
|
+
|
|
167
|
+
Claude Desktop / Claude Code config:
|
|
168
|
+
|
|
169
|
+
```json
|
|
170
|
+
{ "mcpServers": { "distill": { "command": "distill-mcp" } } }
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Distill exposes 8 tools, 12 resources, and 4 prompts. See [`docs/mcp.md`](docs/mcp.md) for the list.
|
|
174
|
+
|
|
175
|
+
## Cost
|
|
176
|
+
|
|
177
|
+
Bulk video analysis is essentially free (~$0.006/video). Gemini Deep Research dominates paid reports (~$2–3/report). `distill synthesize` is ~$0.50 for a multi-topic corpus pass. Every run logs actual vs estimated cost to `library/cost_log.jsonl`; `distill costs` shows the history.
|
|
178
|
+
|
|
179
|
+
Full cost model in [`docs/cost.md`](docs/cost.md).
|
|
180
|
+
|
|
181
|
+
## Docs
|
|
182
|
+
|
|
183
|
+
- [`docs/usage.md`](docs/usage.md) — full command reference
|
|
184
|
+
- [`docs/architecture.md`](docs/architecture.md) — data flow, 4-phase report pipeline, model routing, security hardening
|
|
185
|
+
- [`docs/outputs.md`](docs/outputs.md) — what every artifact contains
|
|
186
|
+
- [`docs/cost.md`](docs/cost.md) — cost model, examples, guardrails
|
|
187
|
+
- [`docs/mcp.md`](docs/mcp.md) — MCP tools, resources, prompts
|
|
188
|
+
- [`docs/briefing-contexts/TEMPLATE.md`](docs/briefing-contexts/TEMPLATE.md) — starting point for `--context-file` prompts
|
|
189
|
+
- [`private/README.md`](private/README.md) — where personal/client-specific files go (git-ignored)
|
|
190
|
+
|
|
191
|
+
## Roadmap and changelog
|
|
192
|
+
|
|
193
|
+
- [`docs/CHANGELOG.md`](docs/CHANGELOG.md) — what shipped in `0.1.0`
|
|
194
|
+
- [`ROADMAP.md`](ROADMAP.md) — what's next
|
|
195
|
+
|
|
196
|
+
## Contributing
|
|
197
|
+
|
|
198
|
+
See [`docs/CONTRIBUTING.md`](docs/CONTRIBUTING.md) for dev setup, quality gates, and scope. Security disclosures go through [`docs/SECURITY.md`](docs/SECURITY.md).
|
|
199
|
+
|
|
200
|
+
## License
|
|
201
|
+
|
|
202
|
+
MIT — see [`LICENSE`](LICENSE).
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Distill — Turn YouTube channels into strategic intelligence."""
|