distillr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. distillr-0.1.0/LICENSE +21 -0
  2. distillr-0.1.0/PKG-INFO +257 -0
  3. distillr-0.1.0/README.md +202 -0
  4. distillr-0.1.0/distill/__init__.py +1 -0
  5. distillr-0.1.0/distill/accordion.py +829 -0
  6. distillr-0.1.0/distill/analysis.py +215 -0
  7. distillr-0.1.0/distill/banner.py +298 -0
  8. distillr-0.1.0/distill/briefing.py +74 -0
  9. distillr-0.1.0/distill/browser_search.py +259 -0
  10. distillr-0.1.0/distill/cli.py +6426 -0
  11. distillr-0.1.0/distill/cli_shared.py +463 -0
  12. distillr-0.1.0/distill/config.py +136 -0
  13. distillr-0.1.0/distill/corpus_analysis.py +52 -0
  14. distillr-0.1.0/distill/costs.py +180 -0
  15. distillr-0.1.0/distill/dashboard_data.py +760 -0
  16. distillr-0.1.0/distill/discovery.py +377 -0
  17. distillr-0.1.0/distill/docx_export.py +477 -0
  18. distillr-0.1.0/distill/file_search.py +413 -0
  19. distillr-0.1.0/distill/library.py +359 -0
  20. distillr-0.1.0/distill/mcp_server.py +1072 -0
  21. distillr-0.1.0/distill/net.py +34 -0
  22. distillr-0.1.0/distill/paper_analysis.py +81 -0
  23. distillr-0.1.0/distill/paper_ingest.py +255 -0
  24. distillr-0.1.0/distill/preflight.py +164 -0
  25. distillr-0.1.0/distill/prompts.py +880 -0
  26. distillr-0.1.0/distill/prompts_accordion.py +536 -0
  27. distillr-0.1.0/distill/ranking.py +647 -0
  28. distillr-0.1.0/distill/research.py +293 -0
  29. distillr-0.1.0/distill/research_brief.py +298 -0
  30. distillr-0.1.0/distill/site_analysis.py +165 -0
  31. distillr-0.1.0/distill/site_attachments.py +204 -0
  32. distillr-0.1.0/distill/site_scraper.py +516 -0
  33. distillr-0.1.0/distill/state.py +62 -0
  34. distillr-0.1.0/distill/summary.py +415 -0
  35. distillr-0.1.0/distill/synthesis.py +162 -0
  36. distillr-0.1.0/distill/synthesize.py +103 -0
  37. distillr-0.1.0/distill/transcripts.py +153 -0
  38. distillr-0.1.0/distill/web/__init__.py +0 -0
  39. distillr-0.1.0/distill/web/routes/__init__.py +0 -0
  40. distillr-0.1.0/distill/web/routes/channels.py +69 -0
  41. distillr-0.1.0/distill/web/routes/costs.py +36 -0
  42. distillr-0.1.0/distill/web/routes/dashboard.py +27 -0
  43. distillr-0.1.0/distill/web/routes/topics.py +122 -0
  44. distillr-0.1.0/distill/web/routes/videos.py +45 -0
  45. distillr-0.1.0/distill/web/routes/watchlist.py +43 -0
  46. distillr-0.1.0/distill/web/server.py +64 -0
  47. distillr-0.1.0/distill/web/static/htmx.min.js +1 -0
  48. distillr-0.1.0/distill/web/static/style.css +360 -0
  49. distillr-0.1.0/distill/web/templates/base.html +38 -0
  50. distillr-0.1.0/distill/web/templates/channel_detail.html +66 -0
  51. distillr-0.1.0/distill/web/templates/costs.html +69 -0
  52. distillr-0.1.0/distill/web/templates/dashboard.html +153 -0
  53. distillr-0.1.0/distill/web/templates/topic_detail.html +77 -0
  54. distillr-0.1.0/distill/web/templates/topic_list.html +29 -0
  55. distillr-0.1.0/distill/web/templates/video_detail.html +54 -0
  56. distillr-0.1.0/distill/web/templates/watchlist.html +70 -0
  57. distillr-0.1.0/distillr.egg-info/PKG-INFO +257 -0
  58. distillr-0.1.0/distillr.egg-info/SOURCES.txt +101 -0
  59. distillr-0.1.0/distillr.egg-info/dependency_links.txt +1 -0
  60. distillr-0.1.0/distillr.egg-info/entry_points.txt +3 -0
  61. distillr-0.1.0/distillr.egg-info/requires.txt +28 -0
  62. distillr-0.1.0/distillr.egg-info/top_level.txt +1 -0
  63. distillr-0.1.0/pyproject.toml +168 -0
  64. distillr-0.1.0/setup.cfg +4 -0
  65. distillr-0.1.0/tests/test_accordion.py +885 -0
  66. distillr-0.1.0/tests/test_analysis.py +160 -0
  67. distillr-0.1.0/tests/test_banner.py +278 -0
  68. distillr-0.1.0/tests/test_briefing.py +79 -0
  69. distillr-0.1.0/tests/test_browser_search.py +298 -0
  70. distillr-0.1.0/tests/test_cli.py +2647 -0
  71. distillr-0.1.0/tests/test_cli_shared.py +600 -0
  72. distillr-0.1.0/tests/test_cli_support_discover.py +167 -0
  73. distillr-0.1.0/tests/test_cli_support_learning.py +299 -0
  74. distillr-0.1.0/tests/test_cli_support_learning_flow.py +275 -0
  75. distillr-0.1.0/tests/test_config.py +156 -0
  76. distillr-0.1.0/tests/test_corpus_analysis.py +25 -0
  77. distillr-0.1.0/tests/test_costs.py +80 -0
  78. distillr-0.1.0/tests/test_dashboard_data.py +384 -0
  79. distillr-0.1.0/tests/test_discovery.py +626 -0
  80. distillr-0.1.0/tests/test_docx_export.py +197 -0
  81. distillr-0.1.0/tests/test_file_search.py +359 -0
  82. distillr-0.1.0/tests/test_integration.py +63 -0
  83. distillr-0.1.0/tests/test_library.py +170 -0
  84. distillr-0.1.0/tests/test_mcp_server.py +1185 -0
  85. distillr-0.1.0/tests/test_paper_analysis.py +57 -0
  86. distillr-0.1.0/tests/test_paper_ingest.py +172 -0
  87. distillr-0.1.0/tests/test_preflight.py +165 -0
  88. distillr-0.1.0/tests/test_prompts.py +223 -0
  89. distillr-0.1.0/tests/test_ranking.py +464 -0
  90. distillr-0.1.0/tests/test_research.py +315 -0
  91. distillr-0.1.0/tests/test_research_brief_module.py +169 -0
  92. distillr-0.1.0/tests/test_site_analysis.py +160 -0
  93. distillr-0.1.0/tests/test_site_attachments.py +205 -0
  94. distillr-0.1.0/tests/test_site_scraper.py +384 -0
  95. distillr-0.1.0/tests/test_state.py +198 -0
  96. distillr-0.1.0/tests/test_summary.py +269 -0
  97. distillr-0.1.0/tests/test_synthesis.py +160 -0
  98. distillr-0.1.0/tests/test_synthesize_module.py +37 -0
  99. distillr-0.1.0/tests/test_topic_watch.py +691 -0
  100. distillr-0.1.0/tests/test_transcripts.py +337 -0
  101. distillr-0.1.0/tests/test_watchlist.py +138 -0
  102. distillr-0.1.0/tests/test_web_server.py +160 -0
  103. distillr-0.1.0/tests/test_ytdlp_contract.py +78 -0
distillr-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nick Seal
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,257 @@
1
+ Metadata-Version: 2.4
2
+ Name: distillr
3
+ Version: 0.1.0
4
+ Summary: Source-to-intelligence platform: turn YouTube, websites, and arXiv papers into a structured, reusable corpus with per-source insights, cross-source synthesis, and Deep Research reports.
5
+ Author: Nick Seal
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/blisspixel/distillr
8
+ Project-URL: Repository, https://github.com/blisspixel/distillr
9
+ Project-URL: Issues, https://github.com/blisspixel/distillr/issues
10
+ Keywords: research,intelligence,synthesis,arxiv,youtube,rag,knowledge-base,mcp
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Information Technology
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Text Processing :: Linguistic
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: yt-dlp>=2025.1.0
28
+ Requires-Dist: openai>=1.0.0
29
+ Requires-Dist: google-genai>=1.50.0
30
+ Requires-Dist: typer>=0.9.0
31
+ Requires-Dist: pydantic>=2.0
32
+ Requires-Dist: pydantic-settings>=2.0
33
+ Requires-Dist: rich>=13.0.0
34
+ Requires-Dist: python-dotenv>=1.0.0
35
+ Requires-Dist: python-docx>=1.0.0
36
+ Requires-Dist: playwright>=1.52.0
37
+ Requires-Dist: mcp>=1.0.0
38
+ Requires-Dist: fastapi>=0.115.0
39
+ Requires-Dist: uvicorn[standard]>=0.30.0
40
+ Requires-Dist: jinja2>=3.1.0
41
+ Requires-Dist: markdown>=3.5.0
42
+ Requires-Dist: pypdf>=4.0.0
43
+ Requires-Dist: requests>=2.32.0
44
+ Requires-Dist: defusedxml>=0.7.1
45
+ Provides-Extra: dev
46
+ Requires-Dist: pytest>=7.0; extra == "dev"
47
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
48
+ Requires-Dist: ruff>=0.6.0; extra == "dev"
49
+ Requires-Dist: bandit[toml]>=1.7; extra == "dev"
50
+ Requires-Dist: pip-audit>=2.7; extra == "dev"
51
+ Requires-Dist: pre-commit>=3.5; extra == "dev"
52
+ Requires-Dist: build>=1.0; extra == "dev"
53
+ Requires-Dist: twine>=5.0; extra == "dev"
54
+ Dynamic: license-file
55
+
56
+ # Distill
57
+
58
+ [![CI](https://github.com/blisspixel/distillr/actions/workflows/ci.yml/badge.svg)](https://github.com/blisspixel/distillr/actions/workflows/ci.yml)
59
+ [![PyPI](https://img.shields.io/pypi/v/distillr.svg)](https://pypi.org/project/distillr/)
60
+ [![Python](https://img.shields.io/pypi/pyversions/distillr.svg)](https://pypi.org/project/distillr/)
61
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
62
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
63
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
64
+
65
+ > Turn YouTube, websites, and arXiv papers into a structured, reusable corpus of insights, syntheses, and reports — all plain markdown on your disk.
66
+
67
+ ```bash
68
+ pip install distillr
69
+ distill papers "temporal knowledge graph" --topic tkg --limit 20
70
+ ```
71
+
72
+ That one command searches arXiv, downloads 20 PDFs, extracts full text, runs structured analysis on each, and writes a cross-paper synthesis. For a 20-paper run like the example below, expect single-digit minutes and roughly ~$1 in model spend. Terminal output during the run looks like this:
73
+
74
+ ```
75
+ Papers: temporal knowledge graph
76
+ Topic: tkg | Selected papers: 20
77
+
78
+ [1/20] Time is Not a Label: Continuous Phase Rotation for Temporal Knowledge
79
+ Graphs and Agentic Memory
80
+ [2/20] Inductive Reasoning for Temporal Knowledge Graphs with Emerging Entities
81
+ ...
82
+
83
+ 6m 47s ~$1.01 (391,278 in / 38,117 out)
84
+
85
+ paper.md 90.4 KB
86
+ insights.md 8.1 KB
87
+ ...
88
+ paper_synthesis.md 11.8 KB
89
+ corpus_synthesis.md 10.5 KB
90
+ ```
91
+
92
+ ## What you get
93
+
94
+ One local `library/` directory of plain markdown. No database, no cloud lock-in, no proprietary format. Open it in any text editor, Obsidian, VS Code, or feed it into another tool.
95
+
96
+ Three source types, same pipeline shape (capture → analyze → synthesize → report):
97
+
98
+ - **YouTube** — channels, topic searches, videos, Shorts
99
+ - **Websites** — vendor sites, research hubs, curated URL sets (browser-first crawl with PDF/embedded-video ingestion)
100
+ - **arXiv papers** — phrase-matched search, full-PDF extraction, structured per-paper insights, cross-paper synthesis
101
+
102
+ Plus an MCP server so AI assistants and agent systems can query the library directly.
103
+
104
+ ## Quick start
105
+
106
+ ```bash
107
+ pip install distillr
108
+ playwright install chromium # for YouTube search + website capture
109
+ distill doctor # verify API keys + system health
110
+ ```
111
+
112
+ Set two keys in `.env` (copy from `.env.example`):
113
+
114
+ ```bash
115
+ XAI_API_KEY=xai-... # Grok models
116
+ GEMINI_API_KEY=AIza... # Gemini Deep Research (reports + briefings)
117
+ ```
118
+
119
+ Then try any of:
120
+
121
+ ```bash
122
+ # Goal-aware cross-source discovery (papers + videos, reranked against a goal)
123
+ distill discover "help an AI become a great music composer" --topic music --preview
124
+ distill discover --goal-file private/my-goal.md --topic research --yes
125
+
126
+ # Get smart on a YouTube topic, fast
127
+ distill latest "Microsoft Fabric best practices" --limit 10 --report
128
+
129
+ # Discover and ingest arXiv papers — expands the query, LLM-reranks candidates,
130
+ # picks the top N (use --preview to see the shortlist without ingesting)
131
+ distill papers "agent memory systems" --topic memory --limit 20
132
+ distill papers "agent memory systems" --topic memory --limit 20 --preview
133
+
134
+ # Distill a vendor/research site
135
+ distill site-batch configs/example_seeds.json --topic example --seed-only
136
+ ```
137
+
138
+ The full command reference lives in [`docs/usage.md`](docs/usage.md).
139
+
140
+ ## Mental model
141
+
142
+ ```
143
+ library/
144
+ └── topics/<topic>/
145
+ ├── channels/<creator>/videos/<video>/
146
+ │ ├── transcript.txt
147
+ │ └── insights.md
148
+ ├── sites/<hostname>/pages/<page>/
149
+ │ ├── content.md
150
+ │ └── insights.md
151
+ ├── papers/<paper>/
152
+ │ ├── paper.md
153
+ │ └── insights.md
154
+ ├── topic_synthesis.md # cross-source
155
+ └── corpus_synthesis.md # mixed-source view
156
+ ```
157
+
158
+ You build a topic library over time. Ingest once, refresh on a cadence, generate a report or briefing when you need one.
159
+
160
+ See [`docs/outputs.md`](docs/outputs.md) for what every artifact contains.
161
+
162
+ ## Sample output
163
+
164
+ A per-paper `insights.md` (excerpt):
165
+
166
+ ```markdown
167
+ ---
168
+ paper_title: "Time is Not a Label: Continuous Phase Rotation for Temporal Knowledge Graphs"
169
+ paper_id: 2604.11544v1
170
+ analyzed_by: grok-4.20-0309-reasoning
171
+ source_mode: full_pdf
172
+ ---
173
+
174
+ ### Core Contribution
175
+ 1. Continuous functional rotation θ_r(τ) = s · α_r · τ · ω instead of discrete
176
+ timestamp lookup tables. Zero-shot interpolation of unseen dates.
177
+ 2. Semantic Speed Gate: MLP that reads only text embedding ϕ(r) and outputs α_r.
178
+ Learns relational volatility from data.
179
+ 3. Geometric shadowing in complex space: obsolete facts rotated out of phase so
180
+ the correct fact outranks contradictions via the scoring function alone.
181
+
182
+ ### Methods and Evidence
183
+ - On ICEWS05-15, RoMem-ChronoR reaches 72.6 MRR (vs vanilla ChronoR 68.4).
184
+ - Zero-shot domain transfer to FinTMMBench: 0.728 MRR, 0.673 R@5.
185
+ - All baselines use identical answer LLM and judge for fairness.
186
+
187
+ ### Limits and Open Questions
188
+ - Computational cost at millions-of-facts scale is motivation but no latency,
189
+ memory, or throughput numbers are reported.
190
+ - Gate pretrained only on ICEWS05-15 political events; generalization to
191
+ highly ambiguous relations is not quantified.
192
+ ```
193
+
194
+ A cross-paper `paper_synthesis.md` (excerpt):
195
+
196
+ ```markdown
197
+ ## Strongest Research Signals
198
+
199
+ - Append-only temporal representations improve long-horizon extrapolation:
200
+ RoMem (arXiv:2604.11544), EST (arXiv:2602.12389v3), and CID-TKG converge on
201
+ persistent or dual-view entity state over destructive overwriting, with
202
+ consistent MRR/Hits@K gains on ICEWS and GDELT.
203
+
204
+ - Semantic gating scales better than manual relation tagging: RoMem's Semantic
205
+ Speed Gate and EST's energy-barrier gate both learn relational volatility
206
+ from text embeddings rather than schema tags…
207
+ ```
208
+
209
+ For **multi-topic** literature reviews, stakeholder briefings, or agent grounding, `distill research-brief` (Gemini Deep Research, web-augmented) and `distill synthesize` (Grok 4.20 single-call, corpus-only) take a user-written context file that shapes the output. See [`docs/usage.md#research-briefings-and-deep-synthesis`](docs/usage.md#research-briefings-and-deep-synthesis).
210
+
211
+ ## Dashboard
212
+
213
+ ```bash
214
+ distill # terminal home screen
215
+ distill serve # local web dashboard at http://127.0.0.1:8899
216
+ ```
217
+
218
+ The terminal home screen shows tracked topics, channel and topic watches, recent runs, failures, and rolling spend. The web dashboard adds clickable drill-downs to per-topic, per-channel, and per-video views with rendered markdown, plus cost history and watchlist status. Both auto-refresh and read directly from library files — no database.
219
+
220
+ ## MCP server
221
+
222
+ Claude Desktop / Claude Code config:
223
+
224
+ ```json
225
+ { "mcpServers": { "distill": { "command": "distill-mcp" } } }
226
+ ```
227
+
228
+ Distill exposes 8 tools, 12 resources, and 4 prompts. See [`docs/mcp.md`](docs/mcp.md) for the list.
229
+
230
+ ## Cost
231
+
232
+ Bulk video analysis is essentially free (~$0.006/video). Gemini Deep Research dominates paid reports (~$2–3/report). `distill synthesize` is ~$0.50 for a multi-topic corpus pass. Every run logs actual vs estimated cost to `library/cost_log.jsonl`; `distill costs` shows the history.
233
+
234
+ Full cost model in [`docs/cost.md`](docs/cost.md).
235
+
236
+ ## Docs
237
+
238
+ - [`docs/usage.md`](docs/usage.md) — full command reference
239
+ - [`docs/architecture.md`](docs/architecture.md) — data flow, 4-phase report pipeline, model routing, security hardening
240
+ - [`docs/outputs.md`](docs/outputs.md) — what every artifact contains
241
+ - [`docs/cost.md`](docs/cost.md) — cost model, examples, guardrails
242
+ - [`docs/mcp.md`](docs/mcp.md) — MCP tools, resources, prompts
243
+ - [`docs/briefing-contexts/TEMPLATE.md`](docs/briefing-contexts/TEMPLATE.md) — starting point for `--context-file` prompts
244
+ - [`private/README.md`](private/README.md) — where personal/client-specific files go (git-ignored)
245
+
246
+ ## Roadmap and changelog
247
+
248
+ - [`docs/CHANGELOG.md`](docs/CHANGELOG.md) — what shipped in `0.1.0`
249
+ - [`ROADMAP.md`](ROADMAP.md) — what's next
250
+
251
+ ## Contributing
252
+
253
+ See [`docs/CONTRIBUTING.md`](docs/CONTRIBUTING.md) for dev setup, quality gates, and scope. Security disclosures go through [`docs/SECURITY.md`](docs/SECURITY.md).
254
+
255
+ ## License
256
+
257
+ MIT — see [`LICENSE`](LICENSE).
@@ -0,0 +1,202 @@
1
+ # Distill
2
+
3
+ [![CI](https://github.com/blisspixel/distillr/actions/workflows/ci.yml/badge.svg)](https://github.com/blisspixel/distillr/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/distillr.svg)](https://pypi.org/project/distillr/)
5
+ [![Python](https://img.shields.io/pypi/pyversions/distillr.svg)](https://pypi.org/project/distillr/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
7
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
8
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
9
+
10
+ > Turn YouTube, websites, and arXiv papers into a structured, reusable corpus of insights, syntheses, and reports — all plain markdown on your disk.
11
+
12
+ ```bash
13
+ pip install distillr
14
+ distill papers "temporal knowledge graph" --topic tkg --limit 20
15
+ ```
16
+
17
+ That one command searches arXiv, downloads 20 PDFs, extracts full text, runs structured analysis on each, and writes a cross-paper synthesis. For a 20-paper run like the example below, expect single-digit minutes and roughly ~$1 in model spend. Terminal output during the run looks like this:
18
+
19
+ ```
20
+ Papers: temporal knowledge graph
21
+ Topic: tkg | Selected papers: 20
22
+
23
+ [1/20] Time is Not a Label: Continuous Phase Rotation for Temporal Knowledge
24
+ Graphs and Agentic Memory
25
+ [2/20] Inductive Reasoning for Temporal Knowledge Graphs with Emerging Entities
26
+ ...
27
+
28
+ 6m 47s ~$1.01 (391,278 in / 38,117 out)
29
+
30
+ paper.md 90.4 KB
31
+ insights.md 8.1 KB
32
+ ...
33
+ paper_synthesis.md 11.8 KB
34
+ corpus_synthesis.md 10.5 KB
35
+ ```
36
+
37
+ ## What you get
38
+
39
+ One local `library/` directory of plain markdown. No database, no cloud lock-in, no proprietary format. Open it in any text editor, Obsidian, VS Code, or feed it into another tool.
40
+
41
+ Three source types, same pipeline shape (capture → analyze → synthesize → report):
42
+
43
+ - **YouTube** — channels, topic searches, videos, Shorts
44
+ - **Websites** — vendor sites, research hubs, curated URL sets (browser-first crawl with PDF/embedded-video ingestion)
45
+ - **arXiv papers** — phrase-matched search, full-PDF extraction, structured per-paper insights, cross-paper synthesis
46
+
47
+ Plus an MCP server so AI assistants and agent systems can query the library directly.
48
+
49
+ ## Quick start
50
+
51
+ ```bash
52
+ pip install distillr
53
+ playwright install chromium # for YouTube search + website capture
54
+ distill doctor # verify API keys + system health
55
+ ```
56
+
57
+ Set two keys in `.env` (copy from `.env.example`):
58
+
59
+ ```bash
60
+ XAI_API_KEY=xai-... # Grok models
61
+ GEMINI_API_KEY=AIza... # Gemini Deep Research (reports + briefings)
62
+ ```
63
+
64
+ Then try any of:
65
+
66
+ ```bash
67
+ # Goal-aware cross-source discovery (papers + videos, reranked against a goal)
68
+ distill discover "help an AI become a great music composer" --topic music --preview
69
+ distill discover --goal-file private/my-goal.md --topic research --yes
70
+
71
+ # Get smart on a YouTube topic, fast
72
+ distill latest "Microsoft Fabric best practices" --limit 10 --report
73
+
74
+ # Discover and ingest arXiv papers — expands the query, LLM-reranks candidates,
75
+ # picks the top N (use --preview to see the shortlist without ingesting)
76
+ distill papers "agent memory systems" --topic memory --limit 20
77
+ distill papers "agent memory systems" --topic memory --limit 20 --preview
78
+
79
+ # Distill a vendor/research site
80
+ distill site-batch configs/example_seeds.json --topic example --seed-only
81
+ ```
82
+
83
+ The full command reference lives in [`docs/usage.md`](docs/usage.md).
84
+
85
+ ## Mental model
86
+
87
+ ```
88
+ library/
89
+ └── topics/<topic>/
90
+ ├── channels/<creator>/videos/<video>/
91
+ │ ├── transcript.txt
92
+ │ └── insights.md
93
+ ├── sites/<hostname>/pages/<page>/
94
+ │ ├── content.md
95
+ │ └── insights.md
96
+ ├── papers/<paper>/
97
+ │ ├── paper.md
98
+ │ └── insights.md
99
+ ├── topic_synthesis.md # cross-source
100
+ └── corpus_synthesis.md # mixed-source view
101
+ ```
102
+
103
+ You build a topic library over time. Ingest once, refresh on a cadence, generate a report or briefing when you need one.
104
+
105
+ See [`docs/outputs.md`](docs/outputs.md) for what every artifact contains.
106
+
107
+ ## Sample output
108
+
109
+ A per-paper `insights.md` (excerpt):
110
+
111
+ ```markdown
112
+ ---
113
+ paper_title: "Time is Not a Label: Continuous Phase Rotation for Temporal Knowledge Graphs"
114
+ paper_id: 2604.11544v1
115
+ analyzed_by: grok-4.20-0309-reasoning
116
+ source_mode: full_pdf
117
+ ---
118
+
119
+ ### Core Contribution
120
+ 1. Continuous functional rotation θ_r(τ) = s · α_r · τ · ω instead of discrete
121
+ timestamp lookup tables. Zero-shot interpolation of unseen dates.
122
+ 2. Semantic Speed Gate: MLP that reads only text embedding ϕ(r) and outputs α_r.
123
+ Learns relational volatility from data.
124
+ 3. Geometric shadowing in complex space: obsolete facts rotated out of phase so
125
+ the correct fact outranks contradictions via the scoring function alone.
126
+
127
+ ### Methods and Evidence
128
+ - On ICEWS05-15, RoMem-ChronoR reaches 72.6 MRR (vs vanilla ChronoR 68.4).
129
+ - Zero-shot domain transfer to FinTMMBench: 0.728 MRR, 0.673 R@5.
130
+ - All baselines use identical answer LLM and judge for fairness.
131
+
132
+ ### Limits and Open Questions
133
+ - Computational cost at millions-of-facts scale is motivation but no latency,
134
+ memory, or throughput numbers are reported.
135
+ - Gate pretrained only on ICEWS05-15 political events; generalization to
136
+ highly ambiguous relations is not quantified.
137
+ ```
138
+
139
+ A cross-paper `paper_synthesis.md` (excerpt):
140
+
141
+ ```markdown
142
+ ## Strongest Research Signals
143
+
144
+ - Append-only temporal representations improve long-horizon extrapolation:
145
+ RoMem (arXiv:2604.11544), EST (arXiv:2602.12389v3), and CID-TKG converge on
146
+ persistent or dual-view entity state over destructive overwriting, with
147
+ consistent MRR/Hits@K gains on ICEWS and GDELT.
148
+
149
+ - Semantic gating scales better than manual relation tagging: RoMem's Semantic
150
+ Speed Gate and EST's energy-barrier gate both learn relational volatility
151
+ from text embeddings rather than schema tags…
152
+ ```
153
+
154
+ For **multi-topic** literature reviews, stakeholder briefings, or agent grounding, `distill research-brief` (Gemini Deep Research, web-augmented) and `distill synthesize` (Grok 4.20 single-call, corpus-only) take a user-written context file that shapes the output. See [`docs/usage.md#research-briefings-and-deep-synthesis`](docs/usage.md#research-briefings-and-deep-synthesis).
155
+
156
+ ## Dashboard
157
+
158
+ ```bash
159
+ distill # terminal home screen
160
+ distill serve # local web dashboard at http://127.0.0.1:8899
161
+ ```
162
+
163
+ The terminal home screen shows tracked topics, channel and topic watches, recent runs, failures, and rolling spend. The web dashboard adds clickable drill-downs to per-topic, per-channel, and per-video views with rendered markdown, plus cost history and watchlist status. Both auto-refresh and read directly from library files — no database.
164
+
165
+ ## MCP server
166
+
167
+ Claude Desktop / Claude Code config:
168
+
169
+ ```json
170
+ { "mcpServers": { "distill": { "command": "distill-mcp" } } }
171
+ ```
172
+
173
+ Distill exposes 8 tools, 12 resources, and 4 prompts. See [`docs/mcp.md`](docs/mcp.md) for the list.
174
+
175
+ ## Cost
176
+
177
+ Bulk video analysis is essentially free (~$0.006/video). Gemini Deep Research dominates paid reports (~$2–3/report). `distill synthesize` is ~$0.50 for a multi-topic corpus pass. Every run logs actual vs estimated cost to `library/cost_log.jsonl`; `distill costs` shows the history.
178
+
179
+ Full cost model in [`docs/cost.md`](docs/cost.md).
180
+
181
+ ## Docs
182
+
183
+ - [`docs/usage.md`](docs/usage.md) — full command reference
184
+ - [`docs/architecture.md`](docs/architecture.md) — data flow, 4-phase report pipeline, model routing, security hardening
185
+ - [`docs/outputs.md`](docs/outputs.md) — what every artifact contains
186
+ - [`docs/cost.md`](docs/cost.md) — cost model, examples, guardrails
187
+ - [`docs/mcp.md`](docs/mcp.md) — MCP tools, resources, prompts
188
+ - [`docs/briefing-contexts/TEMPLATE.md`](docs/briefing-contexts/TEMPLATE.md) — starting point for `--context-file` prompts
189
+ - [`private/README.md`](private/README.md) — where personal/client-specific files go (git-ignored)
190
+
191
+ ## Roadmap and changelog
192
+
193
+ - [`docs/CHANGELOG.md`](docs/CHANGELOG.md) — what shipped in `0.1.0`
194
+ - [`ROADMAP.md`](ROADMAP.md) — what's next
195
+
196
+ ## Contributing
197
+
198
+ See [`docs/CONTRIBUTING.md`](docs/CONTRIBUTING.md) for dev setup, quality gates, and scope. Security disclosures go through [`docs/SECURITY.md`](docs/SECURITY.md).
199
+
200
+ ## License
201
+
202
+ MIT — see [`LICENSE`](LICENSE).
@@ -0,0 +1 @@
1
+ """Distill — Turn YouTube channels into strategic intelligence."""