@seanyao/roll 0.5.0 → 2.602.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +736 -0
- package/LICENSE +21 -0
- package/README.md +65 -165
- package/bin/dream-test-quality-scan +110 -0
- package/bin/roll +15030 -814
- package/conventions/config.yaml +17 -1
- package/conventions/global/AGENTS.md +146 -100
- package/conventions/global/CLAUDE.md +1 -21
- package/conventions/global/GEMINI.md +8 -22
- package/conventions/global/project_rules.md +9 -0
- package/conventions/templates/backend-service/AGENTS.md +30 -81
- package/conventions/templates/backend-service/GEMINI.md +3 -3
- package/conventions/templates/backend-service/project_rules.md +16 -0
- package/conventions/templates/cli/AGENTS.md +31 -58
- package/conventions/templates/cli/CLAUDE.md +3 -5
- package/conventions/templates/cli/GEMINI.md +3 -3
- package/conventions/templates/cli/project_rules.md +16 -0
- package/conventions/templates/frontend-only/AGENTS.md +29 -64
- package/conventions/templates/frontend-only/GEMINI.md +3 -3
- package/conventions/templates/frontend-only/project_rules.md +14 -0
- package/conventions/templates/fullstack/AGENTS.md +31 -79
- package/conventions/templates/fullstack/CLAUDE.md +1 -1
- package/conventions/templates/fullstack/GEMINI.md +3 -3
- package/conventions/templates/fullstack/project_rules.md +15 -0
- package/lib/README.md +42 -0
- package/lib/__pycache__/github_sync.cpython-314.pyc +0 -0
- package/lib/__pycache__/loop-fmt.cpython-314.pyc +0 -0
- package/lib/__pycache__/loop_result_eval.cpython-314.pyc +0 -0
- package/lib/__pycache__/loop_unstick.cpython-314.pyc +0 -0
- package/lib/__pycache__/model_prices.cpython-314.pyc +0 -0
- package/lib/__pycache__/prices_fetcher.cpython-314.pyc +0 -0
- package/lib/__pycache__/roll-home.cpython-314.pyc +0 -0
- package/lib/__pycache__/roll-loop-status.cpython-314.pyc +0 -0
- package/lib/__pycache__/roll_git.cpython-314.pyc +0 -0
- package/lib/__pycache__/roll_render.cpython-314.pyc +0 -0
- package/lib/__pycache__/slides-render.cpython-314.pyc +0 -0
- package/lib/agent_usage/README.md +49 -0
- package/lib/agent_usage/__init__.py +108 -0
- package/lib/agent_usage/__pycache__/__init__.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/gemini.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/kimi.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/openai.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/pi.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/pi_emit.cpython-314.pyc +0 -0
- package/lib/agent_usage/__pycache__/qwen.cpython-314.pyc +0 -0
- package/lib/agent_usage/gemini.py +127 -0
- package/lib/agent_usage/kimi.py +278 -0
- package/lib/agent_usage/kimi_emit.py +123 -0
- package/lib/agent_usage/openai.py +126 -0
- package/lib/agent_usage/pi.py +200 -0
- package/lib/agent_usage/pi_emit.py +135 -0
- package/lib/agent_usage/qwen.py +128 -0
- package/lib/backfill-pi-usage.py +243 -0
- package/lib/changelog_audit.py +155 -0
- package/lib/changelog_generate.py +263 -0
- package/lib/context_feed_budget.sh +194 -0
- package/lib/github_sync.py +876 -0
- package/lib/i18n/README.md +54 -0
- package/lib/i18n/agent.sh +75 -0
- package/lib/i18n/alert.sh +20 -0
- package/lib/i18n/backlog.sh +96 -0
- package/lib/i18n/brief.sh +5 -0
- package/lib/i18n/changelog.sh +5 -0
- package/lib/i18n/ci.sh +15 -0
- package/lib/i18n/debug.sh +0 -0
- package/lib/i18n/doctor.sh +44 -0
- package/lib/i18n/dream.sh +0 -0
- package/lib/i18n/init.sh +91 -0
- package/lib/i18n/lang.sh +10 -0
- package/lib/i18n/loop.sh +140 -0
- package/lib/i18n/migrate.sh +74 -0
- package/lib/i18n/offboard.sh +31 -0
- package/lib/i18n/onboard.sh +0 -0
- package/lib/i18n/peer.sh +41 -0
- package/lib/i18n/peer_help.sh +25 -0
- package/lib/i18n/peer_reset.sh +7 -0
- package/lib/i18n/peer_status.sh +5 -0
- package/lib/i18n/prices.sh +3 -0
- package/lib/i18n/prices_refresh.sh +17 -0
- package/lib/i18n/prices_show.sh +7 -0
- package/lib/i18n/propose.sh +0 -0
- package/lib/i18n/release.sh +0 -0
- package/lib/i18n/research.sh +0 -0
- package/lib/i18n/review_pr.sh +0 -0
- package/lib/i18n/sentinel.sh +0 -0
- package/lib/i18n/setup.sh +3 -0
- package/lib/i18n/shared.sh +157 -0
- package/lib/i18n/skills/roll-brief.sh +47 -0
- package/lib/i18n/skills/roll-build.sh +97 -0
- package/lib/i18n/skills/roll-design.sh +18 -0
- package/lib/i18n/skills/roll-fix.sh +53 -0
- package/lib/i18n/skills/roll-loop.sh +28 -0
- package/lib/i18n/skills/roll-onboard.sh +33 -0
- package/lib/i18n/skills_catalog.sh +30 -0
- package/lib/i18n/slides.sh +3 -0
- package/lib/i18n/slides_build.sh +38 -0
- package/lib/i18n/slides_delete.sh +19 -0
- package/lib/i18n/slides_list.sh +14 -0
- package/lib/i18n/slides_logs.sh +12 -0
- package/lib/i18n/slides_new.sh +15 -0
- package/lib/i18n/slides_preview.sh +14 -0
- package/lib/i18n/slides_templates.sh +7 -0
- package/lib/i18n/status.sh +21 -0
- package/lib/i18n/update.sh +24 -0
- package/lib/i18n.sh +211 -0
- package/lib/loop-exit-summary.py +393 -0
- package/lib/loop-fmt.py +589 -0
- package/lib/loop_pick_agent.py +316 -0
- package/lib/loop_result_eval.py +469 -0
- package/lib/loop_unstick.py +180 -0
- package/lib/model_prices.py +194 -0
- package/lib/prices/README.md +35 -0
- package/lib/prices/snapshot-2026-05-22.json +22 -0
- package/lib/prices/snapshot-2026-05-23-deepseek.json +15 -0
- package/lib/prices/snapshot-2026-05-23-kimi.json +15 -0
- package/lib/prices_fetcher.py +285 -0
- package/lib/roll-backlog.py +225 -0
- package/lib/roll-brief.py +286 -0
- package/lib/roll-help.py +158 -0
- package/lib/roll-home.py +556 -0
- package/lib/roll-init.py +156 -0
- package/lib/roll-loop-status.py +1683 -0
- package/lib/roll-loop-story.py +191 -0
- package/lib/roll-onboard-render.py +378 -0
- package/lib/roll-peer.py +252 -0
- package/lib/roll-plan-validate.py +386 -0
- package/lib/roll-setup.py +102 -0
- package/lib/roll-status.py +367 -0
- package/lib/roll_git.py +41 -0
- package/lib/roll_render.py +414 -0
- package/lib/slides/components/README.md +123 -0
- package/lib/slides/components/cards-2.html +9 -0
- package/lib/slides/components/cards-3.html +9 -0
- package/lib/slides/components/cards-4.html +9 -0
- package/lib/slides/components/compare.html +22 -0
- package/lib/slides/components/highlight.html +9 -0
- package/lib/slides/components/pipeline.html +12 -0
- package/lib/slides/components/plain.html +7 -0
- package/lib/slides/components/quote.html +4 -0
- package/lib/slides/components/timeline.html +9 -0
- package/lib/slides/templates/introduction-v3.html +571 -0
- package/lib/slides/templates/pitch.html +0 -0
- package/lib/slides-render.py +778 -0
- package/lib/slides-validate.py +357 -0
- package/lib/test_quality_gate.py +143 -0
- package/package.json +8 -7
- package/skills/roll-.changelog/SKILL.md +406 -33
- package/skills/roll-.clarify/SKILL.md +5 -2
- package/skills/roll-.dream/SKILL.md +374 -0
- package/skills/roll-.echo/SKILL.md +5 -2
- package/skills/roll-.qa/SKILL.md +57 -3
- package/skills/roll-.review/SKILL.md +42 -3
- package/skills/roll-brief/SKILL.md +209 -0
- package/skills/roll-build/SKILL.md +308 -63
- package/skills/roll-debug/SKILL.md +341 -162
- package/skills/roll-debug/injectable-bb.js +263 -0
- package/skills/roll-deck/SKILL.md +296 -0
- package/skills/roll-design/ENGINEERING_CHECKLIST.md +1 -1
- package/skills/roll-design/SKILL.md +733 -94
- package/skills/roll-doc/SKILL.md +595 -0
- package/skills/roll-doctor/SKILL.md +192 -0
- package/skills/roll-fix/SKILL.md +149 -32
- package/skills/{roll-jot → roll-idea}/SKILL.md +18 -10
- package/skills/roll-loop/SKILL.md +579 -0
- package/skills/roll-notes/SKILL.md +103 -0
- package/skills/roll-onboard/SKILL.md +234 -0
- package/skills/roll-peer/SKILL.md +336 -0
- package/skills/roll-propose/SKILL.md +157 -0
- package/skills/roll-review-pr/SKILL.md +58 -0
- package/skills/roll-sentinel/SKILL.md +11 -2
- package/skills/roll-spar/SKILL.md +8 -6
- package/template/.github/workflows/ci.yml +5 -2
- package/template/AGENTS.md +20 -74
- package/skills/roll-research/SKILL.md +0 -307
- package/skills/roll-research/references/schema.json +0 -162
- package/skills/roll-research/scripts/md_to_pdf.py +0 -289
- package/tools/roll-fetch/SKILL.md +0 -182
- package/tools/roll-fetch/package.json +0 -15
- package/tools/roll-fetch/smart-web-fetch.js +0 -558
- package/tools/roll-probe/SKILL.md +0 -84
- /package/template/{BACKLOG.md → .roll/backlog.md} +0 -0
|
@@ -1,289 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Deep Research Report: Markdown to PDF converter (WeasyPrint)
|
|
4
|
-
Usage: python md_to_pdf.py input.md output.pdf [--title "Report Title"] [--author "Author"]
|
|
5
|
-
|
|
6
|
-
Dependencies: pip install weasyprint markdown --break-system-packages
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import sys
|
|
10
|
-
import os
|
|
11
|
-
import re
|
|
12
|
-
import argparse
|
|
13
|
-
import markdown
|
|
14
|
-
|
|
15
|
-
# ── CSS Styles ──
|
|
16
|
-
CSS_TEMPLATE = """
|
|
17
|
-
@page {
|
|
18
|
-
size: A4;
|
|
19
|
-
margin: 25mm 20mm 20mm 20mm;
|
|
20
|
-
|
|
21
|
-
@top-center {
|
|
22
|
-
content: "HEADER_TEXT";
|
|
23
|
-
font-family: "Droid Sans Fallback", Helvetica, Arial, sans-serif;
|
|
24
|
-
font-size: 8pt;
|
|
25
|
-
color: #95a5a6;
|
|
26
|
-
border-bottom: 0.5pt solid #ecf0f1;
|
|
27
|
-
padding-bottom: 3mm;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
@bottom-center {
|
|
31
|
-
content: "Page " counter(page);
|
|
32
|
-
font-family: "Droid Sans Fallback", Helvetica, Arial, sans-serif;
|
|
33
|
-
font-size: 8pt;
|
|
34
|
-
color: #95a5a6;
|
|
35
|
-
border-top: 0.8pt solid #1a5276;
|
|
36
|
-
padding-top: 2mm;
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
@page :first {
|
|
41
|
-
@top-center { content: none; }
|
|
42
|
-
@bottom-center { content: none; }
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
body {
|
|
46
|
-
font-family: "Droid Sans Fallback", Helvetica, Arial, sans-serif;
|
|
47
|
-
font-size: 10.5pt;
|
|
48
|
-
line-height: 1.75;
|
|
49
|
-
color: #2c3e50;
|
|
50
|
-
text-align: justify;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
/* Cover page */
|
|
54
|
-
.cover {
|
|
55
|
-
page-break-after: always;
|
|
56
|
-
text-align: center;
|
|
57
|
-
padding-top: 45%;
|
|
58
|
-
}
|
|
59
|
-
.cover h1 {
|
|
60
|
-
font-size: 28pt;
|
|
61
|
-
color: #1a5276;
|
|
62
|
-
margin-bottom: 8mm;
|
|
63
|
-
font-weight: bold;
|
|
64
|
-
letter-spacing: 2pt;
|
|
65
|
-
}
|
|
66
|
-
.cover .subtitle {
|
|
67
|
-
font-size: 14pt;
|
|
68
|
-
color: #95a5a6;
|
|
69
|
-
margin-bottom: 6mm;
|
|
70
|
-
}
|
|
71
|
-
.cover .meta {
|
|
72
|
-
font-size: 11pt;
|
|
73
|
-
color: #95a5a6;
|
|
74
|
-
margin-bottom: 4mm;
|
|
75
|
-
}
|
|
76
|
-
.cover .divider {
|
|
77
|
-
width: 60%;
|
|
78
|
-
margin: 8mm auto;
|
|
79
|
-
border: none;
|
|
80
|
-
border-top: 1.5pt solid #1a5276;
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
/* H1 */
|
|
84
|
-
h1 {
|
|
85
|
-
font-size: 20pt;
|
|
86
|
-
color: #1a5276;
|
|
87
|
-
margin-top: 16mm;
|
|
88
|
-
margin-bottom: 6mm;
|
|
89
|
-
padding-bottom: 3mm;
|
|
90
|
-
border-bottom: 2pt solid #1a5276;
|
|
91
|
-
page-break-before: always;
|
|
92
|
-
font-weight: bold;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
/* H2 */
|
|
96
|
-
h2 {
|
|
97
|
-
font-size: 14pt;
|
|
98
|
-
color: #1e8449;
|
|
99
|
-
margin-top: 10mm;
|
|
100
|
-
margin-bottom: 5mm;
|
|
101
|
-
font-weight: bold;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
/* H3 */
|
|
105
|
-
h3 {
|
|
106
|
-
font-size: 12pt;
|
|
107
|
-
color: #2e86c1;
|
|
108
|
-
margin-top: 6mm;
|
|
109
|
-
margin-bottom: 3mm;
|
|
110
|
-
font-weight: bold;
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
h4 {
|
|
114
|
-
font-size: 11pt;
|
|
115
|
-
color: #5b2c6f;
|
|
116
|
-
margin-top: 5mm;
|
|
117
|
-
margin-bottom: 2mm;
|
|
118
|
-
font-weight: bold;
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
/* Paragraphs */
|
|
122
|
-
p {
|
|
123
|
-
margin-top: 1.5mm;
|
|
124
|
-
margin-bottom: 1.5mm;
|
|
125
|
-
orphans: 3;
|
|
126
|
-
widows: 3;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
/* Blockquotes */
|
|
130
|
-
blockquote {
|
|
131
|
-
margin: 4mm 0;
|
|
132
|
-
padding: 4mm 4mm 4mm 10mm;
|
|
133
|
-
background: #f8f9fa;
|
|
134
|
-
border-left: 3pt solid #1a5276;
|
|
135
|
-
color: #5d6d7e;
|
|
136
|
-
font-size: 10pt;
|
|
137
|
-
}
|
|
138
|
-
blockquote p {
|
|
139
|
-
margin: 1mm 0;
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
/* Bold */
|
|
143
|
-
strong, b {
|
|
144
|
-
font-weight: bold;
|
|
145
|
-
color: #1a252f;
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
/* Inline code */
|
|
149
|
-
code {
|
|
150
|
-
font-family: "Courier New", Courier, monospace;
|
|
151
|
-
background: #fdf2e9;
|
|
152
|
-
color: #c0392b;
|
|
153
|
-
padding: 0.5mm 1.5mm;
|
|
154
|
-
border-radius: 2pt;
|
|
155
|
-
font-size: 9.5pt;
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
/* Tables */
|
|
159
|
-
table {
|
|
160
|
-
width: 100%;
|
|
161
|
-
border-collapse: collapse;
|
|
162
|
-
margin: 4mm 0;
|
|
163
|
-
font-size: 9.5pt;
|
|
164
|
-
}
|
|
165
|
-
thead th {
|
|
166
|
-
background: #1a5276;
|
|
167
|
-
color: white;
|
|
168
|
-
padding: 3mm;
|
|
169
|
-
text-align: left;
|
|
170
|
-
font-weight: bold;
|
|
171
|
-
}
|
|
172
|
-
tbody td {
|
|
173
|
-
padding: 2.5mm 3mm;
|
|
174
|
-
border-bottom: 0.5pt solid #bdc3c7;
|
|
175
|
-
}
|
|
176
|
-
tbody tr:nth-child(even) {
|
|
177
|
-
background: #f8f9fa;
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
/* Horizontal rule */
|
|
181
|
-
hr {
|
|
182
|
-
border: none;
|
|
183
|
-
border-top: 0.5pt solid #bdc3c7;
|
|
184
|
-
margin: 4mm 0;
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
/* Lists */
|
|
188
|
-
ul, ol {
|
|
189
|
-
margin: 2mm 0;
|
|
190
|
-
padding-left: 8mm;
|
|
191
|
-
}
|
|
192
|
-
li {
|
|
193
|
-
margin-bottom: 1mm;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
/* Links */
|
|
197
|
-
a {
|
|
198
|
-
color: #2e86c1;
|
|
199
|
-
text-decoration: none;
|
|
200
|
-
}
|
|
201
|
-
"""
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def md_to_html(md_text, title="Deep Research Report", subtitle="Let's roll",
|
|
205
|
-
meta_line="", author="roll"):
|
|
206
|
-
"""Convert Markdown to HTML with cover page"""
|
|
207
|
-
|
|
208
|
-
# Convert body with markdown library
|
|
209
|
-
html_body = markdown.markdown(
|
|
210
|
-
md_text,
|
|
211
|
-
extensions=['tables', 'fenced_code', 'nl2br'],
|
|
212
|
-
output_format='html5'
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
# Extract first H1 for cover (remove from body)
|
|
216
|
-
first_h1_match = re.search(r'<h1>(.*?)</h1>', html_body)
|
|
217
|
-
if first_h1_match:
|
|
218
|
-
extracted_title = first_h1_match.group(1)
|
|
219
|
-
if not title or title == "Deep Research Report":
|
|
220
|
-
title = extracted_title
|
|
221
|
-
html_body = html_body.replace(first_h1_match.group(0), '', 1)
|
|
222
|
-
|
|
223
|
-
# Replace header placeholder in CSS
|
|
224
|
-
css = CSS_TEMPLATE.replace("HEADER_TEXT", f"{title} | Deep Research Report")
|
|
225
|
-
|
|
226
|
-
# Build cover page
|
|
227
|
-
cover_html = f"""
|
|
228
|
-
<div class="cover">
|
|
229
|
-
<h1 style="page-break-before: avoid; border: none;">{title}</h1>
|
|
230
|
-
<div class="subtitle">{subtitle}</div>
|
|
231
|
-
{"<div class='meta'>" + meta_line + "</div>" if meta_line else ""}
|
|
232
|
-
<hr class="divider">
|
|
233
|
-
<div class="meta">Author: {author}</div>
|
|
234
|
-
</div>
|
|
235
|
-
"""
|
|
236
|
-
|
|
237
|
-
full_html = f"""<!DOCTYPE html>
|
|
238
|
-
<html lang="en">
|
|
239
|
-
<head>
|
|
240
|
-
<meta charset="UTF-8">
|
|
241
|
-
<style>{css}</style>
|
|
242
|
-
</head>
|
|
243
|
-
<body>
|
|
244
|
-
{cover_html}
|
|
245
|
-
{html_body}
|
|
246
|
-
</body>
|
|
247
|
-
</html>"""
|
|
248
|
-
|
|
249
|
-
return full_html
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
def main():
|
|
253
|
-
parser = argparse.ArgumentParser(description="Deep Research Report: Markdown to PDF")
|
|
254
|
-
parser.add_argument("input", help="Input Markdown file path")
|
|
255
|
-
parser.add_argument("output", help="Output PDF file path")
|
|
256
|
-
parser.add_argument("--title", default=None, help="Report title")
|
|
257
|
-
parser.add_argument("--author", default="roll", help="Author name")
|
|
258
|
-
parser.add_argument("--subtitle", default="Let's roll", help="Report subtitle")
|
|
259
|
-
args = parser.parse_args()
|
|
260
|
-
|
|
261
|
-
with open(args.input, "r", encoding="utf-8") as f:
|
|
262
|
-
md_text = f.read()
|
|
263
|
-
|
|
264
|
-
# Extract metadata line
|
|
265
|
-
meta_line = ""
|
|
266
|
-
for line in md_text.split("\n"):
|
|
267
|
-
stripped = line.strip().lstrip(">").strip()
|
|
268
|
-
if "research date" in stripped.lower() or "field:" in stripped.lower() or "subject type" in stripped.lower():
|
|
269
|
-
meta_line = stripped
|
|
270
|
-
break
|
|
271
|
-
|
|
272
|
-
html = md_to_html(md_text, title=args.title or "Deep Research Report",
|
|
273
|
-
subtitle=args.subtitle, meta_line=meta_line, author=args.author)
|
|
274
|
-
|
|
275
|
-
# Save intermediate HTML (for debugging)
|
|
276
|
-
html_path = args.output.replace('.pdf', '.html')
|
|
277
|
-
with open(html_path, 'w', encoding='utf-8') as f:
|
|
278
|
-
f.write(html)
|
|
279
|
-
print(f"[OK] HTML generated: {html_path}")
|
|
280
|
-
|
|
281
|
-
# Convert to PDF
|
|
282
|
-
from weasyprint import HTML
|
|
283
|
-
HTML(string=html).write_pdf(args.output)
|
|
284
|
-
size_kb = os.path.getsize(args.output) / 1024
|
|
285
|
-
print(f"[OK] PDF generated: {args.output} ({size_kb:.1f} KB)")
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
if __name__ == "__main__":
|
|
289
|
-
main()
|
|
@@ -1,182 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
hidden: true
|
|
3
|
-
name: roll-fetch
|
|
4
|
-
description: Web page fetching and crawling for AI agents. Extract content from URLs for research, documentation, and competitive analysis.
|
|
5
|
-
---
|
|
6
|
-
|
|
7
|
-
# Roll Fetch - Web Content Extraction
|
|
8
|
-
|
|
9
|
-
Extract content from web pages for research and analysis.
|
|
10
|
-
|
|
11
|
-
## When to Use
|
|
12
|
-
|
|
13
|
-
- Product research (competitor analysis)
|
|
14
|
-
- Technical documentation gathering
|
|
15
|
-
- Code examples and best practices
|
|
16
|
-
- Full site crawling for backup/analysis
|
|
17
|
-
|
|
18
|
-
## Environment Setup
|
|
19
|
-
|
|
20
|
-
Configure API keys per machine:
|
|
21
|
-
|
|
22
|
-
```bash
|
|
23
|
-
# Required for Tavily
|
|
24
|
-
export TAVILY_API_KEY=tvly-dev-...
|
|
25
|
-
|
|
26
|
-
# Optional for cloud browser fallback
|
|
27
|
-
export BROWSER_USE_API_KEY=bu-...
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
Or create `.env` file in project root:
|
|
31
|
-
```
|
|
32
|
-
TAVILY_API_KEY=tvly-dev-...
|
|
33
|
-
BROWSER_USE_API_KEY=bu-...
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
## Methods
|
|
37
|
-
|
|
38
|
-
### 1. Tavily API (Recommended)
|
|
39
|
-
|
|
40
|
-
Best quality extraction, requires `TAVILY_API_KEY`.
|
|
41
|
-
|
|
42
|
-
```bash
|
|
43
|
-
# Using Tavily CLI or API
|
|
44
|
-
curl -X POST https://api.tavily.com/extract \
|
|
45
|
-
-H "Content-Type: application/json" \
|
|
46
|
-
-d '{
|
|
47
|
-
"urls": ["https://example.com"],
|
|
48
|
-
"api_key": "your_tavily_api_key"
|
|
49
|
-
}'
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
**Pros**: AI-optimized extraction, handles complex layouts
|
|
53
|
-
**Cons**: Requires API key, rate limited
|
|
54
|
-
|
|
55
|
-
### 2. LLM Native Fetch (Default)
|
|
56
|
-
|
|
57
|
-
Use your built-in URL fetching capability directly.
|
|
58
|
-
|
|
59
|
-
**When to use**: When Tavily is unavailable or for quick checks.
|
|
60
|
-
|
|
61
|
-
**Note**: Most modern AI agents (Kimi, Codex, Claude) have native URL fetching. Use `FetchURL` tool or equivalent.
|
|
62
|
-
|
|
63
|
-
### 3. Browser Automation (Fallback)
|
|
64
|
-
|
|
65
|
-
Local browser automation for stubborn pages using **[browser-use](https://github.com/browser-use/browser-use)**.
|
|
66
|
-
|
|
67
|
-
**How to Choose:**
|
|
68
|
-
|
|
69
|
-
| If | Then Use | Why |
|
|
70
|
-
|----|---------|-----|
|
|
71
|
-
| `BROWSER_USE_API_KEY` in env | **Cloud** | Managed browsers, less setup |
|
|
72
|
-
| No API key, but `browser-use` installed | **Local** | Free, no external dependency |
|
|
73
|
-
| Neither | Skip to manual extraction | Tell user "Need browser automation setup" |
|
|
74
|
-
|
|
75
|
-
**Option A: Local (Free, No API Key)**
|
|
76
|
-
```python
|
|
77
|
-
from browser_use import Agent, Browser, BrowserConfig
|
|
78
|
-
import asyncio
|
|
79
|
-
|
|
80
|
-
async def fetch_page(url):
|
|
81
|
-
# Pure local, no API key needed
|
|
82
|
-
browser = Browser(config=BrowserConfig(headless=True))
|
|
83
|
-
await browser.start()
|
|
84
|
-
page = await browser.get_current_page()
|
|
85
|
-
await page.goto(url)
|
|
86
|
-
content = await page.content()
|
|
87
|
-
await browser.stop()
|
|
88
|
-
return content
|
|
89
|
-
|
|
90
|
-
# Run
|
|
91
|
-
content = asyncio.run(fetch_page("https://example.com"))
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
**Option B: Cloud API**
|
|
95
|
-
```python
|
|
96
|
-
from browser_use import Agent
|
|
97
|
-
|
|
98
|
-
agent = Agent(
|
|
99
|
-
task=f"Extract the main content from {url} and return as markdown",
|
|
100
|
-
llm="moonshot" # or openai, anthropic
|
|
101
|
-
)
|
|
102
|
-
result = await agent.run()
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
**Setup** (Local):
|
|
106
|
-
```bash
|
|
107
|
-
pip install browser-use
|
|
108
|
-
playwright install chromium
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
## Usage
|
|
112
|
-
|
|
113
|
-
### CLI Usage (via smart-web-fetch.js)
|
|
114
|
-
|
|
115
|
-
```bash
|
|
116
|
-
# Auto mode (Tavily → Native → Browser)
|
|
117
|
-
node smart-web-fetch.js fetch https://example.com
|
|
118
|
-
|
|
119
|
-
# Explicit method
|
|
120
|
-
node smart-web-fetch.js fetch https://example.com tavily
|
|
121
|
-
node smart-web-fetch.js fetch https://example.com native
|
|
122
|
-
node smart-web-fetch.js fetch https://example.com browser
|
|
123
|
-
|
|
124
|
-
# Search
|
|
125
|
-
node smart-web-fetch.js search "Python async" 5
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
### Programmatic Usage
|
|
129
|
-
|
|
130
|
-
```javascript
|
|
131
|
-
const { smartFetch, smartSearch } = require('./smart-web-fetch.js');
|
|
132
|
-
|
|
133
|
-
// Fetch a page
|
|
134
|
-
const result = await smartFetch('https://example.com');
|
|
135
|
-
console.log(result.content);
|
|
136
|
-
|
|
137
|
-
// Search
|
|
138
|
-
const searchResult = await smartSearch('OpenAI GPT-5', 5);
|
|
139
|
-
console.log(searchResult.results);
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
### Single Page Fetch
|
|
143
|
-
|
|
144
|
-
```
|
|
145
|
-
User: "Fetch https://docs.example.com/api"
|
|
146
|
-
→ Use smart-web-fetch.js with auto mode
|
|
147
|
-
→ Return clean markdown content
|
|
148
|
-
```
|
|
149
|
-
|
|
150
|
-
### Full Site Crawl
|
|
151
|
-
|
|
152
|
-
```
|
|
153
|
-
User: "Crawl https://docs.example.com"
|
|
154
|
-
→ Use smart-web-fetch.js recursively
|
|
155
|
-
→ Extract all internal links
|
|
156
|
-
→ Recursively fetch up to max depth (default: 2)
|
|
157
|
-
→ Save each page as separate markdown file
|
|
158
|
-
```
|
|
159
|
-
|
|
160
|
-
## Output Format
|
|
161
|
-
|
|
162
|
-
Always return clean Markdown:
|
|
163
|
-
- Extract main content only (remove nav, ads, footers)
|
|
164
|
-
- Preserve code blocks and tables
|
|
165
|
-
- Include source URL as header
|
|
166
|
-
|
|
167
|
-
## Quality Check
|
|
168
|
-
|
|
169
|
-
Validate extracted content:
|
|
170
|
-
- Min length: 500 chars (reject if shorter)
|
|
171
|
-
- Check for captcha/error messages
|
|
172
|
-
- Verify main content structure (headings, paragraphs)
|
|
173
|
-
|
|
174
|
-
## Examples
|
|
175
|
-
|
|
176
|
-
| Task | Method | Command |
|
|
177
|
-
|------|--------|---------|
|
|
178
|
-
| Quick article | Auto | `node smart-web-fetch.js fetch https://blog.example.com` |
|
|
179
|
-
| API docs | Tavily | `node smart-web-fetch.js fetch https://docs.example.com tavily` |
|
|
180
|
-
| SPA site | Browser | `node smart-web-fetch.js fetch https://spa.example.com browser` |
|
|
181
|
-
| Search | Tavily | `node smart-web-fetch.js search "Python async" 5` |
|
|
182
|
-
| Fallback test | Native | `node smart-web-fetch.js fetch https://example.com native` |
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "smart-web-fetch",
|
|
3
|
-
"version": "1.0.0",
|
|
4
|
-
"description": "Intelligent web fetching with automatic Tavily → Scrapling fallback",
|
|
5
|
-
"main": "smart-web-fetch.js",
|
|
6
|
-
"bin": {
|
|
7
|
-
"smart-web-fetch": "./smart-web-fetch.js"
|
|
8
|
-
},
|
|
9
|
-
"scripts": {
|
|
10
|
-
"test": "node smart-web-fetch.js fetch https://example.com"
|
|
11
|
-
},
|
|
12
|
-
"keywords": ["web-scraping", "tavily", "scrapling", "fallback"],
|
|
13
|
-
"author": "R0_lobster",
|
|
14
|
-
"license": "MIT"
|
|
15
|
-
}
|