@veyralabs/skills 0.4.1 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,194 @@
1
+ """
2
+ Scrapling wrapper for venture-analyst.
3
+ Handles competitor pages, pricing pages, and review sites.
4
+ Falls back gracefully on blocked sites.
5
+ """
6
+ import re
7
+ from typing import Optional
8
+
9
+
10
+ def scrape_competitor(url: str) -> Optional[dict]:
11
+ """
12
+ Scrape a competitor website.
13
+ Tries basic Fetcher first, falls back to StealthyFetcher for Cloudflare.
14
+ """
15
+ try:
16
+ from scrapling import Fetcher
17
+ page = Fetcher(auto_match=False).get(url, timeout=15, stealthy_headers=True)
18
+ if _is_empty(page):
19
+ return _scrape_stealthy(url)
20
+ return _parse_competitor_page(url, page)
21
+ except Exception:
22
+ return _scrape_stealthy(url)
23
+
24
+
25
+ def _scrape_stealthy(url: str) -> Optional[dict]:
26
+ """StealthyFetcher for JS-rendered or Cloudflare-protected sites."""
27
+ try:
28
+ from scrapling import StealthyFetcher
29
+ page = StealthyFetcher(auto_match=False).get(url, timeout=20, network_idle=True)
30
+ if _is_empty(page):
31
+ return None
32
+ return _parse_competitor_page(url, page)
33
+ except Exception:
34
+ return None
35
+
36
+
37
+ def _is_empty(page) -> bool:
38
+ if page is None:
39
+ return True
40
+ try:
41
+ return len(page.get_all_text()) < 100
42
+ except Exception:
43
+ return True
44
+
45
+
46
+ def _parse_competitor_page(url: str, page) -> dict:
47
+ return {
48
+ "url": url,
49
+ "title": _get_title(page),
50
+ "tagline": _get_tagline(page),
51
+ "description": _get_meta_description(page),
52
+ "pricing": _get_pricing(page),
53
+ "features": _get_features(page),
54
+ "tech_stack": _get_tech_signals(page),
55
+ }
56
+
57
+
58
+ def _get_title(page) -> str:
59
+ try:
60
+ return page.css("title").first.text.strip()[:120]
61
+ except Exception:
62
+ return ""
63
+
64
+
65
+ def _get_tagline(page) -> str:
66
+ """Extract the main hero headline."""
67
+ selectors = ["h1", "[class*='hero'] h1", "header h1", "[class*='headline']", "[class*='tagline']"]
68
+ for sel in selectors:
69
+ try:
70
+ el = page.css(sel).first
71
+ if el:
72
+ text = el.text.strip()
73
+ if 5 < len(text) < 200:
74
+ return text
75
+ except Exception:
76
+ pass
77
+ return ""
78
+
79
+
80
+ def _get_meta_description(page) -> str:
81
+ try:
82
+ el = page.css('meta[name="description"]').first
83
+ return (el.attrs.get("content") or "")[:300]
84
+ except Exception:
85
+ return ""
86
+
87
+
88
+ def _get_pricing(page) -> dict:
89
+ """Extract pricing signals from page text."""
90
+ try:
91
+ text = page.get_all_text()
92
+ except Exception:
93
+ return {}
94
+
95
+ prices = re.findall(
96
+ r'[\$€£]\s*(\d+(?:[.,]\d+)?)\s*(?:/\s*(?:mo|month|year|yr|user|seat))?',
97
+ text,
98
+ re.IGNORECASE,
99
+ )
100
+ model_keywords = [
101
+ "free", "freemium", "free trial", "per user", "per month", "per year",
102
+ "enterprise", "custom pricing", "contact us", "flat rate",
103
+ ]
104
+ detected_model = [kw for kw in model_keywords if kw.lower() in text.lower()]
105
+
106
+ return {
107
+ "prices": prices[:6],
108
+ "model_signals": detected_model[:4],
109
+ "has_free_tier": any(kw in ["free", "freemium", "free trial"] for kw in detected_model),
110
+ }
111
+
112
+
113
+ def _get_features(page) -> list[str]:
114
+ """Extract feature descriptions from bullets and feature sections."""
115
+ features = []
116
+ try:
117
+ for el in page.css("li, [class*='feature'], [class*='benefit']")[:30]:
118
+ try:
119
+ text = el.text.strip()
120
+ if 10 < len(text) < 150 and not text.startswith(("©", "Terms", "Privacy")):
121
+ features.append(text)
122
+ except Exception:
123
+ pass
124
+ except Exception:
125
+ pass
126
+ return features[:12]
127
+
128
+
129
+ def _get_tech_signals(page) -> list[str]:
130
+ """Detect tech stack from script URLs and meta tags."""
131
+ signals = []
132
+ TECH_PATTERNS = {
133
+ "React": r"react",
134
+ "Vue": r"vue\.js|vuejs",
135
+ "Angular": r"angular",
136
+ "Next.js": r"_next/",
137
+ "Stripe": r"js\.stripe\.com",
138
+ "Intercom": r"intercom",
139
+ "Segment": r"segment\.com",
140
+ "HubSpot": r"hubspot",
141
+ "Webflow": r"webflow",
142
+ "Shopify": r"shopify",
143
+ }
144
+ try:
145
+ html = str(page)
146
+ for tech, pattern in TECH_PATTERNS.items():
147
+ if re.search(pattern, html, re.IGNORECASE):
148
+ signals.append(tech)
149
+ except Exception:
150
+ pass
151
+ return signals
152
+
153
+
154
+ def scrape_g2_reviews(product_url: str, max_pages: int = 2) -> list[dict]:
155
+ """
156
+ Scrape G2 reviews. Requires StealthyFetcher + Playwright.
157
+ G2 is Cloudflare-protected — basic Fetcher will fail.
158
+ """
159
+ reviews = []
160
+ try:
161
+ from scrapling import StealthyFetcher
162
+ for page_num in range(1, max_pages + 1):
163
+ url = f"{product_url}?page={page_num}"
164
+ page = StealthyFetcher(auto_match=False).get(url, timeout=25, network_idle=True)
165
+ if _is_empty(page):
166
+ break
167
+ for review in page.css("[itemprop='review'], .paper--white, [class*='review-card']"):
168
+ try:
169
+ reviews.append({
170
+ "source": "g2",
171
+ "pros": _safe_text(review, "[class*='pros']"),
172
+ "cons": _safe_text(review, "[class*='cons']"),
173
+ "rating": _safe_attr(review, "[itemprop='ratingValue']", "content"),
174
+ "title": _safe_text(review, "h3, [class*='title']"),
175
+ })
176
+ except Exception:
177
+ pass
178
+ except Exception:
179
+ pass
180
+ return reviews
181
+
182
+
183
+ def _safe_text(parent, selector: str) -> str:
184
+ try:
185
+ return parent.css(selector).first.text.strip()[:300]
186
+ except Exception:
187
+ return ""
188
+
189
+
190
+ def _safe_attr(parent, selector: str, attr: str) -> str:
191
+ try:
192
+ return parent.css(selector).first.attrs.get(attr, "")
193
+ except Exception:
194
+ return ""
@@ -0,0 +1,288 @@
1
+ """
2
+ Level 1 data sources for venture-analyst.
3
+ Zero API keys required. Works immediately after install.
4
+ """
5
+ import time
6
+ import requests
7
+ from typing import Optional
8
+
9
+ REDDIT_UA = "venture-analyst/1.0 (open-source research tool; github.com/veyralabsgroup/veyraskills)"
10
+
11
+
12
+ # ── HN Algolia (no auth, 10k req/hour) ────────────────────────────────────────
13
+
14
+ def search_hn(query: str, limit: int = 20) -> list[dict]:
15
+ """Search Hacker News via Algolia API. Best zero-key source."""
16
+ url = "https://hn.algolia.com/api/v1/search"
17
+ params = {
18
+ "query": query,
19
+ "hitsPerPage": limit,
20
+ "tags": "(story,ask_hn,show_hn)",
21
+ }
22
+ try:
23
+ r = requests.get(url, params=params, timeout=10)
24
+ r.raise_for_status()
25
+ hits = r.json().get("hits", [])
26
+ return [
27
+ {
28
+ "source": "hackernews",
29
+ "title": h.get("title") or (h.get("story_text", "")[:80] + "..."),
30
+ "url": h.get("url") or f"https://news.ycombinator.com/item?id={h.get('objectID')}",
31
+ "points": h.get("points", 0),
32
+ "comments": h.get("num_comments", 0),
33
+ "text": (h.get("story_text") or "")[:400],
34
+ "author": h.get("author", ""),
35
+ "date": h.get("created_at", "")[:10],
36
+ }
37
+ for h in hits
38
+ ]
39
+ except Exception:
40
+ return []
41
+
42
+
43
+ def search_hn_comments(query: str, min_points: int = 5, limit: int = 30) -> list[dict]:
44
+ """Search HN comments — great for finding raw pain and opinions."""
45
+ url = "https://hn.algolia.com/api/v1/search"
46
+ params = {
47
+ "query": query,
48
+ "hitsPerPage": limit,
49
+ "tags": "comment",
50
+ "numericFilters": f"points>{min_points}",
51
+ }
52
+ try:
53
+ r = requests.get(url, params=params, timeout=10)
54
+ r.raise_for_status()
55
+ hits = r.json().get("hits", [])
56
+ return [
57
+ {
58
+ "source": "hackernews_comment",
59
+ "text": (h.get("comment_text") or "")[:500],
60
+ "url": f"https://news.ycombinator.com/item?id={h.get('objectID')}",
61
+ "points": h.get("points", 0),
62
+ "author": h.get("author", ""),
63
+ }
64
+ for h in hits if h.get("comment_text")
65
+ ]
66
+ except Exception:
67
+ return []
68
+
69
+
70
+ # ── Reddit (no auth, custom UA required) ──────────────────────────────────────
71
+
72
+ def search_reddit(
73
+ query: str,
74
+ subreddits: Optional[list[str]] = None,
75
+ limit: int = 25,
76
+ timeframe: str = "year",
77
+ ) -> list[dict]:
78
+ """Search Reddit via public .json endpoint. Custom UA avoids 429s."""
79
+ results = []
80
+ headers = {"User-Agent": REDDIT_UA}
81
+
82
+ if subreddits:
83
+ per_sub = max(5, limit // len(subreddits[:4]))
84
+ for sub in subreddits[:4]:
85
+ url = f"https://www.reddit.com/r/{sub}/search.json"
86
+ params = {"q": query, "sort": "top", "limit": per_sub, "t": timeframe, "restrict_sr": 1}
87
+ _reddit_fetch(url, params, headers, results)
88
+ time.sleep(1.2)
89
+ else:
90
+ url = "https://www.reddit.com/search.json"
91
+ params = {"q": query, "sort": "relevance", "limit": limit, "t": timeframe}
92
+ _reddit_fetch(url, params, headers, results)
93
+
94
+ return results
95
+
96
+
97
+ def _reddit_fetch(url: str, params: dict, headers: dict, results: list) -> None:
98
+ try:
99
+ r = requests.get(url, params=params, headers=headers, timeout=12)
100
+ if r.status_code == 200:
101
+ for post in r.json().get("data", {}).get("children", []):
102
+ d = post.get("data", {})
103
+ results.append({
104
+ "source": "reddit",
105
+ "title": d.get("title", ""),
106
+ "url": f"https://reddit.com{d.get('permalink', '')}",
107
+ "upvotes": d.get("score", 0),
108
+ "comments": d.get("num_comments", 0),
109
+ "text": (d.get("selftext") or "")[:500],
110
+ "subreddit": d.get("subreddit", ""),
111
+ })
112
+ except Exception:
113
+ pass
114
+
115
+
116
+ # ── GitHub Issues (no auth, 60 req/hour) ──────────────────────────────────────
117
+
118
+ def search_github_issues(
119
+ query: str,
120
+ limit: int = 20,
121
+ token: Optional[str] = None,
122
+ ) -> list[dict]:
123
+ """
124
+ Search GitHub issues for pain points and feature requests.
125
+ Unauthenticated: 60 req/hour (enough for a single analysis).
126
+ With GITHUB_TOKEN: 5,000 req/hour.
127
+ """
128
+ url = "https://api.github.com/search/issues"
129
+ params = {
130
+ "q": f"{query} type:issue",
131
+ "sort": "reactions",
132
+ "order": "desc",
133
+ "per_page": min(limit, 30),
134
+ }
135
+ headers = {"Accept": "application/vnd.github.v3+json"}
136
+ if token:
137
+ headers["Authorization"] = f"token {token}"
138
+
139
+ try:
140
+ r = requests.get(url, params=params, headers=headers, timeout=12)
141
+ if r.status_code != 200:
142
+ return []
143
+ return [
144
+ {
145
+ "source": "github",
146
+ "title": i.get("title", ""),
147
+ "url": i.get("html_url", ""),
148
+ "reactions": i.get("reactions", {}).get("total_count", 0),
149
+ "comments": i.get("comments", 0),
150
+ "text": (i.get("body") or "")[:400],
151
+ "repo": i.get("repository_url", "").split("/")[-1],
152
+ "state": i.get("state", ""),
153
+ }
154
+ for i in r.json().get("items", [])
155
+ ]
156
+ except Exception:
157
+ return []
158
+
159
+
160
+ def search_github_repos(
161
+ query: str,
162
+ limit: int = 10,
163
+ token: Optional[str] = None,
164
+ ) -> list[dict]:
165
+ """Find existing repos/tools in the space."""
166
+ url = "https://api.github.com/search/repositories"
167
+ params = {
168
+ "q": query,
169
+ "sort": "stars",
170
+ "order": "desc",
171
+ "per_page": min(limit, 20),
172
+ }
173
+ headers = {"Accept": "application/vnd.github.v3+json"}
174
+ if token:
175
+ headers["Authorization"] = f"token {token}"
176
+
177
+ try:
178
+ r = requests.get(url, params=params, headers=headers, timeout=12)
179
+ if r.status_code != 200:
180
+ return []
181
+ return [
182
+ {
183
+ "source": "github_repo",
184
+ "name": repo.get("full_name", ""),
185
+ "url": repo.get("html_url", ""),
186
+ "stars": repo.get("stargazers_count", 0),
187
+ "description": repo.get("description", ""),
188
+ "language": repo.get("language", ""),
189
+ "updated": repo.get("updated_at", "")[:10],
190
+ }
191
+ for repo in r.json().get("items", [])
192
+ ]
193
+ except Exception:
194
+ return []
195
+
196
+
197
+ # ── Web search (ddgs, no key) ──────────────────────────────────────────────────
198
+
199
+ def search_web(query: str, limit: int = 10) -> list[dict]:
200
+ """Web search via ddgs. No API key. May rate-limit on heavy use."""
201
+ try:
202
+ from ddgs import DDGS
203
+ results = []
204
+ with DDGS() as ddgs:
205
+ for r in ddgs.text(query, max_results=limit):
206
+ results.append({
207
+ "source": "web",
208
+ "title": r.get("title", ""),
209
+ "url": r.get("href", ""),
210
+ "text": r.get("body", ""),
211
+ })
212
+ return results
213
+ except Exception:
214
+ return []
215
+
216
+
217
+ # ── Google Trends (no key) ─────────────────────────────────────────────────────
218
+
219
+ def get_trends(keyword: str) -> dict:
220
+ """Google Trends via trendspyg. Fragile but free."""
221
+ try:
222
+ from trendspyg import TrendReq
223
+ pytrends = TrendReq(hl="en-US", tz=360)
224
+ pytrends.build_payload([keyword], timeframe="today 12-m")
225
+ data = pytrends.interest_over_time()
226
+ if data.empty:
227
+ return {"trend": "no_data", "avg_interest": 0}
228
+
229
+ avg = float(data[keyword].mean())
230
+ recent = float(data[keyword].iloc[-8:].mean())
231
+ if recent > avg * 1.25:
232
+ trend = "rising"
233
+ elif recent < avg * 0.75:
234
+ trend = "declining"
235
+ else:
236
+ trend = "stable"
237
+
238
+ related = {}
239
+ try:
240
+ related_data = pytrends.related_queries()
241
+ top = related_data.get(keyword, {}).get("top")
242
+ if top is not None and not top.empty:
243
+ related = {row["query"]: row["value"] for _, row in top.head(5).iterrows()}
244
+ except Exception:
245
+ pass
246
+
247
+ return {
248
+ "trend": trend,
249
+ "avg_interest": round(avg, 1),
250
+ "recent_interest": round(recent, 1),
251
+ "related_queries": related,
252
+ }
253
+ except Exception:
254
+ return {"trend": "unavailable", "avg_interest": 0}
255
+
256
+
257
+ # ── Evidence Score ─────────────────────────────────────────────────────────────
258
+
259
+ def calculate_evidence_score(results: dict) -> dict:
260
+ """
261
+ Score the evidence quality collected across all sources.
262
+ Returns score 0-100 + breakdown per source.
263
+ """
264
+ reddit = results.get("reddit", [])
265
+ hn = results.get("hackernews", []) + results.get("hackernews_comment", [])
266
+ github = results.get("github", [])
267
+ competitors = results.get("competitors", [])
268
+ trends = results.get("trends", {})
269
+
270
+ # Weighted scoring
271
+ reddit_pts = min(len(reddit) * 1.5, 25)
272
+ hn_pts = min(len(hn) * 2.5, 25)
273
+ github_pts = min(len(github) * 2, 20)
274
+ comp_pts = min(len(competitors) * 3, 20)
275
+ trend_pts = 10 if trends.get("trend") not in ("unavailable", "no_data") else 0
276
+
277
+ score = int(reddit_pts + hn_pts + github_pts + comp_pts + trend_pts)
278
+
279
+ return {
280
+ "evidence_score": min(score, 100),
281
+ "breakdown": {
282
+ "reddit_mentions": len(reddit),
283
+ "hn_discussions": len(hn),
284
+ "github_issues": len(github),
285
+ "competitors_found": len(competitors),
286
+ "trend_data": trends.get("trend", "unavailable"),
287
+ },
288
+ }
@@ -0,0 +1,119 @@
1
+ # Experiment Spec
2
+
3
+ Use this template when designing a specific validation experiment. Fill every section — vague experiments produce vague results.
4
+
5
+ ---
6
+
7
+ ## Experiment: [name]
8
+
9
+ **Idea being validated:** [one sentence]
10
+ **Hypothesis:** If [target customer] experiences [problem], then [% of them] will [take specific action] when shown [this offering].
11
+ **Type:** discovery / demand_signal / value_validation / willingness_to_pay
12
+
13
+ ---
14
+
15
+ ### Setup
16
+
17
+ **Duration:** [X days/weeks]
18
+ **Budget:** [€0 / €X]
19
+ **Effort:** [hours estimated]
20
+
21
+ **Who:** [exact target customer - be specific. "founders" is not specific. "B2B SaaS founders with 1-10 person team, pre-Series A" is specific.]
22
+
23
+ **Channel:** [where you'll find them]
24
+ - Primary: [Reddit / LinkedIn / Cold email / specific community]
25
+ - Backup: [if primary fails]
26
+
27
+ **What you're showing them:**
28
+ [Landing page URL / message template / demo link / mockup]
29
+
30
+ ---
31
+
32
+ ### Metrics
33
+
34
+ **Primary metric:** [one number]
35
+ **Target:** [specific threshold that determines pass/fail]
36
+
37
+ | Metric | How to measure | Pass threshold | Fail threshold |
38
+ |--------|---------------|----------------|----------------|
39
+ | [primary] | [tool/method] | [number] | [number] |
40
+ | [secondary] | [tool/method] | [number] | [number] |
41
+
42
+ **Data collection method:** [Google Analytics / Tally form / manual tracking / Stripe]
43
+
44
+ ---
45
+
46
+ ### Scripts and materials
47
+
48
+ **Outreach message (cold):**
49
+ ```
50
+ Subject: [subject line - no spam words]
51
+
52
+ [message body - short, no pitch, curiosity-based]
53
+ ```
54
+
55
+ **Interview opener:**
56
+ "I'm researching how [people like you] handle [problem area]. Not selling anything — want 15 minutes to understand your current process."
57
+
58
+ **Landing page headline:** [outcome they want] without [current pain]
59
+ **CTA text:** [Join waitlist / Get early access / Book a call]
60
+
61
+ ---
62
+
63
+ ### Mom Test checklist
64
+
65
+ Before running interviews, verify all questions pass the Mom Test:
66
+
67
+ - [ ] Questions ask about past behavior, not future intentions
68
+ - [ ] No hypotheticals ("would you use X?" is banned)
69
+ - [ ] No leading questions ("don't you find X frustrating?")
70
+ - [ ] No pitching during problem interviews
71
+ - [ ] Success criteria defined before starting (not after seeing results)
72
+
73
+ ---
74
+
75
+ ### Week-by-week plan
76
+
77
+ **Week 1:**
78
+ - [ ] [specific task]
79
+ - [ ] [specific task]
80
+
81
+ **Week 2:**
82
+ - [ ] [specific task]
83
+ - [ ] Review data against thresholds
84
+
85
+ ---
86
+
87
+ ### Results tracking
88
+
89
+ | Date | Channel | Contacts/Views | Actions | Conversion |
90
+ |------|---------|----------------|---------|------------|
91
+ | | | | | |
92
+
93
+ **Running total:**
94
+ - Primary metric: [X / target]
95
+ - Secondary: [X / target]
96
+
97
+ ---
98
+
99
+ ### Decision rules
100
+
101
+ **If primary metric >= target by [date]:** [proceed to next experiment / begin building]
102
+
103
+ **If primary metric < 50% of target by [date]:** [pivot message / pivot target customer / kill experiment]
104
+
105
+ **If qualitative signals contradict quantitative:** [investigate further — don't average them out]
106
+
107
+ ---
108
+
109
+ ### Post-experiment notes
110
+
111
+ **What worked:**
112
+
113
+ **What didn't:**
114
+
115
+ **Biggest surprise:**
116
+
117
+ **Quote from a customer that changed how I think:**
118
+
119
+ **Next experiment:** [or "begin building" if validated]