npm - @veyralabs/skills - Versions diffs - 0.4.1 → 0.5.1 - Mend

@veyralabs/skills 0.4.1 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/skills/venture-suite/venture-analyst/scripts/scraper.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""
+Scrapling wrapper for venture-analyst.
+Handles competitor pages, pricing pages, and review sites.
+Falls back gracefully on blocked sites.
+"""
+import re
+from typing import Optional
+def scrape_competitor(url: str) -> Optional[dict]:
+    """
+    Scrape a competitor website.
+    Tries basic Fetcher first, falls back to StealthyFetcher for Cloudflare.
+    """
+    try:
+        from scrapling import Fetcher
+        page = Fetcher(auto_match=False).get(url, timeout=15, stealthy_headers=True)
+        if _is_empty(page):
+            return _scrape_stealthy(url)
+        return _parse_competitor_page(url, page)
+    except Exception:
+        return _scrape_stealthy(url)
+def _scrape_stealthy(url: str) -> Optional[dict]:
+    """StealthyFetcher for JS-rendered or Cloudflare-protected sites."""
+    try:
+        from scrapling import StealthyFetcher
+        page = StealthyFetcher(auto_match=False).get(url, timeout=20, network_idle=True)
+        if _is_empty(page):
+            return None
+        return _parse_competitor_page(url, page)
+    except Exception:
+        return None
+def _is_empty(page) -> bool:
+    if page is None:
+        return True
+    try:
+        return len(page.get_all_text()) < 100
+    except Exception:
+        return True
+def _parse_competitor_page(url: str, page) -> dict:
+    return {
+        "url": url,
+        "title":       _get_title(page),
+        "tagline":     _get_tagline(page),
+        "description": _get_meta_description(page),
+        "pricing":     _get_pricing(page),
+        "features":    _get_features(page),
+        "tech_stack":  _get_tech_signals(page),
+    }
+def _get_title(page) -> str:
+    try:
+        return page.css("title").first.text.strip()[:120]
+    except Exception:
+        return ""
+def _get_tagline(page) -> str:
+    """Extract the main hero headline."""
+    selectors = ["h1", "[class*='hero'] h1", "header h1", "[class*='headline']", "[class*='tagline']"]
+    for sel in selectors:
+        try:
+            el = page.css(sel).first
+            if el:
+                text = el.text.strip()
+                if 5 < len(text) < 200:
+                    return text
+        except Exception:
+            pass
+    return ""
+def _get_meta_description(page) -> str:
+    try:
+        el = page.css('meta[name="description"]').first
+        return (el.attrs.get("content") or "")[:300]
+    except Exception:
+        return ""
+def _get_pricing(page) -> dict:
+    """Extract pricing signals from page text."""
+    try:
+        text = page.get_all_text()
+    except Exception:
+        return {}
+    prices = re.findall(
+        r'[\$€£]\s*(\d+(?:[.,]\d+)?)\s*(?:/\s*(?:mo|month|year|yr|user|seat))?',
+        text,
+        re.IGNORECASE,
+    )
+    model_keywords = [
+        "free", "freemium", "free trial", "per user", "per month", "per year",
+        "enterprise", "custom pricing", "contact us", "flat rate",
+    ]
+    detected_model = [kw for kw in model_keywords if kw.lower() in text.lower()]
+    return {
+        "prices": prices[:6],
+        "model_signals": detected_model[:4],
+        "has_free_tier": any(kw in ["free", "freemium", "free trial"] for kw in detected_model),
+    }
+def _get_features(page) -> list[str]:
+    """Extract feature descriptions from bullets and feature sections."""
+    features = []
+    try:
+        for el in page.css("li, [class*='feature'], [class*='benefit']")[:30]:
+            try:
+                text = el.text.strip()
+                if 10 < len(text) < 150 and not text.startswith(("©", "Terms", "Privacy")):
+                    features.append(text)
+            except Exception:
+                pass
+    except Exception:
+        pass
+    return features[:12]
+def _get_tech_signals(page) -> list[str]:
+    """Detect tech stack from script URLs and meta tags."""
+    signals = []
+    TECH_PATTERNS = {
+        "React":        r"react",
+        "Vue":          r"vue\.js|vuejs",
+        "Angular":      r"angular",
+        "Next.js":      r"_next/",
+        "Stripe":       r"js\.stripe\.com",
+        "Intercom":     r"intercom",
+        "Segment":      r"segment\.com",
+        "HubSpot":      r"hubspot",
+        "Webflow":      r"webflow",
+        "Shopify":      r"shopify",
+    }
+    try:
+        html = str(page)
+        for tech, pattern in TECH_PATTERNS.items():
+            if re.search(pattern, html, re.IGNORECASE):
+                signals.append(tech)
+    except Exception:
+        pass
+    return signals
+def scrape_g2_reviews(product_url: str, max_pages: int = 2) -> list[dict]:
+    """
+    Scrape G2 reviews. Requires StealthyFetcher + Playwright.
+    G2 is Cloudflare-protected — basic Fetcher will fail.
+    """
+    reviews = []
+    try:
+        from scrapling import StealthyFetcher
+        for page_num in range(1, max_pages + 1):
+            url = f"{product_url}?page={page_num}"
+            page = StealthyFetcher(auto_match=False).get(url, timeout=25, network_idle=True)
+            if _is_empty(page):
+                break
+            for review in page.css("[itemprop='review'], .paper--white, [class*='review-card']"):
+                try:
+                    reviews.append({
+                        "source": "g2",
+                        "pros": _safe_text(review, "[class*='pros']"),
+                        "cons": _safe_text(review, "[class*='cons']"),
+                        "rating": _safe_attr(review, "[itemprop='ratingValue']", "content"),
+                        "title": _safe_text(review, "h3, [class*='title']"),
+                    })
+                except Exception:
+                    pass
+    except Exception:
+        pass
+    return reviews
+def _safe_text(parent, selector: str) -> str:
+    try:
+        return parent.css(selector).first.text.strip()[:300]
+    except Exception:
+        return ""
+def _safe_attr(parent, selector: str, attr: str) -> str:
+    try:
+        return parent.css(selector).first.attrs.get(attr, "")
+    except Exception:
+        return ""

package/skills/venture-suite/venture-analyst/scripts/sources.py ADDED Viewed

@@ -0,0 +1,288 @@
+"""
+Level 1 data sources for venture-analyst.
+Zero API keys required. Works immediately after install.
+"""
+import time
+import requests
+from typing import Optional
+REDDIT_UA = "venture-analyst/1.0 (open-source research tool; github.com/veyralabsgroup/veyraskills)"
+# ── HN Algolia (no auth, 10k req/hour) ────────────────────────────────────────
+def search_hn(query: str, limit: int = 20) -> list[dict]:
+    """Search Hacker News via Algolia API. Best zero-key source."""
+    url = "https://hn.algolia.com/api/v1/search"
+    params = {
+        "query": query,
+        "hitsPerPage": limit,
+        "tags": "(story,ask_hn,show_hn)",
+    }
+    try:
+        r = requests.get(url, params=params, timeout=10)
+        r.raise_for_status()
+        hits = r.json().get("hits", [])
+        return [
+            {
+                "source": "hackernews",
+                "title": h.get("title") or (h.get("story_text", "")[:80] + "..."),
+                "url": h.get("url") or f"https://news.ycombinator.com/item?id={h.get('objectID')}",
+                "points": h.get("points", 0),
+                "comments": h.get("num_comments", 0),
+                "text": (h.get("story_text") or "")[:400],
+                "author": h.get("author", ""),
+                "date": h.get("created_at", "")[:10],
+            }
+            for h in hits
+        ]
+    except Exception:
+        return []
+def search_hn_comments(query: str, min_points: int = 5, limit: int = 30) -> list[dict]:
+    """Search HN comments — great for finding raw pain and opinions."""
+    url = "https://hn.algolia.com/api/v1/search"
+    params = {
+        "query": query,
+        "hitsPerPage": limit,
+        "tags": "comment",
+        "numericFilters": f"points>{min_points}",
+    }
+    try:
+        r = requests.get(url, params=params, timeout=10)
+        r.raise_for_status()
+        hits = r.json().get("hits", [])
+        return [
+            {
+                "source": "hackernews_comment",
+                "text": (h.get("comment_text") or "")[:500],
+                "url": f"https://news.ycombinator.com/item?id={h.get('objectID')}",
+                "points": h.get("points", 0),
+                "author": h.get("author", ""),
+            }
+            for h in hits if h.get("comment_text")
+        ]
+    except Exception:
+        return []
+# ── Reddit (no auth, custom UA required) ──────────────────────────────────────
+def search_reddit(
+    query: str,
+    subreddits: Optional[list[str]] = None,
+    limit: int = 25,
+    timeframe: str = "year",
+) -> list[dict]:
+    """Search Reddit via public .json endpoint. Custom UA avoids 429s."""
+    results = []
+    headers = {"User-Agent": REDDIT_UA}
+    if subreddits:
+        per_sub = max(5, limit // len(subreddits[:4]))
+        for sub in subreddits[:4]:
+            url = f"https://www.reddit.com/r/{sub}/search.json"
+            params = {"q": query, "sort": "top", "limit": per_sub, "t": timeframe, "restrict_sr": 1}
+            _reddit_fetch(url, params, headers, results)
+            time.sleep(1.2)
+    else:
+        url = "https://www.reddit.com/search.json"
+        params = {"q": query, "sort": "relevance", "limit": limit, "t": timeframe}
+        _reddit_fetch(url, params, headers, results)
+    return results
+def _reddit_fetch(url: str, params: dict, headers: dict, results: list) -> None:
+    try:
+        r = requests.get(url, params=params, headers=headers, timeout=12)
+        if r.status_code == 200:
+            for post in r.json().get("data", {}).get("children", []):
+                d = post.get("data", {})
+                results.append({
+                    "source": "reddit",
+                    "title": d.get("title", ""),
+                    "url": f"https://reddit.com{d.get('permalink', '')}",
+                    "upvotes": d.get("score", 0),
+                    "comments": d.get("num_comments", 0),
+                    "text": (d.get("selftext") or "")[:500],
+                    "subreddit": d.get("subreddit", ""),
+                })
+    except Exception:
+        pass
+# ── GitHub Issues (no auth, 60 req/hour) ──────────────────────────────────────
+def search_github_issues(
+    query: str,
+    limit: int = 20,
+    token: Optional[str] = None,
+) -> list[dict]:
+    """
+    Search GitHub issues for pain points and feature requests.
+    Unauthenticated: 60 req/hour (enough for a single analysis).
+    With GITHUB_TOKEN: 5,000 req/hour.
+    """
+    url = "https://api.github.com/search/issues"
+    params = {
+        "q": f"{query} type:issue",
+        "sort": "reactions",
+        "order": "desc",
+        "per_page": min(limit, 30),
+    }
+    headers = {"Accept": "application/vnd.github.v3+json"}
+    if token:
+        headers["Authorization"] = f"token {token}"
+    try:
+        r = requests.get(url, params=params, headers=headers, timeout=12)
+        if r.status_code != 200:
+            return []
+        return [
+            {
+                "source": "github",
+                "title": i.get("title", ""),
+                "url": i.get("html_url", ""),
+                "reactions": i.get("reactions", {}).get("total_count", 0),
+                "comments": i.get("comments", 0),
+                "text": (i.get("body") or "")[:400],
+                "repo": i.get("repository_url", "").split("/")[-1],
+                "state": i.get("state", ""),
+            }
+            for i in r.json().get("items", [])
+        ]
+    except Exception:
+        return []
+def search_github_repos(
+    query: str,
+    limit: int = 10,
+    token: Optional[str] = None,
+) -> list[dict]:
+    """Find existing repos/tools in the space."""
+    url = "https://api.github.com/search/repositories"
+    params = {
+        "q": query,
+        "sort": "stars",
+        "order": "desc",
+        "per_page": min(limit, 20),
+    }
+    headers = {"Accept": "application/vnd.github.v3+json"}
+    if token:
+        headers["Authorization"] = f"token {token}"
+    try:
+        r = requests.get(url, params=params, headers=headers, timeout=12)
+        if r.status_code != 200:
+            return []
+        return [
+            {
+                "source": "github_repo",
+                "name": repo.get("full_name", ""),
+                "url": repo.get("html_url", ""),
+                "stars": repo.get("stargazers_count", 0),
+                "description": repo.get("description", ""),
+                "language": repo.get("language", ""),
+                "updated": repo.get("updated_at", "")[:10],
+            }
+            for repo in r.json().get("items", [])
+        ]
+    except Exception:
+        return []
+# ── Web search (ddgs, no key) ──────────────────────────────────────────────────
+def search_web(query: str, limit: int = 10) -> list[dict]:
+    """Web search via ddgs. No API key. May rate-limit on heavy use."""
+    try:
+        from ddgs import DDGS
+        results = []
+        with DDGS() as ddgs:
+            for r in ddgs.text(query, max_results=limit):
+                results.append({
+                    "source": "web",
+                    "title": r.get("title", ""),
+                    "url": r.get("href", ""),
+                    "text": r.get("body", ""),
+                })
+        return results
+    except Exception:
+        return []
+# ── Google Trends (no key) ─────────────────────────────────────────────────────
+def get_trends(keyword: str) -> dict:
+    """Google Trends via trendspyg. Fragile but free."""
+    try:
+        from trendspyg import TrendReq
+        pytrends = TrendReq(hl="en-US", tz=360)
+        pytrends.build_payload([keyword], timeframe="today 12-m")
+        data = pytrends.interest_over_time()
+        if data.empty:
+            return {"trend": "no_data", "avg_interest": 0}
+        avg = float(data[keyword].mean())
+        recent = float(data[keyword].iloc[-8:].mean())
+        if recent > avg * 1.25:
+            trend = "rising"
+        elif recent < avg * 0.75:
+            trend = "declining"
+        else:
+            trend = "stable"
+        related = {}
+        try:
+            related_data = pytrends.related_queries()
+            top = related_data.get(keyword, {}).get("top")
+            if top is not None and not top.empty:
+                related = {row["query"]: row["value"] for _, row in top.head(5).iterrows()}
+        except Exception:
+            pass
+        return {
+            "trend": trend,
+            "avg_interest": round(avg, 1),
+            "recent_interest": round(recent, 1),
+            "related_queries": related,
+        }
+    except Exception:
+        return {"trend": "unavailable", "avg_interest": 0}
+# ── Evidence Score ─────────────────────────────────────────────────────────────
+def calculate_evidence_score(results: dict) -> dict:
+    """
+    Score the evidence quality collected across all sources.
+    Returns score 0-100 + breakdown per source.
+    """
+    reddit = results.get("reddit", [])
+    hn = results.get("hackernews", []) + results.get("hackernews_comment", [])
+    github = results.get("github", [])
+    competitors = results.get("competitors", [])
+    trends = results.get("trends", {})
+    # Weighted scoring
+    reddit_pts  = min(len(reddit) * 1.5, 25)
+    hn_pts      = min(len(hn) * 2.5, 25)
+    github_pts  = min(len(github) * 2, 20)
+    comp_pts    = min(len(competitors) * 3, 20)
+    trend_pts   = 10 if trends.get("trend") not in ("unavailable", "no_data") else 0
+    score = int(reddit_pts + hn_pts + github_pts + comp_pts + trend_pts)
+    return {
+        "evidence_score": min(score, 100),
+        "breakdown": {
+            "reddit_mentions": len(reddit),
+            "hn_discussions": len(hn),
+            "github_issues": len(github),
+            "competitors_found": len(competitors),
+            "trend_data": trends.get("trend", "unavailable"),
+        },
+    }

package/skills/venture-suite/venture-analyst/templates/experiment-spec.md ADDED Viewed

@@ -0,0 +1,119 @@
+# Experiment Spec
+Use this template when designing a specific validation experiment. Fill every section — vague experiments produce vague results.
+---
+## Experiment: [name]
+**Idea being validated:** [one sentence]
+**Hypothesis:** If [target customer] experiences [problem], then [% of them] will [take specific action] when shown [this offering].
+**Type:** discovery / demand_signal / value_validation / willingness_to_pay
+---
+### Setup
+**Duration:** [X days/weeks]
+**Budget:** [€0 / €X]
+**Effort:** [hours estimated]
+**Who:** [exact target customer - be specific. "founders" is not specific. "B2B SaaS founders with 1-10 person team, pre-Series A" is specific.]
+**Channel:** [where you'll find them]
+- Primary: [Reddit / LinkedIn / Cold email / specific community]
+- Backup: [if primary fails]
+**What you're showing them:**
+[Landing page URL / message template / demo link / mockup]
+---
+### Metrics
+**Primary metric:** [one number]
+**Target:** [specific threshold that determines pass/fail]
+| Metric | How to measure | Pass threshold | Fail threshold |
+|--------|---------------|----------------|----------------|
+| [primary] | [tool/method] | [number] | [number] |
+| [secondary] | [tool/method] | [number] | [number] |
+**Data collection method:** [Google Analytics / Tally form / manual tracking / Stripe]
+---
+### Scripts and materials
+**Outreach message (cold):**
+```
+Subject: [subject line - no spam words]
+[message body - short, no pitch, curiosity-based]
+```
+**Interview opener:**
+"I'm researching how [people like you] handle [problem area]. Not selling anything — want 15 minutes to understand your current process."
+**Landing page headline:** [outcome they want] without [current pain]
+**CTA text:** [Join waitlist / Get early access / Book a call]
+---
+### Mom Test checklist
+Before running interviews, verify all questions pass the Mom Test:
+- [ ] Questions ask about past behavior, not future intentions
+- [ ] No hypotheticals ("would you use X?" is banned)
+- [ ] No leading questions ("don't you find X frustrating?")
+- [ ] No pitching during problem interviews
+- [ ] Success criteria defined before starting (not after seeing results)
+---
+### Week-by-week plan
+**Week 1:**
+- [ ] [specific task]
+- [ ] [specific task]
+**Week 2:**
+- [ ] [specific task]
+- [ ] Review data against thresholds
+---
+### Results tracking
+| Date | Channel | Contacts/Views | Actions | Conversion |
+|------|---------|----------------|---------|------------|
+|      |         |                |         |            |
+**Running total:**
+- Primary metric: [X / target]
+- Secondary: [X / target]
+---
+### Decision rules
+**If primary metric >= target by [date]:** [proceed to next experiment / begin building]
+**If primary metric < 50% of target by [date]:** [pivot message / pivot target customer / kill experiment]
+**If qualitative signals contradict quantitative:** [investigate further — don't average them out]
+---
+### Post-experiment notes
+**What worked:**
+**What didn't:**
+**Biggest surprise:**
+**Quote from a customer that changed how I think:**
+**Next experiment:** [or "begin building" if validated]