@opendirectory.dev/skills 0.1.37 → 0.1.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/registry.json +8 -0
- package/skills/map-your-market/.env.example +3 -0
- package/skills/map-your-market/README.md +147 -0
- package/skills/map-your-market/SKILL.md +469 -0
- package/skills/map-your-market/evals/evals.json +102 -0
- package/skills/map-your-market/references/icp-signals.md +90 -0
- package/skills/map-your-market/references/pain-scoring.md +85 -0
- package/skills/map-your-market/references/subreddit-map.md +58 -0
- package/skills/map-your-market/scripts/fetch.py +710 -0
|
@@ -0,0 +1,710 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
map-your-market fetch script
|
|
4
|
+
Collects pain signals from Reddit, HN, GitHub Issues, G2, and Google Trends.
|
|
5
|
+
No required API keys. GITHUB_TOKEN optional (improves rate limits).
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 scripts/fetch.py "developer observability" --competitors "Datadog,Grafana" --output /tmp/mym-raw.json
|
|
9
|
+
python3 scripts/fetch.py "B2B analytics" --output results.json --stdout
|
|
10
|
+
GITHUB_TOKEN=your_token python3 scripts/fetch.py "devops tooling" --competitors "New Relic,Datadog"
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import ssl
|
|
18
|
+
import sys
|
|
19
|
+
import time
|
|
20
|
+
import urllib.error
|
|
21
|
+
import urllib.parse
|
|
22
|
+
import urllib.request
|
|
23
|
+
from datetime import datetime, timedelta, timezone
|
|
24
|
+
from html.parser import HTMLParser
|
|
25
|
+
|
|
26
|
+
_ssl_ctx = ssl._create_unverified_context()
|
|
27
|
+
|
|
28
|
+
TODAY = datetime.now(timezone.utc)
|
|
29
|
+
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# HTTP helper
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
def fetch_json(url, headers=None, timeout=20):
|
|
37
|
+
req = urllib.request.Request(url, headers=headers or {})
|
|
38
|
+
req.add_header("User-Agent", "map-your-market-skill/1.0")
|
|
39
|
+
try:
|
|
40
|
+
with urllib.request.urlopen(req, context=_ssl_ctx, timeout=timeout) as r:
|
|
41
|
+
return json.loads(r.read().decode("utf-8"))
|
|
42
|
+
except urllib.error.HTTPError as e:
|
|
43
|
+
if not quiet:
|
|
44
|
+
print(f" HTTP {e.code}: {url[:80]}", file=sys.stderr)
|
|
45
|
+
return None
|
|
46
|
+
except Exception as e:
|
|
47
|
+
if not quiet:
|
|
48
|
+
print(f" Error: {e} -- {url[:80]}", file=sys.stderr)
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def fetch_html(url, timeout=20):
|
|
53
|
+
req = urllib.request.Request(url)
|
|
54
|
+
req.add_header("User-Agent", "Mozilla/5.0 (compatible; map-your-market/1.0)")
|
|
55
|
+
req.add_header("Accept", "text/html,application/xhtml+xml")
|
|
56
|
+
try:
|
|
57
|
+
with urllib.request.urlopen(req, context=_ssl_ctx, timeout=timeout) as r:
|
|
58
|
+
return r.read().decode("utf-8", errors="replace")
|
|
59
|
+
except Exception as e:
|
|
60
|
+
if not quiet:
|
|
61
|
+
print(f" HTML fetch error: {e} -- {url[:80]}", file=sys.stderr)
|
|
62
|
+
return ""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def gh_get(path):
|
|
66
|
+
headers = {"Accept": "application/vnd.github+json"}
|
|
67
|
+
if GITHUB_TOKEN:
|
|
68
|
+
headers["Authorization"] = f"Bearer {GITHUB_TOKEN}"
|
|
69
|
+
return fetch_json(f"https://api.github.com{path}", headers=headers)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
quiet = False
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
# Subreddit detection
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
SUBREDDIT_MAP = {
|
|
80
|
+
"devops": ["devops", "sysadmin", "aws", "kubernetes", "docker"],
|
|
81
|
+
"observability": ["devops", "sysadmin", "dataengineering", "CloudArchitects"],
|
|
82
|
+
"monitoring": ["devops", "sysadmin", "networking", "aws"],
|
|
83
|
+
"analytics": ["analytics", "dataengineering", "datascience", "BusinessIntelligence"],
|
|
84
|
+
"b2b": ["startups", "entrepreneur", "SaaS", "smallbusiness"],
|
|
85
|
+
"saas": ["SaaS", "startups", "entrepreneur", "microsaas"],
|
|
86
|
+
"developer": ["programming", "webdev", "ExperiencedDevs", "devops"],
|
|
87
|
+
"developer tools": ["programming", "webdev", "devops", "ExperiencedDevs"],
|
|
88
|
+
"api": ["webdev", "programming", "devops", "node"],
|
|
89
|
+
"security": ["netsec", "cybersecurity", "devops", "sysadmin"],
|
|
90
|
+
"data": ["dataengineering", "datascience", "analytics", "BusinessIntelligence"],
|
|
91
|
+
"database": ["dataengineering", "Database", "PostgreSQL", "learnprogramming"],
|
|
92
|
+
"auth": ["webdev", "programming", "netsec", "node"],
|
|
93
|
+
"payments": ["webdev", "programming", "entrepreneur", "ecommerce"],
|
|
94
|
+
"ecommerce": ["ecommerce", "entrepreneur", "shopify", "startups"],
|
|
95
|
+
"marketing": ["marketing", "digital_marketing", "entrepreneur", "startups"],
|
|
96
|
+
"crm": ["sales", "salesforce", "entrepreneur", "smallbusiness"],
|
|
97
|
+
"sales": ["sales", "entrepreneur", "startups", "smallbusiness"],
|
|
98
|
+
"hr": ["humanresources", "remotework", "startups", "smallbusiness"],
|
|
99
|
+
"finance": ["personalfinance", "accounting", "startups", "smallbusiness"],
|
|
100
|
+
"healthcare": ["healthIT", "medicine", "startups", "technology"],
|
|
101
|
+
"ai": ["MachineLearning", "artificial", "ChatGPT", "learnmachinelearning"],
|
|
102
|
+
"ml": ["MachineLearning", "learnmachinelearning", "datascience", "artificial"],
|
|
103
|
+
"llm": ["MachineLearning", "artificial", "ChatGPT", "LocalLLaMA"],
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
FALLBACK_SUBREDDITS = ["programming", "webdev", "technology", "startups", "entrepreneur"]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def detect_subreddits(category: str, competitors: list) -> list:
|
|
110
|
+
subs = set()
|
|
111
|
+
cat_lower = category.lower()
|
|
112
|
+
|
|
113
|
+
for keyword, subreddit_list in SUBREDDIT_MAP.items():
|
|
114
|
+
if keyword in cat_lower:
|
|
115
|
+
subs.update(subreddit_list)
|
|
116
|
+
|
|
117
|
+
# Also infer from competitor names
|
|
118
|
+
for comp in competitors:
|
|
119
|
+
comp_lower = comp.lower()
|
|
120
|
+
if any(w in comp_lower for w in ["data", "log", "metric", "monitor", "trace"]):
|
|
121
|
+
subs.update(SUBREDDIT_MAP.get("observability", []))
|
|
122
|
+
if any(w in comp_lower for w in ["pay", "stripe", "billing"]):
|
|
123
|
+
subs.update(SUBREDDIT_MAP.get("payments", []))
|
|
124
|
+
if any(w in comp_lower for w in ["db", "sql", "postgres", "mongo"]):
|
|
125
|
+
subs.update(SUBREDDIT_MAP.get("database", []))
|
|
126
|
+
|
|
127
|
+
if not subs:
|
|
128
|
+
subs.update(FALLBACK_SUBREDDITS)
|
|
129
|
+
|
|
130
|
+
# Always include a broad signal subreddit
|
|
131
|
+
subs.add("technology")
|
|
132
|
+
return list(subs)[:8]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ---------------------------------------------------------------------------
|
|
136
|
+
# Pain scoring
|
|
137
|
+
# ---------------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
def compute_pain_score(source: str, score_val: int, comments: int, created_at: str) -> float:
|
|
140
|
+
try:
|
|
141
|
+
if created_at:
|
|
142
|
+
dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
|
|
143
|
+
days_old = (TODAY - dt).days
|
|
144
|
+
else:
|
|
145
|
+
days_old = 180
|
|
146
|
+
except Exception:
|
|
147
|
+
days_old = 180
|
|
148
|
+
|
|
149
|
+
if days_old < 30:
|
|
150
|
+
recency = 1.0
|
|
151
|
+
elif days_old < 90:
|
|
152
|
+
recency = 0.85
|
|
153
|
+
elif days_old < 180:
|
|
154
|
+
recency = 0.7
|
|
155
|
+
else:
|
|
156
|
+
recency = 0.5
|
|
157
|
+
|
|
158
|
+
if source == "github_issue":
|
|
159
|
+
base = score_val * 3 # reactions -- most deliberate signal
|
|
160
|
+
elif source == "reddit":
|
|
161
|
+
# Cap Reddit base at 500 to prevent viral off-topic posts dominating
|
|
162
|
+
base = min(score_val, 500) + comments * 0.3
|
|
163
|
+
else: # hn
|
|
164
|
+
base = score_val + comments * 0.3
|
|
165
|
+
|
|
166
|
+
return round(base * recency, 1)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
# Reddit
|
|
171
|
+
# ---------------------------------------------------------------------------
|
|
172
|
+
|
|
173
|
+
def build_reddit_queries(category: str, competitors: list) -> list:
|
|
174
|
+
queries = [category]
|
|
175
|
+
# Add competitor names as queries to find complaints
|
|
176
|
+
for comp in competitors[:3]:
|
|
177
|
+
queries.append(comp)
|
|
178
|
+
# Add pain-oriented variants
|
|
179
|
+
cat_words = category.split()[:2]
|
|
180
|
+
if cat_words:
|
|
181
|
+
queries.append(" ".join(cat_words) + " alternative")
|
|
182
|
+
queries.append(" ".join(cat_words) + " problem")
|
|
183
|
+
return list(dict.fromkeys(queries)) # deduplicate preserving order
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def search_reddit(queries: list, subreddits: list, time_filter: str = "year") -> list:
|
|
187
|
+
results = []
|
|
188
|
+
seen_ids = set()
|
|
189
|
+
|
|
190
|
+
def parse_posts(data):
|
|
191
|
+
posts = []
|
|
192
|
+
if not data or "data" not in data:
|
|
193
|
+
return posts
|
|
194
|
+
for child in data["data"].get("children", []):
|
|
195
|
+
p = child.get("data", {})
|
|
196
|
+
post_id = p.get("id", "")
|
|
197
|
+
if not post_id or post_id in seen_ids:
|
|
198
|
+
continue
|
|
199
|
+
seen_ids.add(post_id)
|
|
200
|
+
score_val = p.get("score", 0)
|
|
201
|
+
num_comments = p.get("num_comments", 0)
|
|
202
|
+
created = datetime.fromtimestamp(p.get("created_utc", 0), tz=timezone.utc).isoformat()
|
|
203
|
+
body = (p.get("selftext", "") or "")[:500]
|
|
204
|
+
posts.append({
|
|
205
|
+
"id": post_id,
|
|
206
|
+
"source": "reddit",
|
|
207
|
+
"title": p.get("title", ""),
|
|
208
|
+
"body_excerpt": body,
|
|
209
|
+
"pain_score": compute_pain_score("reddit", score_val, num_comments, created),
|
|
210
|
+
"url": f"https://www.reddit.com{p.get('permalink', '')}",
|
|
211
|
+
"subreddit": p.get("subreddit", ""),
|
|
212
|
+
"score": score_val,
|
|
213
|
+
"comments": num_comments,
|
|
214
|
+
"created_at": created,
|
|
215
|
+
"matched_query": "",
|
|
216
|
+
})
|
|
217
|
+
return posts
|
|
218
|
+
|
|
219
|
+
def is_relevant(post: dict, query: str) -> bool:
|
|
220
|
+
"""Require query words to appear in title or body for basic relevance."""
|
|
221
|
+
query_words = [w.lower() for w in query.split() if len(w) > 3]
|
|
222
|
+
if not query_words:
|
|
223
|
+
return True
|
|
224
|
+
text = (post.get("title", "") + " " + post.get("body_excerpt", "")).lower()
|
|
225
|
+
return any(w in text for w in query_words)
|
|
226
|
+
|
|
227
|
+
# Subreddit-specific search only (more relevant than global search)
|
|
228
|
+
for sub in subreddits[:6]:
|
|
229
|
+
for query in queries[:3]: # top 3 queries per subreddit
|
|
230
|
+
encoded = urllib.parse.quote_plus(query)
|
|
231
|
+
url = f"https://www.reddit.com/r/{sub}/search.json?q={encoded}&sort=top&t={time_filter}&restrict_sr=true&limit=25"
|
|
232
|
+
if not quiet:
|
|
233
|
+
print(f" Reddit r/{sub}: {query!r}", file=sys.stderr)
|
|
234
|
+
data = fetch_json(url, headers={"User-Agent": "map-your-market/1.0"})
|
|
235
|
+
posts = parse_posts(data)
|
|
236
|
+
for p in posts:
|
|
237
|
+
p["matched_query"] = query
|
|
238
|
+
# Relevance filter: skip posts where query words don't appear in title/body
|
|
239
|
+
relevant = [p for p in posts if is_relevant(p, query)]
|
|
240
|
+
results.extend(relevant)
|
|
241
|
+
time.sleep(2)
|
|
242
|
+
|
|
243
|
+
# Filter noise: min pain score 2.0
|
|
244
|
+
results = [r for r in results if r["pain_score"] >= 2.0]
|
|
245
|
+
return results
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# ---------------------------------------------------------------------------
|
|
249
|
+
# Hacker News (Algolia API)
|
|
250
|
+
# ---------------------------------------------------------------------------
|
|
251
|
+
|
|
252
|
+
def search_hn(queries: list, days_back: int = 365) -> list:
|
|
253
|
+
results = []
|
|
254
|
+
seen_ids = set()
|
|
255
|
+
cutoff_ts = int((TODAY - timedelta(days=days_back)).timestamp())
|
|
256
|
+
|
|
257
|
+
for query in queries:
|
|
258
|
+
encoded = urllib.parse.quote_plus(query)
|
|
259
|
+
# Search stories
|
|
260
|
+
url = f"https://hn.algolia.com/api/v1/search?query={encoded}&tags=story&numericFilters=created_at_i>{cutoff_ts}&hitsPerPage=50"
|
|
261
|
+
if not quiet:
|
|
262
|
+
print(f" HN stories: {query!r}", file=sys.stderr)
|
|
263
|
+
data = fetch_json(url)
|
|
264
|
+
if data:
|
|
265
|
+
for hit in data.get("hits", []):
|
|
266
|
+
obj_id = hit.get("objectID", "")
|
|
267
|
+
if not obj_id or obj_id in seen_ids:
|
|
268
|
+
continue
|
|
269
|
+
seen_ids.add(obj_id)
|
|
270
|
+
points = hit.get("points") or 0
|
|
271
|
+
num_comments = hit.get("num_comments") or 0
|
|
272
|
+
if points < 3:
|
|
273
|
+
continue # noise floor
|
|
274
|
+
created = hit.get("created_at", "")
|
|
275
|
+
results.append({
|
|
276
|
+
"id": obj_id,
|
|
277
|
+
"source": "hn",
|
|
278
|
+
"title": hit.get("title", ""),
|
|
279
|
+
"body_excerpt": (hit.get("story_text") or "")[:400],
|
|
280
|
+
"pain_score": compute_pain_score("hn", points, num_comments, created),
|
|
281
|
+
"url": hit.get("url") or f"https://news.ycombinator.com/item?id={obj_id}",
|
|
282
|
+
"subreddit": "",
|
|
283
|
+
"score": points,
|
|
284
|
+
"comments": num_comments,
|
|
285
|
+
"created_at": created,
|
|
286
|
+
"matched_query": query,
|
|
287
|
+
})
|
|
288
|
+
time.sleep(1)
|
|
289
|
+
|
|
290
|
+
# Search Ask HN comments
|
|
291
|
+
url = f"https://hn.algolia.com/api/v1/search?query={encoded}&tags=comment&numericFilters=created_at_i>{cutoff_ts}&hitsPerPage=30"
|
|
292
|
+
data = fetch_json(url)
|
|
293
|
+
if data:
|
|
294
|
+
for hit in data.get("hits", []):
|
|
295
|
+
obj_id = hit.get("objectID", "")
|
|
296
|
+
if not obj_id or obj_id in seen_ids:
|
|
297
|
+
continue
|
|
298
|
+
seen_ids.add(obj_id)
|
|
299
|
+
text = (hit.get("comment_text") or "")[:400]
|
|
300
|
+
if not text or len(text) < 50:
|
|
301
|
+
continue
|
|
302
|
+
created = hit.get("created_at", "")
|
|
303
|
+
results.append({
|
|
304
|
+
"id": obj_id,
|
|
305
|
+
"source": "hn",
|
|
306
|
+
"title": f"HN comment: {text[:80]}...",
|
|
307
|
+
"body_excerpt": text,
|
|
308
|
+
"pain_score": compute_pain_score("hn", 5, 0, created), # comments = low score base
|
|
309
|
+
"url": f"https://news.ycombinator.com/item?id={obj_id}",
|
|
310
|
+
"subreddit": "",
|
|
311
|
+
"score": 5,
|
|
312
|
+
"comments": 0,
|
|
313
|
+
"created_at": created,
|
|
314
|
+
"matched_query": query,
|
|
315
|
+
})
|
|
316
|
+
time.sleep(1)
|
|
317
|
+
|
|
318
|
+
return results
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# ---------------------------------------------------------------------------
|
|
322
|
+
# GitHub Issues
|
|
323
|
+
# ---------------------------------------------------------------------------
|
|
324
|
+
|
|
325
|
+
def search_github_issues(competitors: list, category: str) -> list:
|
|
326
|
+
results = []
|
|
327
|
+
seen = set()
|
|
328
|
+
|
|
329
|
+
# Search for issues mentioning pain keywords in competitor repos
|
|
330
|
+
pain_terms = ["not working", "problem", "issue", "broken", "pricing", "slow", "alternative", "migrate", "annoying", "hate"]
|
|
331
|
+
|
|
332
|
+
for comp in competitors[:4]:
|
|
333
|
+
# Try to find the GitHub repo for this competitor
|
|
334
|
+
encoded = urllib.parse.quote_plus(comp)
|
|
335
|
+
search_url = f"/search/repositories?q={encoded}&sort=stars&per_page=3"
|
|
336
|
+
data = gh_get(search_url)
|
|
337
|
+
time.sleep(0.5)
|
|
338
|
+
if not data or "items" not in data:
|
|
339
|
+
continue
|
|
340
|
+
for repo in data["items"][:2]:
|
|
341
|
+
full_name = repo.get("full_name", "")
|
|
342
|
+
if not full_name:
|
|
343
|
+
continue
|
|
344
|
+
if not quiet:
|
|
345
|
+
print(f" GitHub issues: {full_name}", file=sys.stderr)
|
|
346
|
+
# Fetch top issues by reactions
|
|
347
|
+
issues_url = f"/repos/{full_name}/issues?state=open&sort=reactions&direction=desc&per_page=50"
|
|
348
|
+
issues = gh_get(issues_url)
|
|
349
|
+
time.sleep(0.5)
|
|
350
|
+
if not issues or not isinstance(issues, list):
|
|
351
|
+
continue
|
|
352
|
+
for issue in issues:
|
|
353
|
+
if "pull_request" in issue:
|
|
354
|
+
continue # skip PRs
|
|
355
|
+
reactions = issue.get("reactions", {}).get("+1", 0) or issue.get("reactions", {}).get("total_count", 0) or 0
|
|
356
|
+
if reactions < 2:
|
|
357
|
+
continue # noise floor
|
|
358
|
+
issue_id = str(issue.get("id", ""))
|
|
359
|
+
if issue_id in seen:
|
|
360
|
+
continue
|
|
361
|
+
seen.add(issue_id)
|
|
362
|
+
body = (issue.get("body") or "")[:500]
|
|
363
|
+
created = issue.get("created_at", "")
|
|
364
|
+
results.append({
|
|
365
|
+
"id": issue_id,
|
|
366
|
+
"source": "github_issue",
|
|
367
|
+
"title": issue.get("title", ""),
|
|
368
|
+
"body_excerpt": body,
|
|
369
|
+
"pain_score": compute_pain_score("github_issue", reactions, issue.get("comments", 0), created),
|
|
370
|
+
"url": issue.get("html_url", ""),
|
|
371
|
+
"subreddit": f"github/{full_name}",
|
|
372
|
+
"score": reactions,
|
|
373
|
+
"comments": issue.get("comments", 0),
|
|
374
|
+
"created_at": created,
|
|
375
|
+
"matched_query": comp,
|
|
376
|
+
})
|
|
377
|
+
|
|
378
|
+
# Also do category-based GitHub issue search
|
|
379
|
+
if category:
|
|
380
|
+
encoded = urllib.parse.quote_plus(f"{category} is:issue is:open")
|
|
381
|
+
search_url = f"/search/issues?q={encoded}&sort=reactions&order=desc&per_page=30"
|
|
382
|
+
data = gh_get(search_url)
|
|
383
|
+
time.sleep(0.5)
|
|
384
|
+
if data and "items" in data:
|
|
385
|
+
for issue in data["items"]:
|
|
386
|
+
issue_id = str(issue.get("id", ""))
|
|
387
|
+
if issue_id in seen:
|
|
388
|
+
continue
|
|
389
|
+
seen.add(issue_id)
|
|
390
|
+
reactions = issue.get("reactions", {}).get("+1", 0) or 0
|
|
391
|
+
if reactions < 2:
|
|
392
|
+
continue
|
|
393
|
+
body = (issue.get("body") or "")[:500]
|
|
394
|
+
created = issue.get("created_at", "")
|
|
395
|
+
results.append({
|
|
396
|
+
"id": issue_id,
|
|
397
|
+
"source": "github_issue",
|
|
398
|
+
"title": issue.get("title", ""),
|
|
399
|
+
"body_excerpt": body,
|
|
400
|
+
"pain_score": compute_pain_score("github_issue", reactions, issue.get("comments", 0), created),
|
|
401
|
+
"url": issue.get("html_url", ""),
|
|
402
|
+
"subreddit": "github/search",
|
|
403
|
+
"score": reactions,
|
|
404
|
+
"comments": issue.get("comments", 0),
|
|
405
|
+
"created_at": created,
|
|
406
|
+
"matched_query": category,
|
|
407
|
+
})
|
|
408
|
+
|
|
409
|
+
return results
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
# ---------------------------------------------------------------------------
|
|
413
|
+
# G2 scraper
|
|
414
|
+
# ---------------------------------------------------------------------------
|
|
415
|
+
|
|
416
|
+
class G2Parser(HTMLParser):
|
|
417
|
+
def __init__(self):
|
|
418
|
+
super().__init__()
|
|
419
|
+
self.vendors = []
|
|
420
|
+
self._in_product = False
|
|
421
|
+
self._current = {}
|
|
422
|
+
self._capture_name = False
|
|
423
|
+
|
|
424
|
+
def handle_starttag(self, tag, attrs):
|
|
425
|
+
attrs = dict(attrs)
|
|
426
|
+
cls = attrs.get("class", "")
|
|
427
|
+
if "product-listing" in cls or "product-card" in cls:
|
|
428
|
+
self._in_product = True
|
|
429
|
+
self._current = {}
|
|
430
|
+
if self._in_product and tag == "a" and "product-listing__product-name" in cls:
|
|
431
|
+
self._capture_name = True
|
|
432
|
+
|
|
433
|
+
def handle_endtag(self, tag):
|
|
434
|
+
if tag == "div" and self._in_product and self._current.get("name"):
|
|
435
|
+
self.vendors.append(dict(self._current))
|
|
436
|
+
self._in_product = False
|
|
437
|
+
if self._capture_name:
|
|
438
|
+
self._capture_name = False
|
|
439
|
+
|
|
440
|
+
def handle_data(self, data):
|
|
441
|
+
if self._capture_name and data.strip():
|
|
442
|
+
self._current["name"] = data.strip()
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def scrape_g2_category(category: str) -> dict:
|
|
446
|
+
slug = re.sub(r"[^a-z0-9]+", "-", category.lower()).strip("-")
|
|
447
|
+
urls_to_try = [
|
|
448
|
+
f"https://www.g2.com/categories/{slug}",
|
|
449
|
+
f"https://www.g2.com/software/{slug}/",
|
|
450
|
+
]
|
|
451
|
+
html = ""
|
|
452
|
+
used_url = urls_to_try[0]
|
|
453
|
+
for url in urls_to_try:
|
|
454
|
+
if not quiet:
|
|
455
|
+
print(f" G2: {url}", file=sys.stderr)
|
|
456
|
+
html = fetch_html(url)
|
|
457
|
+
if html:
|
|
458
|
+
used_url = url
|
|
459
|
+
break
|
|
460
|
+
time.sleep(1)
|
|
461
|
+
|
|
462
|
+
if not html:
|
|
463
|
+
# Fallback: search DuckDuckGo for G2 category to get vendor names
|
|
464
|
+
if not quiet:
|
|
465
|
+
print(f" G2 blocked -- trying DuckDuckGo fallback", file=sys.stderr)
|
|
466
|
+
ddg_url = f"https://html.duckduckgo.com/html/?q=site:g2.com+{urllib.parse.quote(category)}+software+reviews"
|
|
467
|
+
html = fetch_html(ddg_url)
|
|
468
|
+
if html:
|
|
469
|
+
# Extract product names from DDG results
|
|
470
|
+
ddg_names = re.findall(r'g2\.com/products/([a-z0-9-]+)/reviews', html)
|
|
471
|
+
vendors = [{"name": n.replace("-", " ").title()} for n in list(dict.fromkeys(ddg_names))[:10]]
|
|
472
|
+
return {"vendor_count_g2": len(vendors), "top_vendors": vendors, "g2_url": f"via DuckDuckGo search"}
|
|
473
|
+
return {"vendor_count_g2": 0, "top_vendors": [], "g2_url": used_url}
|
|
474
|
+
|
|
475
|
+
# Extract vendor count
|
|
476
|
+
vendor_count = 0
|
|
477
|
+
count_match = re.search(r"(\d[\d,]+)\s+(?:products|software|tools|solutions)", html, re.IGNORECASE)
|
|
478
|
+
if count_match:
|
|
479
|
+
vendor_count = int(count_match.group(1).replace(",", ""))
|
|
480
|
+
|
|
481
|
+
# Extract product names
|
|
482
|
+
name_matches = re.findall(r'data-product-name="([^"]+)"', html)
|
|
483
|
+
if not name_matches:
|
|
484
|
+
name_matches = re.findall(r'class="product-listing__product-name[^"]*">([^<]+)<', html)
|
|
485
|
+
if not name_matches:
|
|
486
|
+
name_matches = re.findall(r'"name"\s*:\s*"([^"]{3,50})"', html)
|
|
487
|
+
|
|
488
|
+
rating_matches = re.findall(r'"ratingValue"\s*:\s*"?([\d.]+)"?', html)
|
|
489
|
+
review_matches = re.findall(r'"reviewCount"\s*:\s*"?(\d+)"?', html)
|
|
490
|
+
|
|
491
|
+
top_vendors = []
|
|
492
|
+
for i, name in enumerate(name_matches[:10]):
|
|
493
|
+
vendor = {"name": name.strip()}
|
|
494
|
+
if i < len(rating_matches):
|
|
495
|
+
vendor["rating"] = rating_matches[i]
|
|
496
|
+
if i < len(review_matches):
|
|
497
|
+
vendor["review_count"] = int(review_matches[i])
|
|
498
|
+
top_vendors.append(vendor)
|
|
499
|
+
|
|
500
|
+
if vendor_count == 0 and top_vendors:
|
|
501
|
+
vendor_count = len(top_vendors)
|
|
502
|
+
|
|
503
|
+
return {"vendor_count_g2": vendor_count, "top_vendors": top_vendors, "g2_url": used_url}
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
# ---------------------------------------------------------------------------
|
|
507
|
+
# Google Trends (unofficial endpoint)
|
|
508
|
+
# ---------------------------------------------------------------------------
|
|
509
|
+
|
|
510
|
+
def get_trends_direction(keyword: str) -> dict:
|
|
511
|
+
"""Infer trend direction from HN post frequency as a proxy when Google Trends is unavailable."""
|
|
512
|
+
if not quiet:
|
|
513
|
+
print(f" Trends (via HN frequency): {keyword!r}", file=sys.stderr)
|
|
514
|
+
# Compare HN post counts: older 6 months vs recent 6 months
|
|
515
|
+
try:
|
|
516
|
+
cutoff_old = int((TODAY - timedelta(days=365)).timestamp())
|
|
517
|
+
cutoff_mid = int((TODAY - timedelta(days=180)).timestamp())
|
|
518
|
+
encoded = urllib.parse.quote_plus(keyword)
|
|
519
|
+
|
|
520
|
+
url_old = (f"https://hn.algolia.com/api/v1/search?query={encoded}"
|
|
521
|
+
f"&tags=story&numericFilters=created_at_i>{cutoff_old},created_at_i<{cutoff_mid}&hitsPerPage=1")
|
|
522
|
+
url_new = (f"https://hn.algolia.com/api/v1/search?query={encoded}"
|
|
523
|
+
f"&tags=story&numericFilters=created_at_i>{cutoff_mid}&hitsPerPage=1")
|
|
524
|
+
|
|
525
|
+
data_old = fetch_json(url_old)
|
|
526
|
+
time.sleep(0.5)
|
|
527
|
+
data_new = fetch_json(url_new)
|
|
528
|
+
|
|
529
|
+
count_old = data_old.get("nbHits", 0) if data_old else 0
|
|
530
|
+
count_new = data_new.get("nbHits", 0) if data_new else 0
|
|
531
|
+
|
|
532
|
+
if count_old == 0 and count_new == 0:
|
|
533
|
+
return {"trends_direction": "unknown", "trends_12mo": [], "trends_note": "insufficient HN data"}
|
|
534
|
+
|
|
535
|
+
if count_old == 0:
|
|
536
|
+
direction = "up"
|
|
537
|
+
elif count_new > count_old * 1.2:
|
|
538
|
+
direction = "up"
|
|
539
|
+
elif count_new < count_old * 0.8:
|
|
540
|
+
direction = "down"
|
|
541
|
+
else:
|
|
542
|
+
direction = "flat"
|
|
543
|
+
|
|
544
|
+
return {
|
|
545
|
+
"trends_direction": direction,
|
|
546
|
+
"trends_12mo": [],
|
|
547
|
+
"trends_note": f"HN posts: {count_old} (6-12mo ago) vs {count_new} (last 6mo)",
|
|
548
|
+
}
|
|
549
|
+
except Exception:
|
|
550
|
+
return {"trends_direction": "unknown", "trends_12mo": []}
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
# ---------------------------------------------------------------------------
|
|
554
|
+
# ICP signals extractor
|
|
555
|
+
# ---------------------------------------------------------------------------
|
|
556
|
+
|
|
557
|
+
def extract_icp_signals(reddit_results: list) -> list:
|
|
558
|
+
sub_counts = {}
|
|
559
|
+
sub_scores = {}
|
|
560
|
+
for p in reddit_results:
|
|
561
|
+
sub = p.get("subreddit", "")
|
|
562
|
+
if sub:
|
|
563
|
+
sub_counts[sub] = sub_counts.get(sub, 0) + 1
|
|
564
|
+
sub_scores[sub] = sub_scores.get(sub, 0) + p.get("pain_score", 0)
|
|
565
|
+
|
|
566
|
+
signals = []
|
|
567
|
+
for sub, count in sorted(sub_counts.items(), key=lambda x: -x[1]):
|
|
568
|
+
avg_score = sub_scores[sub] / count if count > 0 else 0
|
|
569
|
+
signals.append({
|
|
570
|
+
"subreddit": sub,
|
|
571
|
+
"post_count": count,
|
|
572
|
+
"avg_pain_score": round(avg_score, 1),
|
|
573
|
+
"total_pain_score": round(sub_scores[sub], 1),
|
|
574
|
+
})
|
|
575
|
+
return signals
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
# ---------------------------------------------------------------------------
|
|
579
|
+
# Main pipeline
|
|
580
|
+
# ---------------------------------------------------------------------------
|
|
581
|
+
|
|
582
|
+
def main():
|
|
583
|
+
global quiet
|
|
584
|
+
|
|
585
|
+
parser = argparse.ArgumentParser(description="Fetch market pain signals for map-your-market skill")
|
|
586
|
+
parser.add_argument("category", help="Market category keywords (e.g. 'developer observability')")
|
|
587
|
+
parser.add_argument("--competitors", "-c", default="", help="Comma-separated competitor names")
|
|
588
|
+
parser.add_argument("--context", default="", help="Product context for output")
|
|
589
|
+
parser.add_argument("--output", "-o", default=None, help="Output JSON file path")
|
|
590
|
+
parser.add_argument("--stdout", action="store_true", help="Print JSON to stdout")
|
|
591
|
+
parser.add_argument("--quiet", "-q", action="store_true", help="Suppress progress output")
|
|
592
|
+
args = parser.parse_args()
|
|
593
|
+
|
|
594
|
+
quiet = args.quiet
|
|
595
|
+
|
|
596
|
+
if not args.output and not args.stdout:
|
|
597
|
+
slug = re.sub(r"[^a-z0-9]+", "-", args.category.lower()).strip("-")
|
|
598
|
+
args.output = f"market-map-{slug}-{TODAY.strftime('%Y-%m-%d')}.json"
|
|
599
|
+
|
|
600
|
+
competitors = [c.strip() for c in args.competitors.split(",") if c.strip()] if args.competitors else []
|
|
601
|
+
|
|
602
|
+
if not quiet:
|
|
603
|
+
print(f"Mapping market: {args.category!r}", file=sys.stderr)
|
|
604
|
+
print(f"Competitors: {competitors or 'none'}", file=sys.stderr)
|
|
605
|
+
|
|
606
|
+
# Detect subreddits
|
|
607
|
+
subreddits = detect_subreddits(args.category, competitors)
|
|
608
|
+
if not quiet:
|
|
609
|
+
print(f"Subreddits: {subreddits}", file=sys.stderr)
|
|
610
|
+
|
|
611
|
+
# Build search queries
|
|
612
|
+
queries = build_reddit_queries(args.category, competitors)
|
|
613
|
+
if not quiet:
|
|
614
|
+
print(f"Queries: {queries}", file=sys.stderr)
|
|
615
|
+
|
|
616
|
+
# --- Reddit ---
|
|
617
|
+
if not quiet:
|
|
618
|
+
print("\n[1/5] Reddit...", file=sys.stderr)
|
|
619
|
+
reddit_results = search_reddit(queries, subreddits)
|
|
620
|
+
if not quiet:
|
|
621
|
+
print(f" Found {len(reddit_results)} Reddit signals", file=sys.stderr)
|
|
622
|
+
|
|
623
|
+
# --- HN ---
|
|
624
|
+
if not quiet:
|
|
625
|
+
print("\n[2/5] Hacker News...", file=sys.stderr)
|
|
626
|
+
hn_queries = [args.category] + competitors[:2]
|
|
627
|
+
hn_results = search_hn(hn_queries)
|
|
628
|
+
if not quiet:
|
|
629
|
+
print(f" Found {len(hn_results)} HN signals", file=sys.stderr)
|
|
630
|
+
|
|
631
|
+
# --- GitHub Issues ---
|
|
632
|
+
if not quiet:
|
|
633
|
+
print("\n[3/5] GitHub Issues...", file=sys.stderr)
|
|
634
|
+
github_results = search_github_issues(competitors, args.category)
|
|
635
|
+
if not quiet:
|
|
636
|
+
print(f" Found {len(github_results)} GitHub issue signals", file=sys.stderr)
|
|
637
|
+
|
|
638
|
+
# --- G2 ---
|
|
639
|
+
if not quiet:
|
|
640
|
+
print("\n[4/5] G2...", file=sys.stderr)
|
|
641
|
+
g2_data = scrape_g2_category(args.category)
|
|
642
|
+
if not quiet:
|
|
643
|
+
print(f" G2 vendors: {g2_data['vendor_count_g2']}", file=sys.stderr)
|
|
644
|
+
|
|
645
|
+
# --- Trends ---
|
|
646
|
+
if not quiet:
|
|
647
|
+
print("\n[5/5] Google Trends...", file=sys.stderr)
|
|
648
|
+
trends = get_trends_direction(args.category)
|
|
649
|
+
if not quiet:
|
|
650
|
+
print(f" Trends direction: {trends['trends_direction']}", file=sys.stderr)
|
|
651
|
+
|
|
652
|
+
# --- Combine and score ---
|
|
653
|
+
all_pains = reddit_results + hn_results + github_results
|
|
654
|
+
all_pains.sort(key=lambda x: x["pain_score"], reverse=True)
|
|
655
|
+
|
|
656
|
+
# ICP signals
|
|
657
|
+
icp_signals = extract_icp_signals(reddit_results)
|
|
658
|
+
|
|
659
|
+
# Build summary
|
|
660
|
+
top20 = all_pains[:20]
|
|
661
|
+
total = len(all_pains)
|
|
662
|
+
|
|
663
|
+
# Competitor mention counts
|
|
664
|
+
competitor_mentioned = {}
|
|
665
|
+
for comp in competitors:
|
|
666
|
+
count = sum(1 for p in all_pains if comp.lower() in (p["title"] + " " + p["body_excerpt"]).lower())
|
|
667
|
+
if count > 0:
|
|
668
|
+
competitor_mentioned[comp] = count
|
|
669
|
+
|
|
670
|
+
output_data = {
|
|
671
|
+
"date": TODAY.strftime("%Y-%m-%d"),
|
|
672
|
+
"category": args.category,
|
|
673
|
+
"competitors": competitors,
|
|
674
|
+
"product_context": args.context,
|
|
675
|
+
"subreddits_searched": subreddits,
|
|
676
|
+
"queries_used": queries,
|
|
677
|
+
"market_signals": {
|
|
678
|
+
"vendor_count_g2": g2_data["vendor_count_g2"],
|
|
679
|
+
"top_vendors": g2_data["top_vendors"],
|
|
680
|
+
"g2_url": g2_data["g2_url"],
|
|
681
|
+
"trends_direction": trends["trends_direction"],
|
|
682
|
+
"trends_12mo": trends["trends_12mo"],
|
|
683
|
+
"hn_signals_found": len(hn_results),
|
|
684
|
+
"reddit_signals_found": len(reddit_results),
|
|
685
|
+
"github_issue_signals": len(github_results),
|
|
686
|
+
},
|
|
687
|
+
"raw_pains": all_pains,
|
|
688
|
+
"icp_signals": icp_signals,
|
|
689
|
+
"summary": {
|
|
690
|
+
"total_pain_signals": total,
|
|
691
|
+
"high_signal": top20,
|
|
692
|
+
"competitor_mentioned": competitor_mentioned,
|
|
693
|
+
},
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
if not quiet:
|
|
697
|
+
print(f"\nTotal signals: {total}", file=sys.stderr)
|
|
698
|
+
print(f"Top pain_score: {all_pains[0]['pain_score'] if all_pains else 0}", file=sys.stderr)
|
|
699
|
+
|
|
700
|
+
if args.stdout:
|
|
701
|
+
print(json.dumps(output_data, indent=2))
|
|
702
|
+
else:
|
|
703
|
+
with open(args.output, "w") as f:
|
|
704
|
+
json.dump(output_data, f, indent=2)
|
|
705
|
+
if not quiet:
|
|
706
|
+
print(f"Output: {args.output}", file=sys.stderr)
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
if __name__ == "__main__":
|
|
710
|
+
main()
|