@opendirectory.dev/skills 0.1.36 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,710 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ map-your-market fetch script
4
+ Collects pain signals from Reddit, HN, GitHub Issues, G2, and Google Trends.
5
+ No required API keys. GITHUB_TOKEN optional (improves rate limits).
6
+
7
+ Usage:
8
+ python3 scripts/fetch.py "developer observability" --competitors "Datadog,Grafana" --output /tmp/mym-raw.json
9
+ python3 scripts/fetch.py "B2B analytics" --output results.json --stdout
10
+ GITHUB_TOKEN=your_token python3 scripts/fetch.py "devops tooling" --competitors "New Relic,Datadog"
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import os
16
+ import re
17
+ import ssl
18
+ import sys
19
+ import time
20
+ import urllib.error
21
+ import urllib.parse
22
+ import urllib.request
23
+ from datetime import datetime, timedelta, timezone
24
+ from html.parser import HTMLParser
25
+
26
+ _ssl_ctx = ssl._create_unverified_context()
27
+
28
+ TODAY = datetime.now(timezone.utc)
29
+ GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # HTTP helper
34
+ # ---------------------------------------------------------------------------
35
+
36
+ def fetch_json(url, headers=None, timeout=20):
37
+ req = urllib.request.Request(url, headers=headers or {})
38
+ req.add_header("User-Agent", "map-your-market-skill/1.0")
39
+ try:
40
+ with urllib.request.urlopen(req, context=_ssl_ctx, timeout=timeout) as r:
41
+ return json.loads(r.read().decode("utf-8"))
42
+ except urllib.error.HTTPError as e:
43
+ if not quiet:
44
+ print(f" HTTP {e.code}: {url[:80]}", file=sys.stderr)
45
+ return None
46
+ except Exception as e:
47
+ if not quiet:
48
+ print(f" Error: {e} -- {url[:80]}", file=sys.stderr)
49
+ return None
50
+
51
+
52
+ def fetch_html(url, timeout=20):
53
+ req = urllib.request.Request(url)
54
+ req.add_header("User-Agent", "Mozilla/5.0 (compatible; map-your-market/1.0)")
55
+ req.add_header("Accept", "text/html,application/xhtml+xml")
56
+ try:
57
+ with urllib.request.urlopen(req, context=_ssl_ctx, timeout=timeout) as r:
58
+ return r.read().decode("utf-8", errors="replace")
59
+ except Exception as e:
60
+ if not quiet:
61
+ print(f" HTML fetch error: {e} -- {url[:80]}", file=sys.stderr)
62
+ return ""
63
+
64
+
65
+ def gh_get(path):
66
+ headers = {"Accept": "application/vnd.github+json"}
67
+ if GITHUB_TOKEN:
68
+ headers["Authorization"] = f"Bearer {GITHUB_TOKEN}"
69
+ return fetch_json(f"https://api.github.com{path}", headers=headers)
70
+
71
+
72
+ quiet = False
73
+
74
+
75
+ # ---------------------------------------------------------------------------
76
+ # Subreddit detection
77
+ # ---------------------------------------------------------------------------
78
+
79
+ SUBREDDIT_MAP = {
80
+ "devops": ["devops", "sysadmin", "aws", "kubernetes", "docker"],
81
+ "observability": ["devops", "sysadmin", "dataengineering", "CloudArchitects"],
82
+ "monitoring": ["devops", "sysadmin", "networking", "aws"],
83
+ "analytics": ["analytics", "dataengineering", "datascience", "BusinessIntelligence"],
84
+ "b2b": ["startups", "entrepreneur", "SaaS", "smallbusiness"],
85
+ "saas": ["SaaS", "startups", "entrepreneur", "microsaas"],
86
+ "developer": ["programming", "webdev", "ExperiencedDevs", "devops"],
87
+ "developer tools": ["programming", "webdev", "devops", "ExperiencedDevs"],
88
+ "api": ["webdev", "programming", "devops", "node"],
89
+ "security": ["netsec", "cybersecurity", "devops", "sysadmin"],
90
+ "data": ["dataengineering", "datascience", "analytics", "BusinessIntelligence"],
91
+ "database": ["dataengineering", "Database", "PostgreSQL", "learnprogramming"],
92
+ "auth": ["webdev", "programming", "netsec", "node"],
93
+ "payments": ["webdev", "programming", "entrepreneur", "ecommerce"],
94
+ "ecommerce": ["ecommerce", "entrepreneur", "shopify", "startups"],
95
+ "marketing": ["marketing", "digital_marketing", "entrepreneur", "startups"],
96
+ "crm": ["sales", "salesforce", "entrepreneur", "smallbusiness"],
97
+ "sales": ["sales", "entrepreneur", "startups", "smallbusiness"],
98
+ "hr": ["humanresources", "remotework", "startups", "smallbusiness"],
99
+ "finance": ["personalfinance", "accounting", "startups", "smallbusiness"],
100
+ "healthcare": ["healthIT", "medicine", "startups", "technology"],
101
+ "ai": ["MachineLearning", "artificial", "ChatGPT", "learnmachinelearning"],
102
+ "ml": ["MachineLearning", "learnmachinelearning", "datascience", "artificial"],
103
+ "llm": ["MachineLearning", "artificial", "ChatGPT", "LocalLLaMA"],
104
+ }
105
+
106
+ FALLBACK_SUBREDDITS = ["programming", "webdev", "technology", "startups", "entrepreneur"]
107
+
108
+
109
+ def detect_subreddits(category: str, competitors: list) -> list:
110
+ subs = set()
111
+ cat_lower = category.lower()
112
+
113
+ for keyword, subreddit_list in SUBREDDIT_MAP.items():
114
+ if keyword in cat_lower:
115
+ subs.update(subreddit_list)
116
+
117
+ # Also infer from competitor names
118
+ for comp in competitors:
119
+ comp_lower = comp.lower()
120
+ if any(w in comp_lower for w in ["data", "log", "metric", "monitor", "trace"]):
121
+ subs.update(SUBREDDIT_MAP.get("observability", []))
122
+ if any(w in comp_lower for w in ["pay", "stripe", "billing"]):
123
+ subs.update(SUBREDDIT_MAP.get("payments", []))
124
+ if any(w in comp_lower for w in ["db", "sql", "postgres", "mongo"]):
125
+ subs.update(SUBREDDIT_MAP.get("database", []))
126
+
127
+ if not subs:
128
+ subs.update(FALLBACK_SUBREDDITS)
129
+
130
+ # Always include a broad signal subreddit
131
+ subs.add("technology")
132
+ return list(subs)[:8]
133
+
134
+
135
+ # ---------------------------------------------------------------------------
136
+ # Pain scoring
137
+ # ---------------------------------------------------------------------------
138
+
139
+ def compute_pain_score(source: str, score_val: int, comments: int, created_at: str) -> float:
140
+ try:
141
+ if created_at:
142
+ dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
143
+ days_old = (TODAY - dt).days
144
+ else:
145
+ days_old = 180
146
+ except Exception:
147
+ days_old = 180
148
+
149
+ if days_old < 30:
150
+ recency = 1.0
151
+ elif days_old < 90:
152
+ recency = 0.85
153
+ elif days_old < 180:
154
+ recency = 0.7
155
+ else:
156
+ recency = 0.5
157
+
158
+ if source == "github_issue":
159
+ base = score_val * 3 # reactions -- most deliberate signal
160
+ elif source == "reddit":
161
+ # Cap Reddit base at 500 to prevent viral off-topic posts dominating
162
+ base = min(score_val, 500) + comments * 0.3
163
+ else: # hn
164
+ base = score_val + comments * 0.3
165
+
166
+ return round(base * recency, 1)
167
+
168
+
169
+ # ---------------------------------------------------------------------------
170
+ # Reddit
171
+ # ---------------------------------------------------------------------------
172
+
173
+ def build_reddit_queries(category: str, competitors: list) -> list:
174
+ queries = [category]
175
+ # Add competitor names as queries to find complaints
176
+ for comp in competitors[:3]:
177
+ queries.append(comp)
178
+ # Add pain-oriented variants
179
+ cat_words = category.split()[:2]
180
+ if cat_words:
181
+ queries.append(" ".join(cat_words) + " alternative")
182
+ queries.append(" ".join(cat_words) + " problem")
183
+ return list(dict.fromkeys(queries)) # deduplicate preserving order
184
+
185
+
186
+ def search_reddit(queries: list, subreddits: list, time_filter: str = "year") -> list:
187
+ results = []
188
+ seen_ids = set()
189
+
190
+ def parse_posts(data):
191
+ posts = []
192
+ if not data or "data" not in data:
193
+ return posts
194
+ for child in data["data"].get("children", []):
195
+ p = child.get("data", {})
196
+ post_id = p.get("id", "")
197
+ if not post_id or post_id in seen_ids:
198
+ continue
199
+ seen_ids.add(post_id)
200
+ score_val = p.get("score", 0)
201
+ num_comments = p.get("num_comments", 0)
202
+ created = datetime.fromtimestamp(p.get("created_utc", 0), tz=timezone.utc).isoformat()
203
+ body = (p.get("selftext", "") or "")[:500]
204
+ posts.append({
205
+ "id": post_id,
206
+ "source": "reddit",
207
+ "title": p.get("title", ""),
208
+ "body_excerpt": body,
209
+ "pain_score": compute_pain_score("reddit", score_val, num_comments, created),
210
+ "url": f"https://www.reddit.com{p.get('permalink', '')}",
211
+ "subreddit": p.get("subreddit", ""),
212
+ "score": score_val,
213
+ "comments": num_comments,
214
+ "created_at": created,
215
+ "matched_query": "",
216
+ })
217
+ return posts
218
+
219
+ def is_relevant(post: dict, query: str) -> bool:
220
+ """Require query words to appear in title or body for basic relevance."""
221
+ query_words = [w.lower() for w in query.split() if len(w) > 3]
222
+ if not query_words:
223
+ return True
224
+ text = (post.get("title", "") + " " + post.get("body_excerpt", "")).lower()
225
+ return any(w in text for w in query_words)
226
+
227
+ # Subreddit-specific search only (more relevant than global search)
228
+ for sub in subreddits[:6]:
229
+ for query in queries[:3]: # top 3 queries per subreddit
230
+ encoded = urllib.parse.quote_plus(query)
231
+ url = f"https://www.reddit.com/r/{sub}/search.json?q={encoded}&sort=top&t={time_filter}&restrict_sr=true&limit=25"
232
+ if not quiet:
233
+ print(f" Reddit r/{sub}: {query!r}", file=sys.stderr)
234
+ data = fetch_json(url, headers={"User-Agent": "map-your-market/1.0"})
235
+ posts = parse_posts(data)
236
+ for p in posts:
237
+ p["matched_query"] = query
238
+ # Relevance filter: skip posts where query words don't appear in title/body
239
+ relevant = [p for p in posts if is_relevant(p, query)]
240
+ results.extend(relevant)
241
+ time.sleep(2)
242
+
243
+ # Filter noise: min pain score 2.0
244
+ results = [r for r in results if r["pain_score"] >= 2.0]
245
+ return results
246
+
247
+
248
+ # ---------------------------------------------------------------------------
249
+ # Hacker News (Algolia API)
250
+ # ---------------------------------------------------------------------------
251
+
252
+ def search_hn(queries: list, days_back: int = 365) -> list:
253
+ results = []
254
+ seen_ids = set()
255
+ cutoff_ts = int((TODAY - timedelta(days=days_back)).timestamp())
256
+
257
+ for query in queries:
258
+ encoded = urllib.parse.quote_plus(query)
259
+ # Search stories
260
+ url = f"https://hn.algolia.com/api/v1/search?query={encoded}&tags=story&numericFilters=created_at_i>{cutoff_ts}&hitsPerPage=50"
261
+ if not quiet:
262
+ print(f" HN stories: {query!r}", file=sys.stderr)
263
+ data = fetch_json(url)
264
+ if data:
265
+ for hit in data.get("hits", []):
266
+ obj_id = hit.get("objectID", "")
267
+ if not obj_id or obj_id in seen_ids:
268
+ continue
269
+ seen_ids.add(obj_id)
270
+ points = hit.get("points") or 0
271
+ num_comments = hit.get("num_comments") or 0
272
+ if points < 3:
273
+ continue # noise floor
274
+ created = hit.get("created_at", "")
275
+ results.append({
276
+ "id": obj_id,
277
+ "source": "hn",
278
+ "title": hit.get("title", ""),
279
+ "body_excerpt": (hit.get("story_text") or "")[:400],
280
+ "pain_score": compute_pain_score("hn", points, num_comments, created),
281
+ "url": hit.get("url") or f"https://news.ycombinator.com/item?id={obj_id}",
282
+ "subreddit": "",
283
+ "score": points,
284
+ "comments": num_comments,
285
+ "created_at": created,
286
+ "matched_query": query,
287
+ })
288
+ time.sleep(1)
289
+
290
+ # Search Ask HN comments
291
+ url = f"https://hn.algolia.com/api/v1/search?query={encoded}&tags=comment&numericFilters=created_at_i>{cutoff_ts}&hitsPerPage=30"
292
+ data = fetch_json(url)
293
+ if data:
294
+ for hit in data.get("hits", []):
295
+ obj_id = hit.get("objectID", "")
296
+ if not obj_id or obj_id in seen_ids:
297
+ continue
298
+ seen_ids.add(obj_id)
299
+ text = (hit.get("comment_text") or "")[:400]
300
+ if not text or len(text) < 50:
301
+ continue
302
+ created = hit.get("created_at", "")
303
+ results.append({
304
+ "id": obj_id,
305
+ "source": "hn",
306
+ "title": f"HN comment: {text[:80]}...",
307
+ "body_excerpt": text,
308
+ "pain_score": compute_pain_score("hn", 5, 0, created), # comments = low score base
309
+ "url": f"https://news.ycombinator.com/item?id={obj_id}",
310
+ "subreddit": "",
311
+ "score": 5,
312
+ "comments": 0,
313
+ "created_at": created,
314
+ "matched_query": query,
315
+ })
316
+ time.sleep(1)
317
+
318
+ return results
319
+
320
+
321
+ # ---------------------------------------------------------------------------
322
+ # GitHub Issues
323
+ # ---------------------------------------------------------------------------
324
+
325
+ def search_github_issues(competitors: list, category: str) -> list:
326
+ results = []
327
+ seen = set()
328
+
329
+ # Search for issues mentioning pain keywords in competitor repos
330
+ pain_terms = ["not working", "problem", "issue", "broken", "pricing", "slow", "alternative", "migrate", "annoying", "hate"]
331
+
332
+ for comp in competitors[:4]:
333
+ # Try to find the GitHub repo for this competitor
334
+ encoded = urllib.parse.quote_plus(comp)
335
+ search_url = f"/search/repositories?q={encoded}&sort=stars&per_page=3"
336
+ data = gh_get(search_url)
337
+ time.sleep(0.5)
338
+ if not data or "items" not in data:
339
+ continue
340
+ for repo in data["items"][:2]:
341
+ full_name = repo.get("full_name", "")
342
+ if not full_name:
343
+ continue
344
+ if not quiet:
345
+ print(f" GitHub issues: {full_name}", file=sys.stderr)
346
+ # Fetch top issues by reactions
347
+ issues_url = f"/repos/{full_name}/issues?state=open&sort=reactions&direction=desc&per_page=50"
348
+ issues = gh_get(issues_url)
349
+ time.sleep(0.5)
350
+ if not issues or not isinstance(issues, list):
351
+ continue
352
+ for issue in issues:
353
+ if "pull_request" in issue:
354
+ continue # skip PRs
355
+ reactions = issue.get("reactions", {}).get("+1", 0) or issue.get("reactions", {}).get("total_count", 0) or 0
356
+ if reactions < 2:
357
+ continue # noise floor
358
+ issue_id = str(issue.get("id", ""))
359
+ if issue_id in seen:
360
+ continue
361
+ seen.add(issue_id)
362
+ body = (issue.get("body") or "")[:500]
363
+ created = issue.get("created_at", "")
364
+ results.append({
365
+ "id": issue_id,
366
+ "source": "github_issue",
367
+ "title": issue.get("title", ""),
368
+ "body_excerpt": body,
369
+ "pain_score": compute_pain_score("github_issue", reactions, issue.get("comments", 0), created),
370
+ "url": issue.get("html_url", ""),
371
+ "subreddit": f"github/{full_name}",
372
+ "score": reactions,
373
+ "comments": issue.get("comments", 0),
374
+ "created_at": created,
375
+ "matched_query": comp,
376
+ })
377
+
378
+ # Also do category-based GitHub issue search
379
+ if category:
380
+ encoded = urllib.parse.quote_plus(f"{category} is:issue is:open")
381
+ search_url = f"/search/issues?q={encoded}&sort=reactions&order=desc&per_page=30"
382
+ data = gh_get(search_url)
383
+ time.sleep(0.5)
384
+ if data and "items" in data:
385
+ for issue in data["items"]:
386
+ issue_id = str(issue.get("id", ""))
387
+ if issue_id in seen:
388
+ continue
389
+ seen.add(issue_id)
390
+ reactions = issue.get("reactions", {}).get("+1", 0) or 0
391
+ if reactions < 2:
392
+ continue
393
+ body = (issue.get("body") or "")[:500]
394
+ created = issue.get("created_at", "")
395
+ results.append({
396
+ "id": issue_id,
397
+ "source": "github_issue",
398
+ "title": issue.get("title", ""),
399
+ "body_excerpt": body,
400
+ "pain_score": compute_pain_score("github_issue", reactions, issue.get("comments", 0), created),
401
+ "url": issue.get("html_url", ""),
402
+ "subreddit": "github/search",
403
+ "score": reactions,
404
+ "comments": issue.get("comments", 0),
405
+ "created_at": created,
406
+ "matched_query": category,
407
+ })
408
+
409
+ return results
410
+
411
+
412
+ # ---------------------------------------------------------------------------
413
+ # G2 scraper
414
+ # ---------------------------------------------------------------------------
415
+
416
+ class G2Parser(HTMLParser):
417
+ def __init__(self):
418
+ super().__init__()
419
+ self.vendors = []
420
+ self._in_product = False
421
+ self._current = {}
422
+ self._capture_name = False
423
+
424
+ def handle_starttag(self, tag, attrs):
425
+ attrs = dict(attrs)
426
+ cls = attrs.get("class", "")
427
+ if "product-listing" in cls or "product-card" in cls:
428
+ self._in_product = True
429
+ self._current = {}
430
+ if self._in_product and tag == "a" and "product-listing__product-name" in cls:
431
+ self._capture_name = True
432
+
433
+ def handle_endtag(self, tag):
434
+ if tag == "div" and self._in_product and self._current.get("name"):
435
+ self.vendors.append(dict(self._current))
436
+ self._in_product = False
437
+ if self._capture_name:
438
+ self._capture_name = False
439
+
440
+ def handle_data(self, data):
441
+ if self._capture_name and data.strip():
442
+ self._current["name"] = data.strip()
443
+
444
+
445
+ def scrape_g2_category(category: str) -> dict:
446
+ slug = re.sub(r"[^a-z0-9]+", "-", category.lower()).strip("-")
447
+ urls_to_try = [
448
+ f"https://www.g2.com/categories/{slug}",
449
+ f"https://www.g2.com/software/{slug}/",
450
+ ]
451
+ html = ""
452
+ used_url = urls_to_try[0]
453
+ for url in urls_to_try:
454
+ if not quiet:
455
+ print(f" G2: {url}", file=sys.stderr)
456
+ html = fetch_html(url)
457
+ if html:
458
+ used_url = url
459
+ break
460
+ time.sleep(1)
461
+
462
+ if not html:
463
+ # Fallback: search DuckDuckGo for G2 category to get vendor names
464
+ if not quiet:
465
+ print(f" G2 blocked -- trying DuckDuckGo fallback", file=sys.stderr)
466
+ ddg_url = f"https://html.duckduckgo.com/html/?q=site:g2.com+{urllib.parse.quote(category)}+software+reviews"
467
+ html = fetch_html(ddg_url)
468
+ if html:
469
+ # Extract product names from DDG results
470
+ ddg_names = re.findall(r'g2\.com/products/([a-z0-9-]+)/reviews', html)
471
+ vendors = [{"name": n.replace("-", " ").title()} for n in list(dict.fromkeys(ddg_names))[:10]]
472
+ return {"vendor_count_g2": len(vendors), "top_vendors": vendors, "g2_url": f"via DuckDuckGo search"}
473
+ return {"vendor_count_g2": 0, "top_vendors": [], "g2_url": used_url}
474
+
475
+ # Extract vendor count
476
+ vendor_count = 0
477
+ count_match = re.search(r"(\d[\d,]+)\s+(?:products|software|tools|solutions)", html, re.IGNORECASE)
478
+ if count_match:
479
+ vendor_count = int(count_match.group(1).replace(",", ""))
480
+
481
+ # Extract product names
482
+ name_matches = re.findall(r'data-product-name="([^"]+)"', html)
483
+ if not name_matches:
484
+ name_matches = re.findall(r'class="product-listing__product-name[^"]*">([^<]+)<', html)
485
+ if not name_matches:
486
+ name_matches = re.findall(r'"name"\s*:\s*"([^"]{3,50})"', html)
487
+
488
+ rating_matches = re.findall(r'"ratingValue"\s*:\s*"?([\d.]+)"?', html)
489
+ review_matches = re.findall(r'"reviewCount"\s*:\s*"?(\d+)"?', html)
490
+
491
+ top_vendors = []
492
+ for i, name in enumerate(name_matches[:10]):
493
+ vendor = {"name": name.strip()}
494
+ if i < len(rating_matches):
495
+ vendor["rating"] = rating_matches[i]
496
+ if i < len(review_matches):
497
+ vendor["review_count"] = int(review_matches[i])
498
+ top_vendors.append(vendor)
499
+
500
+ if vendor_count == 0 and top_vendors:
501
+ vendor_count = len(top_vendors)
502
+
503
+ return {"vendor_count_g2": vendor_count, "top_vendors": top_vendors, "g2_url": used_url}
504
+
505
+
506
+ # ---------------------------------------------------------------------------
507
+ # Google Trends (unofficial endpoint)
508
+ # ---------------------------------------------------------------------------
509
+
510
+ def get_trends_direction(keyword: str) -> dict:
511
+ """Infer trend direction from HN post frequency as a proxy when Google Trends is unavailable."""
512
+ if not quiet:
513
+ print(f" Trends (via HN frequency): {keyword!r}", file=sys.stderr)
514
+ # Compare HN post counts: older 6 months vs recent 6 months
515
+ try:
516
+ cutoff_old = int((TODAY - timedelta(days=365)).timestamp())
517
+ cutoff_mid = int((TODAY - timedelta(days=180)).timestamp())
518
+ encoded = urllib.parse.quote_plus(keyword)
519
+
520
+ url_old = (f"https://hn.algolia.com/api/v1/search?query={encoded}"
521
+ f"&tags=story&numericFilters=created_at_i>{cutoff_old},created_at_i<{cutoff_mid}&hitsPerPage=1")
522
+ url_new = (f"https://hn.algolia.com/api/v1/search?query={encoded}"
523
+ f"&tags=story&numericFilters=created_at_i>{cutoff_mid}&hitsPerPage=1")
524
+
525
+ data_old = fetch_json(url_old)
526
+ time.sleep(0.5)
527
+ data_new = fetch_json(url_new)
528
+
529
+ count_old = data_old.get("nbHits", 0) if data_old else 0
530
+ count_new = data_new.get("nbHits", 0) if data_new else 0
531
+
532
+ if count_old == 0 and count_new == 0:
533
+ return {"trends_direction": "unknown", "trends_12mo": [], "trends_note": "insufficient HN data"}
534
+
535
+ if count_old == 0:
536
+ direction = "up"
537
+ elif count_new > count_old * 1.2:
538
+ direction = "up"
539
+ elif count_new < count_old * 0.8:
540
+ direction = "down"
541
+ else:
542
+ direction = "flat"
543
+
544
+ return {
545
+ "trends_direction": direction,
546
+ "trends_12mo": [],
547
+ "trends_note": f"HN posts: {count_old} (6-12mo ago) vs {count_new} (last 6mo)",
548
+ }
549
+ except Exception:
550
+ return {"trends_direction": "unknown", "trends_12mo": []}
551
+
552
+
553
+ # ---------------------------------------------------------------------------
554
+ # ICP signals extractor
555
+ # ---------------------------------------------------------------------------
556
+
557
+ def extract_icp_signals(reddit_results: list) -> list:
558
+ sub_counts = {}
559
+ sub_scores = {}
560
+ for p in reddit_results:
561
+ sub = p.get("subreddit", "")
562
+ if sub:
563
+ sub_counts[sub] = sub_counts.get(sub, 0) + 1
564
+ sub_scores[sub] = sub_scores.get(sub, 0) + p.get("pain_score", 0)
565
+
566
+ signals = []
567
+ for sub, count in sorted(sub_counts.items(), key=lambda x: -x[1]):
568
+ avg_score = sub_scores[sub] / count if count > 0 else 0
569
+ signals.append({
570
+ "subreddit": sub,
571
+ "post_count": count,
572
+ "avg_pain_score": round(avg_score, 1),
573
+ "total_pain_score": round(sub_scores[sub], 1),
574
+ })
575
+ return signals
576
+
577
+
578
+ # ---------------------------------------------------------------------------
579
+ # Main pipeline
580
+ # ---------------------------------------------------------------------------
581
+
582
+ def main():
583
+ global quiet
584
+
585
+ parser = argparse.ArgumentParser(description="Fetch market pain signals for map-your-market skill")
586
+ parser.add_argument("category", help="Market category keywords (e.g. 'developer observability')")
587
+ parser.add_argument("--competitors", "-c", default="", help="Comma-separated competitor names")
588
+ parser.add_argument("--context", default="", help="Product context for output")
589
+ parser.add_argument("--output", "-o", default=None, help="Output JSON file path")
590
+ parser.add_argument("--stdout", action="store_true", help="Print JSON to stdout")
591
+ parser.add_argument("--quiet", "-q", action="store_true", help="Suppress progress output")
592
+ args = parser.parse_args()
593
+
594
+ quiet = args.quiet
595
+
596
+ if not args.output and not args.stdout:
597
+ slug = re.sub(r"[^a-z0-9]+", "-", args.category.lower()).strip("-")
598
+ args.output = f"market-map-{slug}-{TODAY.strftime('%Y-%m-%d')}.json"
599
+
600
+ competitors = [c.strip() for c in args.competitors.split(",") if c.strip()] if args.competitors else []
601
+
602
+ if not quiet:
603
+ print(f"Mapping market: {args.category!r}", file=sys.stderr)
604
+ print(f"Competitors: {competitors or 'none'}", file=sys.stderr)
605
+
606
+ # Detect subreddits
607
+ subreddits = detect_subreddits(args.category, competitors)
608
+ if not quiet:
609
+ print(f"Subreddits: {subreddits}", file=sys.stderr)
610
+
611
+ # Build search queries
612
+ queries = build_reddit_queries(args.category, competitors)
613
+ if not quiet:
614
+ print(f"Queries: {queries}", file=sys.stderr)
615
+
616
+ # --- Reddit ---
617
+ if not quiet:
618
+ print("\n[1/5] Reddit...", file=sys.stderr)
619
+ reddit_results = search_reddit(queries, subreddits)
620
+ if not quiet:
621
+ print(f" Found {len(reddit_results)} Reddit signals", file=sys.stderr)
622
+
623
+ # --- HN ---
624
+ if not quiet:
625
+ print("\n[2/5] Hacker News...", file=sys.stderr)
626
+ hn_queries = [args.category] + competitors[:2]
627
+ hn_results = search_hn(hn_queries)
628
+ if not quiet:
629
+ print(f" Found {len(hn_results)} HN signals", file=sys.stderr)
630
+
631
+ # --- GitHub Issues ---
632
+ if not quiet:
633
+ print("\n[3/5] GitHub Issues...", file=sys.stderr)
634
+ github_results = search_github_issues(competitors, args.category)
635
+ if not quiet:
636
+ print(f" Found {len(github_results)} GitHub issue signals", file=sys.stderr)
637
+
638
+ # --- G2 ---
639
+ if not quiet:
640
+ print("\n[4/5] G2...", file=sys.stderr)
641
+ g2_data = scrape_g2_category(args.category)
642
+ if not quiet:
643
+ print(f" G2 vendors: {g2_data['vendor_count_g2']}", file=sys.stderr)
644
+
645
+ # --- Trends ---
646
+ if not quiet:
647
+ print("\n[5/5] Google Trends...", file=sys.stderr)
648
+ trends = get_trends_direction(args.category)
649
+ if not quiet:
650
+ print(f" Trends direction: {trends['trends_direction']}", file=sys.stderr)
651
+
652
+ # --- Combine and score ---
653
+ all_pains = reddit_results + hn_results + github_results
654
+ all_pains.sort(key=lambda x: x["pain_score"], reverse=True)
655
+
656
+ # ICP signals
657
+ icp_signals = extract_icp_signals(reddit_results)
658
+
659
+ # Build summary
660
+ top20 = all_pains[:20]
661
+ total = len(all_pains)
662
+
663
+ # Competitor mention counts
664
+ competitor_mentioned = {}
665
+ for comp in competitors:
666
+ count = sum(1 for p in all_pains if comp.lower() in (p["title"] + " " + p["body_excerpt"]).lower())
667
+ if count > 0:
668
+ competitor_mentioned[comp] = count
669
+
670
+ output_data = {
671
+ "date": TODAY.strftime("%Y-%m-%d"),
672
+ "category": args.category,
673
+ "competitors": competitors,
674
+ "product_context": args.context,
675
+ "subreddits_searched": subreddits,
676
+ "queries_used": queries,
677
+ "market_signals": {
678
+ "vendor_count_g2": g2_data["vendor_count_g2"],
679
+ "top_vendors": g2_data["top_vendors"],
680
+ "g2_url": g2_data["g2_url"],
681
+ "trends_direction": trends["trends_direction"],
682
+ "trends_12mo": trends["trends_12mo"],
683
+ "hn_signals_found": len(hn_results),
684
+ "reddit_signals_found": len(reddit_results),
685
+ "github_issue_signals": len(github_results),
686
+ },
687
+ "raw_pains": all_pains,
688
+ "icp_signals": icp_signals,
689
+ "summary": {
690
+ "total_pain_signals": total,
691
+ "high_signal": top20,
692
+ "competitor_mentioned": competitor_mentioned,
693
+ },
694
+ }
695
+
696
+ if not quiet:
697
+ print(f"\nTotal signals: {total}", file=sys.stderr)
698
+ print(f"Top pain_score: {all_pains[0]['pain_score'] if all_pains else 0}", file=sys.stderr)
699
+
700
+ if args.stdout:
701
+ print(json.dumps(output_data, indent=2))
702
+ else:
703
+ with open(args.output, "w") as f:
704
+ json.dump(output_data, f, indent=2)
705
+ if not quiet:
706
+ print(f"Output: {args.output}", file=sys.stderr)
707
+
708
+
709
+ if __name__ == "__main__":
710
+ main()