@opendirectory.dev/skills 0.1.38 → 0.1.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,810 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ where-your-customer-lives fetch script
4
+ Discovers channels where your ICP gathers via signal-trace + competitor layer.
5
+ No required API keys. GITHUB_TOKEN optional (improves competitor layer rate limits).
6
+
7
+ Usage:
8
+ python3 scripts/fetch.py "startup gtm" --icp-role "technical co-founders" --icp-pain "customer acquisition"
9
+ python3 scripts/fetch.py "devops tools" --competitors "Datadog,Grafana" --output /tmp/wcl-raw.json
10
+ GITHUB_TOKEN=your_token python3 scripts/fetch.py "B2B sales" --competitors "Clay,Apollo" --stdout
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import math
16
+ import os
17
+ import re
18
+ import ssl
19
+ import sys
20
+ import time
21
+ import urllib.error
22
+ import urllib.parse
23
+ import urllib.request
24
+ from datetime import datetime, timedelta, timezone
25
+
26
+ _ssl_ctx = ssl._create_unverified_context()
27
+
28
+ TODAY = datetime.now(timezone.utc)
29
+ GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
30
+ quiet = False
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # HTTP helpers (verbatim from map-your-market)
35
+ # ---------------------------------------------------------------------------
36
+
37
+ def fetch_json(url, headers=None, timeout=20):
38
+ req = urllib.request.Request(url, headers=headers or {})
39
+ req.add_header("User-Agent", "where-your-customer-lives/1.0")
40
+ try:
41
+ with urllib.request.urlopen(req, context=_ssl_ctx, timeout=timeout) as r:
42
+ return json.loads(r.read().decode("utf-8"))
43
+ except urllib.error.HTTPError as e:
44
+ if not quiet:
45
+ print(f" HTTP {e.code}: {url[:80]}", file=sys.stderr)
46
+ return None
47
+ except Exception as e:
48
+ if not quiet:
49
+ print(f" Error: {e} -- {url[:80]}", file=sys.stderr)
50
+ return None
51
+
52
+
53
+ def fetch_html(url, timeout=20):
54
+ req = urllib.request.Request(url)
55
+ req.add_header("User-Agent", "Mozilla/5.0 (compatible; where-your-customer-lives/1.0)")
56
+ req.add_header("Accept", "text/html,application/xhtml+xml")
57
+ try:
58
+ with urllib.request.urlopen(req, context=_ssl_ctx, timeout=timeout) as r:
59
+ return r.read().decode("utf-8", errors="replace")
60
+ except Exception as e:
61
+ if not quiet:
62
+ print(f" HTML fetch error: {e} -- {url[:80]}", file=sys.stderr)
63
+ return ""
64
+
65
+
66
+ def gh_get(path):
67
+ headers = {"Accept": "application/vnd.github+json"}
68
+ if GITHUB_TOKEN:
69
+ headers["Authorization"] = f"Bearer {GITHUB_TOKEN}"
70
+ return fetch_json(f"https://api.github.com{path}", headers=headers)
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Subreddit detection (verbatim from map-your-market)
75
+ # ---------------------------------------------------------------------------
76
+
77
+ SUBREDDIT_MAP = {
78
+ "devops": ["devops", "sysadmin", "aws", "kubernetes", "docker"],
79
+ "observability": ["devops", "sysadmin", "dataengineering", "CloudArchitects"],
80
+ "monitoring": ["devops", "sysadmin", "networking", "aws"],
81
+ "analytics": ["analytics", "dataengineering", "datascience", "BusinessIntelligence"],
82
+ "b2b": ["startups", "entrepreneur", "SaaS", "smallbusiness"],
83
+ "saas": ["SaaS", "startups", "entrepreneur", "microsaas"],
84
+ "developer": ["programming", "webdev", "ExperiencedDevs", "devops"],
85
+ "developer tools": ["programming", "webdev", "devops", "ExperiencedDevs"],
86
+ "api": ["webdev", "programming", "devops", "node"],
87
+ "security": ["netsec", "cybersecurity", "devops", "sysadmin"],
88
+ "data": ["dataengineering", "datascience", "analytics", "BusinessIntelligence"],
89
+ "database": ["dataengineering", "Database", "PostgreSQL", "learnprogramming"],
90
+ "auth": ["webdev", "programming", "netsec", "node"],
91
+ "payments": ["webdev", "programming", "entrepreneur", "ecommerce"],
92
+ "ecommerce": ["ecommerce", "entrepreneur", "shopify", "startups"],
93
+ "marketing": ["marketing", "digital_marketing", "entrepreneur", "startups"],
94
+ "gtm": ["startups", "entrepreneur", "sales", "marketing"],
95
+ "go-to-market": ["startups", "entrepreneur", "sales", "marketing"],
96
+ "crm": ["sales", "salesforce", "entrepreneur", "smallbusiness"],
97
+ "sales": ["sales", "entrepreneur", "startups", "smallbusiness"],
98
+ "hr": ["humanresources", "remotework", "startups", "smallbusiness"],
99
+ "finance": ["personalfinance", "accounting", "startups", "smallbusiness"],
100
+ "healthcare": ["healthIT", "medicine", "startups", "technology"],
101
+ "startup": ["startups", "entrepreneur", "SaaS", "smallbusiness"],
102
+ "ai": ["MachineLearning", "artificial", "ChatGPT", "learnmachinelearning"],
103
+ "ml": ["MachineLearning", "learnmachinelearning", "datascience", "artificial"],
104
+ "llm": ["MachineLearning", "artificial", "ChatGPT", "LocalLLaMA"],
105
+ "product": ["ProductManagement", "startups", "entrepreneur", "SaaS"],
106
+ "growth": ["startups", "entrepreneur", "marketing", "digital_marketing"],
107
+ "consumer": ["technology", "apps", "selfhosted", "productivity"],
108
+ }
109
+
110
+ FALLBACK_SUBREDDITS = ["startups", "entrepreneur", "technology", "programming", "webdev"]
111
+
112
+
113
+ def detect_subreddits(category: str, competitors: list) -> list:
114
+ subs = set()
115
+ cat_lower = category.lower()
116
+
117
+ for keyword, subreddit_list in SUBREDDIT_MAP.items():
118
+ if keyword in cat_lower:
119
+ subs.update(subreddit_list)
120
+
121
+ for comp in competitors:
122
+ comp_lower = comp.lower()
123
+ if any(w in comp_lower for w in ["data", "log", "metric", "monitor", "trace"]):
124
+ subs.update(SUBREDDIT_MAP.get("observability", []))
125
+ if any(w in comp_lower for w in ["pay", "stripe", "billing"]):
126
+ subs.update(SUBREDDIT_MAP.get("payments", []))
127
+ if any(w in comp_lower for w in ["crm", "sales", "hubspot", "salesforce"]):
128
+ subs.update(SUBREDDIT_MAP.get("crm", []))
129
+
130
+ if not subs:
131
+ subs.update(FALLBACK_SUBREDDITS)
132
+
133
+ subs.add("startups")
134
+ return list(subs)[:8]
135
+
136
+
137
+ # ---------------------------------------------------------------------------
138
+ # Pain scoring (verbatim from map-your-market)
139
+ # ---------------------------------------------------------------------------
140
+
141
+ def compute_pain_score(source: str, score_val: int, comments: int, created_at: str) -> float:
142
+ try:
143
+ if created_at:
144
+ dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
145
+ days_old = (TODAY - dt).days
146
+ else:
147
+ days_old = 180
148
+ except Exception:
149
+ days_old = 180
150
+
151
+ if days_old < 30:
152
+ recency = 1.0
153
+ elif days_old < 90:
154
+ recency = 0.85
155
+ elif days_old < 180:
156
+ recency = 0.7
157
+ else:
158
+ recency = 0.5
159
+
160
+ if source == "github_issue":
161
+ base = score_val * 3
162
+ elif source == "reddit":
163
+ base = min(score_val, 500) + comments * 0.3
164
+ else:
165
+ base = score_val + comments * 0.3
166
+
167
+ return round(base * recency, 1)
168
+
169
+
170
+ # ---------------------------------------------------------------------------
171
+ # Reddit search (verbatim from map-your-market, queries adapted for ICP)
172
+ # ---------------------------------------------------------------------------
173
+
174
+ def build_icp_queries(category: str, icp_role: str, icp_pain: str, competitors: list) -> list:
175
+ queries = []
176
+ if icp_role:
177
+ queries.append(icp_role)
178
+ if icp_pain:
179
+ queries.append(icp_pain)
180
+ queries.append(category)
181
+ for comp in competitors[:2]:
182
+ queries.append(comp)
183
+ if icp_pain:
184
+ words = icp_pain.split()[:3]
185
+ queries.append(" ".join(words) + " alternative")
186
+ elif category:
187
+ cat_words = category.split()[:2]
188
+ queries.append(" ".join(cat_words) + " alternative")
189
+ return list(dict.fromkeys(queries))[:6]
190
+
191
+
192
+ def search_reddit(queries: list, subreddits: list, time_filter: str = "year") -> list:
193
+ results = []
194
+ seen_ids = set()
195
+
196
+ def parse_posts(data):
197
+ posts = []
198
+ if not data or "data" not in data:
199
+ return posts
200
+ for child in data["data"].get("children", []):
201
+ p = child.get("data", {})
202
+ post_id = p.get("id", "")
203
+ if not post_id or post_id in seen_ids:
204
+ continue
205
+ seen_ids.add(post_id)
206
+ score_val = p.get("score", 0)
207
+ num_comments = p.get("num_comments", 0)
208
+ created = datetime.fromtimestamp(p.get("created_utc", 0), tz=timezone.utc).isoformat()
209
+ body = (p.get("selftext", "") or "")[:500]
210
+ posts.append({
211
+ "id": post_id,
212
+ "source": "reddit",
213
+ "title": p.get("title", ""),
214
+ "body_excerpt": body,
215
+ "pain_score": compute_pain_score("reddit", score_val, num_comments, created),
216
+ "url": f"https://www.reddit.com{p.get('permalink', '')}",
217
+ "subreddit": p.get("subreddit", ""),
218
+ "score": score_val,
219
+ "comments": num_comments,
220
+ "created_at": created,
221
+ "matched_query": "",
222
+ })
223
+ return posts
224
+
225
+ def is_relevant(post: dict, query: str) -> bool:
226
+ query_words = [w.lower() for w in query.split() if len(w) > 3]
227
+ if not query_words:
228
+ return True
229
+ text = (post.get("title", "") + " " + post.get("body_excerpt", "")).lower()
230
+ return any(w in text for w in query_words)
231
+
232
+ for sub in subreddits[:6]:
233
+ for query in queries[:3]:
234
+ encoded = urllib.parse.quote_plus(query)
235
+ url = f"https://www.reddit.com/r/{sub}/search.json?q={encoded}&sort=top&t={time_filter}&restrict_sr=true&limit=25"
236
+ if not quiet:
237
+ print(f" Reddit r/{sub}: {query!r}", file=sys.stderr)
238
+ data = fetch_json(url, headers={"User-Agent": "where-your-customer-lives/1.0"})
239
+ posts = parse_posts(data)
240
+ for p in posts:
241
+ p["matched_query"] = query
242
+ relevant = [p for p in posts if is_relevant(p, query)]
243
+ results.extend(relevant)
244
+ time.sleep(2)
245
+
246
+ results = [r for r in results if r["pain_score"] >= 2.0]
247
+ return results
248
+
249
+
250
+ # ---------------------------------------------------------------------------
251
+ # HN search (verbatim from map-your-market)
252
+ # ---------------------------------------------------------------------------
253
+
254
+ def search_hn(queries: list, days_back: int = 365) -> list:
255
+ results = []
256
+ seen_ids = set()
257
+ cutoff_ts = int((TODAY - timedelta(days=days_back)).timestamp())
258
+
259
+ for query in queries:
260
+ encoded = urllib.parse.quote_plus(query)
261
+ url = f"https://hn.algolia.com/api/v1/search?query={encoded}&tags=story&numericFilters=created_at_i>{cutoff_ts}&hitsPerPage=50"
262
+ if not quiet:
263
+ print(f" HN: {query!r}", file=sys.stderr)
264
+ data = fetch_json(url)
265
+ if data:
266
+ for hit in data.get("hits", []):
267
+ obj_id = hit.get("objectID", "")
268
+ if not obj_id or obj_id in seen_ids:
269
+ continue
270
+ seen_ids.add(obj_id)
271
+ points = hit.get("points") or 0
272
+ num_comments = hit.get("num_comments") or 0
273
+ if points < 3:
274
+ continue
275
+ created = hit.get("created_at", "")
276
+ results.append({
277
+ "id": obj_id,
278
+ "source": "hn",
279
+ "title": hit.get("title", ""),
280
+ "body_excerpt": (hit.get("story_text") or "")[:400],
281
+ "pain_score": compute_pain_score("hn", points, num_comments, created),
282
+ "url": hit.get("url") or f"https://news.ycombinator.com/item?id={obj_id}",
283
+ "subreddit": "",
284
+ "score": points,
285
+ "comments": num_comments,
286
+ "created_at": created,
287
+ "matched_query": query,
288
+ })
289
+ time.sleep(1)
290
+
291
+ return results
292
+
293
+
294
+ # ---------------------------------------------------------------------------
295
+ # NEW: Reddit subreddit metadata
296
+ # ---------------------------------------------------------------------------
297
+
298
+ def get_subreddit_metadata(subreddit: str) -> dict:
299
+ url = f"https://www.reddit.com/r/{subreddit}/about.json"
300
+ data = fetch_json(url, headers={"User-Agent": "where-your-customer-lives/1.0"})
301
+ if not data or "data" not in data:
302
+ return {"subscribers": 0, "active_user_count": 0, "description": ""}
303
+ d = data["data"]
304
+ desc = (d.get("public_description") or d.get("description") or "")[:200]
305
+ desc = re.sub(r'\s+', ' ', desc).strip()
306
+ return {
307
+ "subscribers": d.get("subscribers", 0) or 0,
308
+ "active_user_count": d.get("active_user_count", 0) or 0,
309
+ "description": desc,
310
+ }
311
+
312
+
313
+ # ---------------------------------------------------------------------------
314
+ # NEW: DuckDuckGo channel discovery
315
+ # ---------------------------------------------------------------------------
316
+
317
+ _MEMBER_PATTERNS = [
318
+ r'(\d[\d,]+)\s*[Kk]\+?\s*(?:members|subscribers|followers|users)',
319
+ r'(\d[\d,]+)\+?\s*(?:members|subscribers|followers|users|professionals|engineers|developers|founders)',
320
+ r'(?:join|with|over|reach)\s+(\d[\d,]+)\+?\s*(?:members|subscribers|followers)',
321
+ r'(\d[\d,]+)\s*(?:member|subscriber|follower)\s*(?:community|group|list)',
322
+ ]
323
+
324
+
325
+ def parse_member_count(text: str) -> int:
326
+ for pattern in _MEMBER_PATTERNS:
327
+ m = re.search(pattern, text, re.IGNORECASE)
328
+ if m:
329
+ num_str = m.group(1).replace(",", "")
330
+ try:
331
+ count = int(num_str)
332
+ except ValueError:
333
+ continue
334
+ if re.search(r'\d\s*[Kk]', m.group(0)):
335
+ count *= 1000
336
+ if 10 <= count <= 50_000_000:
337
+ return count
338
+ return 0
339
+
340
+
341
+ def _decode_ddg_url(raw_url: str) -> str:
342
+ """DDG wraps destinations in //duckduckgo.com/l/?uddg=... -- extract the real URL."""
343
+ m = re.search(r'uddg=([^&]+)', raw_url)
344
+ if m:
345
+ return urllib.parse.unquote(m.group(1))
346
+ if raw_url.startswith("//"):
347
+ return "https:" + raw_url
348
+ return raw_url
349
+
350
+
351
+ def parse_ddg_results(html: str) -> list:
352
+ results = []
353
+ # DDG HTML: href comes after class="result__a" in attribute order
354
+ # Pattern: extract raw href (DDG redirect) and title text
355
+ hrefs = re.findall(r'<a[^>]*class="result__a"[^>]*href="([^"]+)"', html)
356
+ titles_raw = re.findall(r'<a[^>]*class="result__a"[^>]*>(.*?)</a>', html, re.DOTALL)
357
+ snippets_raw = re.findall(
358
+ r'class="result__snippet"[^>]*>(.*?)</(?:a|span|div)>',
359
+ html, re.DOTALL | re.IGNORECASE
360
+ )
361
+
362
+ titles = [re.sub(r'<[^>]+>', '', t).strip() for t in titles_raw]
363
+ snippets = [re.sub(r'<[^>]+>', '', s).strip() for s in snippets_raw]
364
+
365
+ for i, (raw_href, title) in enumerate(zip(hrefs, titles)):
366
+ if not title or len(title) < 4:
367
+ continue
368
+ real_url = _decode_ddg_url(raw_href)
369
+ snippet = snippets[i] if i < len(snippets) else ""
370
+ results.append({"title": title, "url": real_url, "snippet": snippet})
371
+
372
+ return results[:12]
373
+
374
+
375
+ _HTML_ENTITIES = {
376
+ "&#x27;": "'", "&amp;": "&", "&lt;": "<", "&gt;": ">", "&quot;": '"',
377
+ "&#39;": "'", "&apos;": "'",
378
+ }
379
+
380
+ def _decode_html(text: str) -> str:
381
+ for entity, char in _HTML_ENTITIES.items():
382
+ text = text.replace(entity, char)
383
+ return text
384
+
385
+
386
+ _LISTICLE_PATTERNS = re.compile(
387
+ r'^(?:\d+\s+(?:best|top|great)|best\s+\d+|top\s+\d+|how\s+to|list\s+of|ultimate\s+list|'
388
+ r'complete\s+list|the\s+\d+|full\s+list)',
389
+ re.IGNORECASE
390
+ )
391
+
392
+
393
+ def clean_channel_name(title: str, channel_type: str) -> str:
394
+ """Extract a clean channel name from a DDG result title."""
395
+ title = _decode_html(title)
396
+
397
+ # Reject listicle titles (they're articles, not channels)
398
+ if _LISTICLE_PATTERNS.match(title.strip()):
399
+ return ""
400
+
401
+ # Strip common suffixes
402
+ name = re.sub(r'\s*[-|:]\s*(?:Home|Official|Website|Sign up|Join|Free|Login|Welcome).*$', '', title, flags=re.IGNORECASE)
403
+ name = name.strip()
404
+
405
+ if channel_type == "slack":
406
+ m = re.search(r'([A-Za-z][^|:\n]{3,50}?)\s+(?:Slack|Workspace)\b', title, re.IGNORECASE)
407
+ if m:
408
+ return m.group(0).strip()
409
+ m = re.search(r'([A-Za-z][^|:\n]{3,50}?)\s+Community\b', title, re.IGNORECASE)
410
+ if m:
411
+ return m.group(0).strip()
412
+
413
+ elif channel_type == "discord":
414
+ m = re.search(r'([A-Za-z][^|:\n]{3,50}?)\s+(?:Discord|Server)\b', title, re.IGNORECASE)
415
+ if m:
416
+ return m.group(0).strip()
417
+
418
+ elif channel_type == "conference":
419
+ m = re.search(r'([A-Za-z][^|:\n]{3,60}?(?:Conf(?:erence)?|Summit|Con\b|Meetup|Camp))', title, re.IGNORECASE)
420
+ if m:
421
+ return m.group(0).strip()
422
+
423
+ elif channel_type == "podcast":
424
+ # Strip platform suffixes: "- Apple Podcasts", "- Spotify", "| Spotify", "Episodes |"
425
+ name = re.sub(r'\s*[-|]\s*(?:Apple\s+Podcasts?|Spotify|Google\s+Podcasts?|Stitcher|Podbean|Buzzsprout|Anchor).*$', '', name, flags=re.IGNORECASE)
426
+ name = re.sub(r'^Episodes\s*\|\s*', '', name, flags=re.IGNORECASE)
427
+ name = re.sub(r'#\d+.*$', '', name) # strip episode number suffixes
428
+ name = name.strip()
429
+
430
+ elif channel_type == "linkedin_group":
431
+ name = re.sub(r'\s*\|\s*LinkedIn.*$', '', name, flags=re.IGNORECASE).strip()
432
+
433
+ elif channel_type == "youtube":
434
+ name = re.sub(r'\s*-\s*YouTube.*$', '', name, flags=re.IGNORECASE).strip()
435
+ name = re.sub(r'\s*\|\s*YouTube.*$', '', name, flags=re.IGNORECASE).strip()
436
+
437
+ return name[:70] if len(name) > 5 else ""
438
+
439
+
440
+ def search_channels_ddg(query: str, channel_type: str) -> list:
441
+ encoded = urllib.parse.quote_plus(query)
442
+ url = f"https://html.duckduckgo.com/html/?q={encoded}"
443
+ if not quiet:
444
+ print(f" DDG [{channel_type}]: {query!r}", file=sys.stderr)
445
+ html = fetch_html(url)
446
+ if not html:
447
+ return []
448
+
449
+ ddg_results = parse_ddg_results(html)
450
+ channels = []
451
+
452
+ for r in ddg_results:
453
+ combined = _decode_html(f"{r['title']} {r['snippet']}")
454
+ member_count = parse_member_count(combined)
455
+ name = clean_channel_name(r["title"], channel_type)
456
+ if not name or len(name) < 4:
457
+ continue
458
+
459
+ # Skip obvious noise (search engine meta-pages, encyclopedias)
460
+ bad_domains = ["google.com", "bing.com", "duckduckgo.com", "wikipedia.org", "wikidata.org"]
461
+ if any(d in r.get("url", "") for d in bad_domains):
462
+ continue
463
+
464
+ channels.append({
465
+ "name": name,
466
+ "type": channel_type,
467
+ "url": r["url"],
468
+ "members": member_count,
469
+ "active_users": 0,
470
+ "description": r["snippet"][:200],
471
+ "activity_score": 5, # default; DDG doesn't expose post frequency
472
+ "icp_signal_count": 0,
473
+ "competitor_mentions": 0,
474
+ "entry_type": "open",
475
+ "discovery_method": "ddg_search",
476
+ "evidence_posts": [],
477
+ })
478
+
479
+ return channels
480
+
481
+
482
+ # ---------------------------------------------------------------------------
483
+ # NEW: Channel scoring
484
+ # ---------------------------------------------------------------------------
485
+
486
+ def score_channel(channel: dict) -> float:
487
+ icp_signals = channel.get("icp_signal_count", 0)
488
+ members = channel.get("members", 0)
489
+ activity = channel.get("activity_score", 0)
490
+ comp_mentions = channel.get("competitor_mentions", 0)
491
+ entry_type = channel.get("entry_type", "open")
492
+
493
+ score = (
494
+ icp_signals * 10
495
+ + min(math.log10(max(members, 1)) * 15, 50)
496
+ + min(activity, 30)
497
+ + comp_mentions * 5
498
+ )
499
+
500
+ if entry_type == "paid":
501
+ score -= 20
502
+ elif entry_type == "invite-only":
503
+ score -= 10
504
+
505
+ return round(score, 1)
506
+
507
+
508
+ def get_tier(score: float) -> str:
509
+ if score >= 100:
510
+ return "top-priority"
511
+ elif score >= 60:
512
+ return "high"
513
+ elif score >= 30:
514
+ return "medium"
515
+ else:
516
+ return "low"
517
+
518
+
519
+ # ---------------------------------------------------------------------------
520
+ # NEW: Deduplicate channels
521
+ # ---------------------------------------------------------------------------
522
+
523
+ def deduplicate_channels(channels: list) -> list:
524
+ seen = {}
525
+ for ch in channels:
526
+ key = ch["name"].lower().strip()
527
+ if key not in seen:
528
+ seen[key] = dict(ch)
529
+ else:
530
+ existing = seen[key]
531
+ existing["icp_signal_count"] = max(
532
+ existing.get("icp_signal_count", 0), ch.get("icp_signal_count", 0)
533
+ )
534
+ existing["competitor_mentions"] = existing.get("competitor_mentions", 0) + ch.get("competitor_mentions", 0)
535
+ if ch.get("members", 0) > existing.get("members", 0):
536
+ existing["members"] = ch["members"]
537
+ existing["evidence_posts"] = existing.get("evidence_posts", []) + ch.get("evidence_posts", [])
538
+ # Prefer signal_trace as discovery method
539
+ if ch.get("discovery_method") == "signal_trace":
540
+ existing["discovery_method"] = "signal_trace"
541
+
542
+ return list(seen.values())
543
+
544
+
545
+ # ---------------------------------------------------------------------------
546
+ # Main pipeline
547
+ # ---------------------------------------------------------------------------
548
+
549
+ def main():
550
+ global quiet
551
+
552
+ parser = argparse.ArgumentParser(description="Discover channels where your ICP gathers")
553
+ parser.add_argument("category", help="Market category (e.g. 'startup gtm sales tools')")
554
+ parser.add_argument("--icp-role", default="", help="ICP role description (e.g. 'technical co-founders')")
555
+ parser.add_argument("--icp-pain", default="", help="ICP primary pain (e.g. 'customer acquisition')")
556
+ parser.add_argument("--product", default="", help="Product one-liner")
557
+ parser.add_argument("--competitors", "-c", default="", help="Comma-separated competitor names")
558
+ parser.add_argument("--output", "-o", default=None, help="Output JSON file path")
559
+ parser.add_argument("--stdout", action="store_true", help="Print JSON to stdout")
560
+ parser.add_argument("--quiet", "-q", action="store_true", help="Suppress progress output")
561
+ args = parser.parse_args()
562
+
563
+ quiet = args.quiet
564
+ competitors = [c.strip() for c in args.competitors.split(",") if c.strip()] if args.competitors else []
565
+
566
+ if not args.output and not args.stdout:
567
+ slug = re.sub(r"[^a-z0-9]+", "-", args.category.lower()).strip("-")
568
+ args.output = f"/tmp/wcl-raw-{slug}-{TODAY.strftime('%Y-%m-%d')}.json"
569
+
570
+ if not quiet:
571
+ print(f"Finding channels for: {args.category!r}", file=sys.stderr)
572
+ print(f"ICP role: {args.icp_role or 'not specified'}", file=sys.stderr)
573
+ print(f"ICP pain: {args.icp_pain or 'not specified'}", file=sys.stderr)
574
+ print(f"Competitors: {competitors or 'none'}", file=sys.stderr)
575
+
576
+ # -----------------------------------------------------------------------
577
+ # Step 1: Detect subreddits + build ICP queries
578
+ # -----------------------------------------------------------------------
579
+ subreddits = detect_subreddits(args.category, competitors)
580
+ queries = build_icp_queries(args.category, args.icp_role, args.icp_pain, competitors)
581
+
582
+ if not quiet:
583
+ print(f"\nSubreddits: {subreddits}", file=sys.stderr)
584
+ print(f"Queries: {queries}", file=sys.stderr)
585
+
586
+ # -----------------------------------------------------------------------
587
+ # Step 2: Signal-trace via Reddit
588
+ # -----------------------------------------------------------------------
589
+ if not quiet:
590
+ print("\n[1/5] Signal-trace: Reddit...", file=sys.stderr)
591
+
592
+ reddit_posts = search_reddit(queries, subreddits)
593
+
594
+ if not quiet:
595
+ print(f" Found {len(reddit_posts)} ICP posts", file=sys.stderr)
596
+
597
+ # Aggregate by subreddit
598
+ sub_stats = {}
599
+ for post in reddit_posts:
600
+ sub = post.get("subreddit", "")
601
+ if not sub:
602
+ continue
603
+ if sub not in sub_stats:
604
+ sub_stats[sub] = {
605
+ "icp_signal_count": 0,
606
+ "competitor_mentions": 0,
607
+ "total_score": 0.0,
608
+ "evidence_posts": [],
609
+ }
610
+ sub_stats[sub]["icp_signal_count"] += 1
611
+ sub_stats[sub]["total_score"] += post.get("pain_score", 0)
612
+ post_text = (post.get("title", "") + " " + post.get("body_excerpt", "")).lower()
613
+ for comp in competitors:
614
+ if comp.lower() in post_text:
615
+ sub_stats[sub]["competitor_mentions"] += 1
616
+ if len(sub_stats[sub]["evidence_posts"]) < 3:
617
+ sub_stats[sub]["evidence_posts"].append({
618
+ "title": post.get("title", ""),
619
+ "score": post.get("pain_score", 0),
620
+ "url": post.get("url", ""),
621
+ "pain_match": post.get("matched_query", ""),
622
+ })
623
+
624
+ # Fetch metadata + build Reddit channels
625
+ reddit_channels = []
626
+ for sub, stats in sub_stats.items():
627
+ if not quiet:
628
+ print(f" Metadata: r/{sub}", file=sys.stderr)
629
+ meta = get_subreddit_metadata(sub)
630
+ time.sleep(1)
631
+
632
+ channel = {
633
+ "name": f"r/{sub}",
634
+ "type": "reddit",
635
+ "url": f"https://reddit.com/r/{sub}",
636
+ "members": meta.get("subscribers", 0),
637
+ "active_users": meta.get("active_user_count", 0),
638
+ "description": meta.get("description", ""),
639
+ "activity_score": min((meta.get("active_user_count", 0) or 0) // 100, 30),
640
+ "icp_signal_count": stats["icp_signal_count"],
641
+ "competitor_mentions": stats["competitor_mentions"],
642
+ "entry_type": "open",
643
+ "discovery_method": "signal_trace",
644
+ "evidence_posts": stats["evidence_posts"],
645
+ }
646
+ channel["channel_score"] = score_channel(channel)
647
+ channel["tier"] = get_tier(channel["channel_score"])
648
+ reddit_channels.append(channel)
649
+
650
+ # -----------------------------------------------------------------------
651
+ # Step 3: HN signal-trace
652
+ # -----------------------------------------------------------------------
653
+ if not quiet:
654
+ print("\n[2/5] HN signal-trace...", file=sys.stderr)
655
+
656
+ hn_results = search_hn(queries[:3])
657
+
658
+ if not quiet:
659
+ print(f" Found {len(hn_results)} HN signals", file=sys.stderr)
660
+
661
+ hn_channel = None
662
+ if len(hn_results) >= 3:
663
+ hn_comp_mentions = sum(
664
+ 1 for r in hn_results
665
+ if any(c.lower() in (r.get("title", "") + r.get("body_excerpt", "")).lower() for c in competitors)
666
+ )
667
+ hn_channel = {
668
+ "name": "Hacker News",
669
+ "type": "forum",
670
+ "url": "https://news.ycombinator.com",
671
+ "members": 0,
672
+ "active_users": 0,
673
+ "description": "Tech and startup community; strong signal for developer and founder ICPs",
674
+ "activity_score": 20,
675
+ "icp_signal_count": len(hn_results),
676
+ "competitor_mentions": hn_comp_mentions,
677
+ "entry_type": "open",
678
+ "discovery_method": "signal_trace",
679
+ "evidence_posts": [
680
+ {
681
+ "title": r.get("title", ""),
682
+ "score": r.get("pain_score", 0),
683
+ "url": r.get("url", ""),
684
+ "pain_match": r.get("matched_query", ""),
685
+ }
686
+ for r in sorted(hn_results, key=lambda x: x.get("pain_score", 0), reverse=True)[:3]
687
+ ],
688
+ }
689
+ hn_channel["channel_score"] = score_channel(hn_channel)
690
+ hn_channel["tier"] = get_tier(hn_channel["channel_score"])
691
+
692
+ # -----------------------------------------------------------------------
693
+ # Step 4: DuckDuckGo channel discovery (non-Reddit types)
694
+ # -----------------------------------------------------------------------
695
+ if not quiet:
696
+ print("\n[3/5] Discovering Slack/Discord/newsletter/podcast/conference channels...", file=sys.stderr)
697
+
698
+ icp_label = args.icp_role or args.category
699
+ ddg_channels = []
700
+
701
+ channel_searches = [
702
+ (f"{args.category} slack community", "slack"),
703
+ (f"{icp_label} discord server community", "discord"),
704
+ (f"{args.category} newsletter weekly", "newsletter"),
705
+ (f"{icp_label} podcast episodes", "podcast"),
706
+ (f"{args.category} conference summit 2025", "conference"),
707
+ (f"site:linkedin.com/groups {args.category}", "linkedin_group"),
708
+ (f"best {args.category} youtube channel", "youtube"),
709
+ ]
710
+
711
+ for query, ch_type in channel_searches:
712
+ found = search_channels_ddg(query, ch_type)
713
+ ddg_channels.extend(found[:3])
714
+ time.sleep(2.5)
715
+
716
+ for ch in ddg_channels:
717
+ ch["channel_score"] = score_channel(ch)
718
+ ch["tier"] = get_tier(ch["channel_score"])
719
+
720
+ # -----------------------------------------------------------------------
721
+ # Step 5: Competitor layer -- where competitors are discussed
722
+ # -----------------------------------------------------------------------
723
+ if competitors:
724
+ if not quiet:
725
+ print("\n[4/5] Competitor layer...", file=sys.stderr)
726
+ for comp in competitors[:3]:
727
+ comp_channels = search_channels_ddg(
728
+ f"{comp} community users discussion forum",
729
+ "forum"
730
+ )
731
+ for ch in comp_channels[:2]:
732
+ ch["competitor_mentions"] = 3
733
+ ch["channel_score"] = score_channel(ch)
734
+ ch["tier"] = get_tier(ch["channel_score"])
735
+ ddg_channels.append(ch)
736
+ time.sleep(2)
737
+ else:
738
+ if not quiet:
739
+ print("\n[4/5] Competitor layer: skipped (no competitors provided)", file=sys.stderr)
740
+
741
+ # -----------------------------------------------------------------------
742
+ # Step 6: Combine, deduplicate, rank
743
+ # -----------------------------------------------------------------------
744
+ if not quiet:
745
+ print("\n[5/5] Ranking channels...", file=sys.stderr)
746
+
747
+ all_channels = list(reddit_channels)
748
+ if hn_channel:
749
+ all_channels.append(hn_channel)
750
+ all_channels.extend(ddg_channels)
751
+
752
+ all_channels = deduplicate_channels(all_channels)
753
+
754
+ # Re-score after deduplication (competitor_mentions may have changed)
755
+ for ch in all_channels:
756
+ ch["channel_score"] = score_channel(ch)
757
+ ch["tier"] = get_tier(ch["channel_score"])
758
+
759
+ all_channels.sort(key=lambda x: x.get("channel_score", 0), reverse=True)
760
+ # Filter: keep channels with any positive signal (score > 0) -- DDG channels with no
761
+ # member count score at 5 (activity default) and should still appear in output
762
+ all_channels = [ch for ch in all_channels if ch.get("channel_score", 0) > 0]
763
+
764
+ by_type: dict = {}
765
+ for ch in all_channels:
766
+ t = ch["type"]
767
+ by_type[t] = by_type.get(t, 0) + 1
768
+
769
+ top_priority = [ch["name"] for ch in all_channels if ch.get("tier") == "top-priority"]
770
+ high_channels = [ch["name"] for ch in all_channels if ch.get("tier") == "high"]
771
+
772
+ output_data = {
773
+ "date": TODAY.strftime("%Y-%m-%d"),
774
+ "product": args.product,
775
+ "icp_role": args.icp_role,
776
+ "icp_pain": args.icp_pain,
777
+ "category": args.category,
778
+ "competitors": competitors,
779
+ "subreddits_searched": subreddits,
780
+ "queries_used": queries,
781
+ "reddit_posts_found": len(reddit_posts),
782
+ "hn_signals_found": len(hn_results),
783
+ "channels_discovered": all_channels,
784
+ "summary": {
785
+ "total_channels": len(all_channels),
786
+ "top_priority": top_priority,
787
+ "high": high_channels,
788
+ "by_type": by_type,
789
+ "competitor_layer_ran": bool(competitors),
790
+ },
791
+ }
792
+
793
+ if not quiet:
794
+ print(f"\nChannels discovered: {len(all_channels)}", file=sys.stderr)
795
+ print(f"Top priority: {len(top_priority)}", file=sys.stderr)
796
+ print(f"By type: {by_type}", file=sys.stderr)
797
+ for i, ch in enumerate(all_channels[:5], 1):
798
+ print(f" #{i} {ch['name']} [{ch['type']}] score={ch.get('channel_score', 0)} tier={ch.get('tier', '')}", file=sys.stderr)
799
+
800
+ if args.stdout:
801
+ print(json.dumps(output_data, indent=2))
802
+ else:
803
+ with open(args.output, "w") as f:
804
+ json.dump(output_data, f, indent=2)
805
+ if not quiet:
806
+ print(f"Output: {args.output}", file=sys.stderr)
807
+
808
+
809
+ if __name__ == "__main__":
810
+ main()