delimit-cli 4.0.1 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,562 @@
1
+ """Reddit bulk scanner -- fetch, categorize, and rank posts for outreach.
2
+
3
+ Scans 25+ subreddits via the residential proxy, scores each post on
4
+ engagement, freshness, comment opportunity, and venture relevance,
5
+ then returns a ranked list of outreach targets.
6
+
7
+ Rate limited to 1 request per 2 seconds to stay well under Reddit limits.
8
+ Results are persisted to ~/.delimit/reddit_scans/{date}.json for dedup.
9
+ """
10
+
11
+ import json
12
+ import logging
13
+ import os
14
+ import re
15
+ import time
16
+ import urllib.error
17
+ import urllib.request
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ logger = logging.getLogger("delimit.ai.reddit_scanner")
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Subreddit groups
26
+ # ---------------------------------------------------------------------------
27
+
28
+ SCAN_GROUPS: Dict[str, List[str]] = {
29
+ "delimit_core": ["ClaudeAI", "vibecoding", "cursor", "AI_Agents"],
30
+ "delimit_adjacent": ["devops", "programming", "ContextEngineering", "LocalLLaMA", "MachineLearning"],
31
+ "domainvested": ["Domains", "Entrepreneur", "SideProject", "flipping"],
32
+ "wirereport": ["sportsbook", "sportsbetting"],
33
+ "stakeone": ["harmony_one", "CryptoCurrency", "defi"],
34
+ "karma_building": ["SaaS", "opensource", "webdev", "startups", "ExperiencedDevs", "selfhosted", "IndieHackers"],
35
+ }
36
+
37
+ ALL_SUBREDDITS: List[str] = [sub for subs in SCAN_GROUPS.values() for sub in subs]
38
+
39
+ # Reverse lookup: subreddit -> group
40
+ _SUB_TO_GROUP: Dict[str, str] = {}
41
+ for _group, _subs in SCAN_GROUPS.items():
42
+ for _sub in _subs:
43
+ _SUB_TO_GROUP[_sub.lower()] = _group
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Venture keywords (loaded from disk or defined inline as fallback)
47
+ # ---------------------------------------------------------------------------
48
+
49
+ _VENTURE_KEYWORDS_FALLBACK: Dict[str, List[str]] = {
50
+ "delimit": [
51
+ "api governance", "breaking changes", "openapi", "api linting",
52
+ "mcp server", "mcp tools", "claude.md", "claude code",
53
+ "ai coding", "vibe coding", "semver", "api compatibility",
54
+ "schema migration", "api versioning", "contract testing",
55
+ "session handoff", "agent state", "context engineering",
56
+ ],
57
+ "domainvested": [
58
+ "domain investing", "domain appraisal", "domain flipping",
59
+ "expired domains", "brandable domains", "domain valuation",
60
+ "namepros", "domain name",
61
+ ],
62
+ "wirereport": [
63
+ "sports api", "live sports data", "sports scores",
64
+ "sports news automation", "sports betting api",
65
+ ],
66
+ "stakeone": [
67
+ "harmony one", "harmony validator", "one staking",
68
+ "harmony blockchain", "harmony network",
69
+ ],
70
+ }
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Pain point categories for product intelligence
74
+ # ---------------------------------------------------------------------------
75
+
76
+ PAIN_CATEGORIES: Dict[str, List[str]] = {
77
+ "context_loss": ["lost context", "re-explain", "starting from zero", "forgot", "doesn't remember"],
78
+ "rate_limits": ["rate limit", "session limit", "throttled", "burned through", "ran out"],
79
+ "multi_model": ["switching between", "codex and claude", "multiple models", "different tool"],
80
+ "code_quality": ["broke my", "deleted", "undid", "regression", "broke production"],
81
+ "session_management": ["session died", "context window", "compact", "handoff"],
82
+ "governance": ["breaking change", "API broke", "schema", "backward compat"],
83
+ "onboarding": ["how to start", "getting started", "setup", "configure"],
84
+ "cost": ["expensive", "pricing", "cost", "$200", "billing"],
85
+ }
86
+
87
+ # Which pain categories map to Delimit features
88
+ _PAIN_TO_RELEVANCE: Dict[str, str] = {
89
+ "context_loss": "existing_feature", # persistent context / session handoff
90
+ "session_management": "existing_feature", # session handoff, compact
91
+ "governance": "existing_feature", # API governance, breaking change detection
92
+ "multi_model": "existing_feature", # cross-model continuity
93
+ "code_quality": "planned_feature", # test verification, guardrails
94
+ "onboarding": "planned_feature", # delimit init, doctor, setup
95
+ "rate_limits": "new_opportunity", # not directly addressed yet
96
+ "cost": "new_opportunity", # pricing transparency / cost tracking
97
+ }
98
+
99
+ PROXY_URL = "http://127.0.0.1:4819/reddit-fetch"
100
+ SCANS_DIR = Path.home() / ".delimit" / "reddit_scans"
101
+ VENTURES_CONFIG_PATH = Path.home() / ".delimit" / "social_target_ventures.json"
102
+
103
+ # Posts by these authors are always skipped
104
+ SKIP_AUTHORS = {"delimitdev", "delimit_ai", "AutoModerator", "[deleted]"}
105
+
106
+ # ---------------------------------------------------------------------------
107
+ # Keyword loading
108
+ # ---------------------------------------------------------------------------
109
+
110
+
111
+ def _load_venture_keywords() -> Dict[str, List[str]]:
112
+ """Load venture keywords from disk config, falling back to built-in list."""
113
+ if VENTURES_CONFIG_PATH.exists():
114
+ try:
115
+ data = json.loads(VENTURES_CONFIG_PATH.read_text())
116
+ ventures = data.get("ventures", {})
117
+ result: Dict[str, List[str]] = {}
118
+ for name, cfg in ventures.items():
119
+ topics = cfg.get("topics", [])
120
+ if topics:
121
+ result[name] = [t.lower() for t in topics]
122
+ if result:
123
+ return result
124
+ except (json.JSONDecodeError, OSError) as exc:
125
+ logger.warning("Failed to load venture keywords: %s", exc)
126
+ return {k: [t.lower() for t in v] for k, v in _VENTURE_KEYWORDS_FALLBACK.items()}
127
+
128
+
129
+ # ---------------------------------------------------------------------------
130
+ # Fetching
131
+ # ---------------------------------------------------------------------------
132
+
133
+
134
+ def _fetch_subreddit(
135
+ subreddit: str,
136
+ sort: str = "hot",
137
+ limit: int = 10,
138
+ *,
139
+ proxy_url: str = PROXY_URL,
140
+ ) -> List[Dict[str, Any]]:
141
+ """Fetch posts from a single subreddit via the residential proxy.
142
+
143
+ The proxy endpoint expects a query parameter ``url`` containing the
144
+ actual Reddit JSON URL. Returns a list of extracted post dicts.
145
+ """
146
+ reddit_url = f"https://www.reddit.com/r/{subreddit}/{sort}.json?limit={limit}&raw_json=1"
147
+ fetch_url = f"{proxy_url}?url={urllib.request.quote(reddit_url, safe='')}"
148
+
149
+ req = urllib.request.Request(
150
+ fetch_url,
151
+ headers={"User-Agent": "delimit-scanner/1.0", "Accept": "application/json"},
152
+ )
153
+
154
+ try:
155
+ with urllib.request.urlopen(req, timeout=15) as resp:
156
+ body = json.loads(resp.read().decode())
157
+ except Exception as exc:
158
+ logger.warning("Failed to fetch r/%s: %s", subreddit, exc)
159
+ return []
160
+
161
+ # Reddit returns {"data": {"children": [...]}}
162
+ children = []
163
+ if isinstance(body, dict):
164
+ children = body.get("data", {}).get("children", [])
165
+
166
+ posts: List[Dict[str, Any]] = []
167
+ for child in children:
168
+ d = child.get("data", {})
169
+ if not d:
170
+ continue
171
+ # Skip stickied
172
+ if d.get("stickied"):
173
+ continue
174
+ # Skip our own posts
175
+ author = d.get("author", "")
176
+ if author in SKIP_AUTHORS:
177
+ continue
178
+
179
+ posts.append({
180
+ "id": d.get("id", ""),
181
+ "title": d.get("title", ""),
182
+ "author": author,
183
+ "score": d.get("score", 0),
184
+ "num_comments": d.get("num_comments", 0),
185
+ "subreddit": d.get("subreddit", subreddit),
186
+ "permalink": d.get("permalink", ""),
187
+ "selftext": (d.get("selftext") or "")[:200],
188
+ "created_utc": d.get("created_utc", 0),
189
+ })
190
+
191
+ return posts
192
+
193
+
194
+ def fetch_all(
195
+ limit_per_sub: int = 10,
196
+ sort: str = "hot",
197
+ *,
198
+ rate_limit: float = 4.0,
199
+ proxy_url: str = PROXY_URL,
200
+ subreddits: Optional[List[str]] = None,
201
+ ) -> List[Dict[str, Any]]:
202
+ """Fetch posts from all configured subreddits.
203
+
204
+ Returns the combined flat list of post dicts. Inserts a ``group``
205
+ field into each post based on which scan group the subreddit belongs to.
206
+ """
207
+ targets = subreddits or ALL_SUBREDDITS
208
+ all_posts: List[Dict[str, Any]] = []
209
+
210
+ import random
211
+ for i, sub in enumerate(targets):
212
+ if i > 0 and rate_limit > 0:
213
+ # Add jitter to avoid bot-pattern detection
214
+ time.sleep(rate_limit + random.uniform(0, 2.0))
215
+ posts = _fetch_subreddit(sub, sort=sort, limit=limit_per_sub, proxy_url=proxy_url)
216
+ group = _SUB_TO_GROUP.get(sub.lower(), "unknown")
217
+ for p in posts:
218
+ p["group"] = group
219
+ all_posts.extend(posts)
220
+ logger.info("Fetched %d posts from r/%s (%s)", len(posts), sub, group)
221
+
222
+ return all_posts
223
+
224
+
225
+ # ---------------------------------------------------------------------------
226
+ # Scoring & classification
227
+ # ---------------------------------------------------------------------------
228
+
229
+
230
+ def _age_hours(created_utc: float, now: Optional[float] = None) -> float:
231
+ """Return how many hours old a post is."""
232
+ now_ts = now or time.time()
233
+ return max(0.0, (now_ts - created_utc) / 3600.0)
234
+
235
+
236
+ def _freshness_multiplier(age_h: float) -> float:
237
+ """Return freshness multiplier: <6h = 2x, <12h = 1.5x, else 1x."""
238
+ if age_h < 6:
239
+ return 2.0
240
+ if age_h < 12:
241
+ return 1.5
242
+ return 1.0
243
+
244
+
245
+ def _relevance_tags(title: str, selftext: str, venture_keywords: Dict[str, List[str]]) -> List[str]:
246
+ """Return list of matching keyword tags from the post text."""
247
+ combined = (title + " " + selftext).lower()
248
+ tags: List[str] = []
249
+ for _venture, keywords in venture_keywords.items():
250
+ for kw in keywords:
251
+ if kw in combined and kw not in tags:
252
+ tags.append(kw)
253
+ return tags
254
+
255
+
256
+ def _suggest_angle(relevance_tags: List[str], group: str) -> str:
257
+ """Generate a brief suggested engagement angle."""
258
+ if not relevance_tags:
259
+ if group == "karma_building":
260
+ return "general expertise comment for karma building"
261
+ return "tangentially relevant -- low priority"
262
+
263
+ tag_str = ", ".join(relevance_tags[:3])
264
+
265
+ angle_map = {
266
+ "delimit_core": f"expert comment on {tag_str}",
267
+ "delimit_adjacent": f"helpful technical reply mentioning {tag_str}",
268
+ "domainvested": f"domain industry insight on {tag_str}",
269
+ "wirereport": f"sports data perspective on {tag_str}",
270
+ "stakeone": f"validator/staking expertise on {tag_str}",
271
+ "karma_building": f"genuine helpful comment touching on {tag_str}",
272
+ }
273
+ return angle_map.get(group, f"engage on {tag_str}")
274
+
275
+
276
+ def extract_pain_points(title: str, selftext: str) -> Dict[str, Any]:
277
+ """Extract actionable product insights from a post's text.
278
+
279
+ Returns a dict with:
280
+ pain_point: one-sentence description of the user's problem
281
+ delimit_relevance: existing_feature | planned_feature | new_opportunity | not_relevant
282
+ suggested_ledger_item: one-line ledger title (empty string if not relevant)
283
+ product_insight: one-sentence takeaway about user needs
284
+ matched_categories: list of PAIN_CATEGORIES keys that matched
285
+ """
286
+ combined = (title + " " + selftext).lower()
287
+
288
+ matched_cats: List[str] = []
289
+ matched_phrases: List[str] = []
290
+ for category, phrases in PAIN_CATEGORIES.items():
291
+ for phrase in phrases:
292
+ if phrase in combined:
293
+ if category not in matched_cats:
294
+ matched_cats.append(category)
295
+ matched_phrases.append(phrase)
296
+
297
+ if not matched_cats:
298
+ return {
299
+ "pain_point": "",
300
+ "delimit_relevance": "not_relevant",
301
+ "suggested_ledger_item": "",
302
+ "product_insight": "",
303
+ "matched_categories": [],
304
+ }
305
+
306
+ # Determine overall relevance from the most relevant category
307
+ relevance_priority = ["existing_feature", "planned_feature", "new_opportunity"]
308
+ best_relevance = "not_relevant"
309
+ for cat in matched_cats:
310
+ cat_rel = _PAIN_TO_RELEVANCE.get(cat, "not_relevant")
311
+ if cat_rel in relevance_priority:
312
+ idx = relevance_priority.index(cat_rel)
313
+ best_idx = relevance_priority.index(best_relevance) if best_relevance in relevance_priority else len(relevance_priority)
314
+ if idx < best_idx:
315
+ best_relevance = cat_rel
316
+
317
+ # Build pain_point: summarize from title (truncated, cleaned)
318
+ pain_point = title.strip()
319
+ if len(pain_point) > 120:
320
+ pain_point = pain_point[:117] + "..."
321
+
322
+ # Build suggested ledger item from category + title
323
+ cat_labels = {
324
+ "context_loss": "Context persistence",
325
+ "rate_limits": "Rate limit mitigation",
326
+ "multi_model": "Multi-model workflow",
327
+ "code_quality": "Code safety guardrail",
328
+ "session_management": "Session management",
329
+ "governance": "API governance",
330
+ "onboarding": "Onboarding flow",
331
+ "cost": "Cost management",
332
+ }
333
+ primary_cat = matched_cats[0]
334
+ ledger_prefix = cat_labels.get(primary_cat, primary_cat.replace("_", " ").title())
335
+
336
+ # Extract a compact actionable phrase from the title
337
+ ledger_item = ""
338
+ if best_relevance != "not_relevant":
339
+ # Use the first 80 chars of the title as the action item basis
340
+ short_title = title.strip()[:80].rstrip(".")
341
+ ledger_item = f"{ledger_prefix}: {short_title}"
342
+
343
+ # Build product insight
344
+ cat_insights = {
345
+ "context_loss": "Users lose productivity when context does not persist across sessions",
346
+ "rate_limits": "Rate limits and session caps are a recurring friction point for power users",
347
+ "multi_model": "Users want to move between AI tools without rebuilding context",
348
+ "code_quality": "Users fear AI making destructive changes without guardrails",
349
+ "session_management": "Session lifecycle management is a top concern for daily AI users",
350
+ "governance": "Teams need automated detection of breaking changes in APIs",
351
+ "onboarding": "New users struggle with initial setup and configuration",
352
+ "cost": "Cost predictability and transparency matter to individual developers",
353
+ }
354
+ insight = cat_insights.get(primary_cat, f"Users express frustration with {primary_cat.replace('_', ' ')}")
355
+
356
+ return {
357
+ "pain_point": pain_point,
358
+ "delimit_relevance": best_relevance,
359
+ "suggested_ledger_item": ledger_item,
360
+ "product_insight": insight,
361
+ "matched_categories": matched_cats,
362
+ }
363
+
364
+
365
+ def _build_product_insights(scored_posts: List[Dict[str, Any]]) -> Dict[str, Any]:
366
+ """Aggregate pain_points across all scored posts into a product insights summary.
367
+
368
+ Returns:
369
+ top_pain_points: most mentioned pain categories with counts
370
+ new_opportunities: suggested ledger items for unaddressed pain
371
+ existing_feature_validation: posts that validate features we already ship
372
+ """
373
+ from collections import Counter
374
+
375
+ cat_counter: Counter = Counter()
376
+ new_opps: List[Dict[str, str]] = []
377
+ existing_validations: List[Dict[str, str]] = []
378
+
379
+ for post in scored_posts:
380
+ pp = post.get("pain_points")
381
+ if not pp or not pp.get("matched_categories"):
382
+ continue
383
+
384
+ for cat in pp["matched_categories"]:
385
+ cat_counter[cat] += 1
386
+
387
+ relevance = pp.get("delimit_relevance", "not_relevant")
388
+ entry = {
389
+ "title": post.get("title", ""),
390
+ "subreddit": post.get("subreddit", ""),
391
+ "url": post.get("url", ""),
392
+ "pain_point": pp.get("pain_point", ""),
393
+ "suggested_ledger_item": pp.get("suggested_ledger_item", ""),
394
+ }
395
+
396
+ if relevance == "new_opportunity" and pp.get("suggested_ledger_item"):
397
+ new_opps.append(entry)
398
+ elif relevance == "existing_feature":
399
+ existing_validations.append(entry)
400
+
401
+ # Sort pain points by frequency
402
+ top_pains = [
403
+ {"category": cat, "count": count}
404
+ for cat, count in cat_counter.most_common(10)
405
+ ]
406
+
407
+ return {
408
+ "top_pain_points": top_pains,
409
+ "new_opportunities": new_opps[:20],
410
+ "existing_feature_validation": existing_validations[:20],
411
+ }
412
+
413
+
414
+ def score_and_classify(
415
+ posts: List[Dict[str, Any]],
416
+ *,
417
+ now: Optional[float] = None,
418
+ venture_keywords: Optional[Dict[str, List[str]]] = None,
419
+ ) -> List[Dict[str, Any]]:
420
+ """Score and classify posts, returning them sorted by rank (best first).
421
+
422
+ Each post dict is augmented with:
423
+ engagement_score, age_hours, freshness_mult, relevance_tags,
424
+ karma_building, suggested_angle, priority, final_score, pain_points
425
+ """
426
+ kw = venture_keywords or _load_venture_keywords()
427
+ now_ts = now or time.time()
428
+ scored: List[Dict[str, Any]] = []
429
+
430
+ for post in posts:
431
+ score = post.get("score", 0)
432
+ comments = post.get("num_comments", 0)
433
+ created = post.get("created_utc", 0)
434
+ group = post.get("group", "unknown")
435
+
436
+ age_h = _age_hours(created, now_ts)
437
+ engagement = score * 0.4 + comments * 0.6
438
+ fresh_mult = _freshness_multiplier(age_h)
439
+
440
+ # Comment opportunity bonus: high engagement but room to comment
441
+ comment_opp = 1.0
442
+ if engagement > 5 and comments < 30:
443
+ comment_opp = 1.3
444
+
445
+ tags = _relevance_tags(
446
+ post.get("title", ""),
447
+ post.get("selftext", ""),
448
+ kw,
449
+ )
450
+ relevance_mult = 1.0 + 0.2 * min(len(tags), 5) # up to 2.0x
451
+
452
+ final_score = engagement * fresh_mult * comment_opp * relevance_mult
453
+ is_karma = group == "karma_building"
454
+
455
+ # Classification
456
+ if post.get("stickied") or age_h > 48 or comments > 100:
457
+ priority = "skip"
458
+ elif final_score >= 30 and age_h < 12 and comments < 50:
459
+ priority = "high_priority"
460
+ elif final_score >= 10 or (len(tags) >= 2 and age_h < 24):
461
+ priority = "medium_priority"
462
+ elif final_score >= 3:
463
+ priority = "low_priority"
464
+ else:
465
+ priority = "skip"
466
+
467
+ angle = _suggest_angle(tags, group)
468
+ pain = extract_pain_points(post.get("title", ""), post.get("selftext", ""))
469
+
470
+ scored.append({
471
+ **post,
472
+ "engagement_score": round(engagement, 1),
473
+ "age_hours": round(age_h, 1),
474
+ "freshness_mult": fresh_mult,
475
+ "relevance_tags": tags,
476
+ "karma_building": is_karma,
477
+ "suggested_angle": angle,
478
+ "priority": priority,
479
+ "final_score": round(final_score, 2),
480
+ "url": f"https://reddit.com{post.get('permalink', '')}",
481
+ "pain_points": pain,
482
+ })
483
+
484
+ # Sort by final_score descending
485
+ scored.sort(key=lambda x: x["final_score"], reverse=True)
486
+
487
+ # Assign ranks
488
+ for i, item in enumerate(scored):
489
+ item["rank"] = i + 1
490
+
491
+ return scored
492
+
493
+
494
+ # ---------------------------------------------------------------------------
495
+ # Main scan orchestrator
496
+ # ---------------------------------------------------------------------------
497
+
498
+
499
+ def scan_all(
500
+ limit_per_sub: int = 10,
501
+ sort: str = "hot",
502
+ *,
503
+ rate_limit: float = 4.0,
504
+ proxy_url: str = PROXY_URL,
505
+ ) -> Dict[str, Any]:
506
+ """Full scan: fetch all subreddits, score, classify, persist.
507
+
508
+ Returns the complete result dict with targets, stats, and grouping.
509
+ """
510
+ scan_start = datetime.now(timezone.utc)
511
+
512
+ raw_posts = fetch_all(
513
+ limit_per_sub=limit_per_sub,
514
+ sort=sort,
515
+ rate_limit=rate_limit,
516
+ proxy_url=proxy_url,
517
+ )
518
+
519
+ scored = score_and_classify(raw_posts)
520
+
521
+ # Group by scan group
522
+ by_group: Dict[str, List[Dict[str, Any]]] = {}
523
+ for item in scored:
524
+ g = item.get("group", "unknown")
525
+ by_group.setdefault(g, []).append(item)
526
+
527
+ # Stats
528
+ stats: Dict[str, int] = {"high_priority": 0, "medium_priority": 0, "low_priority": 0, "skip": 0}
529
+ for item in scored:
530
+ p = item.get("priority", "skip")
531
+ stats[p] = stats.get(p, 0) + 1
532
+
533
+ # Only include non-skip targets in the top-level targets list
534
+ targets = [t for t in scored if t["priority"] != "skip"]
535
+
536
+ # Product intelligence summary
537
+ product_insights = _build_product_insights(scored)
538
+
539
+ result: Dict[str, Any] = {
540
+ "scanned_at": scan_start.isoformat(),
541
+ "total_posts": len(raw_posts),
542
+ "subreddits_scanned": len(set(p.get("subreddit", "") for p in raw_posts)),
543
+ "targets": targets,
544
+ "by_group": by_group,
545
+ "stats": stats,
546
+ "product_insights": product_insights,
547
+ }
548
+
549
+ # Persist to disk
550
+ _save_scan(result, scan_start)
551
+
552
+ return result
553
+
554
+
555
+ def _save_scan(result: Dict[str, Any], scan_time: datetime) -> Path:
556
+ """Save scan results to ~/.delimit/reddit_scans/{date}.json."""
557
+ SCANS_DIR.mkdir(parents=True, exist_ok=True)
558
+ filename = scan_time.strftime("%Y-%m-%dT%H%M%S") + ".json"
559
+ path = SCANS_DIR / filename
560
+ path.write_text(json.dumps(result, indent=2, default=str))
561
+ logger.info("Scan saved to %s", path)
562
+ return path