@opendirectory.dev/skills 0.1.42 → 0.1.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,449 @@
1
+ """
2
+ research.py -- two-phase data collector for pricing-finder.
3
+
4
+ Phase 1 (discover): finds competitor candidates via DuckDuckGo search.
5
+ Phase 2 (fetch-pricing): fetches each competitor's pricing page with 3-tier fallback.
6
+
7
+ Uses only free dependencies -- no API keys required.
8
+ Optional: --tavily-key (better search), --firecrawl-key (better JS rendering).
9
+
10
+ Usage:
11
+ # Phase 1
12
+ python3 scripts/research.py \
13
+ --phase discover \
14
+ --product-analysis /tmp/pf-product-analysis.json \
15
+ --output /tmp/pf-competitors-raw.json
16
+
17
+ # Phase 2
18
+ python3 scripts/research.py \
19
+ --phase fetch-pricing \
20
+ --competitors /tmp/pf-competitors-confirmed.json \
21
+ --output /tmp/pf-pricing-raw.json
22
+ """
23
+
24
+ import argparse
25
+ import json
26
+ import os
27
+ import random
28
+ import ssl
29
+ import sys
30
+ import time
31
+ import urllib.request
32
+ from datetime import date
33
+ from urllib.parse import urlparse
34
+
35
+ try:
36
+ from ddgs import DDGS
37
+ except ImportError:
38
+ try:
39
+ from duckduckgo_search import DDGS
40
+ except ImportError:
41
+ DDGS = None
42
+
43
+ try:
44
+ import requests
45
+ import html2text
46
+ from bs4 import BeautifulSoup
47
+ except ImportError:
48
+ requests = None
49
+ html2text = None
50
+ BeautifulSoup = None
51
+
52
+ _ssl_ctx = ssl._create_unverified_context()
53
+ quiet = False
54
+
55
+ USER_AGENTS = [
56
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
57
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
58
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
59
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
60
+ ]
61
+
62
+
63
+ def log(msg):
64
+ if not quiet:
65
+ print(msg, file=sys.stderr)
66
+
67
+
68
+ def random_headers():
69
+ return {
70
+ "User-Agent": random.choice(USER_AGENTS),
71
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
72
+ "Accept-Language": "en-US,en;q=0.5",
73
+ "Accept-Encoding": "gzip, deflate",
74
+ "Connection": "keep-alive",
75
+ }
76
+
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # Search: DuckDuckGo (free) or Tavily (optional upgrade)
80
+ # ---------------------------------------------------------------------------
81
+
82
+ def ddg_search(query, max_results=8):
83
+ """DuckDuckGo search. Returns list of {title, url, snippet}."""
84
+ if DDGS is None:
85
+ log(" ERROR: duckduckgo_search not installed. Run: pip install duckduckgo-search")
86
+ return []
87
+ log(f" DDG search: {query[:80]}")
88
+ try:
89
+ with DDGS() as ddgs:
90
+ results = list(ddgs.text(query, max_results=max_results))
91
+ return [
92
+ {
93
+ "title": r.get("title", ""),
94
+ "url": r.get("href", ""),
95
+ "snippet": r.get("body", "")[:300],
96
+ }
97
+ for r in results
98
+ ]
99
+ except Exception as e:
100
+ log(f" DDG error: {e}")
101
+ return []
102
+
103
+
104
+ def tavily_search(query, key, max_results=8):
105
+ """Tavily search (optional upgrade). Returns same format as ddg_search."""
106
+ log(f" Tavily search: {query[:80]}")
107
+ try:
108
+ payload = json.dumps({
109
+ "api_key": key,
110
+ "query": query,
111
+ "search_depth": "advanced",
112
+ "max_results": max_results,
113
+ }).encode()
114
+ req = urllib.request.Request(
115
+ "https://api.tavily.com/search",
116
+ data=payload,
117
+ headers={"Content-Type": "application/json"},
118
+ method="POST",
119
+ )
120
+ with urllib.request.urlopen(req, timeout=25, context=_ssl_ctx) as resp:
121
+ data = json.loads(resp.read())
122
+ return [
123
+ {
124
+ "title": r.get("title", ""),
125
+ "url": r.get("url", ""),
126
+ "snippet": r.get("content", "")[:300],
127
+ }
128
+ for r in data.get("results", [])
129
+ ]
130
+ except Exception as e:
131
+ log(f" Tavily error: {e}")
132
+ return []
133
+
134
+
135
+ def search(query, tavily_key=None, max_results=8):
136
+ """Use Tavily if key provided, otherwise DuckDuckGo."""
137
+ if tavily_key:
138
+ results = tavily_search(query, tavily_key, max_results)
139
+ if results:
140
+ return results
141
+ return ddg_search(query, max_results)
142
+
143
+
144
+ # ---------------------------------------------------------------------------
145
+ # Fetch: requests+BS4 (free) or Firecrawl (optional upgrade)
146
+ # ---------------------------------------------------------------------------
147
+
148
+ def fetch_url_bs4(url, timeout=20):
149
+ """Fetch URL with requests + html2text. Returns (markdown, status)."""
150
+ if requests is None or html2text is None:
151
+ return "", "requests/html2text not installed"
152
+ try:
153
+ resp = requests.get(
154
+ url,
155
+ headers=random_headers(),
156
+ timeout=timeout,
157
+ allow_redirects=True,
158
+ )
159
+ if resp.status_code != 200:
160
+ return "", f"HTTP {resp.status_code}"
161
+ converter = html2text.HTML2Text()
162
+ converter.ignore_links = False
163
+ converter.ignore_images = True
164
+ converter.body_width = 0
165
+ md = converter.handle(resp.text)
166
+ return md[:8000], "ok"
167
+ except Exception as e:
168
+ return "", str(e)
169
+
170
+
171
+ _GOOGLE_INTERSTITIAL_SIGNALS = [
172
+ "if you are not redirected",
173
+ "please click here",
174
+ "accounts.google.com/ServiceLogin",
175
+ ]
176
+
177
+
178
+ def fetch_google_cache(url, timeout=20):
179
+ """Try Google cache as fallback for blocked pages."""
180
+ cache_url = f"https://webcache.googleusercontent.com/search?q=cache:{url}&hl=en"
181
+ content, status = fetch_url_bs4(cache_url, timeout)
182
+ if content and any(sig in content for sig in _GOOGLE_INTERSTITIAL_SIGNALS):
183
+ return "", "google_cache_interstitial"
184
+ return content, status
185
+
186
+
187
+ def fetch_firecrawl(url, key, timeout=30):
188
+ """Firecrawl fetch (optional upgrade)."""
189
+ try:
190
+ payload = json.dumps({
191
+ "url": url,
192
+ "formats": ["markdown"],
193
+ "onlyMainContent": True,
194
+ }).encode()
195
+ req = urllib.request.Request(
196
+ "https://api.firecrawl.dev/v1/scrape",
197
+ data=payload,
198
+ headers={
199
+ "Authorization": f"Bearer {key}",
200
+ "Content-Type": "application/json",
201
+ },
202
+ method="POST",
203
+ )
204
+ with urllib.request.urlopen(req, timeout=timeout, context=_ssl_ctx) as resp:
205
+ data = json.loads(resp.read())
206
+ content = data.get("data", {}).get("markdown", "") or data.get("markdown", "")
207
+ return content[:8000], "ok"
208
+ except Exception as e:
209
+ return "", str(e)
210
+
211
+
212
+ def infer_pricing_urls(base_url):
213
+ """Return a list of candidate pricing page URLs from the base URL."""
214
+ try:
215
+ parsed = urlparse(base_url)
216
+ base = f"{parsed.scheme}://{parsed.netloc}"
217
+ return [
218
+ f"{base}/pricing",
219
+ f"{base}/plans",
220
+ f"{base}/price",
221
+ f"{base}/pricing/",
222
+ ]
223
+ except Exception:
224
+ return []
225
+
226
+
227
+ def build_result(name, pricing_url, content, source, quality, note=None):
228
+ result = {
229
+ "name": name,
230
+ "pricing_url": pricing_url,
231
+ "content": content,
232
+ "content_length": len(content),
233
+ "source": source,
234
+ "data_quality": quality,
235
+ }
236
+ if note:
237
+ result["data_quality_note"] = note
238
+ return result
239
+
240
+
241
+ # ---------------------------------------------------------------------------
242
+ # Phase 1: competitor discovery
243
+ # ---------------------------------------------------------------------------
244
+
245
+ def run_discover(product_analysis, tavily_key=None):
246
+ name = product_analysis.get("product_name", "")
247
+ l2 = product_analysis.get("industry_taxonomy", {}).get("l2", "")
248
+ l3 = product_analysis.get("industry_taxonomy", {}).get("l3", "")
249
+
250
+ log(f"\nPhase 1: competitor discovery for '{name}'")
251
+ log(f" taxonomy: {l2} > {l3}")
252
+
253
+ q1 = f"{name} competitors alternatives {l3}"
254
+ q2 = f"{l2} {l3} software tool pricing plans"
255
+
256
+ r1 = search(q1, tavily_key, max_results=8)
257
+ r2 = search(q2, tavily_key, max_results=8)
258
+
259
+ log(f" Q1 results: {len(r1)}")
260
+ log(f" Q2 results: {len(r2)}")
261
+ log("Phase 1 complete.")
262
+
263
+ return {
264
+ "date": str(date.today()),
265
+ "product_name": name,
266
+ "competitor_searches": [
267
+ {"query": q1, "results": r1},
268
+ {"query": q2, "results": r2},
269
+ ],
270
+ }
271
+
272
+
273
+ # ---------------------------------------------------------------------------
274
+ # Phase 2: fetch pricing pages
275
+ # ---------------------------------------------------------------------------
276
+
277
+ def fetch_pricing_page(competitor, firecrawl_key=None, tavily_key=None):
278
+ name = competitor.get("name", "")
279
+ pricing_url = competitor.get("pricing_url", "")
280
+ base_url = competitor.get("url", "")
281
+
282
+ log(f"\n Fetching pricing: {name}")
283
+
284
+ # Build URL candidates: explicit pricing URL first, then inferred
285
+ url_candidates = []
286
+ if pricing_url:
287
+ url_candidates.append(pricing_url)
288
+ url_candidates.extend(infer_pricing_urls(base_url))
289
+ # Deduplicate while preserving order
290
+ seen = set()
291
+ url_candidates = [u for u in url_candidates if u and not (u in seen or seen.add(u))]
292
+
293
+ for url in url_candidates:
294
+ # Try 1: Firecrawl (if key provided)
295
+ if firecrawl_key:
296
+ content, status = fetch_firecrawl(url, firecrawl_key)
297
+ if len(content) > 500:
298
+ log(f" firecrawl OK: {len(content)} chars from {url}")
299
+ return build_result(name, url, content, "firecrawl", "high")
300
+ log(f" firecrawl: {status} ({len(content)} chars)")
301
+
302
+ # Try 2: requests + BS4 (free)
303
+ content, status = fetch_url_bs4(url)
304
+ if len(content) > 500:
305
+ log(f" requests OK: {len(content)} chars from {url}")
306
+ return build_result(name, url, content, "requests", "high")
307
+ log(f" requests: {status} ({len(content)} chars)")
308
+
309
+ # Try 3: Google cache
310
+ content, status = fetch_google_cache(url)
311
+ if len(content) > 500:
312
+ log(f" google_cache OK: {len(content)} chars from {url}")
313
+ return build_result(name, url, content, "google_cache", "medium")
314
+ log(f" google_cache: {status} ({len(content)} chars)")
315
+
316
+ time.sleep(1)
317
+
318
+ # Final fallback: DuckDuckGo / Tavily search snippet
319
+ log(f" All fetch attempts failed. Using search snippet fallback.")
320
+ fallback_query = f'"{name}" pricing plans cost per month tiers'
321
+ results = search(fallback_query, tavily_key, max_results=5)
322
+ snippet = " ".join(r.get("snippet", "") for r in results)[:3000]
323
+ return build_result(
324
+ name, pricing_url, snippet, "search_snippet", "low",
325
+ note="Pricing page fetch failed. Data from search snippets only -- prices may be incomplete."
326
+ )
327
+
328
+
329
+ def run_fetch_pricing(confirmed_competitors, firecrawl_key=None, tavily_key=None):
330
+ log(f"\nPhase 2: fetching pricing pages for {len(confirmed_competitors)} competitors")
331
+
332
+ results = []
333
+ for comp in confirmed_competitors:
334
+ result = fetch_pricing_page(comp, firecrawl_key, tavily_key)
335
+ results.append(result)
336
+ time.sleep(1.5) # polite crawl delay
337
+
338
+ log("\nPhase 2 complete.")
339
+ return {
340
+ "date": str(date.today()),
341
+ "competitors_fetched": len(confirmed_competitors),
342
+ "results": results,
343
+ }
344
+
345
+
346
+ # ---------------------------------------------------------------------------
347
+ # CLI
348
+ # ---------------------------------------------------------------------------
349
+
350
+ def main():
351
+ global quiet
352
+
353
+ parser = argparse.ArgumentParser(description="pricing-finder research script")
354
+ parser.add_argument(
355
+ "--phase",
356
+ required=True,
357
+ choices=["discover", "fetch-pricing"],
358
+ help="Which phase to run",
359
+ )
360
+ parser.add_argument(
361
+ "--product-analysis",
362
+ required=True,
363
+ help="Path to pf-product-analysis.json",
364
+ )
365
+ parser.add_argument(
366
+ "--competitors",
367
+ default="",
368
+ help="Path to pf-competitors-confirmed.json (Phase 2 only)",
369
+ )
370
+ parser.add_argument(
371
+ "--tavily-key",
372
+ default=os.environ.get("TAVILY_API_KEY", ""),
373
+ help="Tavily API key for upgraded search (optional)",
374
+ )
375
+ parser.add_argument(
376
+ "--firecrawl-key",
377
+ default=os.environ.get("FIRECRAWL_API_KEY", ""),
378
+ help="Firecrawl API key for upgraded JS rendering (optional)",
379
+ )
380
+ parser.add_argument(
381
+ "--output",
382
+ required=True,
383
+ help="Path to write JSON output",
384
+ )
385
+ parser.add_argument(
386
+ "--quiet",
387
+ action="store_true",
388
+ help="Suppress progress output to stderr",
389
+ )
390
+ args = parser.parse_args()
391
+ quiet = args.quiet
392
+
393
+ # Dependency check
394
+ if DDGS is None and not args.tavily_key:
395
+ print(
396
+ "ERROR: duckduckgo_search not installed and no Tavily key provided.\n"
397
+ "Run: pip install duckduckgo-search requests beautifulsoup4 html2text",
398
+ file=sys.stderr,
399
+ )
400
+ sys.exit(1)
401
+
402
+ if not os.path.exists(args.product_analysis):
403
+ print(f"ERROR: {args.product_analysis} not found", file=sys.stderr)
404
+ sys.exit(1)
405
+
406
+ with open(args.product_analysis) as f:
407
+ product_analysis = json.load(f)
408
+
409
+ if args.phase == "discover":
410
+ output = run_discover(product_analysis, args.tavily_key or None)
411
+
412
+ elif args.phase == "fetch-pricing":
413
+ if not args.competitors:
414
+ print("ERROR: --competitors required for fetch-pricing phase", file=sys.stderr)
415
+ sys.exit(1)
416
+ if not os.path.exists(args.competitors):
417
+ print(f"ERROR: {args.competitors} not found", file=sys.stderr)
418
+ sys.exit(1)
419
+ with open(args.competitors) as f:
420
+ competitors_data = json.load(f)
421
+ confirmed = competitors_data.get("confirmed_competitors", [])
422
+ if not confirmed:
423
+ print("ERROR: no confirmed_competitors in input file", file=sys.stderr)
424
+ sys.exit(1)
425
+ output = run_fetch_pricing(
426
+ confirmed,
427
+ firecrawl_key=args.firecrawl_key or None,
428
+ tavily_key=args.tavily_key or None,
429
+ )
430
+
431
+ with open(args.output, "w") as f:
432
+ json.dump(output, f, indent=2)
433
+ log(f"\nOutput written to {args.output}")
434
+
435
+ # Print summary to stdout for SKILL.md to display
436
+ if args.phase == "discover":
437
+ total = sum(len(s["results"]) for s in output["competitor_searches"])
438
+ print(f"Discover complete: {len(output['competitor_searches'])} queries, {total} total results")
439
+ else:
440
+ print(f"Fetch complete: {output['competitors_fetched']} competitors")
441
+ for r in output.get("results", []):
442
+ quality_label = {"high": "GOOD", "medium": "OK", "low": "SNIPPET ONLY"}.get(
443
+ r["data_quality"], r["data_quality"]
444
+ )
445
+ print(f" {r['name']:22} {r['source']:15} {r['content_length']:5} chars [{quality_label}]")
446
+
447
+
448
+ if __name__ == "__main__":
449
+ main()