devpost-scraper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,135 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from typing import Any, Awaitable, Callable, Mapping
6
+
7
+ from backboard import BackboardClient
8
+ from backboard.exceptions import BackboardAPIError
9
+
10
+ ToolHandler = Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]
11
+
12
+
13
+ class BackboardClientError(Exception):
14
+ """Raised when a Backboard operation fails."""
15
+
16
+
17
+ def build_client() -> BackboardClient:
18
+ api_key = os.getenv("BACKBOARD_API_KEY", "").strip()
19
+ if not api_key:
20
+ raise BackboardClientError(
21
+ "Missing required environment variable `BACKBOARD_API_KEY`."
22
+ )
23
+ return BackboardClient(api_key=api_key)
24
+
25
+
26
+ async def ensure_assistant(
27
+ client: BackboardClient,
28
+ *,
29
+ assistant_id: str | None,
30
+ name: str,
31
+ system_prompt: str,
32
+ tools: list[dict[str, Any]],
33
+ ) -> str:
34
+ if assistant_id:
35
+ return assistant_id
36
+ assistant = await client.create_assistant(
37
+ name=name,
38
+ system_prompt=system_prompt,
39
+ tools=tools,
40
+ )
41
+ return str(assistant.assistant_id)
42
+
43
+
44
+ async def _collect_stream(stream: Any) -> dict[str, Any]:
45
+ """Drain a streaming add_message response into a unified result dict."""
46
+ content_parts: list[str] = []
47
+ tool_calls: list[Any] = []
48
+ run_id: str | None = None
49
+ status = "completed"
50
+
51
+ async for chunk in stream:
52
+ t = chunk.get("type")
53
+ if t == "content_streaming":
54
+ content_parts.append(chunk.get("content", ""))
55
+ elif t == "tool_submit_required":
56
+ status = "REQUIRES_ACTION"
57
+ run_id = chunk.get("run_id")
58
+ tool_calls = chunk.get("tool_calls", [])
59
+ elif t == "run_ended":
60
+ if chunk.get("status") not in (None, "completed"):
61
+ raise BackboardClientError(
62
+ f"Run ended with status: {chunk.get('status')}"
63
+ )
64
+
65
+ return {
66
+ "content": "".join(content_parts) or None,
67
+ "status": status,
68
+ "tool_calls": tool_calls,
69
+ "run_id": run_id,
70
+ }
71
+
72
+
73
+ async def run_in_thread(
74
+ client: BackboardClient,
75
+ *,
76
+ assistant_id: str,
77
+ user_message: str,
78
+ tool_handlers: Mapping[str, ToolHandler],
79
+ llm_provider: str = "openai",
80
+ model_name: str = "gpt-4o-mini",
81
+ max_tool_rounds: int = 6,
82
+ ) -> str:
83
+ """Create a thread, send a message via streaming, execute the tool loop."""
84
+ thread = await client.create_thread(assistant_id)
85
+
86
+ stream = await client.add_message(
87
+ thread_id=thread.thread_id,
88
+ content=user_message,
89
+ stream=True,
90
+ llm_provider=llm_provider,
91
+ model_name=model_name,
92
+ )
93
+ result = await _collect_stream(stream)
94
+
95
+ rounds = 0
96
+ while result["status"] == "REQUIRES_ACTION":
97
+ rounds += 1
98
+ if rounds > max_tool_rounds:
99
+ raise BackboardClientError(
100
+ f"Tool loop exceeded {max_tool_rounds} rounds — aborting."
101
+ )
102
+ if not result["run_id"]:
103
+ raise BackboardClientError("REQUIRES_ACTION without run_id.")
104
+ if not result["tool_calls"]:
105
+ raise BackboardClientError("REQUIRES_ACTION without tool_calls.")
106
+
107
+ tool_outputs = []
108
+ for tc in result["tool_calls"]:
109
+ name = tc["function"]["name"] if isinstance(tc, dict) else tc.function.name
110
+ args_raw = (
111
+ tc["function"].get("arguments", "{}")
112
+ if isinstance(tc, dict)
113
+ else (tc.function.arguments or "{}")
114
+ )
115
+ args = args_raw if isinstance(args_raw, dict) else json.loads(args_raw or "{}")
116
+ tc_id = tc["id"] if isinstance(tc, dict) else tc.id
117
+
118
+ handler = tool_handlers.get(name)
119
+ if handler is None:
120
+ raise BackboardClientError(f"No handler registered for tool `{name}`.")
121
+
122
+ call_result = await handler(args)
123
+ tool_outputs.append({"tool_call_id": tc_id, "output": json.dumps(call_result)})
124
+
125
+ stream = await client.submit_tool_outputs(
126
+ thread_id=thread.thread_id,
127
+ run_id=result["run_id"],
128
+ tool_outputs=tool_outputs,
129
+ stream=True,
130
+ )
131
+ result = await _collect_stream(stream)
132
+
133
+ if not result["content"]:
134
+ raise BackboardClientError("Run completed without content.")
135
+ return result["content"]
devpost_scraper/cli.py ADDED
@@ -0,0 +1,364 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import asyncio
5
+ import json
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Any
10
+ from urllib.parse import urlparse
11
+
12
+ from dotenv import load_dotenv, set_key
13
+
14
+ from devpost_scraper.backboard_client import (
15
+ BackboardClientError,
16
+ build_client,
17
+ ensure_assistant,
18
+ run_in_thread,
19
+ )
20
+ from devpost_scraper.csv_export import write_projects
21
+ from devpost_scraper.models import DevpostProject, HackathonParticipant
22
+ from devpost_scraper.scraper import (
23
+ find_author_email,
24
+ find_participant_email,
25
+ get_hackathon_participants,
26
+ get_project_details,
27
+ search_projects,
28
+ )
29
+
30
+ _ENV_FILE = Path(".env")
31
+ _ASSISTANT_ID_KEY = "DEVPOST_ASSISTANT_ID"
32
+
33
+ # The assistant's ONLY job is to search and return raw project URLs.
34
+ # Python handles all enrichment directly — no tool loop explosion.
35
+ _SYSTEM_PROMPT = """\
36
+ You are a Devpost search assistant. Given a search term:
37
+
38
+ 1. Call search_devpost_projects for page 1 and page 2.
39
+ 2. Deduplicate results by URL.
40
+ 3. Return ONLY a valid JSON array — no prose, no markdown, no code fences.
41
+
42
+ Each element: {"title": "...", "tagline": "...", "url": "...", "built_with": "..."}
43
+ built_with is a comma-separated string of technology names.
44
+ Never call the same tool with the same arguments twice.\
45
+ """
46
+
47
+ _TOOLS: list[dict[str, Any]] = [
48
+ {
49
+ "type": "function",
50
+ "function": {
51
+ "name": "search_devpost_projects",
52
+ "description": "Search Devpost for hackathon projects matching a query.",
53
+ "parameters": {
54
+ "type": "object",
55
+ "properties": {
56
+ "query": {"type": "string", "description": "Search query term"},
57
+ "page": {"type": "integer", "description": "Page number (default 1)"},
58
+ },
59
+ "required": ["query"],
60
+ },
61
+ },
62
+ },
63
+ ]
64
+
65
+
66
+ async def _handle_search(args: dict[str, Any]) -> dict[str, Any]:
67
+ query = args["query"]
68
+ page = int(args.get("page") or 1)
69
+ print(f" [tool] search_devpost_projects(query={query!r}, page={page})", file=sys.stderr)
70
+ return await search_projects(query=query, page=page)
71
+
72
+
73
+ _TOOL_HANDLERS = {
74
+ "search_devpost_projects": _handle_search,
75
+ }
76
+
77
+
78
+ async def _load_or_create_assistant(client: Any) -> str:
79
+ load_dotenv(_ENV_FILE, override=True)
80
+ stored_id = os.getenv(_ASSISTANT_ID_KEY, "").strip()
81
+ if stored_id:
82
+ print(f"[info] Reusing assistant {stored_id}", file=sys.stderr)
83
+ return stored_id
84
+
85
+ print("[info] Creating Backboard assistant…", file=sys.stderr)
86
+ aid = await ensure_assistant(
87
+ client,
88
+ assistant_id=None,
89
+ name="devpost-scraper-v3",
90
+ system_prompt=_SYSTEM_PROMPT,
91
+ tools=_TOOLS,
92
+ )
93
+ _ENV_FILE.touch(exist_ok=True)
94
+ set_key(str(_ENV_FILE), _ASSISTANT_ID_KEY, str(aid))
95
+ print(f"[info] Created assistant {aid} — saved to .env", file=sys.stderr)
96
+ return str(aid)
97
+
98
+
99
+ def _parse_search_results(raw: str) -> list[dict[str, Any]]:
100
+ raw = raw.strip()
101
+ if raw.startswith("```"):
102
+ raw = "\n".join(
103
+ line for line in raw.splitlines()
104
+ if not line.strip().startswith("```")
105
+ ).strip()
106
+ try:
107
+ data = json.loads(raw)
108
+ except json.JSONDecodeError as exc:
109
+ raise SystemExit(
110
+ f"[error] Assistant returned invalid JSON: {exc}\n\nRaw:\n{raw}"
111
+ ) from exc
112
+ if not isinstance(data, list):
113
+ raise SystemExit(f"[error] Expected JSON array, got {type(data).__name__}")
114
+ return [item for item in data if isinstance(item, dict) and item.get("url")]
115
+
116
+
117
+ async def _enrich_project(
118
+ item: dict[str, Any],
119
+ search_term: str,
120
+ ) -> DevpostProject:
121
+ url = item["url"]
122
+
123
+ # detail page enrichment
124
+ details: dict[str, Any] = {}
125
+ try:
126
+ details = await get_project_details(url=url)
127
+ print(f" [enrich] details {url}", file=sys.stderr)
128
+ except Exception as exc:
129
+ print(f" [warn] details failed for {url}: {exc}", file=sys.stderr)
130
+
131
+ # email chain
132
+ email_data: dict[str, Any] = {}
133
+ try:
134
+ email_data = await find_author_email(project_url=url)
135
+ if email_data.get("email"):
136
+ print(f" [email] {email_data['email']} ← {url}", file=sys.stderr)
137
+ else:
138
+ print(f" [email] (none found) ← {url}", file=sys.stderr)
139
+ except Exception as exc:
140
+ print(f" [warn] email failed for {url}: {exc}", file=sys.stderr)
141
+
142
+ author_urls: list[str] = email_data.get("author_profile_urls", [])
143
+
144
+ return DevpostProject(
145
+ search_term=search_term,
146
+ title=details.get("title") or item.get("title", ""),
147
+ tagline=details.get("tagline") or item.get("tagline", ""),
148
+ url=url,
149
+ hackathon_name=details.get("hackathon_name", ""),
150
+ hackathon_url=details.get("hackathon_url", ""),
151
+ summary=details.get("summary", ""),
152
+ built_with=details.get("built_with") or item.get("built_with", ""),
153
+ prizes=details.get("prizes", ""),
154
+ team_size=details.get("team_size", ""),
155
+ author_profile_url=author_urls[0] if author_urls else "",
156
+ email=email_data.get("email", ""),
157
+ )
158
+
159
+
160
+ async def run(search_terms: list[str], output: str | None) -> None:
161
+ load_dotenv(_ENV_FILE, override=True)
162
+ client = build_client()
163
+ assistant_id = await _load_or_create_assistant(client)
164
+
165
+ all_projects: list[DevpostProject] = []
166
+
167
+ for term in search_terms:
168
+ print(f"\n[info] Searching Devpost for: {term!r}", file=sys.stderr)
169
+ raw = await run_in_thread(
170
+ client,
171
+ assistant_id=assistant_id,
172
+ user_message=(
173
+ f"Search Devpost for: {term!r}\n"
174
+ "Collect page 1 and page 2. Return a JSON array of projects."
175
+ ),
176
+ tool_handlers=_TOOL_HANDLERS,
177
+ llm_provider=os.getenv("BACKBOARD_LLM_PROVIDER", "openai"),
178
+ model_name=os.getenv("BACKBOARD_MODEL", "gpt-4o-mini"),
179
+ )
180
+ items = _parse_search_results(raw)
181
+ print(f"[info] Found {len(items)} projects — enriching…", file=sys.stderr)
182
+
183
+ # Enrich sequentially to be polite to external sites
184
+ projects: list[DevpostProject] = []
185
+ for item in items:
186
+ project = await _enrich_project(item, search_term=term)
187
+ projects.append(project)
188
+
189
+ print(f"[info] Collected {len(projects)} projects for {term!r}", file=sys.stderr)
190
+ all_projects.extend(projects)
191
+
192
+ print(f"\n[info] Total projects: {len(all_projects)}", file=sys.stderr)
193
+ write_projects(all_projects, output)
194
+ if output:
195
+ print(f"[info] Wrote → {output}", file=sys.stderr)
196
+
197
+
198
+ def main() -> None:
199
+ parser = argparse.ArgumentParser(
200
+ prog="devpost-scraper",
201
+ description="Extract Devpost project data and export to CSV.",
202
+ )
203
+ parser.add_argument(
204
+ "search_terms",
205
+ nargs="+",
206
+ metavar="TERM",
207
+ help="One or more search terms to query on Devpost",
208
+ )
209
+ parser.add_argument(
210
+ "--output", "-o",
211
+ metavar="FILE",
212
+ default=None,
213
+ help="Output CSV file path (default: stdout)",
214
+ )
215
+ args = parser.parse_args()
216
+ asyncio.run(run(search_terms=args.search_terms, output=args.output))
217
+
218
+
219
+ if __name__ == "__main__":
220
+ main()
221
+
222
+
223
+ _PARTICIPANTS_JWT_KEY = "DEVPOST_SESSION"
224
+
225
+
226
+ async def _run_participants(
227
+ hackathon_url: str,
228
+ jwt_token: str,
229
+ output: str | None,
230
+ no_email: bool,
231
+ ) -> None:
232
+ all_participants: list[HackathonParticipant] = []
233
+ page = 1
234
+
235
+ print(f"[info] Fetching participants from {hackathon_url}", file=sys.stderr)
236
+
237
+ while True:
238
+ data = await get_hackathon_participants(hackathon_url, jwt_token, page=page)
239
+ batch = data.get("participants", [])
240
+ has_more = data.get("has_more", False)
241
+
242
+ if not batch:
243
+ print(f"[info] No participants on page {page}, stopping.", file=sys.stderr)
244
+ break
245
+
246
+ print(f"[info] Page {page}: {len(batch)} participants", file=sys.stderr)
247
+
248
+ for raw in batch:
249
+ profile_url = raw.get("profile_url", "")
250
+ email = ""
251
+ github_url = ""
252
+ linkedin_url = ""
253
+
254
+ if not no_email and profile_url:
255
+ try:
256
+ email_data = await find_participant_email(profile_url)
257
+ email = email_data.get("email", "")
258
+ github_url = email_data.get("github_url", "")
259
+ linkedin_url = email_data.get("linkedin_url", "")
260
+ parts = [f for f in [email, github_url, linkedin_url] if f]
261
+ if parts:
262
+ print(f" [found] {', '.join(parts)} ← {profile_url}", file=sys.stderr)
263
+ else:
264
+ print(f" [none] ← {profile_url}", file=sys.stderr)
265
+ except Exception as exc:
266
+ print(f" [warn] enrich failed for {profile_url}: {exc}", file=sys.stderr)
267
+
268
+ all_participants.append(
269
+ HackathonParticipant(
270
+ hackathon_url=hackathon_url,
271
+ username=raw.get("username", ""),
272
+ name=raw.get("name", ""),
273
+ specialty=raw.get("specialty", ""),
274
+ profile_url=profile_url,
275
+ github_url=github_url,
276
+ linkedin_url=linkedin_url,
277
+ email=email,
278
+ )
279
+ )
280
+
281
+ if not has_more:
282
+ break
283
+ page += 1
284
+
285
+ print(f"\n[info] Total participants: {len(all_participants)}", file=sys.stderr)
286
+
287
+ import csv
288
+
289
+ fieldnames = HackathonParticipant.fieldnames()
290
+ rows = [p.model_dump() for p in all_participants]
291
+
292
+ if output:
293
+ with open(output, "w", newline="", encoding="utf-8") as f:
294
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
295
+ writer.writeheader()
296
+ writer.writerows(rows)
297
+ print(f"[info] Wrote → {output}", file=sys.stderr)
298
+ else:
299
+ import io
300
+ buf = io.StringIO()
301
+ writer = csv.DictWriter(buf, fieldnames=fieldnames)
302
+ writer.writeheader()
303
+ writer.writerows(rows)
304
+ print(buf.getvalue())
305
+
306
+
307
+ def participants_main() -> None:
308
+ load_dotenv(_ENV_FILE, override=True)
309
+
310
+ parser = argparse.ArgumentParser(
311
+ prog="devpost-participants",
312
+ description="Crawl Devpost hackathon participants page and export to CSV.",
313
+ )
314
+ parser.add_argument(
315
+ "hackathon_url",
316
+ metavar="URL",
317
+ help="Hackathon participants URL (e.g. https://hack-days-niet.devpost.com/participants)",
318
+ )
319
+ parser.add_argument(
320
+ "--jwt",
321
+ metavar="TOKEN",
322
+ default=None,
323
+ help="Value of the _devpost session cookie from your browser. Falls back to DEVPOST_SESSION in .env",
324
+ )
325
+ parser.add_argument(
326
+ "--output", "-o",
327
+ metavar="FILE",
328
+ default=None,
329
+ help="Output CSV file path (default: stdout)",
330
+ )
331
+ parser.add_argument(
332
+ "--no-email",
333
+ action="store_true",
334
+ default=False,
335
+ help="Skip email enrichment (faster)",
336
+ )
337
+ args = parser.parse_args()
338
+
339
+ if not args.output:
340
+ parsed = urlparse(args.hackathon_url)
341
+ slug = parsed.hostname.split(".")[0] if parsed.hostname else "hackathon"
342
+ args.output = f"{slug}-participants.csv"
343
+ print(f"[info] No -o given, defaulting to {args.output}", file=sys.stderr)
344
+
345
+ jwt_token = args.jwt or os.getenv(_PARTICIPANTS_JWT_KEY, "").strip()
346
+ if not jwt_token:
347
+ raise SystemExit(
348
+ "[error] No session cookie. Pass --jwt TOKEN or set DEVPOST_SESSION in .env\n"
349
+ " Copy the _devpost cookie value from browser DevTools → Application → Cookies"
350
+ )
351
+
352
+ # Persist JWT to .env for reuse
353
+ if args.jwt:
354
+ _ENV_FILE.touch(exist_ok=True)
355
+ set_key(str(_ENV_FILE), _PARTICIPANTS_JWT_KEY, args.jwt)
356
+
357
+ asyncio.run(
358
+ _run_participants(
359
+ hackathon_url=args.hackathon_url,
360
+ jwt_token=jwt_token,
361
+ output=args.output,
362
+ no_email=args.no_email,
363
+ )
364
+ )
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Iterable
7
+
8
+ from devpost_scraper.models import DevpostProject
9
+
10
+
11
+ def write_projects(projects: Iterable[DevpostProject], output: str | None) -> None:
12
+ """Write projects to CSV. Prints to stdout if output is None."""
13
+ fieldnames = DevpostProject.fieldnames()
14
+
15
+ if output:
16
+ path = Path(output)
17
+ path.parent.mkdir(parents=True, exist_ok=True)
18
+ fh = path.open("w", newline="", encoding="utf-8")
19
+ close = True
20
+ else:
21
+ fh = sys.stdout
22
+ close = False
23
+
24
+ writer = csv.DictWriter(fh, fieldnames=fieldnames, extrasaction="ignore")
25
+ writer.writeheader()
26
+ for project in projects:
27
+ writer.writerow(project.model_dump())
28
+
29
+ if close:
30
+ fh.close()
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+
6
+ class HackathonParticipant(BaseModel):
7
+ model_config = ConfigDict(extra="ignore")
8
+
9
+ hackathon_url: str = ""
10
+ username: str = ""
11
+ name: str = ""
12
+ specialty: str = ""
13
+ profile_url: str = ""
14
+ github_url: str = ""
15
+ linkedin_url: str = ""
16
+ email: str = ""
17
+
18
+ @classmethod
19
+ def fieldnames(cls) -> list[str]:
20
+ return ["hackathon_url", "username", "name", "specialty", "profile_url", "github_url", "linkedin_url", "email"]
21
+
22
+
23
+ class DevpostProject(BaseModel):
24
+ model_config = ConfigDict(extra="ignore")
25
+
26
+ search_term: str = ""
27
+ title: str = ""
28
+ tagline: str = ""
29
+ url: str = ""
30
+ hackathon_name: str = ""
31
+ hackathon_url: str = ""
32
+ summary: str = ""
33
+ built_with: str = ""
34
+ prizes: str = ""
35
+ team_size: str = ""
36
+ author_profile_url: str = ""
37
+ email: str = ""
38
+
39
+ @classmethod
40
+ def fieldnames(cls) -> list[str]:
41
+ return [
42
+ "search_term",
43
+ "title",
44
+ "tagline",
45
+ "url",
46
+ "hackathon_name",
47
+ "hackathon_url",
48
+ "summary",
49
+ "built_with",
50
+ "prizes",
51
+ "team_size",
52
+ "author_profile_url",
53
+ "email",
54
+ ]
@@ -0,0 +1,510 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ from typing import Any
6
+ from urllib.parse import urljoin, urlparse
7
+
8
+ import httpx
9
+ from bs4 import BeautifulSoup
10
+
11
+ _EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
12
+
13
+ # Domains we will follow when walking external links from a Devpost profile
14
+ _WALKABLE_DOMAINS = {
15
+ "github.com",
16
+ "linktr.ee",
17
+ "bio.link",
18
+ "beacons.ai",
19
+ "linkin.bio",
20
+ "carrd.co",
21
+ "about.me",
22
+ "bento.me",
23
+ }
24
+
25
+ _SEARCH_URL = "https://devpost.com/software/search"
26
+ _GITHUB_API_URL = "https://api.github.com/users"
27
+
28
+
29
+ def _github_headers() -> dict[str, str]:
30
+ """Build GitHub API headers, including auth token if GITHUB_TOKEN is set."""
31
+ headers = {
32
+ "Accept": "application/vnd.github+json",
33
+ "User-Agent": "devpost-scraper/1.0",
34
+ }
35
+ token = os.environ.get("GITHUB_TOKEN", "").strip()
36
+ if token:
37
+ headers["Authorization"] = f"Bearer {token}"
38
+ return headers
39
+ _JSON_HEADERS = {
40
+ "Accept": "application/json, text/javascript, */*; q=0.01",
41
+ "X-Requested-With": "XMLHttpRequest",
42
+ "User-Agent": (
43
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
44
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
45
+ "Chrome/122.0.0.0 Safari/537.36"
46
+ ),
47
+ }
48
+ _HTML_HEADERS = {
49
+ "Accept": "text/html,application/xhtml+xml",
50
+ "User-Agent": _JSON_HEADERS["User-Agent"],
51
+ }
52
+
53
+
54
+ async def search_projects(query: str, page: int = 1) -> dict[str, Any]:
55
+ """Search Devpost projects. Returns raw API payload with 'software' list."""
56
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
57
+ resp = await client.get(
58
+ _SEARCH_URL,
59
+ params={"query": query, "page": page},
60
+ headers=_JSON_HEADERS,
61
+ )
62
+ resp.raise_for_status()
63
+ data = resp.json()
64
+
65
+ projects = []
66
+ for item in data.get("software", []):
67
+ built = item.get("built_with") or []
68
+ projects.append(
69
+ {
70
+ "title": item.get("name", ""),
71
+ "tagline": item.get("tagline", ""),
72
+ "url": item.get("url", ""),
73
+ "built_with": ", ".join(built) if isinstance(built, list) else str(built),
74
+ "like_count": item.get("like_count", 0),
75
+ }
76
+ )
77
+
78
+ return {
79
+ "projects": projects,
80
+ "total_count": data.get("total_count", 0),
81
+ "page": page,
82
+ "per_page": data.get("per_page", 24),
83
+ }
84
+
85
+
86
+ async def get_project_details(url: str) -> dict[str, Any]:
87
+ """Fetch a Devpost project page and extract detail fields."""
88
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
89
+ resp = await client.get(url, headers=_HTML_HEADERS)
90
+ resp.raise_for_status()
91
+ html = resp.text
92
+
93
+ soup = BeautifulSoup(html, "html.parser")
94
+
95
+ title = _text(soup.select_one("h1#app-title") or soup.select_one("h1.app_title"))
96
+ tagline = _text(soup.select_one("p#app-details-header-tagline") or soup.select_one("p.large"))
97
+
98
+ summary_el = soup.select_one("div#app-details") or soup.select_one("div.app-details")
99
+ summary = summary_el.get_text(" ", strip=True)[:500] if summary_el else ""
100
+
101
+ built_tags = [t.get_text(strip=True) for t in soup.select("span.cp-tag")]
102
+ built_with = ", ".join(built_tags)
103
+
104
+ hackathon_name = ""
105
+ hackathon_url = ""
106
+ challenge_link = soup.select_one("a.challenge-link") or soup.select_one("a[href*='/hackathons/']")
107
+ if challenge_link:
108
+ hackathon_name = challenge_link.get_text(strip=True)
109
+ hackathon_url = challenge_link.get("href", "")
110
+
111
+ prizes: list[str] = []
112
+ for prize_el in soup.select("div.prize, li.prize, span.prize-name"):
113
+ text = prize_el.get_text(strip=True)
114
+ if text:
115
+ prizes.append(text)
116
+
117
+ team_members = soup.select("ul#app-team li, div.software-team-member")
118
+ team_size = str(len(team_members)) if team_members else ""
119
+
120
+ return {
121
+ "title": title,
122
+ "tagline": tagline,
123
+ "url": url,
124
+ "summary": summary,
125
+ "built_with": built_with,
126
+ "hackathon_name": hackathon_name,
127
+ "hackathon_url": hackathon_url,
128
+ "prizes": "; ".join(prizes),
129
+ "team_size": team_size,
130
+ }
131
+
132
+
133
+ def _text(el: Any) -> str:
134
+ if el is None:
135
+ return ""
136
+ return el.get_text(strip=True)
137
+
138
+
139
+ def _extract_emails(html: str) -> list[str]:
140
+ """Find all email addresses in an HTML document (mailto: + bare text)."""
141
+ soup = BeautifulSoup(html, "html.parser")
142
+ found: set[str] = set()
143
+
144
+ for a in soup.find_all("a", href=True):
145
+ href: str = a["href"]
146
+ if href.startswith("mailto:"):
147
+ addr = href[7:].split("?")[0].strip()
148
+ if addr:
149
+ found.add(addr.lower())
150
+
151
+ for match in _EMAIL_RE.finditer(soup.get_text(" ")):
152
+ found.add(match.group().lower())
153
+
154
+ # Filter out obviously invalid / placeholder emails
155
+ return [e for e in found if "." in e.split("@")[-1] and len(e) < 80]
156
+
157
+
158
+ _DEVPOST_NON_PROFILE_PATHS = {
159
+ "software", "hackathons", "settings", "portfolio", "search",
160
+ "about", "contact", "help", "careers", "login", "register",
161
+ }
162
+
163
+
164
+ async def get_author_profile_urls(project_url: str) -> dict[str, Any]:
165
+ """From a Devpost project page, return the author profile URLs."""
166
+ async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
167
+ resp = await client.get(project_url, headers=_HTML_HEADERS)
168
+ resp.raise_for_status()
169
+ html = resp.text
170
+
171
+ soup = BeautifulSoup(html, "html.parser")
172
+ profiles: list[str] = []
173
+
174
+ for a in soup.find_all("a", href=True):
175
+ href: str = a["href"]
176
+ if href.startswith("/"):
177
+ href = f"https://devpost.com{href}"
178
+ parsed = urlparse(href)
179
+ if parsed.netloc not in ("devpost.com", "www.devpost.com"):
180
+ continue
181
+ path_parts = [p for p in parsed.path.strip("/").split("/") if p]
182
+ if len(path_parts) != 1:
183
+ continue
184
+ slug = path_parts[0]
185
+ if slug in _DEVPOST_NON_PROFILE_PATHS:
186
+ continue
187
+ # Devpost usernames are alphanumeric + dashes, no dots or slashes
188
+ if re.match(r"^[a-zA-Z0-9_\-]+$", slug):
189
+ profiles.append(f"https://devpost.com/{slug}")
190
+
191
+ return {"author_profile_urls": list(dict.fromkeys(profiles))}
192
+
193
+
194
+ async def get_profile_external_links(profile_url: str) -> dict[str, Any]:
195
+ """From a Devpost author profile, return external links."""
196
+ async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
197
+ resp = await client.get(profile_url, headers=_HTML_HEADERS)
198
+ resp.raise_for_status()
199
+ html = resp.text
200
+
201
+ soup = BeautifulSoup(html, "html.parser")
202
+ external: list[str] = []
203
+ emails = _extract_emails(html)
204
+
205
+ for a in soup.find_all("a", href=True):
206
+ href: str = a["href"]
207
+ parsed = urlparse(href)
208
+ if parsed.scheme not in ("http", "https"):
209
+ continue
210
+ domain = parsed.netloc.lstrip("www.")
211
+ if domain and domain not in ("devpost.com",):
212
+ external.append(href)
213
+
214
+ return {
215
+ "profile_url": profile_url,
216
+ "external_links": list(dict.fromkeys(external)),
217
+ "emails_on_profile": emails,
218
+ }
219
+
220
+
221
+ async def extract_emails_from_url(url: str) -> dict[str, Any]:
222
+ """Fetch any URL and return email addresses found on the page."""
223
+ try:
224
+ async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
225
+ resp = await client.get(url, headers=_HTML_HEADERS)
226
+ resp.raise_for_status()
227
+ html = resp.text
228
+ except Exception as exc:
229
+ return {"url": url, "emails": [], "error": str(exc)}
230
+
231
+ emails = _extract_emails(html)
232
+ return {"url": url, "emails": emails}
233
+
234
+
235
+ async def get_hackathon_participants(
236
+ hackathon_url: str,
237
+ jwt_token: str,
238
+ page: int = 1,
239
+ ) -> dict[str, Any]:
240
+ """
241
+ Fetch one page of participants from a Devpost hackathon participants page.
242
+ Requires a valid Devpost session JWT (sent as _devpost_session cookie).
243
+ Returns {"participants": [...], "has_more": bool}.
244
+ """
245
+ base = hackathon_url.rstrip("/").removesuffix("/participants")
246
+ url = f"{base}/participants"
247
+
248
+ # Devpost serves participant HTML fragments via XHR; plain GET returns empty on page 2+
249
+ headers = {
250
+ **_JSON_HEADERS,
251
+ "Accept": "text/javascript, application/javascript",
252
+ "Cookie": f"_devpost={jwt_token}",
253
+ "Referer": url,
254
+ }
255
+
256
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
257
+ resp = await client.get(url, params={"page": page}, headers=headers)
258
+ resp.raise_for_status()
259
+ html = resp.text
260
+
261
+ soup = BeautifulSoup(html, "html.parser")
262
+
263
+ # Each participant is a div.participant with data-participant-id
264
+ cards = soup.select("div.participant")
265
+
266
+ # CSS classes on participant card encode specialty, e.g. "participant full-stack-developer"
267
+ _CARD_SKIP_CLASSES = {"participant"}
268
+
269
+ participants: list[dict[str, Any]] = []
270
+ for card in cards:
271
+ link = card.select_one("a.user-profile-link")
272
+ if not link:
273
+ continue
274
+ profile_href: str = link.get("href", "")
275
+ if profile_href.startswith("/"):
276
+ profile_href = f"https://devpost.com{profile_href}"
277
+
278
+ parsed = urlparse(profile_href)
279
+ slug = parsed.path.strip("/").split("/")[0] if parsed.path else ""
280
+
281
+ # Name lives in img[alt] or h5 inside .user-name
282
+ img = card.select_one("img[alt]")
283
+ name = img["alt"].strip() if img and img.get("alt") else slug
284
+
285
+ # Specialty is encoded as extra CSS class on the card div
286
+ card_classes = [c for c in card.get("class", []) if c not in _CARD_SKIP_CLASSES]
287
+ specialty = card_classes[0].replace("-", " ").title() if card_classes else ""
288
+
289
+ participants.append({
290
+ "username": slug,
291
+ "name": name,
292
+ "profile_url": profile_href,
293
+ "specialty": specialty,
294
+ })
295
+
296
+ # Pagination: Devpost renders <a rel="next"> when more pages exist
297
+ next_link = soup.select_one('a[rel="next"]')
298
+ has_more = next_link is not None
299
+
300
+ return {"participants": participants, "has_more": has_more, "page": page}
301
+
302
+
303
+ _GITHUB_ORG_PATHS = {"orgs", "repos", "topics", "collections", "explore", "marketplace", "about"}
304
+
305
+
306
+ _NOREPLY_SUFFIXES = ("@users.noreply.github.com",)
307
+
308
+
309
+ def _github_username_from_url(github_url: str) -> str:
310
+ """Extract a GitHub username from a profile URL. Returns '' for non-user URLs."""
311
+ parsed = urlparse(github_url)
312
+ path_parts = [p for p in parsed.path.strip("/").split("/") if p]
313
+ if len(path_parts) != 1 or path_parts[0] in _GITHUB_ORG_PATHS:
314
+ return ""
315
+ return path_parts[0]
316
+
317
+
318
+ def _is_real_email(email: str) -> bool:
319
+ """Filter out GitHub noreply and placeholder addresses."""
320
+ if not email:
321
+ return False
322
+ email = email.lower().strip()
323
+ if email.endswith(_NOREPLY_SUFFIXES):
324
+ return False
325
+ if "noreply" in email or "github.com" in email:
326
+ return False
327
+ return "." in email.split("@")[-1]
328
+
329
+
330
+ async def get_github_email(github_url: str) -> str:
331
+ """
332
+ Try three GitHub API strategies to find a user's email:
333
+ 1. /users/{user} — public profile email field (often private)
334
+ 2. /users/{user}/repos?sort=pushed → /repos/{owner}/{repo}/commits — mine commit author email
335
+ 3. /users/{user}/events/public — fallback: PushEvent commit payloads
336
+ """
337
+ username = _github_username_from_url(github_url)
338
+ if not username:
339
+ return ""
340
+
341
+ try:
342
+ async with httpx.AsyncClient(timeout=10.0) as client:
343
+ # Strategy 1: profile email
344
+ resp = await client.get(
345
+ f"{_GITHUB_API_URL}/{username}",
346
+ headers=_github_headers(),
347
+ )
348
+ if resp.status_code == 200:
349
+ email = (resp.json().get("email") or "").strip()
350
+ if _is_real_email(email):
351
+ return email
352
+
353
+ # Strategy 2: most-recently-pushed repo → commits → author email
354
+ resp = await client.get(
355
+ f"{_GITHUB_API_URL}/{username}/repos",
356
+ params={"sort": "pushed", "per_page": 3, "type": "owner"},
357
+ headers=_github_headers(),
358
+ )
359
+ if resp.status_code == 200:
360
+ for repo in resp.json():
361
+ full_name = repo.get("full_name", "")
362
+ if repo.get("fork") or not full_name:
363
+ continue
364
+ commit_resp = await client.get(
365
+ f"https://api.github.com/repos/{full_name}/commits",
366
+ params={"author": username, "per_page": 5},
367
+ headers=_github_headers(),
368
+ )
369
+ if commit_resp.status_code != 200:
370
+ continue
371
+ for c in commit_resp.json():
372
+ author = c.get("commit", {}).get("author", {})
373
+ email = (author.get("email") or "").strip().lower()
374
+ if _is_real_email(email):
375
+ return email
376
+
377
+ # Strategy 3: PushEvent payloads (fallback, often has 0 commits)
378
+ resp = await client.get(
379
+ f"{_GITHUB_API_URL}/{username}/events/public",
380
+ params={"per_page": 20},
381
+ headers=_github_headers(),
382
+ )
383
+ if resp.status_code == 200:
384
+ for event in resp.json():
385
+ if event.get("type") != "PushEvent":
386
+ continue
387
+ for commit in event.get("payload", {}).get("commits", []):
388
+ email = (commit.get("author", {}).get("email") or "").strip().lower()
389
+ if _is_real_email(email):
390
+ return email
391
+
392
+ except Exception:
393
+ pass
394
+
395
+ return ""
396
+
397
+
398
+ _DEVPOST_OWNED_DOMAINS = {
399
+ "devpost.com", "devpost.team", "info.devpost.com",
400
+ "secure.devpost.com", "d2dmyh35ffsxbl.cloudfront.net",
401
+ "d112y698adiu2z.cloudfront.net",
402
+ }
403
+
404
+
405
+ def _is_personal_link(url: str) -> bool:
406
+ """Filter out Devpost-owned links (nav, footer, CDN) from external link lists."""
407
+ parsed = urlparse(url)
408
+ domain = parsed.netloc.lstrip("www.")
409
+ return domain not in _DEVPOST_OWNED_DOMAINS
410
+
411
+
412
+ async def find_participant_email(profile_url: str) -> dict[str, Any]:
413
+ """
414
+ Enrich a participant from their Devpost profile:
415
+ 1. Extract GitHub URL, LinkedIn URL from profile social links
416
+ 2. Try GitHub API for public email
417
+ 3. Walk other external links for email (linktr.ee, bio.link, etc.)
418
+ """
419
+ result: dict[str, Any] = {
420
+ "profile_url": profile_url,
421
+ "external_links_walked": [],
422
+ "github_url": "",
423
+ "linkedin_url": "",
424
+ "email": "",
425
+ }
426
+
427
+ profile_data = await get_profile_external_links(profile_url)
428
+ all_emails: list[str] = list(profile_data.get("emails_on_profile", []))
429
+
430
+ personal_links = [l for l in profile_data.get("external_links", []) if _is_personal_link(l)]
431
+
432
+ # First pass: capture GitHub + LinkedIn URLs
433
+ for link in personal_links:
434
+ parsed = urlparse(link)
435
+ domain = parsed.netloc.lstrip("www.")
436
+ path_parts = [p for p in parsed.path.strip("/").split("/") if p]
437
+
438
+ if domain == "github.com" and path_parts and not result["github_url"]:
439
+ if path_parts[0] not in _GITHUB_ORG_PATHS and len(path_parts) == 1:
440
+ result["github_url"] = link
441
+
442
+ if domain == "linkedin.com" and "/in/" in parsed.path and not result["linkedin_url"]:
443
+ if "/company/" not in parsed.path:
444
+ result["linkedin_url"] = link
445
+
446
+ # Try GitHub API for public email
447
+ if result["github_url"] and not all_emails:
448
+ email = await get_github_email(result["github_url"])
449
+ if email:
450
+ all_emails.append(email)
451
+
452
+ # Walk remaining external links for email
453
+ if not all_emails:
454
+ for link in personal_links:
455
+ parsed = urlparse(link)
456
+ domain = parsed.netloc.lstrip("www.")
457
+ if domain in ("github.com", "linkedin.com"):
458
+ continue
459
+ if domain not in _WALKABLE_DOMAINS:
460
+ continue
461
+ result["external_links_walked"].append(link)
462
+ link_data = await extract_emails_from_url(link)
463
+ all_emails.extend(link_data.get("emails", []))
464
+ if all_emails:
465
+ break
466
+
467
+ result["email"] = all_emails[0] if all_emails else ""
468
+ return result
469
+
470
+
471
+ async def find_author_email(project_url: str) -> dict[str, Any]:
472
+ """
473
+ Full chain: project page → author profile(s) → external links → emails.
474
+ Returns the first email found along with the chain of URLs walked.
475
+ """
476
+ result: dict[str, Any] = {
477
+ "project_url": project_url,
478
+ "author_profile_urls": [],
479
+ "external_links_walked": [],
480
+ "email": "",
481
+ }
482
+
483
+ # Step 1: get author profiles from project page
484
+ profiles_data = await get_author_profile_urls(project_url)
485
+ author_urls: list[str] = profiles_data.get("author_profile_urls", [])
486
+ result["author_profile_urls"] = author_urls
487
+
488
+ all_emails: list[str] = []
489
+
490
+ for profile_url in author_urls[:3]: # cap at 3 authors
491
+ profile_data = await get_profile_external_links(profile_url)
492
+
493
+ # Emails directly on profile page
494
+ all_emails.extend(profile_data.get("emails_on_profile", []))
495
+
496
+ # Walk external links from profile
497
+ for link in profile_data.get("external_links", []):
498
+ parsed = urlparse(link)
499
+ domain = parsed.netloc.lstrip("www.")
500
+ if domain not in _WALKABLE_DOMAINS:
501
+ continue
502
+ result["external_links_walked"].append(link)
503
+ link_data = await extract_emails_from_url(link)
504
+ all_emails.extend(link_data.get("emails", []))
505
+
506
+ if all_emails:
507
+ break # stop after first author with a result
508
+
509
+ result["email"] = all_emails[0] if all_emails else ""
510
+ return result
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: devpost-scraper
3
+ Version: 0.1.0
4
+ Summary: CLI for extracting Devpost data with Backboard tool-calling and exporting results to CSV.
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: backboard-sdk>=1.5.9
7
+ Requires-Dist: beautifulsoup4>=4.12.0
8
+ Requires-Dist: httpx>=0.27.0
9
+ Requires-Dist: pydantic>=2.7.0
10
+ Requires-Dist: python-dotenv>=1.0.1
11
+ Description-Content-Type: text/markdown
12
+
13
+ # Devpost Scraper
14
+
15
+ CLI for extracting Devpost project data with a Backboard assistant that can call a Devpost MCP tool server and export structured results to CSV.
16
+
17
+ ## Requirements
18
+
19
+ - Python 3.11+
20
+ - `uv`
21
+ - Node.js / `npx` available on your machine
22
+ - A Backboard API key
23
+
24
+ ## Environment
25
+
26
+ Create a `.env` file from `.env.example` and set:
27
+
28
+ - `BACKBOARD_API_KEY`
29
+ - `BACKBOARD_MODEL` (optional)
30
+ - `DEVPOST_ASSISTANT_NAME` (optional)
31
+
32
+ ## MCP server
33
+
34
+ This project is designed to use a Devpost MCP server with this configuration:
35
+
36
+ ```json
37
+ {
38
+ "mcpServers": {
39
+ "devpost": {
40
+ "command": "npx",
41
+ "args": ["devpost-mcp-server"]
42
+ }
43
+ }
44
+ }
45
+ ```
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ uv sync
51
+ ```
52
+
53
+ ## Run
54
+
55
+ ```bash
56
+ uv run devpost-scraper "ai agents" --output ai_agents.csv
57
+ uv run devpost-scraper "developer tools" "climate tech" --output results.csv
58
+ ```
59
+
60
+ You can also use the startup script:
61
+
62
+ ```bash
63
+ ./start.sh "ai agents" --output ai_agents.csv
64
+ ```
65
+
66
+ ## What it does
67
+
68
+ 1. Creates or reuses a Backboard assistant configured for Devpost extraction.
69
+ 2. Creates a thread for the run.
70
+ 3. Sends a prompt that asks the assistant to use the Devpost MCP toolset.
71
+ 4. Handles tool-calling loops until the assistant returns completed structured content.
72
+ 5. Parses the structured JSON result.
73
+ 6. Writes the extracted rows to CSV.
74
+
75
+ ## Expected output shape
76
+
77
+ Each extracted row should contain fields like:
78
+
79
+ - `search_term`
80
+ - `project_title`
81
+ - `tagline`
82
+ - `project_url`
83
+ - `hackathon_name`
84
+ - `hackathon_url`
85
+ - `summary`
86
+ - `built_with`
87
+ - `prizes`
88
+ - `submission_date`
89
+ - `team_size`
90
+
91
+ ## Notes
92
+
93
+ - The CLI is intentionally API-heavy and UI-free.
94
+ - The Backboard assistant must have access to the Devpost MCP tools in the environment where it runs.
95
+ - If your Backboard account or environment requires additional tool registration, wire that into the assistant creation flow in the client module.
96
+
97
+ ## Development
98
+
99
+ ```bash
100
+ uv run python -m devpost_scraper.cli "ai agents" --output out.csv
101
+ ```
@@ -0,0 +1,10 @@
1
+ devpost_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ devpost_scraper/backboard_client.py,sha256=P4eLrSfN1jxhP2sU-eaK1LWKsAUUfdyEJK1AbXaElcY,4309
3
+ devpost_scraper/cli.py,sha256=q4j1JhOhHr8G6J6HMoQIj_QPY17Nj_wH4Lylk_C6S7o,12142
4
+ devpost_scraper/csv_export.py,sha256=wUx10ImfWwPLV2ScGChssqueF4Nf5Xipc1QSsck9VPQ,809
5
+ devpost_scraper/models.py,sha256=WlNwnyc0ykTMGq9UMdrB_9rLAGQqTNKhfl8XpWVKezw,1288
6
+ devpost_scraper/scraper.py,sha256=7Zh1S0wmlb_-aKmylzyt9gl-zYaK9jPFdtrt5R-a5bc,18132
7
+ devpost_scraper-0.1.0.dist-info/METADATA,sha256=I0dfLxINMgcsgpu8QBzRkT9tqIq68cFxk6fW-CDI7n8,2342
8
+ devpost_scraper-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
9
+ devpost_scraper-0.1.0.dist-info/entry_points.txt,sha256=c_5q8zgdUVaV80s5zpdLFupZXNuibia-gcWfjOokuuM,122
10
+ devpost_scraper-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ devpost-participants = devpost_scraper.cli:participants_main
3
+ devpost-scraper = devpost_scraper.cli:main