devpost-scraper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devpost_scraper-0.1.0/.env.example +13 -0
- devpost_scraper-0.1.0/.gitignore +11 -0
- devpost_scraper-0.1.0/PKG-INFO +101 -0
- devpost_scraper-0.1.0/README.md +89 -0
- devpost_scraper-0.1.0/pyproject.toml +27 -0
- devpost_scraper-0.1.0/src/devpost_scraper/__init__.py +0 -0
- devpost_scraper-0.1.0/src/devpost_scraper/backboard_client.py +135 -0
- devpost_scraper-0.1.0/src/devpost_scraper/cli.py +364 -0
- devpost_scraper-0.1.0/src/devpost_scraper/csv_export.py +30 -0
- devpost_scraper-0.1.0/src/devpost_scraper/models.py +54 -0
- devpost_scraper-0.1.0/src/devpost_scraper/scraper.py +510 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Backboard API key — get yours at https://app.backboard.io
|
|
2
|
+
BACKBOARD_API_KEY=
|
|
3
|
+
|
|
4
|
+
# Persisted automatically after first run — do not delete
|
|
5
|
+
DEVPOST_ASSISTANT_ID=
|
|
6
|
+
|
|
7
|
+
# Devpost session cookie (_devpost) for authenticated endpoints (e.g. /participants)
|
|
8
|
+
# Copy the _devpost cookie value from your browser DevTools after logging in
|
|
9
|
+
DEVPOST_SESSION=
|
|
10
|
+
|
|
11
|
+
# GitHub personal access token for higher API rate limits (5000/hr vs 60/hr)
|
|
12
|
+
# Generate at https://github.com/settings/tokens (no scopes needed, public data only)
|
|
13
|
+
GITHUB_TOKEN=
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: devpost-scraper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI for extracting Devpost data with Backboard tool-calling and exporting results to CSV.
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: backboard-sdk>=1.5.9
|
|
7
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
8
|
+
Requires-Dist: httpx>=0.27.0
|
|
9
|
+
Requires-Dist: pydantic>=2.7.0
|
|
10
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# Devpost Scraper
|
|
14
|
+
|
|
15
|
+
CLI for extracting Devpost project data with a Backboard assistant that can call a Devpost MCP tool server and export structured results to CSV.
|
|
16
|
+
|
|
17
|
+
## Requirements
|
|
18
|
+
|
|
19
|
+
- Python 3.11+
|
|
20
|
+
- `uv`
|
|
21
|
+
- Node.js / `npx` available on your machine
|
|
22
|
+
- A Backboard API key
|
|
23
|
+
|
|
24
|
+
## Environment
|
|
25
|
+
|
|
26
|
+
Create a `.env` file from `.env.example` and set:
|
|
27
|
+
|
|
28
|
+
- `BACKBOARD_API_KEY`
|
|
29
|
+
- `BACKBOARD_MODEL` (optional)
|
|
30
|
+
- `DEVPOST_ASSISTANT_NAME` (optional)
|
|
31
|
+
|
|
32
|
+
## MCP server
|
|
33
|
+
|
|
34
|
+
This project is designed to use a Devpost MCP server with this configuration:
|
|
35
|
+
|
|
36
|
+
```json
|
|
37
|
+
{
|
|
38
|
+
"mcpServers": {
|
|
39
|
+
"devpost": {
|
|
40
|
+
"command": "npx",
|
|
41
|
+
"args": ["devpost-mcp-server"]
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Install
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
uv sync
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Run
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
uv run devpost-scraper "ai agents" --output ai_agents.csv
|
|
57
|
+
uv run devpost-scraper "developer tools" "climate tech" --output results.csv
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
You can also use the startup script:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
./start.sh "ai agents" --output ai_agents.csv
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## What it does
|
|
67
|
+
|
|
68
|
+
1. Creates or reuses a Backboard assistant configured for Devpost extraction.
|
|
69
|
+
2. Creates a thread for the run.
|
|
70
|
+
3. Sends a prompt that asks the assistant to use the Devpost MCP toolset.
|
|
71
|
+
4. Handles tool-calling loops until the assistant returns completed structured content.
|
|
72
|
+
5. Parses the structured JSON result.
|
|
73
|
+
6. Writes the extracted rows to CSV.
|
|
74
|
+
|
|
75
|
+
## Expected output shape
|
|
76
|
+
|
|
77
|
+
Each extracted row should contain fields like:
|
|
78
|
+
|
|
79
|
+
- `search_term`
|
|
80
|
+
- `project_title`
|
|
81
|
+
- `tagline`
|
|
82
|
+
- `project_url`
|
|
83
|
+
- `hackathon_name`
|
|
84
|
+
- `hackathon_url`
|
|
85
|
+
- `summary`
|
|
86
|
+
- `built_with`
|
|
87
|
+
- `prizes`
|
|
88
|
+
- `submission_date`
|
|
89
|
+
- `team_size`
|
|
90
|
+
|
|
91
|
+
## Notes
|
|
92
|
+
|
|
93
|
+
- The CLI is intentionally API-heavy and UI-free.
|
|
94
|
+
- The Backboard assistant must have access to the Devpost MCP tools in the environment where it runs.
|
|
95
|
+
- If your Backboard account or environment requires additional tool registration, wire that into the assistant creation flow in the client module.
|
|
96
|
+
|
|
97
|
+
## Development
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
uv run python -m devpost_scraper.cli "ai agents" --output out.csv
|
|
101
|
+
```
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# Devpost Scraper
|
|
2
|
+
|
|
3
|
+
CLI for extracting Devpost project data with a Backboard assistant that can call a Devpost MCP tool server and export structured results to CSV.
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
- Python 3.11+
|
|
8
|
+
- `uv`
|
|
9
|
+
- Node.js / `npx` available on your machine
|
|
10
|
+
- A Backboard API key
|
|
11
|
+
|
|
12
|
+
## Environment
|
|
13
|
+
|
|
14
|
+
Create a `.env` file from `.env.example` and set:
|
|
15
|
+
|
|
16
|
+
- `BACKBOARD_API_KEY`
|
|
17
|
+
- `BACKBOARD_MODEL` (optional)
|
|
18
|
+
- `DEVPOST_ASSISTANT_NAME` (optional)
|
|
19
|
+
|
|
20
|
+
## MCP server
|
|
21
|
+
|
|
22
|
+
This project is designed to use a Devpost MCP server with this configuration:
|
|
23
|
+
|
|
24
|
+
```json
|
|
25
|
+
{
|
|
26
|
+
"mcpServers": {
|
|
27
|
+
"devpost": {
|
|
28
|
+
"command": "npx",
|
|
29
|
+
"args": ["devpost-mcp-server"]
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv sync
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Run
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
uv run devpost-scraper "ai agents" --output ai_agents.csv
|
|
45
|
+
uv run devpost-scraper "developer tools" "climate tech" --output results.csv
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
You can also use the startup script:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
./start.sh "ai agents" --output ai_agents.csv
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## What it does
|
|
55
|
+
|
|
56
|
+
1. Creates or reuses a Backboard assistant configured for Devpost extraction.
|
|
57
|
+
2. Creates a thread for the run.
|
|
58
|
+
3. Sends a prompt that asks the assistant to use the Devpost MCP toolset.
|
|
59
|
+
4. Handles tool-calling loops until the assistant returns completed structured content.
|
|
60
|
+
5. Parses the structured JSON result.
|
|
61
|
+
6. Writes the extracted rows to CSV.
|
|
62
|
+
|
|
63
|
+
## Expected output shape
|
|
64
|
+
|
|
65
|
+
Each extracted row should contain fields like:
|
|
66
|
+
|
|
67
|
+
- `search_term`
|
|
68
|
+
- `project_title`
|
|
69
|
+
- `tagline`
|
|
70
|
+
- `project_url`
|
|
71
|
+
- `hackathon_name`
|
|
72
|
+
- `hackathon_url`
|
|
73
|
+
- `summary`
|
|
74
|
+
- `built_with`
|
|
75
|
+
- `prizes`
|
|
76
|
+
- `submission_date`
|
|
77
|
+
- `team_size`
|
|
78
|
+
|
|
79
|
+
## Notes
|
|
80
|
+
|
|
81
|
+
- The CLI is intentionally API-heavy and UI-free.
|
|
82
|
+
- The Backboard assistant must have access to the Devpost MCP tools in the environment where it runs.
|
|
83
|
+
- If your Backboard account or environment requires additional tool registration, wire that into the assistant creation flow in the client module.
|
|
84
|
+
|
|
85
|
+
## Development
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
uv run python -m devpost_scraper.cli "ai agents" --output out.csv
|
|
89
|
+
```
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "devpost-scraper"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "CLI for extracting Devpost data with Backboard tool-calling and exporting results to CSV."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"backboard-sdk>=1.5.9",
|
|
9
|
+
"beautifulsoup4>=4.12.0",
|
|
10
|
+
"httpx>=0.27.0",
|
|
11
|
+
"pydantic>=2.7.0",
|
|
12
|
+
"python-dotenv>=1.0.1",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
devpost-scraper = "devpost_scraper.cli:main"
|
|
17
|
+
devpost-participants = "devpost_scraper.cli:participants_main"
|
|
18
|
+
|
|
19
|
+
[build-system]
|
|
20
|
+
requires = ["hatchling"]
|
|
21
|
+
build-backend = "hatchling.build"
|
|
22
|
+
|
|
23
|
+
[tool.hatch.build.targets.wheel]
|
|
24
|
+
packages = ["src/devpost_scraper"]
|
|
25
|
+
|
|
26
|
+
[tool.uv]
|
|
27
|
+
package = true
|
|
File without changes
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any, Awaitable, Callable, Mapping
|
|
6
|
+
|
|
7
|
+
from backboard import BackboardClient
|
|
8
|
+
from backboard.exceptions import BackboardAPIError
|
|
9
|
+
|
|
10
|
+
ToolHandler = Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BackboardClientError(Exception):
|
|
14
|
+
"""Raised when a Backboard operation fails."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_client() -> BackboardClient:
|
|
18
|
+
api_key = os.getenv("BACKBOARD_API_KEY", "").strip()
|
|
19
|
+
if not api_key:
|
|
20
|
+
raise BackboardClientError(
|
|
21
|
+
"Missing required environment variable `BACKBOARD_API_KEY`."
|
|
22
|
+
)
|
|
23
|
+
return BackboardClient(api_key=api_key)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def ensure_assistant(
|
|
27
|
+
client: BackboardClient,
|
|
28
|
+
*,
|
|
29
|
+
assistant_id: str | None,
|
|
30
|
+
name: str,
|
|
31
|
+
system_prompt: str,
|
|
32
|
+
tools: list[dict[str, Any]],
|
|
33
|
+
) -> str:
|
|
34
|
+
if assistant_id:
|
|
35
|
+
return assistant_id
|
|
36
|
+
assistant = await client.create_assistant(
|
|
37
|
+
name=name,
|
|
38
|
+
system_prompt=system_prompt,
|
|
39
|
+
tools=tools,
|
|
40
|
+
)
|
|
41
|
+
return str(assistant.assistant_id)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def _collect_stream(stream: Any) -> dict[str, Any]:
|
|
45
|
+
"""Drain a streaming add_message response into a unified result dict."""
|
|
46
|
+
content_parts: list[str] = []
|
|
47
|
+
tool_calls: list[Any] = []
|
|
48
|
+
run_id: str | None = None
|
|
49
|
+
status = "completed"
|
|
50
|
+
|
|
51
|
+
async for chunk in stream:
|
|
52
|
+
t = chunk.get("type")
|
|
53
|
+
if t == "content_streaming":
|
|
54
|
+
content_parts.append(chunk.get("content", ""))
|
|
55
|
+
elif t == "tool_submit_required":
|
|
56
|
+
status = "REQUIRES_ACTION"
|
|
57
|
+
run_id = chunk.get("run_id")
|
|
58
|
+
tool_calls = chunk.get("tool_calls", [])
|
|
59
|
+
elif t == "run_ended":
|
|
60
|
+
if chunk.get("status") not in (None, "completed"):
|
|
61
|
+
raise BackboardClientError(
|
|
62
|
+
f"Run ended with status: {chunk.get('status')}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
"content": "".join(content_parts) or None,
|
|
67
|
+
"status": status,
|
|
68
|
+
"tool_calls": tool_calls,
|
|
69
|
+
"run_id": run_id,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def run_in_thread(
|
|
74
|
+
client: BackboardClient,
|
|
75
|
+
*,
|
|
76
|
+
assistant_id: str,
|
|
77
|
+
user_message: str,
|
|
78
|
+
tool_handlers: Mapping[str, ToolHandler],
|
|
79
|
+
llm_provider: str = "openai",
|
|
80
|
+
model_name: str = "gpt-4o-mini",
|
|
81
|
+
max_tool_rounds: int = 6,
|
|
82
|
+
) -> str:
|
|
83
|
+
"""Create a thread, send a message via streaming, execute the tool loop."""
|
|
84
|
+
thread = await client.create_thread(assistant_id)
|
|
85
|
+
|
|
86
|
+
stream = await client.add_message(
|
|
87
|
+
thread_id=thread.thread_id,
|
|
88
|
+
content=user_message,
|
|
89
|
+
stream=True,
|
|
90
|
+
llm_provider=llm_provider,
|
|
91
|
+
model_name=model_name,
|
|
92
|
+
)
|
|
93
|
+
result = await _collect_stream(stream)
|
|
94
|
+
|
|
95
|
+
rounds = 0
|
|
96
|
+
while result["status"] == "REQUIRES_ACTION":
|
|
97
|
+
rounds += 1
|
|
98
|
+
if rounds > max_tool_rounds:
|
|
99
|
+
raise BackboardClientError(
|
|
100
|
+
f"Tool loop exceeded {max_tool_rounds} rounds — aborting."
|
|
101
|
+
)
|
|
102
|
+
if not result["run_id"]:
|
|
103
|
+
raise BackboardClientError("REQUIRES_ACTION without run_id.")
|
|
104
|
+
if not result["tool_calls"]:
|
|
105
|
+
raise BackboardClientError("REQUIRES_ACTION without tool_calls.")
|
|
106
|
+
|
|
107
|
+
tool_outputs = []
|
|
108
|
+
for tc in result["tool_calls"]:
|
|
109
|
+
name = tc["function"]["name"] if isinstance(tc, dict) else tc.function.name
|
|
110
|
+
args_raw = (
|
|
111
|
+
tc["function"].get("arguments", "{}")
|
|
112
|
+
if isinstance(tc, dict)
|
|
113
|
+
else (tc.function.arguments or "{}")
|
|
114
|
+
)
|
|
115
|
+
args = args_raw if isinstance(args_raw, dict) else json.loads(args_raw or "{}")
|
|
116
|
+
tc_id = tc["id"] if isinstance(tc, dict) else tc.id
|
|
117
|
+
|
|
118
|
+
handler = tool_handlers.get(name)
|
|
119
|
+
if handler is None:
|
|
120
|
+
raise BackboardClientError(f"No handler registered for tool `{name}`.")
|
|
121
|
+
|
|
122
|
+
call_result = await handler(args)
|
|
123
|
+
tool_outputs.append({"tool_call_id": tc_id, "output": json.dumps(call_result)})
|
|
124
|
+
|
|
125
|
+
stream = await client.submit_tool_outputs(
|
|
126
|
+
thread_id=thread.thread_id,
|
|
127
|
+
run_id=result["run_id"],
|
|
128
|
+
tool_outputs=tool_outputs,
|
|
129
|
+
stream=True,
|
|
130
|
+
)
|
|
131
|
+
result = await _collect_stream(stream)
|
|
132
|
+
|
|
133
|
+
if not result["content"]:
|
|
134
|
+
raise BackboardClientError("Run completed without content.")
|
|
135
|
+
return result["content"]
|
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import asyncio
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
|
|
12
|
+
from dotenv import load_dotenv, set_key
|
|
13
|
+
|
|
14
|
+
from devpost_scraper.backboard_client import (
|
|
15
|
+
BackboardClientError,
|
|
16
|
+
build_client,
|
|
17
|
+
ensure_assistant,
|
|
18
|
+
run_in_thread,
|
|
19
|
+
)
|
|
20
|
+
from devpost_scraper.csv_export import write_projects
|
|
21
|
+
from devpost_scraper.models import DevpostProject, HackathonParticipant
|
|
22
|
+
from devpost_scraper.scraper import (
|
|
23
|
+
find_author_email,
|
|
24
|
+
find_participant_email,
|
|
25
|
+
get_hackathon_participants,
|
|
26
|
+
get_project_details,
|
|
27
|
+
search_projects,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
_ENV_FILE = Path(".env")
|
|
31
|
+
_ASSISTANT_ID_KEY = "DEVPOST_ASSISTANT_ID"
|
|
32
|
+
|
|
33
|
+
# The assistant's ONLY job is to search and return raw project URLs.
|
|
34
|
+
# Python handles all enrichment directly — no tool loop explosion.
|
|
35
|
+
_SYSTEM_PROMPT = """\
|
|
36
|
+
You are a Devpost search assistant. Given a search term:
|
|
37
|
+
|
|
38
|
+
1. Call search_devpost_projects for page 1 and page 2.
|
|
39
|
+
2. Deduplicate results by URL.
|
|
40
|
+
3. Return ONLY a valid JSON array — no prose, no markdown, no code fences.
|
|
41
|
+
|
|
42
|
+
Each element: {"title": "...", "tagline": "...", "url": "...", "built_with": "..."}
|
|
43
|
+
built_with is a comma-separated string of technology names.
|
|
44
|
+
Never call the same tool with the same arguments twice.\
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
_TOOLS: list[dict[str, Any]] = [
|
|
48
|
+
{
|
|
49
|
+
"type": "function",
|
|
50
|
+
"function": {
|
|
51
|
+
"name": "search_devpost_projects",
|
|
52
|
+
"description": "Search Devpost for hackathon projects matching a query.",
|
|
53
|
+
"parameters": {
|
|
54
|
+
"type": "object",
|
|
55
|
+
"properties": {
|
|
56
|
+
"query": {"type": "string", "description": "Search query term"},
|
|
57
|
+
"page": {"type": "integer", "description": "Page number (default 1)"},
|
|
58
|
+
},
|
|
59
|
+
"required": ["query"],
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
async def _handle_search(args: dict[str, Any]) -> dict[str, Any]:
|
|
67
|
+
query = args["query"]
|
|
68
|
+
page = int(args.get("page") or 1)
|
|
69
|
+
print(f" [tool] search_devpost_projects(query={query!r}, page={page})", file=sys.stderr)
|
|
70
|
+
return await search_projects(query=query, page=page)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
_TOOL_HANDLERS = {
|
|
74
|
+
"search_devpost_projects": _handle_search,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def _load_or_create_assistant(client: Any) -> str:
|
|
79
|
+
load_dotenv(_ENV_FILE, override=True)
|
|
80
|
+
stored_id = os.getenv(_ASSISTANT_ID_KEY, "").strip()
|
|
81
|
+
if stored_id:
|
|
82
|
+
print(f"[info] Reusing assistant {stored_id}", file=sys.stderr)
|
|
83
|
+
return stored_id
|
|
84
|
+
|
|
85
|
+
print("[info] Creating Backboard assistant…", file=sys.stderr)
|
|
86
|
+
aid = await ensure_assistant(
|
|
87
|
+
client,
|
|
88
|
+
assistant_id=None,
|
|
89
|
+
name="devpost-scraper-v3",
|
|
90
|
+
system_prompt=_SYSTEM_PROMPT,
|
|
91
|
+
tools=_TOOLS,
|
|
92
|
+
)
|
|
93
|
+
_ENV_FILE.touch(exist_ok=True)
|
|
94
|
+
set_key(str(_ENV_FILE), _ASSISTANT_ID_KEY, str(aid))
|
|
95
|
+
print(f"[info] Created assistant {aid} — saved to .env", file=sys.stderr)
|
|
96
|
+
return str(aid)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _parse_search_results(raw: str) -> list[dict[str, Any]]:
|
|
100
|
+
raw = raw.strip()
|
|
101
|
+
if raw.startswith("```"):
|
|
102
|
+
raw = "\n".join(
|
|
103
|
+
line for line in raw.splitlines()
|
|
104
|
+
if not line.strip().startswith("```")
|
|
105
|
+
).strip()
|
|
106
|
+
try:
|
|
107
|
+
data = json.loads(raw)
|
|
108
|
+
except json.JSONDecodeError as exc:
|
|
109
|
+
raise SystemExit(
|
|
110
|
+
f"[error] Assistant returned invalid JSON: {exc}\n\nRaw:\n{raw}"
|
|
111
|
+
) from exc
|
|
112
|
+
if not isinstance(data, list):
|
|
113
|
+
raise SystemExit(f"[error] Expected JSON array, got {type(data).__name__}")
|
|
114
|
+
return [item for item in data if isinstance(item, dict) and item.get("url")]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
async def _enrich_project(
|
|
118
|
+
item: dict[str, Any],
|
|
119
|
+
search_term: str,
|
|
120
|
+
) -> DevpostProject:
|
|
121
|
+
url = item["url"]
|
|
122
|
+
|
|
123
|
+
# detail page enrichment
|
|
124
|
+
details: dict[str, Any] = {}
|
|
125
|
+
try:
|
|
126
|
+
details = await get_project_details(url=url)
|
|
127
|
+
print(f" [enrich] details {url}", file=sys.stderr)
|
|
128
|
+
except Exception as exc:
|
|
129
|
+
print(f" [warn] details failed for {url}: {exc}", file=sys.stderr)
|
|
130
|
+
|
|
131
|
+
# email chain
|
|
132
|
+
email_data: dict[str, Any] = {}
|
|
133
|
+
try:
|
|
134
|
+
email_data = await find_author_email(project_url=url)
|
|
135
|
+
if email_data.get("email"):
|
|
136
|
+
print(f" [email] {email_data['email']} ← {url}", file=sys.stderr)
|
|
137
|
+
else:
|
|
138
|
+
print(f" [email] (none found) ← {url}", file=sys.stderr)
|
|
139
|
+
except Exception as exc:
|
|
140
|
+
print(f" [warn] email failed for {url}: {exc}", file=sys.stderr)
|
|
141
|
+
|
|
142
|
+
author_urls: list[str] = email_data.get("author_profile_urls", [])
|
|
143
|
+
|
|
144
|
+
return DevpostProject(
|
|
145
|
+
search_term=search_term,
|
|
146
|
+
title=details.get("title") or item.get("title", ""),
|
|
147
|
+
tagline=details.get("tagline") or item.get("tagline", ""),
|
|
148
|
+
url=url,
|
|
149
|
+
hackathon_name=details.get("hackathon_name", ""),
|
|
150
|
+
hackathon_url=details.get("hackathon_url", ""),
|
|
151
|
+
summary=details.get("summary", ""),
|
|
152
|
+
built_with=details.get("built_with") or item.get("built_with", ""),
|
|
153
|
+
prizes=details.get("prizes", ""),
|
|
154
|
+
team_size=details.get("team_size", ""),
|
|
155
|
+
author_profile_url=author_urls[0] if author_urls else "",
|
|
156
|
+
email=email_data.get("email", ""),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
async def run(search_terms: list[str], output: str | None) -> None:
|
|
161
|
+
load_dotenv(_ENV_FILE, override=True)
|
|
162
|
+
client = build_client()
|
|
163
|
+
assistant_id = await _load_or_create_assistant(client)
|
|
164
|
+
|
|
165
|
+
all_projects: list[DevpostProject] = []
|
|
166
|
+
|
|
167
|
+
for term in search_terms:
|
|
168
|
+
print(f"\n[info] Searching Devpost for: {term!r}", file=sys.stderr)
|
|
169
|
+
raw = await run_in_thread(
|
|
170
|
+
client,
|
|
171
|
+
assistant_id=assistant_id,
|
|
172
|
+
user_message=(
|
|
173
|
+
f"Search Devpost for: {term!r}\n"
|
|
174
|
+
"Collect page 1 and page 2. Return a JSON array of projects."
|
|
175
|
+
),
|
|
176
|
+
tool_handlers=_TOOL_HANDLERS,
|
|
177
|
+
llm_provider=os.getenv("BACKBOARD_LLM_PROVIDER", "openai"),
|
|
178
|
+
model_name=os.getenv("BACKBOARD_MODEL", "gpt-4o-mini"),
|
|
179
|
+
)
|
|
180
|
+
items = _parse_search_results(raw)
|
|
181
|
+
print(f"[info] Found {len(items)} projects — enriching…", file=sys.stderr)
|
|
182
|
+
|
|
183
|
+
# Enrich sequentially to be polite to external sites
|
|
184
|
+
projects: list[DevpostProject] = []
|
|
185
|
+
for item in items:
|
|
186
|
+
project = await _enrich_project(item, search_term=term)
|
|
187
|
+
projects.append(project)
|
|
188
|
+
|
|
189
|
+
print(f"[info] Collected {len(projects)} projects for {term!r}", file=sys.stderr)
|
|
190
|
+
all_projects.extend(projects)
|
|
191
|
+
|
|
192
|
+
print(f"\n[info] Total projects: {len(all_projects)}", file=sys.stderr)
|
|
193
|
+
write_projects(all_projects, output)
|
|
194
|
+
if output:
|
|
195
|
+
print(f"[info] Wrote → {output}", file=sys.stderr)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def main() -> None:
|
|
199
|
+
parser = argparse.ArgumentParser(
|
|
200
|
+
prog="devpost-scraper",
|
|
201
|
+
description="Extract Devpost project data and export to CSV.",
|
|
202
|
+
)
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
"search_terms",
|
|
205
|
+
nargs="+",
|
|
206
|
+
metavar="TERM",
|
|
207
|
+
help="One or more search terms to query on Devpost",
|
|
208
|
+
)
|
|
209
|
+
parser.add_argument(
|
|
210
|
+
"--output", "-o",
|
|
211
|
+
metavar="FILE",
|
|
212
|
+
default=None,
|
|
213
|
+
help="Output CSV file path (default: stdout)",
|
|
214
|
+
)
|
|
215
|
+
args = parser.parse_args()
|
|
216
|
+
asyncio.run(run(search_terms=args.search_terms, output=args.output))
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
if __name__ == "__main__":
|
|
220
|
+
main()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
_PARTICIPANTS_JWT_KEY = "DEVPOST_SESSION"
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
async def _run_participants(
|
|
227
|
+
hackathon_url: str,
|
|
228
|
+
jwt_token: str,
|
|
229
|
+
output: str | None,
|
|
230
|
+
no_email: bool,
|
|
231
|
+
) -> None:
|
|
232
|
+
all_participants: list[HackathonParticipant] = []
|
|
233
|
+
page = 1
|
|
234
|
+
|
|
235
|
+
print(f"[info] Fetching participants from {hackathon_url}", file=sys.stderr)
|
|
236
|
+
|
|
237
|
+
while True:
|
|
238
|
+
data = await get_hackathon_participants(hackathon_url, jwt_token, page=page)
|
|
239
|
+
batch = data.get("participants", [])
|
|
240
|
+
has_more = data.get("has_more", False)
|
|
241
|
+
|
|
242
|
+
if not batch:
|
|
243
|
+
print(f"[info] No participants on page {page}, stopping.", file=sys.stderr)
|
|
244
|
+
break
|
|
245
|
+
|
|
246
|
+
print(f"[info] Page {page}: {len(batch)} participants", file=sys.stderr)
|
|
247
|
+
|
|
248
|
+
for raw in batch:
|
|
249
|
+
profile_url = raw.get("profile_url", "")
|
|
250
|
+
email = ""
|
|
251
|
+
github_url = ""
|
|
252
|
+
linkedin_url = ""
|
|
253
|
+
|
|
254
|
+
if not no_email and profile_url:
|
|
255
|
+
try:
|
|
256
|
+
email_data = await find_participant_email(profile_url)
|
|
257
|
+
email = email_data.get("email", "")
|
|
258
|
+
github_url = email_data.get("github_url", "")
|
|
259
|
+
linkedin_url = email_data.get("linkedin_url", "")
|
|
260
|
+
parts = [f for f in [email, github_url, linkedin_url] if f]
|
|
261
|
+
if parts:
|
|
262
|
+
print(f" [found] {', '.join(parts)} ← {profile_url}", file=sys.stderr)
|
|
263
|
+
else:
|
|
264
|
+
print(f" [none] ← {profile_url}", file=sys.stderr)
|
|
265
|
+
except Exception as exc:
|
|
266
|
+
print(f" [warn] enrich failed for {profile_url}: {exc}", file=sys.stderr)
|
|
267
|
+
|
|
268
|
+
all_participants.append(
|
|
269
|
+
HackathonParticipant(
|
|
270
|
+
hackathon_url=hackathon_url,
|
|
271
|
+
username=raw.get("username", ""),
|
|
272
|
+
name=raw.get("name", ""),
|
|
273
|
+
specialty=raw.get("specialty", ""),
|
|
274
|
+
profile_url=profile_url,
|
|
275
|
+
github_url=github_url,
|
|
276
|
+
linkedin_url=linkedin_url,
|
|
277
|
+
email=email,
|
|
278
|
+
)
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if not has_more:
|
|
282
|
+
break
|
|
283
|
+
page += 1
|
|
284
|
+
|
|
285
|
+
print(f"\n[info] Total participants: {len(all_participants)}", file=sys.stderr)
|
|
286
|
+
|
|
287
|
+
import csv
|
|
288
|
+
|
|
289
|
+
fieldnames = HackathonParticipant.fieldnames()
|
|
290
|
+
rows = [p.model_dump() for p in all_participants]
|
|
291
|
+
|
|
292
|
+
if output:
|
|
293
|
+
with open(output, "w", newline="", encoding="utf-8") as f:
|
|
294
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
295
|
+
writer.writeheader()
|
|
296
|
+
writer.writerows(rows)
|
|
297
|
+
print(f"[info] Wrote → {output}", file=sys.stderr)
|
|
298
|
+
else:
|
|
299
|
+
import io
|
|
300
|
+
buf = io.StringIO()
|
|
301
|
+
writer = csv.DictWriter(buf, fieldnames=fieldnames)
|
|
302
|
+
writer.writeheader()
|
|
303
|
+
writer.writerows(rows)
|
|
304
|
+
print(buf.getvalue())
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def participants_main() -> None:
|
|
308
|
+
load_dotenv(_ENV_FILE, override=True)
|
|
309
|
+
|
|
310
|
+
parser = argparse.ArgumentParser(
|
|
311
|
+
prog="devpost-participants",
|
|
312
|
+
description="Crawl Devpost hackathon participants page and export to CSV.",
|
|
313
|
+
)
|
|
314
|
+
parser.add_argument(
|
|
315
|
+
"hackathon_url",
|
|
316
|
+
metavar="URL",
|
|
317
|
+
help="Hackathon participants URL (e.g. https://hack-days-niet.devpost.com/participants)",
|
|
318
|
+
)
|
|
319
|
+
parser.add_argument(
|
|
320
|
+
"--jwt",
|
|
321
|
+
metavar="TOKEN",
|
|
322
|
+
default=None,
|
|
323
|
+
help="Value of the _devpost session cookie from your browser. Falls back to DEVPOST_SESSION in .env",
|
|
324
|
+
)
|
|
325
|
+
parser.add_argument(
|
|
326
|
+
"--output", "-o",
|
|
327
|
+
metavar="FILE",
|
|
328
|
+
default=None,
|
|
329
|
+
help="Output CSV file path (default: stdout)",
|
|
330
|
+
)
|
|
331
|
+
parser.add_argument(
|
|
332
|
+
"--no-email",
|
|
333
|
+
action="store_true",
|
|
334
|
+
default=False,
|
|
335
|
+
help="Skip email enrichment (faster)",
|
|
336
|
+
)
|
|
337
|
+
args = parser.parse_args()
|
|
338
|
+
|
|
339
|
+
if not args.output:
|
|
340
|
+
parsed = urlparse(args.hackathon_url)
|
|
341
|
+
slug = parsed.hostname.split(".")[0] if parsed.hostname else "hackathon"
|
|
342
|
+
args.output = f"{slug}-participants.csv"
|
|
343
|
+
print(f"[info] No -o given, defaulting to {args.output}", file=sys.stderr)
|
|
344
|
+
|
|
345
|
+
jwt_token = args.jwt or os.getenv(_PARTICIPANTS_JWT_KEY, "").strip()
|
|
346
|
+
if not jwt_token:
|
|
347
|
+
raise SystemExit(
|
|
348
|
+
"[error] No session cookie. Pass --jwt TOKEN or set DEVPOST_SESSION in .env\n"
|
|
349
|
+
" Copy the _devpost cookie value from browser DevTools → Application → Cookies"
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
# Persist JWT to .env for reuse
|
|
353
|
+
if args.jwt:
|
|
354
|
+
_ENV_FILE.touch(exist_ok=True)
|
|
355
|
+
set_key(str(_ENV_FILE), _PARTICIPANTS_JWT_KEY, args.jwt)
|
|
356
|
+
|
|
357
|
+
asyncio.run(
|
|
358
|
+
_run_participants(
|
|
359
|
+
hackathon_url=args.hackathon_url,
|
|
360
|
+
jwt_token=jwt_token,
|
|
361
|
+
output=args.output,
|
|
362
|
+
no_email=args.no_email,
|
|
363
|
+
)
|
|
364
|
+
)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
from devpost_scraper.models import DevpostProject
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def write_projects(projects: Iterable[DevpostProject], output: str | None) -> None:
|
|
12
|
+
"""Write projects to CSV. Prints to stdout if output is None."""
|
|
13
|
+
fieldnames = DevpostProject.fieldnames()
|
|
14
|
+
|
|
15
|
+
if output:
|
|
16
|
+
path = Path(output)
|
|
17
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
fh = path.open("w", newline="", encoding="utf-8")
|
|
19
|
+
close = True
|
|
20
|
+
else:
|
|
21
|
+
fh = sys.stdout
|
|
22
|
+
close = False
|
|
23
|
+
|
|
24
|
+
writer = csv.DictWriter(fh, fieldnames=fieldnames, extrasaction="ignore")
|
|
25
|
+
writer.writeheader()
|
|
26
|
+
for project in projects:
|
|
27
|
+
writer.writerow(project.model_dump())
|
|
28
|
+
|
|
29
|
+
if close:
|
|
30
|
+
fh.close()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, ConfigDict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class HackathonParticipant(BaseModel):
|
|
7
|
+
model_config = ConfigDict(extra="ignore")
|
|
8
|
+
|
|
9
|
+
hackathon_url: str = ""
|
|
10
|
+
username: str = ""
|
|
11
|
+
name: str = ""
|
|
12
|
+
specialty: str = ""
|
|
13
|
+
profile_url: str = ""
|
|
14
|
+
github_url: str = ""
|
|
15
|
+
linkedin_url: str = ""
|
|
16
|
+
email: str = ""
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def fieldnames(cls) -> list[str]:
|
|
20
|
+
return ["hackathon_url", "username", "name", "specialty", "profile_url", "github_url", "linkedin_url", "email"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DevpostProject(BaseModel):
|
|
24
|
+
model_config = ConfigDict(extra="ignore")
|
|
25
|
+
|
|
26
|
+
search_term: str = ""
|
|
27
|
+
title: str = ""
|
|
28
|
+
tagline: str = ""
|
|
29
|
+
url: str = ""
|
|
30
|
+
hackathon_name: str = ""
|
|
31
|
+
hackathon_url: str = ""
|
|
32
|
+
summary: str = ""
|
|
33
|
+
built_with: str = ""
|
|
34
|
+
prizes: str = ""
|
|
35
|
+
team_size: str = ""
|
|
36
|
+
author_profile_url: str = ""
|
|
37
|
+
email: str = ""
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def fieldnames(cls) -> list[str]:
|
|
41
|
+
return [
|
|
42
|
+
"search_term",
|
|
43
|
+
"title",
|
|
44
|
+
"tagline",
|
|
45
|
+
"url",
|
|
46
|
+
"hackathon_name",
|
|
47
|
+
"hackathon_url",
|
|
48
|
+
"summary",
|
|
49
|
+
"built_with",
|
|
50
|
+
"prizes",
|
|
51
|
+
"team_size",
|
|
52
|
+
"author_profile_url",
|
|
53
|
+
"email",
|
|
54
|
+
]
|
|
@@ -0,0 +1,510 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.parse import urljoin, urlparse
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
|
|
11
|
+
_EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
|
|
12
|
+
|
|
13
|
+
# Domains we will follow when walking external links from a Devpost profile
|
|
14
|
+
_WALKABLE_DOMAINS = {
|
|
15
|
+
"github.com",
|
|
16
|
+
"linktr.ee",
|
|
17
|
+
"bio.link",
|
|
18
|
+
"beacons.ai",
|
|
19
|
+
"linkin.bio",
|
|
20
|
+
"carrd.co",
|
|
21
|
+
"about.me",
|
|
22
|
+
"bento.me",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
_SEARCH_URL = "https://devpost.com/software/search"
|
|
26
|
+
_GITHUB_API_URL = "https://api.github.com/users"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _github_headers() -> dict[str, str]:
|
|
30
|
+
"""Build GitHub API headers, including auth token if GITHUB_TOKEN is set."""
|
|
31
|
+
headers = {
|
|
32
|
+
"Accept": "application/vnd.github+json",
|
|
33
|
+
"User-Agent": "devpost-scraper/1.0",
|
|
34
|
+
}
|
|
35
|
+
token = os.environ.get("GITHUB_TOKEN", "").strip()
|
|
36
|
+
if token:
|
|
37
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
38
|
+
return headers
|
|
39
|
+
_JSON_HEADERS = {
|
|
40
|
+
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
41
|
+
"X-Requested-With": "XMLHttpRequest",
|
|
42
|
+
"User-Agent": (
|
|
43
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
44
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
45
|
+
"Chrome/122.0.0.0 Safari/537.36"
|
|
46
|
+
),
|
|
47
|
+
}
|
|
48
|
+
_HTML_HEADERS = {
|
|
49
|
+
"Accept": "text/html,application/xhtml+xml",
|
|
50
|
+
"User-Agent": _JSON_HEADERS["User-Agent"],
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
async def search_projects(query: str, page: int = 1) -> dict[str, Any]:
|
|
55
|
+
"""Search Devpost projects. Returns raw API payload with 'software' list."""
|
|
56
|
+
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
57
|
+
resp = await client.get(
|
|
58
|
+
_SEARCH_URL,
|
|
59
|
+
params={"query": query, "page": page},
|
|
60
|
+
headers=_JSON_HEADERS,
|
|
61
|
+
)
|
|
62
|
+
resp.raise_for_status()
|
|
63
|
+
data = resp.json()
|
|
64
|
+
|
|
65
|
+
projects = []
|
|
66
|
+
for item in data.get("software", []):
|
|
67
|
+
built = item.get("built_with") or []
|
|
68
|
+
projects.append(
|
|
69
|
+
{
|
|
70
|
+
"title": item.get("name", ""),
|
|
71
|
+
"tagline": item.get("tagline", ""),
|
|
72
|
+
"url": item.get("url", ""),
|
|
73
|
+
"built_with": ", ".join(built) if isinstance(built, list) else str(built),
|
|
74
|
+
"like_count": item.get("like_count", 0),
|
|
75
|
+
}
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
"projects": projects,
|
|
80
|
+
"total_count": data.get("total_count", 0),
|
|
81
|
+
"page": page,
|
|
82
|
+
"per_page": data.get("per_page", 24),
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
async def get_project_details(url: str) -> dict[str, Any]:
|
|
87
|
+
"""Fetch a Devpost project page and extract detail fields."""
|
|
88
|
+
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
89
|
+
resp = await client.get(url, headers=_HTML_HEADERS)
|
|
90
|
+
resp.raise_for_status()
|
|
91
|
+
html = resp.text
|
|
92
|
+
|
|
93
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
94
|
+
|
|
95
|
+
title = _text(soup.select_one("h1#app-title") or soup.select_one("h1.app_title"))
|
|
96
|
+
tagline = _text(soup.select_one("p#app-details-header-tagline") or soup.select_one("p.large"))
|
|
97
|
+
|
|
98
|
+
summary_el = soup.select_one("div#app-details") or soup.select_one("div.app-details")
|
|
99
|
+
summary = summary_el.get_text(" ", strip=True)[:500] if summary_el else ""
|
|
100
|
+
|
|
101
|
+
built_tags = [t.get_text(strip=True) for t in soup.select("span.cp-tag")]
|
|
102
|
+
built_with = ", ".join(built_tags)
|
|
103
|
+
|
|
104
|
+
hackathon_name = ""
|
|
105
|
+
hackathon_url = ""
|
|
106
|
+
challenge_link = soup.select_one("a.challenge-link") or soup.select_one("a[href*='/hackathons/']")
|
|
107
|
+
if challenge_link:
|
|
108
|
+
hackathon_name = challenge_link.get_text(strip=True)
|
|
109
|
+
hackathon_url = challenge_link.get("href", "")
|
|
110
|
+
|
|
111
|
+
prizes: list[str] = []
|
|
112
|
+
for prize_el in soup.select("div.prize, li.prize, span.prize-name"):
|
|
113
|
+
text = prize_el.get_text(strip=True)
|
|
114
|
+
if text:
|
|
115
|
+
prizes.append(text)
|
|
116
|
+
|
|
117
|
+
team_members = soup.select("ul#app-team li, div.software-team-member")
|
|
118
|
+
team_size = str(len(team_members)) if team_members else ""
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
"title": title,
|
|
122
|
+
"tagline": tagline,
|
|
123
|
+
"url": url,
|
|
124
|
+
"summary": summary,
|
|
125
|
+
"built_with": built_with,
|
|
126
|
+
"hackathon_name": hackathon_name,
|
|
127
|
+
"hackathon_url": hackathon_url,
|
|
128
|
+
"prizes": "; ".join(prizes),
|
|
129
|
+
"team_size": team_size,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _text(el: Any) -> str:
|
|
134
|
+
if el is None:
|
|
135
|
+
return ""
|
|
136
|
+
return el.get_text(strip=True)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _extract_emails(html: str) -> list[str]:
|
|
140
|
+
"""Find all email addresses in an HTML document (mailto: + bare text)."""
|
|
141
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
142
|
+
found: set[str] = set()
|
|
143
|
+
|
|
144
|
+
for a in soup.find_all("a", href=True):
|
|
145
|
+
href: str = a["href"]
|
|
146
|
+
if href.startswith("mailto:"):
|
|
147
|
+
addr = href[7:].split("?")[0].strip()
|
|
148
|
+
if addr:
|
|
149
|
+
found.add(addr.lower())
|
|
150
|
+
|
|
151
|
+
for match in _EMAIL_RE.finditer(soup.get_text(" ")):
|
|
152
|
+
found.add(match.group().lower())
|
|
153
|
+
|
|
154
|
+
# Filter out obviously invalid / placeholder emails
|
|
155
|
+
return [e for e in found if "." in e.split("@")[-1] and len(e) < 80]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
_DEVPOST_NON_PROFILE_PATHS = {
|
|
159
|
+
"software", "hackathons", "settings", "portfolio", "search",
|
|
160
|
+
"about", "contact", "help", "careers", "login", "register",
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
async def get_author_profile_urls(project_url: str) -> dict[str, Any]:
|
|
165
|
+
"""From a Devpost project page, return the author profile URLs."""
|
|
166
|
+
async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
|
|
167
|
+
resp = await client.get(project_url, headers=_HTML_HEADERS)
|
|
168
|
+
resp.raise_for_status()
|
|
169
|
+
html = resp.text
|
|
170
|
+
|
|
171
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
172
|
+
profiles: list[str] = []
|
|
173
|
+
|
|
174
|
+
for a in soup.find_all("a", href=True):
|
|
175
|
+
href: str = a["href"]
|
|
176
|
+
if href.startswith("/"):
|
|
177
|
+
href = f"https://devpost.com{href}"
|
|
178
|
+
parsed = urlparse(href)
|
|
179
|
+
if parsed.netloc not in ("devpost.com", "www.devpost.com"):
|
|
180
|
+
continue
|
|
181
|
+
path_parts = [p for p in parsed.path.strip("/").split("/") if p]
|
|
182
|
+
if len(path_parts) != 1:
|
|
183
|
+
continue
|
|
184
|
+
slug = path_parts[0]
|
|
185
|
+
if slug in _DEVPOST_NON_PROFILE_PATHS:
|
|
186
|
+
continue
|
|
187
|
+
# Devpost usernames are alphanumeric + dashes, no dots or slashes
|
|
188
|
+
if re.match(r"^[a-zA-Z0-9_\-]+$", slug):
|
|
189
|
+
profiles.append(f"https://devpost.com/{slug}")
|
|
190
|
+
|
|
191
|
+
return {"author_profile_urls": list(dict.fromkeys(profiles))}
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
async def get_profile_external_links(profile_url: str) -> dict[str, Any]:
|
|
195
|
+
"""From a Devpost author profile, return external links."""
|
|
196
|
+
async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
|
|
197
|
+
resp = await client.get(profile_url, headers=_HTML_HEADERS)
|
|
198
|
+
resp.raise_for_status()
|
|
199
|
+
html = resp.text
|
|
200
|
+
|
|
201
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
202
|
+
external: list[str] = []
|
|
203
|
+
emails = _extract_emails(html)
|
|
204
|
+
|
|
205
|
+
for a in soup.find_all("a", href=True):
|
|
206
|
+
href: str = a["href"]
|
|
207
|
+
parsed = urlparse(href)
|
|
208
|
+
if parsed.scheme not in ("http", "https"):
|
|
209
|
+
continue
|
|
210
|
+
domain = parsed.netloc.lstrip("www.")
|
|
211
|
+
if domain and domain not in ("devpost.com",):
|
|
212
|
+
external.append(href)
|
|
213
|
+
|
|
214
|
+
return {
|
|
215
|
+
"profile_url": profile_url,
|
|
216
|
+
"external_links": list(dict.fromkeys(external)),
|
|
217
|
+
"emails_on_profile": emails,
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
async def extract_emails_from_url(url: str) -> dict[str, Any]:
|
|
222
|
+
"""Fetch any URL and return email addresses found on the page."""
|
|
223
|
+
try:
|
|
224
|
+
async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
|
|
225
|
+
resp = await client.get(url, headers=_HTML_HEADERS)
|
|
226
|
+
resp.raise_for_status()
|
|
227
|
+
html = resp.text
|
|
228
|
+
except Exception as exc:
|
|
229
|
+
return {"url": url, "emails": [], "error": str(exc)}
|
|
230
|
+
|
|
231
|
+
emails = _extract_emails(html)
|
|
232
|
+
return {"url": url, "emails": emails}
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
async def get_hackathon_participants(
|
|
236
|
+
hackathon_url: str,
|
|
237
|
+
jwt_token: str,
|
|
238
|
+
page: int = 1,
|
|
239
|
+
) -> dict[str, Any]:
|
|
240
|
+
"""
|
|
241
|
+
Fetch one page of participants from a Devpost hackathon participants page.
|
|
242
|
+
Requires a valid Devpost session JWT (sent as _devpost_session cookie).
|
|
243
|
+
Returns {"participants": [...], "has_more": bool}.
|
|
244
|
+
"""
|
|
245
|
+
base = hackathon_url.rstrip("/").removesuffix("/participants")
|
|
246
|
+
url = f"{base}/participants"
|
|
247
|
+
|
|
248
|
+
# Devpost serves participant HTML fragments via XHR; plain GET returns empty on page 2+
|
|
249
|
+
headers = {
|
|
250
|
+
**_JSON_HEADERS,
|
|
251
|
+
"Accept": "text/javascript, application/javascript",
|
|
252
|
+
"Cookie": f"_devpost={jwt_token}",
|
|
253
|
+
"Referer": url,
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
257
|
+
resp = await client.get(url, params={"page": page}, headers=headers)
|
|
258
|
+
resp.raise_for_status()
|
|
259
|
+
html = resp.text
|
|
260
|
+
|
|
261
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
262
|
+
|
|
263
|
+
# Each participant is a div.participant with data-participant-id
|
|
264
|
+
cards = soup.select("div.participant")
|
|
265
|
+
|
|
266
|
+
# CSS classes on participant card encode specialty, e.g. "participant full-stack-developer"
|
|
267
|
+
_CARD_SKIP_CLASSES = {"participant"}
|
|
268
|
+
|
|
269
|
+
participants: list[dict[str, Any]] = []
|
|
270
|
+
for card in cards:
|
|
271
|
+
link = card.select_one("a.user-profile-link")
|
|
272
|
+
if not link:
|
|
273
|
+
continue
|
|
274
|
+
profile_href: str = link.get("href", "")
|
|
275
|
+
if profile_href.startswith("/"):
|
|
276
|
+
profile_href = f"https://devpost.com{profile_href}"
|
|
277
|
+
|
|
278
|
+
parsed = urlparse(profile_href)
|
|
279
|
+
slug = parsed.path.strip("/").split("/")[0] if parsed.path else ""
|
|
280
|
+
|
|
281
|
+
# Name lives in img[alt] or h5 inside .user-name
|
|
282
|
+
img = card.select_one("img[alt]")
|
|
283
|
+
name = img["alt"].strip() if img and img.get("alt") else slug
|
|
284
|
+
|
|
285
|
+
# Specialty is encoded as extra CSS class on the card div
|
|
286
|
+
card_classes = [c for c in card.get("class", []) if c not in _CARD_SKIP_CLASSES]
|
|
287
|
+
specialty = card_classes[0].replace("-", " ").title() if card_classes else ""
|
|
288
|
+
|
|
289
|
+
participants.append({
|
|
290
|
+
"username": slug,
|
|
291
|
+
"name": name,
|
|
292
|
+
"profile_url": profile_href,
|
|
293
|
+
"specialty": specialty,
|
|
294
|
+
})
|
|
295
|
+
|
|
296
|
+
# Pagination: Devpost renders <a rel="next"> when more pages exist
|
|
297
|
+
next_link = soup.select_one('a[rel="next"]')
|
|
298
|
+
has_more = next_link is not None
|
|
299
|
+
|
|
300
|
+
return {"participants": participants, "has_more": has_more, "page": page}
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
_GITHUB_ORG_PATHS = {"orgs", "repos", "topics", "collections", "explore", "marketplace", "about"}
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
_NOREPLY_SUFFIXES = ("@users.noreply.github.com",)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _github_username_from_url(github_url: str) -> str:
|
|
310
|
+
"""Extract a GitHub username from a profile URL. Returns '' for non-user URLs."""
|
|
311
|
+
parsed = urlparse(github_url)
|
|
312
|
+
path_parts = [p for p in parsed.path.strip("/").split("/") if p]
|
|
313
|
+
if len(path_parts) != 1 or path_parts[0] in _GITHUB_ORG_PATHS:
|
|
314
|
+
return ""
|
|
315
|
+
return path_parts[0]
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _is_real_email(email: str) -> bool:
|
|
319
|
+
"""Filter out GitHub noreply and placeholder addresses."""
|
|
320
|
+
if not email:
|
|
321
|
+
return False
|
|
322
|
+
email = email.lower().strip()
|
|
323
|
+
if email.endswith(_NOREPLY_SUFFIXES):
|
|
324
|
+
return False
|
|
325
|
+
if "noreply" in email or "github.com" in email:
|
|
326
|
+
return False
|
|
327
|
+
return "." in email.split("@")[-1]
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
async def get_github_email(github_url: str) -> str:
|
|
331
|
+
"""
|
|
332
|
+
Try three GitHub API strategies to find a user's email:
|
|
333
|
+
1. /users/{user} — public profile email field (often private)
|
|
334
|
+
2. /users/{user}/repos?sort=pushed → /repos/{owner}/{repo}/commits — mine commit author email
|
|
335
|
+
3. /users/{user}/events/public — fallback: PushEvent commit payloads
|
|
336
|
+
"""
|
|
337
|
+
username = _github_username_from_url(github_url)
|
|
338
|
+
if not username:
|
|
339
|
+
return ""
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
343
|
+
# Strategy 1: profile email
|
|
344
|
+
resp = await client.get(
|
|
345
|
+
f"{_GITHUB_API_URL}/{username}",
|
|
346
|
+
headers=_github_headers(),
|
|
347
|
+
)
|
|
348
|
+
if resp.status_code == 200:
|
|
349
|
+
email = (resp.json().get("email") or "").strip()
|
|
350
|
+
if _is_real_email(email):
|
|
351
|
+
return email
|
|
352
|
+
|
|
353
|
+
# Strategy 2: most-recently-pushed repo → commits → author email
|
|
354
|
+
resp = await client.get(
|
|
355
|
+
f"{_GITHUB_API_URL}/{username}/repos",
|
|
356
|
+
params={"sort": "pushed", "per_page": 3, "type": "owner"},
|
|
357
|
+
headers=_github_headers(),
|
|
358
|
+
)
|
|
359
|
+
if resp.status_code == 200:
|
|
360
|
+
for repo in resp.json():
|
|
361
|
+
full_name = repo.get("full_name", "")
|
|
362
|
+
if repo.get("fork") or not full_name:
|
|
363
|
+
continue
|
|
364
|
+
commit_resp = await client.get(
|
|
365
|
+
f"https://api.github.com/repos/{full_name}/commits",
|
|
366
|
+
params={"author": username, "per_page": 5},
|
|
367
|
+
headers=_github_headers(),
|
|
368
|
+
)
|
|
369
|
+
if commit_resp.status_code != 200:
|
|
370
|
+
continue
|
|
371
|
+
for c in commit_resp.json():
|
|
372
|
+
author = c.get("commit", {}).get("author", {})
|
|
373
|
+
email = (author.get("email") or "").strip().lower()
|
|
374
|
+
if _is_real_email(email):
|
|
375
|
+
return email
|
|
376
|
+
|
|
377
|
+
# Strategy 3: PushEvent payloads (fallback, often has 0 commits)
|
|
378
|
+
resp = await client.get(
|
|
379
|
+
f"{_GITHUB_API_URL}/{username}/events/public",
|
|
380
|
+
params={"per_page": 20},
|
|
381
|
+
headers=_github_headers(),
|
|
382
|
+
)
|
|
383
|
+
if resp.status_code == 200:
|
|
384
|
+
for event in resp.json():
|
|
385
|
+
if event.get("type") != "PushEvent":
|
|
386
|
+
continue
|
|
387
|
+
for commit in event.get("payload", {}).get("commits", []):
|
|
388
|
+
email = (commit.get("author", {}).get("email") or "").strip().lower()
|
|
389
|
+
if _is_real_email(email):
|
|
390
|
+
return email
|
|
391
|
+
|
|
392
|
+
except Exception:
|
|
393
|
+
pass
|
|
394
|
+
|
|
395
|
+
return ""
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
_DEVPOST_OWNED_DOMAINS = {
|
|
399
|
+
"devpost.com", "devpost.team", "info.devpost.com",
|
|
400
|
+
"secure.devpost.com", "d2dmyh35ffsxbl.cloudfront.net",
|
|
401
|
+
"d112y698adiu2z.cloudfront.net",
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _is_personal_link(url: str) -> bool:
|
|
406
|
+
"""Filter out Devpost-owned links (nav, footer, CDN) from external link lists."""
|
|
407
|
+
parsed = urlparse(url)
|
|
408
|
+
domain = parsed.netloc.lstrip("www.")
|
|
409
|
+
return domain not in _DEVPOST_OWNED_DOMAINS
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
async def find_participant_email(profile_url: str) -> dict[str, Any]:
|
|
413
|
+
"""
|
|
414
|
+
Enrich a participant from their Devpost profile:
|
|
415
|
+
1. Extract GitHub URL, LinkedIn URL from profile social links
|
|
416
|
+
2. Try GitHub API for public email
|
|
417
|
+
3. Walk other external links for email (linktr.ee, bio.link, etc.)
|
|
418
|
+
"""
|
|
419
|
+
result: dict[str, Any] = {
|
|
420
|
+
"profile_url": profile_url,
|
|
421
|
+
"external_links_walked": [],
|
|
422
|
+
"github_url": "",
|
|
423
|
+
"linkedin_url": "",
|
|
424
|
+
"email": "",
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
profile_data = await get_profile_external_links(profile_url)
|
|
428
|
+
all_emails: list[str] = list(profile_data.get("emails_on_profile", []))
|
|
429
|
+
|
|
430
|
+
personal_links = [l for l in profile_data.get("external_links", []) if _is_personal_link(l)]
|
|
431
|
+
|
|
432
|
+
# First pass: capture GitHub + LinkedIn URLs
|
|
433
|
+
for link in personal_links:
|
|
434
|
+
parsed = urlparse(link)
|
|
435
|
+
domain = parsed.netloc.lstrip("www.")
|
|
436
|
+
path_parts = [p for p in parsed.path.strip("/").split("/") if p]
|
|
437
|
+
|
|
438
|
+
if domain == "github.com" and path_parts and not result["github_url"]:
|
|
439
|
+
if path_parts[0] not in _GITHUB_ORG_PATHS and len(path_parts) == 1:
|
|
440
|
+
result["github_url"] = link
|
|
441
|
+
|
|
442
|
+
if domain == "linkedin.com" and "/in/" in parsed.path and not result["linkedin_url"]:
|
|
443
|
+
if "/company/" not in parsed.path:
|
|
444
|
+
result["linkedin_url"] = link
|
|
445
|
+
|
|
446
|
+
# Try GitHub API for public email
|
|
447
|
+
if result["github_url"] and not all_emails:
|
|
448
|
+
email = await get_github_email(result["github_url"])
|
|
449
|
+
if email:
|
|
450
|
+
all_emails.append(email)
|
|
451
|
+
|
|
452
|
+
# Walk remaining external links for email
|
|
453
|
+
if not all_emails:
|
|
454
|
+
for link in personal_links:
|
|
455
|
+
parsed = urlparse(link)
|
|
456
|
+
domain = parsed.netloc.lstrip("www.")
|
|
457
|
+
if domain in ("github.com", "linkedin.com"):
|
|
458
|
+
continue
|
|
459
|
+
if domain not in _WALKABLE_DOMAINS:
|
|
460
|
+
continue
|
|
461
|
+
result["external_links_walked"].append(link)
|
|
462
|
+
link_data = await extract_emails_from_url(link)
|
|
463
|
+
all_emails.extend(link_data.get("emails", []))
|
|
464
|
+
if all_emails:
|
|
465
|
+
break
|
|
466
|
+
|
|
467
|
+
result["email"] = all_emails[0] if all_emails else ""
|
|
468
|
+
return result
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
async def find_author_email(project_url: str) -> dict[str, Any]:
|
|
472
|
+
"""
|
|
473
|
+
Full chain: project page → author profile(s) → external links → emails.
|
|
474
|
+
Returns the first email found along with the chain of URLs walked.
|
|
475
|
+
"""
|
|
476
|
+
result: dict[str, Any] = {
|
|
477
|
+
"project_url": project_url,
|
|
478
|
+
"author_profile_urls": [],
|
|
479
|
+
"external_links_walked": [],
|
|
480
|
+
"email": "",
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
# Step 1: get author profiles from project page
|
|
484
|
+
profiles_data = await get_author_profile_urls(project_url)
|
|
485
|
+
author_urls: list[str] = profiles_data.get("author_profile_urls", [])
|
|
486
|
+
result["author_profile_urls"] = author_urls
|
|
487
|
+
|
|
488
|
+
all_emails: list[str] = []
|
|
489
|
+
|
|
490
|
+
for profile_url in author_urls[:3]: # cap at 3 authors
|
|
491
|
+
profile_data = await get_profile_external_links(profile_url)
|
|
492
|
+
|
|
493
|
+
# Emails directly on profile page
|
|
494
|
+
all_emails.extend(profile_data.get("emails_on_profile", []))
|
|
495
|
+
|
|
496
|
+
# Walk external links from profile
|
|
497
|
+
for link in profile_data.get("external_links", []):
|
|
498
|
+
parsed = urlparse(link)
|
|
499
|
+
domain = parsed.netloc.lstrip("www.")
|
|
500
|
+
if domain not in _WALKABLE_DOMAINS:
|
|
501
|
+
continue
|
|
502
|
+
result["external_links_walked"].append(link)
|
|
503
|
+
link_data = await extract_emails_from_url(link)
|
|
504
|
+
all_emails.extend(link_data.get("emails", []))
|
|
505
|
+
|
|
506
|
+
if all_emails:
|
|
507
|
+
break # stop after first author with a result
|
|
508
|
+
|
|
509
|
+
result["email"] = all_emails[0] if all_emails else ""
|
|
510
|
+
return result
|