signalforge-cli 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalforge_cli-0.4.1/.env.example +18 -0
- signalforge_cli-0.4.1/.gitignore +13 -0
- signalforge_cli-0.4.1/PKG-INFO +223 -0
- signalforge_cli-0.4.1/README.md +211 -0
- signalforge_cli-0.4.1/pyproject.toml +30 -0
- signalforge_cli-0.4.1/src/devpost_scraper/__init__.py +0 -0
- signalforge_cli-0.4.1/src/devpost_scraper/backboard_client.py +135 -0
- signalforge_cli-0.4.1/src/devpost_scraper/cli.py +1094 -0
- signalforge_cli-0.4.1/src/devpost_scraper/csv_export.py +30 -0
- signalforge_cli-0.4.1/src/devpost_scraper/customerio.py +153 -0
- signalforge_cli-0.4.1/src/devpost_scraper/db.py +373 -0
- signalforge_cli-0.4.1/src/devpost_scraper/models.py +157 -0
- signalforge_cli-0.4.1/src/devpost_scraper/scraper.py +648 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Backboard API key — get yours at https://app.backboard.io
|
|
2
|
+
BACKBOARD_API_KEY=
|
|
3
|
+
|
|
4
|
+
# Persisted automatically after first run — do not delete
|
|
5
|
+
DEVPOST_ASSISTANT_ID=
|
|
6
|
+
|
|
7
|
+
# Devpost session cookie (_devpost) for authenticated endpoints (e.g. /participants)
|
|
8
|
+
# Copy the _devpost cookie value from your browser DevTools after logging in
|
|
9
|
+
DEVPOST_SESSION=
|
|
10
|
+
|
|
11
|
+
# GitHub personal access token for higher API rate limits (5000/hr vs 60/hr)
|
|
12
|
+
# Generate at https://github.com/settings/tokens (no scopes needed, public data only)
|
|
13
|
+
GITHUB_TOKEN=
|
|
14
|
+
|
|
15
|
+
# Customer.io Track API credentials (required for --emit-events)
|
|
16
|
+
# Find at https://fly.customer.io/settings/api_credentials
|
|
17
|
+
CUSTOMERIO_SITE_ID=
|
|
18
|
+
CUSTOMERIO_API_KEY=
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: signalforge-cli
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary: CLI for extracting emails and sending customer.io events
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: backboard-sdk>=1.5.9
|
|
7
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
8
|
+
Requires-Dist: httpx>=0.27.0
|
|
9
|
+
Requires-Dist: pydantic>=2.7.0
|
|
10
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# SignalForge
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
_____ _ ________
|
|
17
|
+
/ ___/(_)___ _____ ____ _/ / ____/___ _________ ____
|
|
18
|
+
\__ \/ / __ `/ __ \/ __ `/ / /_ / __ \/ ___/ __ `/ _ \
|
|
19
|
+
___/ / / /_/ / / / / /_/ / / __/ / /_/ / / / /_/ / __/
|
|
20
|
+
/____/_/\__, /_/ /_/\__,_/_/_/ \____/_/ \__, /\___/
|
|
21
|
+
/____/ /____/
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
SignalForge is a CLI toolkit for mining developer signals from public sources,
|
|
25
|
+
enriching them with emails, storing results in SQLite, and emitting Customer.io events.
|
|
26
|
+
|
|
27
|
+
Commands:
|
|
28
|
+
|
|
29
|
+
| Command | Purpose |
|
|
30
|
+
|---|---|
|
|
31
|
+
| `signalforge` | Search Devpost projects by keyword, enrich with emails, export CSV |
|
|
32
|
+
| `signalforge-participants` | Scrape a single hackathon's participant list, export CSV |
|
|
33
|
+
| `signalforge-harvest` | Walk the hackathon listing, scrape all participants, store in SQLite, emit delta events |
|
|
34
|
+
| `signalforge-github-forks` | Mine GitHub fork owners and optionally enrich with emails |
|
|
35
|
+
| `signalforge-rb2b` | Import RB2B visitor CSVs, store in SQLite, emit `visited_site` events |
|
|
36
|
+
|
|
37
|
+
## Requirements
|
|
38
|
+
|
|
39
|
+
- Python 3.11+
|
|
40
|
+
- [`uv`](https://docs.astral.sh/uv/)
|
|
41
|
+
- A [Backboard](https://app.backboard.io) API key (for `signalforge` only)
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
uv sync
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Environment
|
|
50
|
+
|
|
51
|
+
Copy `.env.example` → `.env` and fill in:
|
|
52
|
+
|
|
53
|
+
| Variable | Required for | Notes |
|
|
54
|
+
|---|---|---|
|
|
55
|
+
| `BACKBOARD_API_KEY` | `signalforge` | Backboard account key |
|
|
56
|
+
| `DEVPOST_ASSISTANT_ID` | auto | Persisted on first run |
|
|
57
|
+
| `DEVPOST_SESSION` | `signalforge-participants`, `signalforge-harvest` | `_devpost` cookie from browser DevTools |
|
|
58
|
+
| `GITHUB_TOKEN` | optional | GitHub PAT for 5000 req/hr (vs 60). No scopes needed |
|
|
59
|
+
| `CUSTOMERIO_SITE_ID` | `--emit-events` | Customer.io Track API |
|
|
60
|
+
| `CUSTOMERIO_API_KEY` | `--emit-events` | Customer.io Track API |
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## signalforge
|
|
65
|
+
|
|
66
|
+
Search Devpost projects by keyword, enrich each with detail page + author email, export CSV.
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
uv run signalforge "ai agents" --output results.csv
|
|
70
|
+
uv run signalforge "climate tech" "developer tools" -o results.csv
|
|
71
|
+
|
|
72
|
+
# Or via start.sh
|
|
73
|
+
./start.sh "ai agents" --output results.csv
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## signalforge-participants
|
|
79
|
+
|
|
80
|
+
Scrape a single hackathon's participant list and export to CSV.
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# First time — pass session cookie
|
|
84
|
+
uv run signalforge-participants "https://authorizedtoact.devpost.com/participants" \
|
|
85
|
+
--jwt "<_devpost cookie value>" -o participants.csv
|
|
86
|
+
|
|
87
|
+
# Reuse saved session from .env
|
|
88
|
+
uv run signalforge-participants "https://authorizedtoact.devpost.com/participants" -o out.csv
|
|
89
|
+
|
|
90
|
+
# Skip email enrichment
|
|
91
|
+
uv run signalforge-participants "https://..." --no-email -o out.csv
|
|
92
|
+
|
|
93
|
+
# Emit Customer.io events after scrape
|
|
94
|
+
uv run signalforge-participants "https://..." --emit-events -o out.csv
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## signalforge-harvest
|
|
100
|
+
|
|
101
|
+
Automated pipeline: walk the hackathon listing → scrape participants → store in SQLite → emit Customer.io events for delta (new) participants.
|
|
102
|
+
|
|
103
|
+
### Basic usage
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Scrape 3 pages of open hackathons (27 hackathons), enrich new participants, emit events
|
|
107
|
+
uv run signalforge-harvest --emit-events
|
|
108
|
+
|
|
109
|
+
# Fast first run — scrape without email enrichment
|
|
110
|
+
uv run signalforge-harvest --no-email
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Flags
|
|
114
|
+
|
|
115
|
+
| Flag | Default | Description |
|
|
116
|
+
|---|---|---|
|
|
117
|
+
| `--pages N` | `3` | Number of hackathon listing pages to fetch (9 per page) |
|
|
118
|
+
| `--hackathons N` | `0` (all) | Only process the first N hackathons from the listing |
|
|
119
|
+
| `--jwt TOKEN` | `.env` | Devpost `_devpost` session cookie |
|
|
120
|
+
| `--db PATH` | `devpost_harvest.db` | SQLite database path |
|
|
121
|
+
| `--status {open,ended,upcoming}` | `open` | Hackathon status filter (repeatable) |
|
|
122
|
+
| `--max-participants N` | `0` (unlimited) | Cap participants scraped per hackathon |
|
|
123
|
+
| `--no-email` | off | Skip email enrichment entirely (even for new participants) |
|
|
124
|
+
| `--emit-events` | off | Emit Customer.io events for unemitted participants during scrape |
|
|
125
|
+
| `--emit-unsent` | off | Skip scraping — just emit events for all unsent participants in DB |
|
|
126
|
+
| `--rescrape` | off | Re-scrape hackathons already scraped in a previous run |
|
|
127
|
+
|
|
128
|
+
### How it works
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
Phase 1: Discover hackathons
|
|
132
|
+
GET /api/hackathons?status[]=open → paginated JSON listing
|
|
133
|
+
|
|
134
|
+
Phase 2: Per hackathon
|
|
135
|
+
2a. Fast scan — scrape all participant pages (no enrichment, ~1 req per 20 participants)
|
|
136
|
+
2b. Upsert into SQLite → detect delta (new participants not previously in DB)
|
|
137
|
+
2c. Email-enrich delta only — GitHub API + link walking (skipped with --no-email)
|
|
138
|
+
2d. Emit Customer.io events for unemitted participants (only with --emit-events)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Delta logic
|
|
142
|
+
|
|
143
|
+
On subsequent runs, the fast scan re-fetches participant lists but only new participants
|
|
144
|
+
(not previously in SQLite) get the expensive email enrichment. Already-emitted participants
|
|
145
|
+
are never re-emitted. This makes re-runs fast and safe to repeat.
|
|
146
|
+
|
|
147
|
+
### Common workflows
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Initial bulk scrape (no events yet)
|
|
151
|
+
uv run signalforge-harvest --pages 5
|
|
152
|
+
|
|
153
|
+
# Emit all unsent events from the DB (no scraping, no JWT needed)
|
|
154
|
+
uv run signalforge-harvest --emit-unsent
|
|
155
|
+
|
|
156
|
+
# Quick delta check on first hackathon only
|
|
157
|
+
uv run signalforge-harvest --hackathons 1 --rescrape --emit-events
|
|
158
|
+
|
|
159
|
+
# Re-scan all hackathons for new participants, enrich + emit
|
|
160
|
+
uv run signalforge-harvest --rescrape --emit-events
|
|
161
|
+
|
|
162
|
+
# Include ended hackathons
|
|
163
|
+
uv run signalforge-harvest --status open --status ended
|
|
164
|
+
|
|
165
|
+
# Fast delta scan (skip email enrichment for new participants too)
|
|
166
|
+
uv run signalforge-harvest --rescrape --no-email
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### SQLite schema
|
|
170
|
+
|
|
171
|
+
The database (`devpost_harvest.db`) has two tables:
|
|
172
|
+
|
|
173
|
+
- **`hackathons`** — id, url, title, org, state, dates, registrations, prize, themes.
|
|
174
|
+
`last_scraped_at` is set after participants are scraped.
|
|
175
|
+
- **`participants`** — (hackathon_url, username) primary key, enrichment fields,
|
|
176
|
+
`first_seen_at`, `last_seen_at`, `event_emitted_at`.
|
|
177
|
+
|
|
178
|
+
### Customer.io events
|
|
179
|
+
|
|
180
|
+
Event name: `devpost_hackathon`. Uses participant email as the Customer.io user ID.
|
|
181
|
+
|
|
182
|
+
Event data: hackathon_url, hackathon_title, username, name, specialty, profile_url, github_url, linkedin_url.
|
|
183
|
+
|
|
184
|
+
Email templates in `emails/` use `{{customer.first_name}}` and `{{event.*}}` Liquid variables.
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## signalforge-github-forks
|
|
189
|
+
|
|
190
|
+
Mine fork owners and enrich with emails (optional), stored in the same SQLite DB
|
|
191
|
+
under a synthetic `hackathon_url` like `github:forks:owner/repo`.
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
# Presets
|
|
195
|
+
uv run signalforge-github-forks --preset mem0 --emit-events
|
|
196
|
+
uv run signalforge-github-forks --preset supermemory --no-email
|
|
197
|
+
|
|
198
|
+
# Custom repo
|
|
199
|
+
uv run signalforge-github-forks --repo owner/repo --limit 1000 --mode first_n
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## signalforge-rb2b
|
|
205
|
+
|
|
206
|
+
Import RB2B visitor exports into SQLite and emit Customer.io `visited_site` events
|
|
207
|
+
for identified visitors.
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
# Import CSV(s) and emit events for newly added identified visitors
|
|
211
|
+
uv run signalforge-rb2b daily_2026-03-*.csv --emit-events
|
|
212
|
+
|
|
213
|
+
# Emit any unsent identified visitors from the DB
|
|
214
|
+
uv run signalforge-rb2b --emit-unsent
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Development
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
uv run python -m devpost_scraper.cli "ai agents" --output out.csv
|
|
223
|
+
```
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# SignalForge
|
|
2
|
+
|
|
3
|
+
```
|
|
4
|
+
_____ _ ________
|
|
5
|
+
/ ___/(_)___ _____ ____ _/ / ____/___ _________ ____
|
|
6
|
+
\__ \/ / __ `/ __ \/ __ `/ / /_ / __ \/ ___/ __ `/ _ \
|
|
7
|
+
___/ / / /_/ / / / / /_/ / / __/ / /_/ / / / /_/ / __/
|
|
8
|
+
/____/_/\__, /_/ /_/\__,_/_/_/ \____/_/ \__, /\___/
|
|
9
|
+
/____/ /____/
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
SignalForge is a CLI toolkit for mining developer signals from public sources,
|
|
13
|
+
enriching them with emails, storing results in SQLite, and emitting Customer.io events.
|
|
14
|
+
|
|
15
|
+
Commands:
|
|
16
|
+
|
|
17
|
+
| Command | Purpose |
|
|
18
|
+
|---|---|
|
|
19
|
+
| `signalforge` | Search Devpost projects by keyword, enrich with emails, export CSV |
|
|
20
|
+
| `signalforge-participants` | Scrape a single hackathon's participant list, export CSV |
|
|
21
|
+
| `signalforge-harvest` | Walk the hackathon listing, scrape all participants, store in SQLite, emit delta events |
|
|
22
|
+
| `signalforge-github-forks` | Mine GitHub fork owners and optionally enrich with emails |
|
|
23
|
+
| `signalforge-rb2b` | Import RB2B visitor CSVs, store in SQLite, emit `visited_site` events |
|
|
24
|
+
|
|
25
|
+
## Requirements
|
|
26
|
+
|
|
27
|
+
- Python 3.11+
|
|
28
|
+
- [`uv`](https://docs.astral.sh/uv/)
|
|
29
|
+
- A [Backboard](https://app.backboard.io) API key (for `signalforge` only)
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
uv sync
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Environment
|
|
38
|
+
|
|
39
|
+
Copy `.env.example` → `.env` and fill in:
|
|
40
|
+
|
|
41
|
+
| Variable | Required for | Notes |
|
|
42
|
+
|---|---|---|
|
|
43
|
+
| `BACKBOARD_API_KEY` | `signalforge` | Backboard account key |
|
|
44
|
+
| `DEVPOST_ASSISTANT_ID` | auto | Persisted on first run |
|
|
45
|
+
| `DEVPOST_SESSION` | `signalforge-participants`, `signalforge-harvest` | `_devpost` cookie from browser DevTools |
|
|
46
|
+
| `GITHUB_TOKEN` | optional | GitHub PAT for 5000 req/hr (vs 60). No scopes needed |
|
|
47
|
+
| `CUSTOMERIO_SITE_ID` | `--emit-events` | Customer.io Track API |
|
|
48
|
+
| `CUSTOMERIO_API_KEY` | `--emit-events` | Customer.io Track API |
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## signalforge
|
|
53
|
+
|
|
54
|
+
Search Devpost projects by keyword, enrich each with detail page + author email, export CSV.
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
uv run signalforge "ai agents" --output results.csv
|
|
58
|
+
uv run signalforge "climate tech" "developer tools" -o results.csv
|
|
59
|
+
|
|
60
|
+
# Or via start.sh
|
|
61
|
+
./start.sh "ai agents" --output results.csv
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## signalforge-participants
|
|
67
|
+
|
|
68
|
+
Scrape a single hackathon's participant list and export to CSV.
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# First time — pass session cookie
|
|
72
|
+
uv run signalforge-participants "https://authorizedtoact.devpost.com/participants" \
|
|
73
|
+
--jwt "<_devpost cookie value>" -o participants.csv
|
|
74
|
+
|
|
75
|
+
# Reuse saved session from .env
|
|
76
|
+
uv run signalforge-participants "https://authorizedtoact.devpost.com/participants" -o out.csv
|
|
77
|
+
|
|
78
|
+
# Skip email enrichment
|
|
79
|
+
uv run signalforge-participants "https://..." --no-email -o out.csv
|
|
80
|
+
|
|
81
|
+
# Emit Customer.io events after scrape
|
|
82
|
+
uv run signalforge-participants "https://..." --emit-events -o out.csv
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## signalforge-harvest
|
|
88
|
+
|
|
89
|
+
Automated pipeline: walk the hackathon listing → scrape participants → store in SQLite → emit Customer.io events for delta (new) participants.
|
|
90
|
+
|
|
91
|
+
### Basic usage
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# Scrape 3 pages of open hackathons (27 hackathons), enrich new participants, emit events
|
|
95
|
+
uv run signalforge-harvest --emit-events
|
|
96
|
+
|
|
97
|
+
# Fast first run — scrape without email enrichment
|
|
98
|
+
uv run signalforge-harvest --no-email
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Flags
|
|
102
|
+
|
|
103
|
+
| Flag | Default | Description |
|
|
104
|
+
|---|---|---|
|
|
105
|
+
| `--pages N` | `3` | Number of hackathon listing pages to fetch (9 per page) |
|
|
106
|
+
| `--hackathons N` | `0` (all) | Only process the first N hackathons from the listing |
|
|
107
|
+
| `--jwt TOKEN` | `.env` | Devpost `_devpost` session cookie |
|
|
108
|
+
| `--db PATH` | `devpost_harvest.db` | SQLite database path |
|
|
109
|
+
| `--status {open,ended,upcoming}` | `open` | Hackathon status filter (repeatable) |
|
|
110
|
+
| `--max-participants N` | `0` (unlimited) | Cap participants scraped per hackathon |
|
|
111
|
+
| `--no-email` | off | Skip email enrichment entirely (even for new participants) |
|
|
112
|
+
| `--emit-events` | off | Emit Customer.io events for unemitted participants during scrape |
|
|
113
|
+
| `--emit-unsent` | off | Skip scraping — just emit events for all unsent participants in DB |
|
|
114
|
+
| `--rescrape` | off | Re-scrape hackathons already scraped in a previous run |
|
|
115
|
+
|
|
116
|
+
### How it works
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
Phase 1: Discover hackathons
|
|
120
|
+
GET /api/hackathons?status[]=open → paginated JSON listing
|
|
121
|
+
|
|
122
|
+
Phase 2: Per hackathon
|
|
123
|
+
2a. Fast scan — scrape all participant pages (no enrichment, ~1 req per 20 participants)
|
|
124
|
+
2b. Upsert into SQLite → detect delta (new participants not previously in DB)
|
|
125
|
+
2c. Email-enrich delta only — GitHub API + link walking (skipped with --no-email)
|
|
126
|
+
2d. Emit Customer.io events for unemitted participants (only with --emit-events)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Delta logic
|
|
130
|
+
|
|
131
|
+
On subsequent runs, the fast scan re-fetches participant lists but only new participants
|
|
132
|
+
(not previously in SQLite) get the expensive email enrichment. Already-emitted participants
|
|
133
|
+
are never re-emitted. This makes re-runs fast and safe to repeat.
|
|
134
|
+
|
|
135
|
+
### Common workflows
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
# Initial bulk scrape (no events yet)
|
|
139
|
+
uv run signalforge-harvest --pages 5
|
|
140
|
+
|
|
141
|
+
# Emit all unsent events from the DB (no scraping, no JWT needed)
|
|
142
|
+
uv run signalforge-harvest --emit-unsent
|
|
143
|
+
|
|
144
|
+
# Quick delta check on first hackathon only
|
|
145
|
+
uv run signalforge-harvest --hackathons 1 --rescrape --emit-events
|
|
146
|
+
|
|
147
|
+
# Re-scan all hackathons for new participants, enrich + emit
|
|
148
|
+
uv run signalforge-harvest --rescrape --emit-events
|
|
149
|
+
|
|
150
|
+
# Include ended hackathons
|
|
151
|
+
uv run signalforge-harvest --status open --status ended
|
|
152
|
+
|
|
153
|
+
# Fast delta scan (skip email enrichment for new participants too)
|
|
154
|
+
uv run signalforge-harvest --rescrape --no-email
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### SQLite schema
|
|
158
|
+
|
|
159
|
+
The database (`devpost_harvest.db`) has two tables:
|
|
160
|
+
|
|
161
|
+
- **`hackathons`** — id, url, title, org, state, dates, registrations, prize, themes.
|
|
162
|
+
`last_scraped_at` is set after participants are scraped.
|
|
163
|
+
- **`participants`** — (hackathon_url, username) primary key, enrichment fields,
|
|
164
|
+
`first_seen_at`, `last_seen_at`, `event_emitted_at`.
|
|
165
|
+
|
|
166
|
+
### Customer.io events
|
|
167
|
+
|
|
168
|
+
Event name: `devpost_hackathon`. Uses participant email as the Customer.io user ID.
|
|
169
|
+
|
|
170
|
+
Event data: hackathon_url, hackathon_title, username, name, specialty, profile_url, github_url, linkedin_url.
|
|
171
|
+
|
|
172
|
+
Email templates in `emails/` use `{{customer.first_name}}` and `{{event.*}}` Liquid variables.
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## signalforge-github-forks
|
|
177
|
+
|
|
178
|
+
Mine fork owners and enrich with emails (optional), stored in the same SQLite DB
|
|
179
|
+
under a synthetic `hackathon_url` like `github:forks:owner/repo`.
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
# Presets
|
|
183
|
+
uv run signalforge-github-forks --preset mem0 --emit-events
|
|
184
|
+
uv run signalforge-github-forks --preset supermemory --no-email
|
|
185
|
+
|
|
186
|
+
# Custom repo
|
|
187
|
+
uv run signalforge-github-forks --repo owner/repo --limit 1000 --mode first_n
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## signalforge-rb2b
|
|
193
|
+
|
|
194
|
+
Import RB2B visitor exports into SQLite and emit Customer.io `visited_site` events
|
|
195
|
+
for identified visitors.
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
# Import CSV(s) and emit events for newly added identified visitors
|
|
199
|
+
uv run signalforge-rb2b daily_2026-03-*.csv --emit-events
|
|
200
|
+
|
|
201
|
+
# Emit any unsent identified visitors from the DB
|
|
202
|
+
uv run signalforge-rb2b --emit-unsent
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Development
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
uv run python -m devpost_scraper.cli "ai agents" --output out.csv
|
|
211
|
+
```
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "signalforge-cli"
|
|
3
|
+
version = "0.4.1"
|
|
4
|
+
description = "CLI for extracting emails and sending customer.io events"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"backboard-sdk>=1.5.9",
|
|
9
|
+
"beautifulsoup4>=4.12.0",
|
|
10
|
+
"httpx>=0.27.0",
|
|
11
|
+
"pydantic>=2.7.0",
|
|
12
|
+
"python-dotenv>=1.0.1",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
signalforge = "devpost_scraper.cli:main"
|
|
17
|
+
signalforge-participants = "devpost_scraper.cli:participants_main"
|
|
18
|
+
signalforge-harvest = "devpost_scraper.cli:harvest_main"
|
|
19
|
+
signalforge-github-forks = "devpost_scraper.cli:github_forks_main"
|
|
20
|
+
signalforge-rb2b = "devpost_scraper.cli:rb2b_main"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["hatchling"]
|
|
24
|
+
build-backend = "hatchling.build"
|
|
25
|
+
|
|
26
|
+
[tool.hatch.build.targets.wheel]
|
|
27
|
+
packages = ["src/devpost_scraper"]
|
|
28
|
+
|
|
29
|
+
[tool.uv]
|
|
30
|
+
package = true
|
|
File without changes
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any, Awaitable, Callable, Mapping
|
|
6
|
+
|
|
7
|
+
from backboard import BackboardClient
|
|
8
|
+
from backboard.exceptions import BackboardAPIError
|
|
9
|
+
|
|
10
|
+
ToolHandler = Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BackboardClientError(Exception):
|
|
14
|
+
"""Raised when a Backboard operation fails."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_client() -> BackboardClient:
|
|
18
|
+
api_key = os.getenv("BACKBOARD_API_KEY", "").strip()
|
|
19
|
+
if not api_key:
|
|
20
|
+
raise BackboardClientError(
|
|
21
|
+
"Missing required environment variable `BACKBOARD_API_KEY`."
|
|
22
|
+
)
|
|
23
|
+
return BackboardClient(api_key=api_key)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def ensure_assistant(
|
|
27
|
+
client: BackboardClient,
|
|
28
|
+
*,
|
|
29
|
+
assistant_id: str | None,
|
|
30
|
+
name: str,
|
|
31
|
+
system_prompt: str,
|
|
32
|
+
tools: list[dict[str, Any]],
|
|
33
|
+
) -> str:
|
|
34
|
+
if assistant_id:
|
|
35
|
+
return assistant_id
|
|
36
|
+
assistant = await client.create_assistant(
|
|
37
|
+
name=name,
|
|
38
|
+
system_prompt=system_prompt,
|
|
39
|
+
tools=tools,
|
|
40
|
+
)
|
|
41
|
+
return str(assistant.assistant_id)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def _collect_stream(stream: Any) -> dict[str, Any]:
|
|
45
|
+
"""Drain a streaming add_message response into a unified result dict."""
|
|
46
|
+
content_parts: list[str] = []
|
|
47
|
+
tool_calls: list[Any] = []
|
|
48
|
+
run_id: str | None = None
|
|
49
|
+
status = "completed"
|
|
50
|
+
|
|
51
|
+
async for chunk in stream:
|
|
52
|
+
t = chunk.get("type")
|
|
53
|
+
if t == "content_streaming":
|
|
54
|
+
content_parts.append(chunk.get("content", ""))
|
|
55
|
+
elif t == "tool_submit_required":
|
|
56
|
+
status = "REQUIRES_ACTION"
|
|
57
|
+
run_id = chunk.get("run_id")
|
|
58
|
+
tool_calls = chunk.get("tool_calls", [])
|
|
59
|
+
elif t == "run_ended":
|
|
60
|
+
if chunk.get("status") not in (None, "completed"):
|
|
61
|
+
raise BackboardClientError(
|
|
62
|
+
f"Run ended with status: {chunk.get('status')}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
"content": "".join(content_parts) or None,
|
|
67
|
+
"status": status,
|
|
68
|
+
"tool_calls": tool_calls,
|
|
69
|
+
"run_id": run_id,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def run_in_thread(
|
|
74
|
+
client: BackboardClient,
|
|
75
|
+
*,
|
|
76
|
+
assistant_id: str,
|
|
77
|
+
user_message: str,
|
|
78
|
+
tool_handlers: Mapping[str, ToolHandler],
|
|
79
|
+
llm_provider: str = "openai",
|
|
80
|
+
model_name: str = "gpt-4o-mini",
|
|
81
|
+
max_tool_rounds: int = 6,
|
|
82
|
+
) -> str:
|
|
83
|
+
"""Create a thread, send a message via streaming, execute the tool loop."""
|
|
84
|
+
thread = await client.create_thread(assistant_id)
|
|
85
|
+
|
|
86
|
+
stream = await client.add_message(
|
|
87
|
+
thread_id=thread.thread_id,
|
|
88
|
+
content=user_message,
|
|
89
|
+
stream=True,
|
|
90
|
+
llm_provider=llm_provider,
|
|
91
|
+
model_name=model_name,
|
|
92
|
+
)
|
|
93
|
+
result = await _collect_stream(stream)
|
|
94
|
+
|
|
95
|
+
rounds = 0
|
|
96
|
+
while result["status"] == "REQUIRES_ACTION":
|
|
97
|
+
rounds += 1
|
|
98
|
+
if rounds > max_tool_rounds:
|
|
99
|
+
raise BackboardClientError(
|
|
100
|
+
f"Tool loop exceeded {max_tool_rounds} rounds — aborting."
|
|
101
|
+
)
|
|
102
|
+
if not result["run_id"]:
|
|
103
|
+
raise BackboardClientError("REQUIRES_ACTION without run_id.")
|
|
104
|
+
if not result["tool_calls"]:
|
|
105
|
+
raise BackboardClientError("REQUIRES_ACTION without tool_calls.")
|
|
106
|
+
|
|
107
|
+
tool_outputs = []
|
|
108
|
+
for tc in result["tool_calls"]:
|
|
109
|
+
name = tc["function"]["name"] if isinstance(tc, dict) else tc.function.name
|
|
110
|
+
args_raw = (
|
|
111
|
+
tc["function"].get("arguments", "{}")
|
|
112
|
+
if isinstance(tc, dict)
|
|
113
|
+
else (tc.function.arguments or "{}")
|
|
114
|
+
)
|
|
115
|
+
args = args_raw if isinstance(args_raw, dict) else json.loads(args_raw or "{}")
|
|
116
|
+
tc_id = tc["id"] if isinstance(tc, dict) else tc.id
|
|
117
|
+
|
|
118
|
+
handler = tool_handlers.get(name)
|
|
119
|
+
if handler is None:
|
|
120
|
+
raise BackboardClientError(f"No handler registered for tool `{name}`.")
|
|
121
|
+
|
|
122
|
+
call_result = await handler(args)
|
|
123
|
+
tool_outputs.append({"tool_call_id": tc_id, "output": json.dumps(call_result)})
|
|
124
|
+
|
|
125
|
+
stream = await client.submit_tool_outputs(
|
|
126
|
+
thread_id=thread.thread_id,
|
|
127
|
+
run_id=result["run_id"],
|
|
128
|
+
tool_outputs=tool_outputs,
|
|
129
|
+
stream=True,
|
|
130
|
+
)
|
|
131
|
+
result = await _collect_stream(stream)
|
|
132
|
+
|
|
133
|
+
if not result["content"]:
|
|
134
|
+
raise BackboardClientError("Run completed without content.")
|
|
135
|
+
return result["content"]
|