prospector_engine 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +73 -56
- data/lib/prospector/engine.rb +10 -2
- data/lib/prospector/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8e754e44af2ba6797161e333fc790c36f3a5e8b79e30deb4dc5eef78ddb820c1
|
|
4
|
+
data.tar.gz: cf8338a50a7dfb29f2814b250dea974d69306c1e5c3ae7d69272663b4cd8cd43
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dc08f57c755577e9589a729e671d21b3729dad5a81a6e0b8129ffe432eb07edaaaba8a0ad755827c45247616f2a11517a0750ed295d5bdb498c22461401b6648
|
|
7
|
+
data.tar.gz: 22026bf0d973173b0c22d4b3c4648a0ac9957b83ae8dd164dab0cee6e3256bb3aa29f64a101207ed3f8084d1416337713df738835d6e1416abba16d16dfe8113
|
data/README.md
CHANGED
|
@@ -1,20 +1,21 @@
|
|
|
1
1
|
# Prospector
|
|
2
2
|
|
|
3
|
-
A Rails engine for discovering businesses from multiple sources with AI-powered keyword generation and classification.
|
|
3
|
+
A Rails engine for discovering businesses from multiple sources with AI-powered keyword generation, contact enrichment, and classification.
|
|
4
4
|
|
|
5
|
-
Prospector handles the full discovery pipeline: generate search keywords with AI, fetch business listings from external APIs, classify results for domain relevance, and present an admin review interface -- all inside your existing Rails app.
|
|
5
|
+
Prospector handles the full discovery pipeline: generate search keywords with AI, fetch business listings from external APIs, scrape websites for contact info, classify results for domain relevance, and present an admin review interface -- all inside your existing Rails app.
|
|
6
6
|
|
|
7
7
|
## Features
|
|
8
8
|
|
|
9
|
-
- **Multi-source adapters** -- Ships with Google Places. Pluggable interface for adding Yelp, Bing,
|
|
9
|
+
- **Multi-source adapters** -- Ships with Google Places. Pluggable interface for adding Yelp, Bing, or custom sources.
|
|
10
|
+
- **Contact enrichment** -- Automatically scrapes candidate websites for email and social links (Facebook, Instagram, LinkedIn, TikTok, YouTube). SSRF-protected with IP validation on every redirect hop.
|
|
10
11
|
- **AI keyword generation** -- On-demand LLM-powered keyword generation for any business domain. Keywords are stored and reused across runs.
|
|
11
12
|
- **AI classification** -- Automatically classifies discovered businesses for domain relevance. Non-relevant results are auto-rejected.
|
|
12
|
-
- **Flexible geography** -- Search by metro area, city, coordinates + radius, ZIP code, or bounding box.
|
|
13
|
-
- **Admin UI** -- Mountable admin interface with self-contained
|
|
13
|
+
- **Flexible geography** -- Search by metro area, city, coordinates + radius, ZIP code, or bounding box. 42 preloaded US metro areas available as presets.
|
|
14
|
+
- **Admin UI** -- Mountable admin interface with self-contained dark theme. Review candidates, approve/reject, bulk approve, trigger reclassification.
|
|
14
15
|
- **Keyword management** -- Admin UI for viewing, adding, toggling, and AI-generating search keywords per category.
|
|
15
16
|
- **Background jobs** -- Fetch, classify, and bulk approve jobs integrate with your existing queue (Solid Queue, Sidekiq, etc.).
|
|
16
|
-
- **Turbo Streams** -- Real-time updates when Turbo is present. Gracefully degrades without it.
|
|
17
|
-
- **
|
|
17
|
+
- **Turbo Streams** -- Real-time progress updates when Turbo is present. Gracefully degrades without it.
|
|
18
|
+
- **Instrumentation** -- ActiveSupport::Notifications events for run status changes and candidate approvals.
|
|
18
19
|
|
|
19
20
|
## Requirements
|
|
20
21
|
|
|
@@ -51,11 +52,20 @@ Edit `config/initializers/prospector.rb`:
|
|
|
51
52
|
|
|
52
53
|
```ruby
|
|
53
54
|
Prospector.configure do |config|
|
|
54
|
-
# Required: domain slug for keyword generation and classification context
|
|
55
|
+
# Required: domain slug for keyword generation and classification context.
|
|
55
56
|
config.domain = "motorcycle_services"
|
|
56
57
|
|
|
57
|
-
# Required:
|
|
58
|
-
|
|
58
|
+
# Required: admin authentication.
|
|
59
|
+
config.authenticate_admin_with do |controller|
|
|
60
|
+
controller.current_user&.admin?
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Classifier class (must be set before classification runs).
|
|
64
|
+
# Can be a class constant or a string for lazy resolution.
|
|
65
|
+
config.classifier = MotorcycleClassifier
|
|
66
|
+
# config.classifier = "MotorcycleClassifier"
|
|
67
|
+
|
|
68
|
+
# Called when an admin approves a candidate (optional).
|
|
59
69
|
# Runs AFTER the candidate status is committed (safe to enqueue jobs).
|
|
60
70
|
config.on_approve do |candidate|
|
|
61
71
|
data = candidate.normalized_data
|
|
@@ -67,26 +77,16 @@ Prospector.configure do |config|
|
|
|
67
77
|
zip_code: data["zip_code"],
|
|
68
78
|
phone_number: data["phone_number"],
|
|
69
79
|
website: data["website"],
|
|
80
|
+
email: candidate.email,
|
|
70
81
|
latitude: data["latitude"],
|
|
71
82
|
longitude: data["longitude"],
|
|
72
83
|
service_types: candidate.llm_categories,
|
|
73
|
-
|
|
84
|
+
facebook_url: candidate.facebook_url,
|
|
85
|
+
instagram_url: candidate.instagram_url
|
|
74
86
|
)
|
|
75
87
|
end
|
|
76
88
|
|
|
77
|
-
# Required: admin authentication.
|
|
78
|
-
# Receives the controller instance.
|
|
79
|
-
config.authenticate_admin_with do |controller|
|
|
80
|
-
controller.current_user&.admin?
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
# Required: classifier class (inherits LlmClassifier::Classifier).
|
|
84
|
-
# Defines categories, model, and classification rules for your domain.
|
|
85
|
-
# See "Defining a Classifier" below.
|
|
86
|
-
config.classifier = MotorcycleClassifier
|
|
87
|
-
|
|
88
89
|
# Optional: check for duplicates before creating candidates.
|
|
89
|
-
# Return true to skip the candidate.
|
|
90
90
|
config.duplicate_check do |source_uid:, name:, **|
|
|
91
91
|
Business.exists?(["import_metadata->>'place_id' = ?", source_uid])
|
|
92
92
|
end
|
|
@@ -94,7 +94,7 @@ Prospector.configure do |config|
|
|
|
94
94
|
# Optional: default source adapter (default: :google_places)
|
|
95
95
|
config.default_source = :google_places
|
|
96
96
|
|
|
97
|
-
# Optional: default AI model
|
|
97
|
+
# Optional: default AI model for classification and keyword generation
|
|
98
98
|
config.default_classifier_model = "anthropic:claude-sonnet-4-20250514"
|
|
99
99
|
|
|
100
100
|
# Optional: job queue name (default: :default)
|
|
@@ -102,6 +102,8 @@ Prospector.configure do |config|
|
|
|
102
102
|
end
|
|
103
103
|
```
|
|
104
104
|
|
|
105
|
+
**Note:** `validate!` only runs in production. In development, missing `domain` or `authenticate_admin_with` will cause runtime errors rather than startup failures.
|
|
106
|
+
|
|
105
107
|
Set the required environment variables:
|
|
106
108
|
|
|
107
109
|
```bash
|
|
@@ -116,20 +118,21 @@ ANTHROPIC_API_KEY=your_anthropic_api_key # or whatever ruby_llm needs
|
|
|
116
118
|
Visit `/prospector` in your browser. The admin interface provides:
|
|
117
119
|
|
|
118
120
|
- **Runs** -- Create, monitor, retry, restart, and cancel discovery runs
|
|
119
|
-
- **Candidates** -- Review discovered businesses, approve or reject individually or in bulk
|
|
121
|
+
- **Candidates** -- Review discovered businesses with contact info, approve or reject individually or in bulk
|
|
120
122
|
- **Keywords** -- View, add, toggle, and AI-generate search keywords per category
|
|
121
123
|
|
|
122
124
|
### Creating a Run
|
|
123
125
|
|
|
124
126
|
1. Click "New Run" in the admin UI
|
|
125
127
|
2. Select a geography type (metro area, city, coordinates, ZIP code, or bounding box)
|
|
126
|
-
3.
|
|
127
|
-
4.
|
|
128
|
+
3. For metro areas, optionally select from 42 preloaded US metros or enter a custom one
|
|
129
|
+
4. Fill in the geography details
|
|
130
|
+
5. Click "Start Run"
|
|
128
131
|
|
|
129
132
|
The run progresses through these stages automatically:
|
|
130
133
|
|
|
131
134
|
```
|
|
132
|
-
pending -> running (
|
|
135
|
+
pending -> running (fetch + enrich) -> classifying (AI) -> completed
|
|
133
136
|
```
|
|
134
137
|
|
|
135
138
|
### Defining a Classifier
|
|
@@ -145,7 +148,7 @@ class MotorcycleClassifier < LlmClassifier::Classifier
|
|
|
145
148
|
|
|
146
149
|
model "anthropic:claude-sonnet-4-20250514"
|
|
147
150
|
multi_label true
|
|
148
|
-
require_categories true
|
|
151
|
+
require_categories true
|
|
149
152
|
|
|
150
153
|
system_prompt <<~PROMPT
|
|
151
154
|
You are classifying businesses for the motorcycle services domain.
|
|
@@ -164,13 +167,7 @@ class MotorcycleClassifier < LlmClassifier::Classifier
|
|
|
164
167
|
end
|
|
165
168
|
```
|
|
166
169
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
```ruby
|
|
170
|
-
config.classifier = MotorcycleClassifier
|
|
171
|
-
```
|
|
172
|
-
|
|
173
|
-
The classifier receives a hash with `name`, `address`, `website`, `description`, and `source_types` keys. It returns an `LlmClassifier::Result` with `.categories`, `.confidence`, and `.reasoning`.
|
|
170
|
+
The classifier receives a hash with `name`, `address`, `website`, `description`, and `source_types` (comma-joined string) keys.
|
|
174
171
|
|
|
175
172
|
### Programmatic Usage
|
|
176
173
|
|
|
@@ -195,13 +192,19 @@ Prospector::Keyword.create!(
|
|
|
195
192
|
# Approve a candidate programmatically
|
|
196
193
|
candidate = Prospector::Candidate.find(id)
|
|
197
194
|
candidate.approve! # fires the on_approve callback
|
|
195
|
+
|
|
196
|
+
# Access enriched contact info
|
|
197
|
+
candidate.email # => "info@example.com"
|
|
198
|
+
candidate.facebook_url # => "https://facebook.com/example"
|
|
199
|
+
candidate.instagram_url # => "https://instagram.com/example"
|
|
198
200
|
```
|
|
199
201
|
|
|
200
202
|
### Geography Types
|
|
201
203
|
|
|
202
204
|
```ruby
|
|
203
|
-
# Metro area (text search)
|
|
204
|
-
Prospector::Geography::MetroArea.new(name: "San Francisco", primary_state: "CA")
|
|
205
|
+
# Metro area (text search) -- 42 US metros available as presets
|
|
206
|
+
Prospector::Geography::MetroArea.new(name: "San Francisco Bay Area", primary_state: "CA")
|
|
207
|
+
Prospector::Geography::MetroArea::PRELOADED # => [{name: "Atlanta", primary_state: "GA"}, ...]
|
|
205
208
|
|
|
206
209
|
# City (text search)
|
|
207
210
|
Prospector::Geography::City.new(city: "Austin", state: "TX")
|
|
@@ -226,8 +229,6 @@ class MyApp::YelpAdapter < Prospector::Sources::Base
|
|
|
226
229
|
def self.adapter_key = "yelp"
|
|
227
230
|
|
|
228
231
|
def fetch(geography:, keywords:)
|
|
229
|
-
# Call the Yelp API for each keyword + geography combination.
|
|
230
|
-
# Return an Array of Prospector::Sources::Result.
|
|
231
232
|
keywords.flat_map do |keyword|
|
|
232
233
|
search_yelp(keyword, geography).map do |biz|
|
|
233
234
|
Prospector::Sources::Result.new(
|
|
@@ -260,8 +261,6 @@ Prospector.configure do |config|
|
|
|
260
261
|
end
|
|
261
262
|
```
|
|
262
263
|
|
|
263
|
-
Then select "Yelp" as the source when creating a run.
|
|
264
|
-
|
|
265
264
|
## Database Tables
|
|
266
265
|
|
|
267
266
|
Prospector creates four tables, all prefixed with `prospector_`:
|
|
@@ -269,7 +268,7 @@ Prospector creates four tables, all prefixed with `prospector_`:
|
|
|
269
268
|
| Table | Purpose |
|
|
270
269
|
|-------|---------|
|
|
271
270
|
| `prospector_runs` | Discovery runs with status, geography, and progress counters |
|
|
272
|
-
| `prospector_candidates` | Discovered businesses pending review |
|
|
271
|
+
| `prospector_candidates` | Discovered businesses with contact info (email, social links) pending review |
|
|
273
272
|
| `prospector_classification_runs` | Tracks AI reclassification operations |
|
|
274
273
|
| `prospector_keywords` | Stored search keywords per domain and category |
|
|
275
274
|
|
|
@@ -284,33 +283,51 @@ Prospector::Run (state machine)
|
|
|
284
283
|
| |-- Pipeline::Orchestrator
|
|
285
284
|
| | |-- Pipeline::Normalizer (address parsing)
|
|
286
285
|
| | |-- Duplicate checking (via config.duplicate_check)
|
|
287
|
-
| |
|
|
286
|
+
| | |-- Creates Prospector::Candidate records
|
|
287
|
+
| | '-- Enrichment::ContactScraper (email + social links)
|
|
288
288
|
| '-- Enqueues ClassifyJob
|
|
289
289
|
|
|
|
290
290
|
|-- ClassifyJob
|
|
291
291
|
| '-- Classification::Runner
|
|
292
|
-
| |-- Calls LLM via
|
|
292
|
+
| |-- Calls LLM via llm_classifier
|
|
293
293
|
| |-- Stores categories, confidence, reasoning in metadata
|
|
294
294
|
| '-- Auto-rejects candidates with no relevant categories
|
|
295
295
|
|
|
|
296
296
|
'-- Admin UI
|
|
297
|
-
|--
|
|
298
|
-
|--
|
|
299
|
-
|
|
297
|
+
|-- Runs: create, monitor, retry, restart, cancel
|
|
298
|
+
|-- Candidates: review, approve, reject, restore, bulk approve
|
|
299
|
+
|-- Keywords: view, add, toggle active, AI-generate
|
|
300
|
+
'-- Reclassification with model override
|
|
300
301
|
```
|
|
301
302
|
|
|
302
303
|
## Run States
|
|
303
304
|
|
|
304
305
|
```
|
|
305
306
|
pending --> running --> classifying --> completed
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
^
|
|
310
|
-
|
|
|
311
|
-
|
|
307
|
+
| | |
|
|
308
|
+
v v v
|
|
309
|
+
failed cancelled
|
|
310
|
+
^ ^
|
|
311
|
+
|____________|
|
|
312
|
+
(retry/restart)
|
|
312
313
|
```
|
|
313
314
|
|
|
315
|
+
- `cancellable?` -- pending, running, or classifying
|
|
316
|
+
- `retryable?` -- failed only
|
|
317
|
+
- `restartable?` -- completed, failed, or cancelled
|
|
318
|
+
- `restart!` destroys pending/rejected candidates and resets all counters
|
|
319
|
+
- `reset_for_retry!` resets status and error fields only
|
|
320
|
+
|
|
321
|
+
## Instrumentation
|
|
322
|
+
|
|
323
|
+
Prospector emits ActiveSupport::Notifications events:
|
|
324
|
+
|
|
325
|
+
| Event | Payload |
|
|
326
|
+
|-------|---------|
|
|
327
|
+
| `prospector.run.status_changed` | `run`, `status`, `previous_status` |
|
|
328
|
+
| `prospector.candidate.approved` | `candidate`, `record` |
|
|
329
|
+
| `prospector.candidate.rejected` | `candidate`, `reason` |
|
|
330
|
+
|
|
314
331
|
## Development
|
|
315
332
|
|
|
316
333
|
The gem ships with a devcontainer for containerized development:
|
|
@@ -320,10 +337,10 @@ The gem ships with a devcontainer for containerized development:
|
|
|
320
337
|
docker compose -f .devcontainer/compose.yaml up -d --build
|
|
321
338
|
|
|
322
339
|
# Run tests (inside the container)
|
|
323
|
-
docker exec -w /workspaces/
|
|
340
|
+
docker exec -w /workspaces/prospector_engine prospector_engine-app-1 bundle exec rake test
|
|
324
341
|
|
|
325
342
|
# Run a single test file
|
|
326
|
-
docker exec -w /workspaces/
|
|
343
|
+
docker exec -w /workspaces/prospector_engine prospector_engine-app-1 bundle exec ruby -Itest test/models/run_test.rb
|
|
327
344
|
```
|
|
328
345
|
|
|
329
346
|
The test suite uses a dummy Rails app (`test/dummy/`) with PostgreSQL.
|
data/lib/prospector/engine.rb
CHANGED
|
@@ -2,8 +2,16 @@ module Prospector
|
|
|
2
2
|
class Engine < ::Rails::Engine
|
|
3
3
|
isolate_namespace Prospector
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
initializer "prospector.autoload", before: :set_autoload_paths do |app|
|
|
6
|
+
lib_path = root.join("lib")
|
|
7
|
+
app.config.autoload_paths << lib_path
|
|
8
|
+
app.config.eager_load_paths << lib_path
|
|
9
|
+
|
|
10
|
+
Rails.autoloaders.main.ignore(
|
|
11
|
+
root.join("lib/prospector/version.rb"),
|
|
12
|
+
root.join("lib/generators")
|
|
13
|
+
)
|
|
14
|
+
end
|
|
7
15
|
|
|
8
16
|
initializer "prospector.configuration" do
|
|
9
17
|
Prospector.config.validate! if Rails.env.production?
|
data/lib/prospector/version.rb
CHANGED