prospector_engine 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/MIT-LICENSE +20 -0
- data/README.md +333 -0
- data/Rakefile +9 -0
- data/app/CLAUDE.md +43 -0
- data/app/assets/stylesheets/prospector/application.css +476 -0
- data/app/controllers/prospector/application_controller.rb +16 -0
- data/app/controllers/prospector/candidates_controller.rb +31 -0
- data/app/controllers/prospector/keyword_generations_controller.rb +10 -0
- data/app/controllers/prospector/keywords_controller.rb +38 -0
- data/app/controllers/prospector/run_bulk_approvals_controller.rb +13 -0
- data/app/controllers/prospector/run_cancellations_controller.rb +9 -0
- data/app/controllers/prospector/run_reclassifications_controller.rb +21 -0
- data/app/controllers/prospector/run_restarts_controller.rb +14 -0
- data/app/controllers/prospector/run_retries_controller.rb +14 -0
- data/app/controllers/prospector/runs_controller.rb +47 -0
- data/app/jobs/prospector/application_job.rb +5 -0
- data/app/jobs/prospector/bulk_approve_job.rb +14 -0
- data/app/jobs/prospector/classify_job.rb +17 -0
- data/app/jobs/prospector/fetch_job.rb +8 -0
- data/app/models/prospector/application_record.rb +6 -0
- data/app/models/prospector/candidate.rb +93 -0
- data/app/models/prospector/classification_run.rb +15 -0
- data/app/models/prospector/keyword.rb +16 -0
- data/app/models/prospector/run.rb +94 -0
- data/app/views/prospector/candidates/show.html.erb +63 -0
- data/app/views/prospector/keywords/index.html.erb +72 -0
- data/app/views/prospector/layouts/prospector.html.erb +38 -0
- data/app/views/prospector/runs/index.html.erb +33 -0
- data/app/views/prospector/runs/new.html.erb +109 -0
- data/app/views/prospector/runs/show.html.erb +111 -0
- data/config/routes.rb +15 -0
- data/db/prospector_schema.rb +81 -0
- data/lib/generators/prospector/install/install_generator.rb +31 -0
- data/lib/generators/prospector/install/templates/create_prospector_tables.rb +83 -0
- data/lib/generators/prospector/install/templates/prospector.rb +37 -0
- data/lib/prospector/CLAUDE.md +52 -0
- data/lib/prospector/classification/runner.rb +105 -0
- data/lib/prospector/configuration.rb +56 -0
- data/lib/prospector/engine.rb +18 -0
- data/lib/prospector/enrichment/contact_scraper.rb +188 -0
- data/lib/prospector/error.rb +8 -0
- data/lib/prospector/geography/base.rb +40 -0
- data/lib/prospector/geography/bounding_box.rb +58 -0
- data/lib/prospector/geography/city.rb +29 -0
- data/lib/prospector/geography/coordinates.rb +43 -0
- data/lib/prospector/geography/metro_area.rb +74 -0
- data/lib/prospector/geography/zip_code.rb +25 -0
- data/lib/prospector/keywords/generator.rb +74 -0
- data/lib/prospector/pipeline/normalizer.rb +57 -0
- data/lib/prospector/pipeline/orchestrator.rb +151 -0
- data/lib/prospector/sources/base.rb +13 -0
- data/lib/prospector/sources/google_places/adapter.rb +92 -0
- data/lib/prospector/sources/google_places/client.rb +58 -0
- data/lib/prospector/sources/google_places/us_address_validator.rb +24 -0
- data/lib/prospector/sources/result.rb +21 -0
- data/lib/prospector/version.rb +3 -0
- data/lib/prospector.rb +20 -0
- metadata +185 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
require "httparty"
|
|
2
|
+
require "ipaddr"
|
|
3
|
+
|
|
4
|
+
module Prospector
|
|
5
|
+
module Enrichment
|
|
6
|
+
class ContactScraper
|
|
7
|
+
include HTTParty
|
|
8
|
+
|
|
9
|
+
SOCIAL_PATTERNS = {
|
|
10
|
+
facebook_url: %r{https?://(?:www\.)?facebook\.com/(?!(?:sharer|share)(?:/|\?|$))[^"'\s<>]{1,200}}i,
|
|
11
|
+
instagram_url: %r{https?://(?:www\.)?instagram\.com/[^"'\s<>]{1,200}}i,
|
|
12
|
+
linkedin_url: %r{https?://(?:www\.)?linkedin\.com/[^"'\s<>]{1,200}}i,
|
|
13
|
+
tiktok_url: %r{https?://(?:www\.)?tiktok\.com/[^"'\s<>]{1,200}}i,
|
|
14
|
+
youtube_url: %r{https?://(?:www\.)?youtube\.com/[^"'\s<>]{1,200}}i
|
|
15
|
+
}.freeze
|
|
16
|
+
|
|
17
|
+
SHARE_PATTERNS = [
|
|
18
|
+
%r{facebook\.com/sharer}i, %r{facebook\.com/share}i,
|
|
19
|
+
%r{twitter\.com/intent}i, %r{linkedin\.com/shareArticle}i
|
|
20
|
+
].freeze
|
|
21
|
+
|
|
22
|
+
MAILTO_PATTERN = /mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})/i
|
|
23
|
+
EMAIL_PATTERN = /\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b/
|
|
24
|
+
|
|
25
|
+
FILTERED_EMAIL_PREFIXES = %w[
|
|
26
|
+
noreply no-reply donotreply do-not-reply
|
|
27
|
+
bounce mailer-daemon postmaster abuse spam
|
|
28
|
+
webmaster hostmaster admin administrator
|
|
29
|
+
unsubscribe notifications notification alerts
|
|
30
|
+
newsletter privacy security legal compliance dmca
|
|
31
|
+
].freeze
|
|
32
|
+
|
|
33
|
+
BLOCKED_IP_RANGES = [
|
|
34
|
+
IPAddr.new("10.0.0.0/8"), IPAddr.new("172.16.0.0/12"),
|
|
35
|
+
IPAddr.new("192.168.0.0/16"), IPAddr.new("127.0.0.0/8"),
|
|
36
|
+
IPAddr.new("169.254.0.0/16"), IPAddr.new("0.0.0.0/8"),
|
|
37
|
+
IPAddr.new("::1/128"), IPAddr.new("fc00::/7"),
|
|
38
|
+
IPAddr.new("fe80::/10"), IPAddr.new("::ffff:0:0/96")
|
|
39
|
+
].freeze
|
|
40
|
+
|
|
41
|
+
ALLOWED_SCHEMES = %w[http https].freeze
|
|
42
|
+
TIMEOUT_SECONDS = 10
|
|
43
|
+
MAX_REDIRECTS = 3
|
|
44
|
+
USER_AGENT = "Mozilla/5.0 (compatible; Prospector/1.0)"
|
|
45
|
+
|
|
46
|
+
attr_reader :result, :error
|
|
47
|
+
|
|
48
|
+
def self.scrape(url)
|
|
49
|
+
new(url).scrape
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def initialize(url)
|
|
53
|
+
@url = url
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def scrape
|
|
57
|
+
return self if @url.blank?
|
|
58
|
+
|
|
59
|
+
html = validate_and_fetch(normalize_url(@url))
|
|
60
|
+
@result = extract_all(html) if html
|
|
61
|
+
self
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def success? = @error.nil? && @result.present?
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def validate_and_fetch(url)
|
|
69
|
+
return set_error_nil("Invalid URL scheme") unless allowed_scheme?(url)
|
|
70
|
+
return set_error_nil("URL targets a private network") unless safe_url?(url)
|
|
71
|
+
|
|
72
|
+
fetch_html(url)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def extract_all(html)
|
|
76
|
+
extract_social_links(html).merge(extract_emails(html))
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def normalize_url(url)
|
|
80
|
+
url.match?(%r{\Ahttps?://}i) ? url : "https://#{url}"
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def allowed_scheme?(url)
|
|
84
|
+
ALLOWED_SCHEMES.include?(URI.parse(url).scheme&.downcase)
|
|
85
|
+
rescue URI::InvalidURIError
|
|
86
|
+
false
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def safe_url?(url)
|
|
90
|
+
ip = IPSocket.getaddress(URI.parse(url).host.to_s)
|
|
91
|
+
BLOCKED_IP_RANGES.none? { |range| range.include?(ip) }
|
|
92
|
+
rescue SocketError, URI::InvalidURIError
|
|
93
|
+
false
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def fetch_html(url, redirects = MAX_REDIRECTS)
|
|
97
|
+
response = make_request(url)
|
|
98
|
+
return response.body if response.code == 200
|
|
99
|
+
|
|
100
|
+
handle_non_ok(response, redirects, url)
|
|
101
|
+
rescue StandardError => e
|
|
102
|
+
set_error(e.message); nil
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def make_request(url)
|
|
106
|
+
self.class.get(url, headers: { "User-Agent" => USER_AGENT }, timeout: TIMEOUT_SECONDS, follow_redirects: false)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def handle_non_ok(response, redirects, url)
|
|
110
|
+
return follow_redirect(response, redirects, url) if redirect?(response)
|
|
111
|
+
|
|
112
|
+
set_error("HTTP #{response.code}"); nil
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def redirect?(response)
|
|
116
|
+
response.code.between?(300, 399) && response.headers["location"].present?
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def follow_redirect(response, remaining, original_url)
|
|
120
|
+
return (set_error("Too many redirects"); nil) if remaining <= 0
|
|
121
|
+
|
|
122
|
+
location = resolve_redirect(response.headers["location"], original_url)
|
|
123
|
+
return (set_error("Invalid redirect"); nil) unless location && safe_url?(location)
|
|
124
|
+
|
|
125
|
+
fetch_html(location, remaining - 1)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def resolve_redirect(location, base)
|
|
129
|
+
URI.join(base, location).to_s
|
|
130
|
+
rescue URI::InvalidURIError, URI::BadURIError
|
|
131
|
+
nil
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def extract_social_links(html)
|
|
135
|
+
return {} if html.blank?
|
|
136
|
+
|
|
137
|
+
SOCIAL_PATTERNS.each_with_object({}) do |(key, pattern), links|
|
|
138
|
+
match = first_valid_match(html, pattern)
|
|
139
|
+
links[key] = clean_url(match) if match
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def first_valid_match(html, pattern)
|
|
144
|
+
html.scan(pattern).uniq.reject { |u| share_intent?(u) }.first
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def share_intent?(url)
|
|
148
|
+
SHARE_PATTERNS.any? { |p| url.match?(p) }
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def extract_emails(html)
|
|
152
|
+
return {} if html.blank?
|
|
153
|
+
|
|
154
|
+
email = find_contact_email(html)
|
|
155
|
+
email ? { email: email } : {}
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def find_contact_email(html)
|
|
159
|
+
emails = html.scan(MAILTO_PATTERN).flatten.map(&:downcase).uniq
|
|
160
|
+
emails = html.scan(EMAIL_PATTERN).flatten.map(&:downcase).uniq if emails.empty?
|
|
161
|
+
emails.find { |e| valid_contact_email?(e) }
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def valid_contact_email?(email)
|
|
165
|
+
FILTERED_EMAIL_PREFIXES.none? { |f| email.split("@").first.start_with?(f) }
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def clean_url(url)
|
|
169
|
+
return nil if url.blank?
|
|
170
|
+
|
|
171
|
+
cleaned = url.gsub(/["'>].*$/, "").strip
|
|
172
|
+
URI.parse(cleaned) && cleaned
|
|
173
|
+
rescue URI::InvalidURIError
|
|
174
|
+
nil
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def set_error(msg)
|
|
178
|
+
@error = msg
|
|
179
|
+
self
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def set_error_nil(msg)
|
|
183
|
+
@error = msg
|
|
184
|
+
nil
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
module Prospector
|
|
2
|
+
class Error < StandardError; end
|
|
3
|
+
class ConfigurationError < Error; end
|
|
4
|
+
class AdapterError < Error; end
|
|
5
|
+
class AdapterNotFoundError < AdapterError; end
|
|
6
|
+
class MissingApiKeyError < AdapterError; end
|
|
7
|
+
class ClassificationError < Error; end
|
|
8
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
module Prospector
|
|
2
|
+
module Geography
|
|
3
|
+
class Base
|
|
4
|
+
TYPES = %w[metro_area city coordinates zip_code bounding_box].freeze
|
|
5
|
+
|
|
6
|
+
def self.from_h(data)
|
|
7
|
+
type = data["type"] || data[:type]
|
|
8
|
+
klass = case type.to_s
|
|
9
|
+
when "metro_area" then MetroArea
|
|
10
|
+
when "city" then City
|
|
11
|
+
when "coordinates" then Coordinates
|
|
12
|
+
when "zip_code" then ZipCode
|
|
13
|
+
when "bounding_box" then BoundingBox
|
|
14
|
+
else raise ArgumentError, "Unknown geography type: #{type}"
|
|
15
|
+
end
|
|
16
|
+
klass.from_h(data)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def type
|
|
20
|
+
raise NotImplementedError
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def to_query_string
|
|
24
|
+
raise NotImplementedError
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def to_coordinate_restriction
|
|
28
|
+
nil
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def label
|
|
32
|
+
to_query_string || "#{type}: #{to_h.inspect}"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def to_h
|
|
36
|
+
raise NotImplementedError
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
module Prospector
|
|
2
|
+
module Geography
|
|
3
|
+
class BoundingBox < Base
|
|
4
|
+
attr_reader :ne_lat, :ne_lng, :sw_lat, :sw_lng
|
|
5
|
+
|
|
6
|
+
def initialize(ne_lat:, ne_lng:, sw_lat:, sw_lng:)
|
|
7
|
+
@ne_lat = ne_lat.to_f
|
|
8
|
+
@ne_lng = ne_lng.to_f
|
|
9
|
+
@sw_lat = sw_lat.to_f
|
|
10
|
+
@sw_lng = sw_lng.to_f
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def self.from_h(data)
|
|
14
|
+
new(
|
|
15
|
+
ne_lat: data["ne_lat"] || data[:ne_lat],
|
|
16
|
+
ne_lng: data["ne_lng"] || data[:ne_lng],
|
|
17
|
+
sw_lat: data["sw_lat"] || data[:sw_lat],
|
|
18
|
+
sw_lng: data["sw_lng"] || data[:sw_lng]
|
|
19
|
+
)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def type = "bounding_box"
|
|
23
|
+
|
|
24
|
+
def to_query_string = nil
|
|
25
|
+
|
|
26
|
+
def to_coordinate_restriction
|
|
27
|
+
center_lat = (ne_lat + sw_lat) / 2.0
|
|
28
|
+
center_lng = (ne_lng + sw_lng) / 2.0
|
|
29
|
+
radius = haversine_distance(sw_lat, sw_lng, ne_lat, ne_lng) / 2.0
|
|
30
|
+
|
|
31
|
+
{
|
|
32
|
+
circle: {
|
|
33
|
+
center: { latitude: center_lat, longitude: center_lng },
|
|
34
|
+
radius: [radius, 50_000.0].min
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def label = "Box (#{sw_lat},#{sw_lng}) to (#{ne_lat},#{ne_lng})"
|
|
40
|
+
|
|
41
|
+
def to_h
|
|
42
|
+
{ "ne_lat" => ne_lat, "ne_lng" => ne_lng, "sw_lat" => sw_lat, "sw_lng" => sw_lng }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def haversine_distance(lat1, lng1, lat2, lng2)
|
|
48
|
+
r = 6_371_000
|
|
49
|
+
d_lat = to_rad(lat2 - lat1)
|
|
50
|
+
d_lng = to_rad(lng2 - lng1)
|
|
51
|
+
a = Math.sin(d_lat / 2)**2 + Math.cos(to_rad(lat1)) * Math.cos(to_rad(lat2)) * Math.sin(d_lng / 2)**2
|
|
52
|
+
r * 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def to_rad(deg) = deg * Math::PI / 180
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module Prospector
|
|
2
|
+
module Geography
|
|
3
|
+
class City < Base
|
|
4
|
+
attr_reader :city, :state
|
|
5
|
+
|
|
6
|
+
def initialize(city:, state:)
|
|
7
|
+
@city = city
|
|
8
|
+
@state = state
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.from_h(data)
|
|
12
|
+
new(
|
|
13
|
+
city: data["city"] || data[:city],
|
|
14
|
+
state: data["state"] || data[:state]
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def type = "city"
|
|
19
|
+
|
|
20
|
+
def to_query_string = "#{city}, #{state}"
|
|
21
|
+
|
|
22
|
+
def label = to_query_string
|
|
23
|
+
|
|
24
|
+
def to_h
|
|
25
|
+
{ "city" => city, "state" => state }
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module Prospector
|
|
2
|
+
module Geography
|
|
3
|
+
class Coordinates < Base
|
|
4
|
+
attr_reader :lat, :lng, :radius_meters
|
|
5
|
+
|
|
6
|
+
def initialize(lat:, lng:, radius_meters: 10_000)
|
|
7
|
+
@lat = lat.to_f
|
|
8
|
+
@lng = lng.to_f
|
|
9
|
+
@radius_meters = radius_meters.to_f
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def self.from_h(data)
|
|
13
|
+
new(
|
|
14
|
+
lat: data["lat"] || data[:lat],
|
|
15
|
+
lng: data["lng"] || data[:lng],
|
|
16
|
+
radius_meters: data["radius_meters"] || data[:radius_meters] || 10_000
|
|
17
|
+
)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def type = "coordinates"
|
|
21
|
+
|
|
22
|
+
def to_query_string = nil
|
|
23
|
+
|
|
24
|
+
def to_coordinate_restriction
|
|
25
|
+
{
|
|
26
|
+
circle: {
|
|
27
|
+
center: { latitude: lat, longitude: lng },
|
|
28
|
+
radius: radius_meters
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def label
|
|
34
|
+
radius_km = (radius_meters / 1000).round(1)
|
|
35
|
+
"#{lat}, #{lng} (#{radius_km}km radius)"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def to_h
|
|
39
|
+
{ "lat" => lat, "lng" => lng, "radius_meters" => radius_meters }
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
module Prospector
|
|
2
|
+
module Geography
|
|
3
|
+
class MetroArea < Base
|
|
4
|
+
PRELOADED = [
|
|
5
|
+
{ name: "Atlanta", primary_state: "GA" },
|
|
6
|
+
{ name: "Austin", primary_state: "TX" },
|
|
7
|
+
{ name: "Baltimore", primary_state: "MD" },
|
|
8
|
+
{ name: "Boston", primary_state: "MA" },
|
|
9
|
+
{ name: "Charlotte", primary_state: "NC" },
|
|
10
|
+
{ name: "Chicago", primary_state: "IL" },
|
|
11
|
+
{ name: "Cincinnati", primary_state: "OH" },
|
|
12
|
+
{ name: "Cleveland", primary_state: "OH" },
|
|
13
|
+
{ name: "Columbus", primary_state: "OH" },
|
|
14
|
+
{ name: "Dallas-Fort Worth", primary_state: "TX" },
|
|
15
|
+
{ name: "Denver", primary_state: "CO" },
|
|
16
|
+
{ name: "Detroit", primary_state: "MI" },
|
|
17
|
+
{ name: "Houston", primary_state: "TX" },
|
|
18
|
+
{ name: "Indianapolis", primary_state: "IN" },
|
|
19
|
+
{ name: "Jacksonville", primary_state: "FL" },
|
|
20
|
+
{ name: "Kansas City", primary_state: "MO" },
|
|
21
|
+
{ name: "Las Vegas", primary_state: "NV" },
|
|
22
|
+
{ name: "Los Angeles", primary_state: "CA" },
|
|
23
|
+
{ name: "Memphis", primary_state: "TN" },
|
|
24
|
+
{ name: "Miami", primary_state: "FL" },
|
|
25
|
+
{ name: "Milwaukee", primary_state: "WI" },
|
|
26
|
+
{ name: "Minneapolis-St. Paul", primary_state: "MN" },
|
|
27
|
+
{ name: "Nashville", primary_state: "TN" },
|
|
28
|
+
{ name: "New Orleans", primary_state: "LA" },
|
|
29
|
+
{ name: "New York", primary_state: "NY" },
|
|
30
|
+
{ name: "Oklahoma City", primary_state: "OK" },
|
|
31
|
+
{ name: "Orlando", primary_state: "FL" },
|
|
32
|
+
{ name: "Philadelphia", primary_state: "PA" },
|
|
33
|
+
{ name: "Phoenix", primary_state: "AZ" },
|
|
34
|
+
{ name: "Pittsburgh", primary_state: "PA" },
|
|
35
|
+
{ name: "Portland", primary_state: "OR" },
|
|
36
|
+
{ name: "Raleigh-Durham", primary_state: "NC" },
|
|
37
|
+
{ name: "Richmond", primary_state: "VA" },
|
|
38
|
+
{ name: "Sacramento", primary_state: "CA" },
|
|
39
|
+
{ name: "Salt Lake City", primary_state: "UT" },
|
|
40
|
+
{ name: "San Antonio", primary_state: "TX" },
|
|
41
|
+
{ name: "San Diego", primary_state: "CA" },
|
|
42
|
+
{ name: "San Francisco Bay Area", primary_state: "CA" },
|
|
43
|
+
{ name: "Seattle", primary_state: "WA" },
|
|
44
|
+
{ name: "St. Louis", primary_state: "MO" },
|
|
45
|
+
{ name: "Tampa Bay", primary_state: "FL" },
|
|
46
|
+
{ name: "Washington D.C.", primary_state: "DC" }
|
|
47
|
+
].freeze
|
|
48
|
+
|
|
49
|
+
attr_reader :name, :primary_state
|
|
50
|
+
|
|
51
|
+
def initialize(name:, primary_state:)
|
|
52
|
+
@name = name
|
|
53
|
+
@primary_state = primary_state
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def self.from_h(data)
|
|
57
|
+
new(
|
|
58
|
+
name: data["name"] || data[:name],
|
|
59
|
+
primary_state: data["primary_state"] || data[:primary_state]
|
|
60
|
+
)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def type = "metro_area"
|
|
64
|
+
|
|
65
|
+
def to_query_string = "#{name}, #{primary_state}"
|
|
66
|
+
|
|
67
|
+
def label = to_query_string
|
|
68
|
+
|
|
69
|
+
def to_h
|
|
70
|
+
{ "name" => name, "primary_state" => primary_state }
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module Prospector
|
|
2
|
+
module Geography
|
|
3
|
+
class ZipCode < Base
|
|
4
|
+
attr_reader :zip
|
|
5
|
+
|
|
6
|
+
def initialize(zip:)
|
|
7
|
+
@zip = zip.to_s
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def self.from_h(data)
|
|
11
|
+
new(zip: data["zip"] || data[:zip])
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def type = "zip_code"
|
|
15
|
+
|
|
16
|
+
def to_query_string = zip
|
|
17
|
+
|
|
18
|
+
def label = "ZIP #{zip}"
|
|
19
|
+
|
|
20
|
+
def to_h
|
|
21
|
+
{ "zip" => zip }
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
module Prospector
|
|
2
|
+
module Keywords
|
|
3
|
+
class Generator
|
|
4
|
+
def self.for(domain:, categories:)
|
|
5
|
+
new(domain: domain, categories: categories).generate
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def initialize(domain:, categories:)
|
|
9
|
+
@domain = domain
|
|
10
|
+
@categories = Array(categories)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def generate
|
|
14
|
+
@categories.flat_map { |category| keywords_for(category) }
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def keywords_for(category)
|
|
20
|
+
existing = Keyword.keywords_for(domain: @domain, category: category)
|
|
21
|
+
return existing if existing.any?
|
|
22
|
+
|
|
23
|
+
generated = generate_via_llm(category)
|
|
24
|
+
persist_keywords(category, generated)
|
|
25
|
+
generated
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def generate_via_llm(category)
|
|
29
|
+
require "ruby_llm"
|
|
30
|
+
|
|
31
|
+
chat = RubyLLM.chat(model: Prospector.config.default_classifier_model)
|
|
32
|
+
response = chat.ask(prompt_for(category))
|
|
33
|
+
parse_keywords(response.content)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def prompt_for(category)
|
|
37
|
+
<<~PROMPT
|
|
38
|
+
Generate 4-6 search keywords for finding businesses in the "#{category}" category
|
|
39
|
+
within the "#{@domain}" domain.
|
|
40
|
+
|
|
41
|
+
These keywords will be used as Google Places text search queries appended with a location
|
|
42
|
+
(e.g., "keyword in Dallas, TX"). Generate keywords that would find relevant businesses.
|
|
43
|
+
|
|
44
|
+
Return ONLY the keywords, one per line, no numbering or bullet points.
|
|
45
|
+
PROMPT
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def parse_keywords(text)
|
|
49
|
+
text.strip.split("\n").map(&:strip).reject(&:blank?)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def persist_keywords(category, keywords)
|
|
53
|
+
keywords.each do |kw|
|
|
54
|
+
upsert_keyword(category, kw)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def upsert_keyword(category, kw)
|
|
59
|
+
Keyword.find_or_create_by!(
|
|
60
|
+
domain: @domain,
|
|
61
|
+
category: category,
|
|
62
|
+
keyword: kw
|
|
63
|
+
) do |record|
|
|
64
|
+
record.source = "llm"
|
|
65
|
+
record.metadata = {
|
|
66
|
+
"generated_by" => Prospector.config.default_classifier_model,
|
|
67
|
+
"generated_at" => Time.current.iso8601
|
|
68
|
+
}
|
|
69
|
+
end
|
|
70
|
+
rescue ActiveRecord::RecordNotUnique
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
module Prospector
|
|
2
|
+
module Pipeline
|
|
3
|
+
class Normalizer
|
|
4
|
+
STATE_ZIP_PATTERN = /\b([A-Z]{2})\s+(\d{5})\b/
|
|
5
|
+
|
|
6
|
+
def self.normalize(result)
|
|
7
|
+
new(result).call
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def initialize(result)
|
|
11
|
+
@result = result
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def call
|
|
15
|
+
parts = parse_address
|
|
16
|
+
{
|
|
17
|
+
"business_name" => @result.name,
|
|
18
|
+
"street_address" => parts[:street],
|
|
19
|
+
"city" => parts[:city],
|
|
20
|
+
"state" => parts[:state],
|
|
21
|
+
"zip_code" => parts[:zip],
|
|
22
|
+
"phone_number" => @result.phone_number,
|
|
23
|
+
"website" => @result.website,
|
|
24
|
+
"latitude" => @result.latitude,
|
|
25
|
+
"longitude" => @result.longitude
|
|
26
|
+
}
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def parse_address
|
|
32
|
+
address = @result.formatted_address
|
|
33
|
+
return {} if address.blank?
|
|
34
|
+
|
|
35
|
+
segments = address.split(",").map(&:strip)
|
|
36
|
+
state_zip_index = find_state_zip_index(segments)
|
|
37
|
+
return { street: segments[0] } unless state_zip_index
|
|
38
|
+
|
|
39
|
+
build_parts(segments, state_zip_index)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def find_state_zip_index(segments)
|
|
43
|
+
segments.rindex { |s| s.match?(STATE_ZIP_PATTERN) }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def build_parts(segments, idx)
|
|
47
|
+
match = segments[idx].match(STATE_ZIP_PATTERN)
|
|
48
|
+
{
|
|
49
|
+
street: idx >= 2 ? segments[0..(idx - 2)].join(", ") : (idx == 1 ? segments[0] : nil),
|
|
50
|
+
city: idx >= 2 ? segments[idx - 1] : nil,
|
|
51
|
+
state: match[1],
|
|
52
|
+
zip: match[2]
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|