prospector_engine 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README.md +333 -0
  4. data/Rakefile +9 -0
  5. data/app/CLAUDE.md +43 -0
  6. data/app/assets/stylesheets/prospector/application.css +476 -0
  7. data/app/controllers/prospector/application_controller.rb +16 -0
  8. data/app/controllers/prospector/candidates_controller.rb +31 -0
  9. data/app/controllers/prospector/keyword_generations_controller.rb +10 -0
  10. data/app/controllers/prospector/keywords_controller.rb +38 -0
  11. data/app/controllers/prospector/run_bulk_approvals_controller.rb +13 -0
  12. data/app/controllers/prospector/run_cancellations_controller.rb +9 -0
  13. data/app/controllers/prospector/run_reclassifications_controller.rb +21 -0
  14. data/app/controllers/prospector/run_restarts_controller.rb +14 -0
  15. data/app/controllers/prospector/run_retries_controller.rb +14 -0
  16. data/app/controllers/prospector/runs_controller.rb +47 -0
  17. data/app/jobs/prospector/application_job.rb +5 -0
  18. data/app/jobs/prospector/bulk_approve_job.rb +14 -0
  19. data/app/jobs/prospector/classify_job.rb +17 -0
  20. data/app/jobs/prospector/fetch_job.rb +8 -0
  21. data/app/models/prospector/application_record.rb +6 -0
  22. data/app/models/prospector/candidate.rb +93 -0
  23. data/app/models/prospector/classification_run.rb +15 -0
  24. data/app/models/prospector/keyword.rb +16 -0
  25. data/app/models/prospector/run.rb +94 -0
  26. data/app/views/prospector/candidates/show.html.erb +63 -0
  27. data/app/views/prospector/keywords/index.html.erb +72 -0
  28. data/app/views/prospector/layouts/prospector.html.erb +38 -0
  29. data/app/views/prospector/runs/index.html.erb +33 -0
  30. data/app/views/prospector/runs/new.html.erb +109 -0
  31. data/app/views/prospector/runs/show.html.erb +111 -0
  32. data/config/routes.rb +15 -0
  33. data/db/prospector_schema.rb +81 -0
  34. data/lib/generators/prospector/install/install_generator.rb +31 -0
  35. data/lib/generators/prospector/install/templates/create_prospector_tables.rb +83 -0
  36. data/lib/generators/prospector/install/templates/prospector.rb +37 -0
  37. data/lib/prospector/CLAUDE.md +52 -0
  38. data/lib/prospector/classification/runner.rb +105 -0
  39. data/lib/prospector/configuration.rb +56 -0
  40. data/lib/prospector/engine.rb +18 -0
  41. data/lib/prospector/enrichment/contact_scraper.rb +188 -0
  42. data/lib/prospector/error.rb +8 -0
  43. data/lib/prospector/geography/base.rb +40 -0
  44. data/lib/prospector/geography/bounding_box.rb +58 -0
  45. data/lib/prospector/geography/city.rb +29 -0
  46. data/lib/prospector/geography/coordinates.rb +43 -0
  47. data/lib/prospector/geography/metro_area.rb +74 -0
  48. data/lib/prospector/geography/zip_code.rb +25 -0
  49. data/lib/prospector/keywords/generator.rb +74 -0
  50. data/lib/prospector/pipeline/normalizer.rb +57 -0
  51. data/lib/prospector/pipeline/orchestrator.rb +151 -0
  52. data/lib/prospector/sources/base.rb +13 -0
  53. data/lib/prospector/sources/google_places/adapter.rb +92 -0
  54. data/lib/prospector/sources/google_places/client.rb +58 -0
  55. data/lib/prospector/sources/google_places/us_address_validator.rb +24 -0
  56. data/lib/prospector/sources/result.rb +21 -0
  57. data/lib/prospector/version.rb +3 -0
  58. data/lib/prospector.rb +20 -0
  59. metadata +185 -0
@@ -0,0 +1,151 @@
1
+ module Prospector
2
+ module Pipeline
3
+ class Orchestrator
4
+ def initialize(run)
5
+ @run = run
6
+ @adapter = Prospector.config.resolve_source(run.source_adapter.to_sym).new
7
+ @geography = run.geography
8
+ @skipped_count = 0
9
+ @error_count = 0
10
+ @errors = []
11
+ end
12
+
13
+ def perform
14
+ @run.update!(status: "running", started_at: Time.current)
15
+ fetch_and_process
16
+ complete_fetch!
17
+ rescue => e
18
+ fail_run!(e)
19
+ raise
20
+ end
21
+
22
+ private
23
+
24
+ def fetch_and_process
25
+ keywords = collect_keywords
26
+ results = @adapter.fetch(geography: @geography, keywords: keywords)
27
+ @run.update!(total_found: results.size)
28
+
29
+ results.each_with_index do |result, index|
30
+ break if cancelled?(index)
31
+
32
+ process_result(result)
33
+ end
34
+ end
35
+
36
+ def complete_fetch!
37
+ @run.update!(
38
+ status: "classifying",
39
+ skipped_count: @skipped_count,
40
+ error_count: @error_count,
41
+ error_messages: @errors.join("\n").presence
42
+ )
43
+ ClassifyJob.perform_later(@run.id)
44
+ end
45
+
46
+ def fail_run!(error)
47
+ @run.update!(
48
+ status: "failed",
49
+ completed_at: Time.current,
50
+ error_messages: "Import failed: #{error.message}"
51
+ )
52
+ end
53
+
54
+ def collect_keywords
55
+ categories = @run.categories.presence || [Prospector.config.domain]
56
+ Keywords::Generator.for(domain: Prospector.config.domain, categories: categories)
57
+ end
58
+
59
+ def process_result(result)
60
+ normalized = Normalizer.normalize(result)
61
+
62
+ if duplicate?(result.uid, normalized)
63
+ @skipped_count += 1
64
+ return
65
+ end
66
+
67
+ create_candidate(result, normalized)
68
+ rescue => e
69
+ @error_count += 1
70
+ @errors << "Error processing #{result.name}: #{e.message}"
71
+ end
72
+
73
+ def duplicate?(source_uid, normalized)
74
+ @run.candidates.exists?(source_uid: source_uid) ||
75
+ check_host_duplicate(source_uid, normalized)
76
+ end
77
+
78
+ def check_host_duplicate(source_uid, normalized)
79
+ checker = Prospector.config.duplicate_check
80
+ return false unless checker
81
+
82
+ checker.call(
83
+ source_uid: source_uid,
84
+ name: normalized["business_name"],
85
+ address: normalized["street_address"],
86
+ city: normalized["city"],
87
+ state: normalized["state"]
88
+ )
89
+ end
90
+
91
+ def create_candidate(result, normalized)
92
+ full_address = build_address(normalized)
93
+
94
+ candidate = @run.candidates.create!(
95
+ name: result.name,
96
+ address: full_address,
97
+ latitude: result.latitude,
98
+ longitude: result.longitude,
99
+ phone_number: result.phone_number,
100
+ website: result.website,
101
+ category: result.category,
102
+ source_uid: result.uid,
103
+ hours_of_operation: format_hours(result.hours),
104
+ source_data: result.raw || {},
105
+ metadata: build_metadata(result, normalized),
106
+ status: "pending"
107
+ )
108
+ @run.increment!(:fetched_count)
109
+ enrich_contacts(candidate)
110
+ end
111
+
112
+ def enrich_contacts(candidate)
113
+ return if candidate.website.blank?
114
+
115
+ scraper = Enrichment::ContactScraper.scrape(candidate.website)
116
+ apply_contacts(candidate, scraper) if scraper.success?
117
+ end
118
+
119
+ def apply_contacts(candidate, scraper)
120
+ attrs = scraper.result.slice(:email, :facebook_url, :instagram_url, :linkedin_url, :tiktok_url, :youtube_url)
121
+ candidate.update!(attrs)
122
+ rescue ActiveRecord::RecordInvalid
123
+ nil
124
+ end
125
+
126
+ def build_address(normalized)
127
+ [normalized["street_address"], normalized["city"], normalized["state"], normalized["zip_code"]].compact.join(", ")
128
+ end
129
+
130
+ def build_metadata(result, normalized)
131
+ { "rating" => result.rating, "rating_count" => result.rating_count, "types" => result.types, "normalized_data" => normalized }
132
+ end
133
+
134
+ def format_hours(hours_array)
135
+ return {} if hours_array.blank?
136
+
137
+ hours = {}
138
+ hours_array.each_with_index do |day_hours, i|
139
+ hours[Date::DAYNAMES[i % 7].downcase] = day_hours
140
+ end
141
+ hours
142
+ end
143
+
144
+ def cancelled?(index)
145
+ return false unless (index + 1) % 10 == 0
146
+
147
+ @run.reload.cancelled?
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,13 @@
1
+ module Prospector
2
+ module Sources
3
+ class Base
4
+ def fetch(geography:, keywords:)
5
+ raise NotImplementedError, "#{self.class}#fetch must return Array<Sources::Result>"
6
+ end
7
+
8
+ def self.adapter_key
9
+ raise NotImplementedError
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,92 @@
1
+ require "set"
2
+
3
+ module Prospector
4
+ module Sources
5
+ module GooglePlaces
6
+ class Adapter < Sources::Base
7
+ def self.adapter_key = "google_places"
8
+
9
+ def initialize(api_key: nil)
10
+ @client = Client.new(api_key: api_key)
11
+ @seen_uids = Set.new
12
+ end
13
+
14
+ def fetch(geography:, keywords:)
15
+ results = []
16
+ keywords.each do |keyword|
17
+ results.concat(fetch_for_keyword(geography, keyword))
18
+ end
19
+ results
20
+ end
21
+
22
+ def extract_result(place)
23
+ uid = place["id"]&.delete_prefix("places/")
24
+ return if uid.blank?
25
+ return if @seen_uids.include?(uid)
26
+ return unless valid_place?(place)
27
+
28
+ @seen_uids.add(uid)
29
+ build_result(place, uid)
30
+ end
31
+
32
+ def build_result(place, uid)
33
+ Sources::Result.new(
34
+ uid: uid,
35
+ name: place.dig("displayName", "text"),
36
+ formatted_address: place["formattedAddress"],
37
+ latitude: place.dig("location", "latitude"),
38
+ longitude: place.dig("location", "longitude"),
39
+ phone_number: place["nationalPhoneNumber"],
40
+ website: place["websiteUri"],
41
+ description: nil,
42
+ category: nil,
43
+ hours: place.dig("regularOpeningHours", "weekdayDescriptions"),
44
+ rating: place["rating"],
45
+ rating_count: place["userRatingCount"],
46
+ types: place["types"] || [],
47
+ raw: place
48
+ )
49
+ end
50
+
51
+ private
52
+
53
+ def fetch_for_keyword(geography, keyword)
54
+ query_string = geography.to_query_string
55
+ if query_string
56
+ fetch_text("#{keyword} in #{query_string}")
57
+ else
58
+ coord = geography.to_coordinate_restriction
59
+ fetch_nearby(coord) if coord
60
+ end || []
61
+ end
62
+
63
+ def fetch_text(query)
64
+ response = @client.search_text(query)
65
+ parse_response(response)
66
+ end
67
+
68
+ def fetch_nearby(location_restriction)
69
+ response = @client.search_nearby(location_restriction)
70
+ parse_response(response)
71
+ end
72
+
73
+ def parse_response(response)
74
+ return [] unless response.code == 200
75
+
76
+ places = response.parsed_response["places"] || []
77
+ places.filter_map { |place| extract_result(place) }
78
+ end
79
+
80
+ def valid_place?(place)
81
+ place.dig("displayName", "text").present? &&
82
+ place["formattedAddress"].present? &&
83
+ place.dig("location", "latitude").present? &&
84
+ place.dig("location", "longitude").present? &&
85
+ place["id"].present? &&
86
+ place["businessStatus"] == "OPERATIONAL" &&
87
+ UsAddressValidator.us_address?(place["formattedAddress"])
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,58 @@
1
+ require "httparty"
2
+
3
+ module Prospector
4
+ module Sources
5
+ module GooglePlaces
6
+ class Client
7
+ include HTTParty
8
+ base_uri "https://places.googleapis.com/v1"
9
+
10
+ FIELD_MASK = [
11
+ "places.id",
12
+ "places.displayName",
13
+ "places.formattedAddress",
14
+ "places.location",
15
+ "places.rating",
16
+ "places.userRatingCount",
17
+ "places.types",
18
+ "places.businessStatus",
19
+ "places.nationalPhoneNumber",
20
+ "places.websiteUri",
21
+ "places.regularOpeningHours"
22
+ ].join(",").freeze
23
+
24
+ def initialize(api_key: nil)
25
+ @api_key = api_key || ENV["GOOGLE_MAPS_API_KEY"]
26
+ raise MissingApiKeyError, "Google Maps API key not configured" if @api_key.blank?
27
+ end
28
+
29
+ def search_text(query, region_code: "US")
30
+ body = { textQuery: query, regionCode: region_code }
31
+ post("/places:searchText", body)
32
+ end
33
+
34
+ def search_nearby(location_restriction, region_code: "US")
35
+ body = { locationRestriction: location_restriction, regionCode: region_code }
36
+ post("/places:searchNearby", body)
37
+ end
38
+
39
+ private
40
+
41
+ def post(path, body)
42
+ self.class.post(path, build_options(body))
43
+ end
44
+
45
+ def build_options(body)
46
+ {
47
+ body: body.to_json,
48
+ headers: {
49
+ "Content-Type" => "application/json",
50
+ "X-Goog-Api-Key" => @api_key,
51
+ "X-Goog-FieldMask" => FIELD_MASK
52
+ }
53
+ }
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,24 @@
1
+ module Prospector
2
+ module Sources
3
+ module GooglePlaces
4
+ module UsAddressValidator
5
+ US_STATE_CODES = %w[
6
+ AL AK AZ AR CA CO CT DE FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO
7
+ MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY DC
8
+ PR VI GU AS MP
9
+ ].freeze
10
+
11
+ US_STATE_PATTERN = /,\s*[^,]+,\s*(#{US_STATE_CODES.join('|')})\s*(\d{5}(-\d{4})?)?$/i
12
+
13
+ def self.us_address?(address)
14
+ return false if address.blank?
15
+
16
+ normalized = address.strip.downcase
17
+ return true if normalized.end_with?("usa") || normalized.end_with?("united states")
18
+
19
+ address.strip.match?(US_STATE_PATTERN)
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,21 @@
1
+ module Prospector
2
+ module Sources
3
+ Result = Struct.new(
4
+ :uid,
5
+ :name,
6
+ :formatted_address,
7
+ :latitude,
8
+ :longitude,
9
+ :phone_number,
10
+ :website,
11
+ :description,
12
+ :category,
13
+ :hours,
14
+ :rating,
15
+ :rating_count,
16
+ :types,
17
+ :raw,
18
+ keyword_init: true
19
+ )
20
+ end
21
+ end
@@ -0,0 +1,3 @@
1
+ module Prospector
2
+ VERSION = "0.1.1"
3
+ end
data/lib/prospector.rb ADDED
@@ -0,0 +1,20 @@
1
+ require "prospector/version"
2
+ require "prospector/error"
3
+ require "prospector/configuration"
4
+ require "prospector/engine"
5
+
6
+ module Prospector
7
+ class << self
8
+ def configure
9
+ yield config
10
+ end
11
+
12
+ def config
13
+ @config ||= Configuration.new
14
+ end
15
+
16
+ def reset_config!
17
+ @config = Configuration.new
18
+ end
19
+ end
20
+ end
metadata ADDED
@@ -0,0 +1,185 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: prospector_engine
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - AxiumFoundry
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-04-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rails
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '7.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '7.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: httparty
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.21'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.21'
41
+ - !ruby/object:Gem::Dependency
42
+ name: llm_classifier
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 0.2.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 0.2.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: webmock
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: mocha
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '2.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '2.0'
97
+ description: A Rails engine for discovering businesses from Google Places and other
98
+ sources, with AI-powered keyword generation and classification.
99
+ email:
100
+ - dev@axiumfoundry.com
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - MIT-LICENSE
106
+ - README.md
107
+ - Rakefile
108
+ - app/CLAUDE.md
109
+ - app/assets/stylesheets/prospector/application.css
110
+ - app/controllers/prospector/application_controller.rb
111
+ - app/controllers/prospector/candidates_controller.rb
112
+ - app/controllers/prospector/keyword_generations_controller.rb
113
+ - app/controllers/prospector/keywords_controller.rb
114
+ - app/controllers/prospector/run_bulk_approvals_controller.rb
115
+ - app/controllers/prospector/run_cancellations_controller.rb
116
+ - app/controllers/prospector/run_reclassifications_controller.rb
117
+ - app/controllers/prospector/run_restarts_controller.rb
118
+ - app/controllers/prospector/run_retries_controller.rb
119
+ - app/controllers/prospector/runs_controller.rb
120
+ - app/jobs/prospector/application_job.rb
121
+ - app/jobs/prospector/bulk_approve_job.rb
122
+ - app/jobs/prospector/classify_job.rb
123
+ - app/jobs/prospector/fetch_job.rb
124
+ - app/models/prospector/application_record.rb
125
+ - app/models/prospector/candidate.rb
126
+ - app/models/prospector/classification_run.rb
127
+ - app/models/prospector/keyword.rb
128
+ - app/models/prospector/run.rb
129
+ - app/views/prospector/candidates/show.html.erb
130
+ - app/views/prospector/keywords/index.html.erb
131
+ - app/views/prospector/layouts/prospector.html.erb
132
+ - app/views/prospector/runs/index.html.erb
133
+ - app/views/prospector/runs/new.html.erb
134
+ - app/views/prospector/runs/show.html.erb
135
+ - config/routes.rb
136
+ - db/prospector_schema.rb
137
+ - lib/generators/prospector/install/install_generator.rb
138
+ - lib/generators/prospector/install/templates/create_prospector_tables.rb
139
+ - lib/generators/prospector/install/templates/prospector.rb
140
+ - lib/prospector.rb
141
+ - lib/prospector/CLAUDE.md
142
+ - lib/prospector/classification/runner.rb
143
+ - lib/prospector/configuration.rb
144
+ - lib/prospector/engine.rb
145
+ - lib/prospector/enrichment/contact_scraper.rb
146
+ - lib/prospector/error.rb
147
+ - lib/prospector/geography/base.rb
148
+ - lib/prospector/geography/bounding_box.rb
149
+ - lib/prospector/geography/city.rb
150
+ - lib/prospector/geography/coordinates.rb
151
+ - lib/prospector/geography/metro_area.rb
152
+ - lib/prospector/geography/zip_code.rb
153
+ - lib/prospector/keywords/generator.rb
154
+ - lib/prospector/pipeline/normalizer.rb
155
+ - lib/prospector/pipeline/orchestrator.rb
156
+ - lib/prospector/sources/base.rb
157
+ - lib/prospector/sources/google_places/adapter.rb
158
+ - lib/prospector/sources/google_places/client.rb
159
+ - lib/prospector/sources/google_places/us_address_validator.rb
160
+ - lib/prospector/sources/result.rb
161
+ - lib/prospector/version.rb
162
+ homepage: https://github.com/AxiumFoundry/prospector_engine
163
+ licenses:
164
+ - MIT
165
+ metadata: {}
166
+ post_install_message:
167
+ rdoc_options: []
168
+ require_paths:
169
+ - lib
170
+ required_ruby_version: !ruby/object:Gem::Requirement
171
+ requirements:
172
+ - - ">="
173
+ - !ruby/object:Gem::Version
174
+ version: 3.1.0
175
+ required_rubygems_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ requirements: []
181
+ rubygems_version: 3.4.20
182
+ signing_key:
183
+ specification_version: 4
184
+ summary: Business discovery from multiple sources with AI classification
185
+ test_files: []