store_list_scraper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.DS_Store +0 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +12 -0
- data/Gemfile.lock +46 -0
- data/LICENSE.txt +21 -0
- data/README.md +35 -0
- data/Rakefile +4 -0
- data/lib/List_Scraper.rb +17 -0
- data/lib/storeListScraper/business_list.csv +27109 -0
- data/lib/storeListScraper/cli.rb +96 -0
- data/lib/storeListScraper/csv_manage.rb +28 -0
- data/lib/storeListScraper/location_scraper.rb +103 -0
- data/lib/storeListScraper/store.rb +19 -0
- data/lib/storeListScraper/update_scraper.rb +51 -0
- data/lib/storeListScraper/version.rb +4 -0
- data/sig/store_list_scraper.rbs +4 -0
- data/store_list_scraper.gemspec +44 -0
- metadata +132 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
|
2
|
+
class ListScraper::CLI
|
3
|
+
|
4
|
+
def start #greet user with menu
|
5
|
+
puts "\nHello user! ヽ(‘ ∇‘ )ノ"
|
6
|
+
menu
|
7
|
+
end
|
8
|
+
|
9
|
+
def menu
|
10
|
+
puts "\nWhat are we interested in accomplishing today?\n1) New Scrape\n2) View Business List\n3) Update Business List\n4) Exit Program\n"
|
11
|
+
case gets.strip.to_i
|
12
|
+
when 1
|
13
|
+
new_scrape
|
14
|
+
when 2
|
15
|
+
list_view
|
16
|
+
when 3
|
17
|
+
list_update
|
18
|
+
when 4
|
19
|
+
puts "\n(⌐■_■)ノ\nGoodbye friend!\n"
|
20
|
+
exit
|
21
|
+
else
|
22
|
+
puts "\n(ರ_ರ)\nThat is not a valid input.."
|
23
|
+
menu
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def new_scrape
|
28
|
+
puts "\n(∪ ◡ ∪)\nSo you want to run a scrape?\nPlease type a company name as found in my Business List.\nOR type 'list' to view the Business List\nOr 'menu' to return to the main menu"
|
29
|
+
name_check
|
30
|
+
end
|
31
|
+
|
32
|
+
def name_check
|
33
|
+
input = gets.strip
|
34
|
+
case input
|
35
|
+
when "list"
|
36
|
+
list_view
|
37
|
+
when "menu"
|
38
|
+
menu
|
39
|
+
else
|
40
|
+
#check business list for name
|
41
|
+
link = ListScraper::CSVmanager.business_list_check(input)
|
42
|
+
if link != nil
|
43
|
+
puts "\n( ゚ヮ゚)\nFound the business! Want me to scrape a list? (y/n)"
|
44
|
+
confimation = gets.strip
|
45
|
+
confimation == 'y' ? scrape(link) : exit
|
46
|
+
else
|
47
|
+
puts "\n(●_●)\nSorry, but it doesn't look like that's on the list.\nPlease enter another name\nOR type 'list' to view my Business List\nOR type 'menu' to return to the main menu"
|
48
|
+
name_check
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def list_view
|
54
|
+
puts "\nThis is a list of 27,000+ businesses,\nHow would you like to view it?\n1) select a letter group\n2) search a keyword\n3) return to main menu"
|
55
|
+
case gets.strip.to_i
|
56
|
+
when 1
|
57
|
+
puts "Please type letter(s) you want at the START of the business name:"
|
58
|
+
ListScraper::CSVmanager.list_view_by_letter(gets.strip)
|
59
|
+
menu
|
60
|
+
when 2
|
61
|
+
puts "Please type the keyword as you expect to see it in the business name.\n(example: 'Jimmy's Pizza' contains 'Pizza'"
|
62
|
+
ListScraper::CSVmanager.list_view_by_search(gets.strip)
|
63
|
+
menu
|
64
|
+
when 3
|
65
|
+
menu
|
66
|
+
else
|
67
|
+
puts "\n(ರ_ರ)\nThat is not a valid input.."
|
68
|
+
list_view
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def scrape(link)
|
73
|
+
a = ListScraper::LocationScraper.new("#{ListScraper::LocationScraper.base}#{link}")
|
74
|
+
a.page_scrape(a.link)
|
75
|
+
a.clean_out
|
76
|
+
puts "\n( ◕‿◕)\nI found #{a.loc_pages.length} locations across #{a.state_pages.length}for this business.\nWould you like to export? (y/n)"
|
77
|
+
confirmation = gets.strip
|
78
|
+
if confirmation == 'y'
|
79
|
+
puts "\n(°ロ°)☝\nWhat would you like to name the file?"
|
80
|
+
fileName = gets.strip
|
81
|
+
puts "...\n( ◉_◉)\nExporting now, this can take awhile. I will alert when done."
|
82
|
+
a.create_stores
|
83
|
+
ListScraper::CSVmanager.locations_export(fileName)
|
84
|
+
puts "Export Completed Successfully!"
|
85
|
+
else
|
86
|
+
puts "\n(⌐■_■)ノ\nI hope you have a great day!\nGoodbye"
|
87
|
+
exit
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def list_update
|
92
|
+
puts "\n⊙﹏⊙\nRunning update, this could take several minutes.."
|
93
|
+
ListScraper::UpdateScraper.new
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
|
2
|
+
class ListScraper::CSVmanager
|
3
|
+
|
4
|
+
def self.locations_export(name)
|
5
|
+
c = CSV.open("#{name}.csv", "w")
|
6
|
+
c << ["IDnum", "Address", "City", "State", "ZIP"] #headers
|
7
|
+
ListScraper::Store.all.each do |loc|
|
8
|
+
c << [loc.idnum, loc.address, loc.city, loc.state, loc.zip]
|
9
|
+
end
|
10
|
+
c.close()
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.business_list_check(company)
|
14
|
+
#check for business existance on file
|
15
|
+
h = CSV.read("./lib/storeListScraper/business_list.csv").find {|row| row[0] == "#{company}"}
|
16
|
+
h != nil ? h[1] : nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.list_view_by_letter(letter)
|
20
|
+
g = CSV.read("./lib/storeListScraper/business_list.csv").select {|row| row[0].downcase.start_with?("#{letter.downcase}")}
|
21
|
+
g.each {|item| puts item[0]}
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.list_view_by_search(word)
|
25
|
+
g = CSV.read("./lib/storeListScraper/business_list.csv").select {|row| row[0].downcase.include?("#{word.downcase}")}
|
26
|
+
g.each {|item| puts item[0]}
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
|
2
|
+
class ListScraper::LocationScraper
|
3
|
+
attr_accessor :link, :state_pages, :city_pages, :loc_pages
|
4
|
+
@@base = 'https://storefound.org'
|
5
|
+
|
6
|
+
def initialize(link)
|
7
|
+
@link = link
|
8
|
+
@state_pages = []
|
9
|
+
@city_pages = []
|
10
|
+
@loc_pages = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.base
|
14
|
+
@@base
|
15
|
+
end
|
16
|
+
|
17
|
+
def page_scrape(page, type = 'all')
|
18
|
+
begin
|
19
|
+
doc = Nokogiri::HTML5(URI.open(page))
|
20
|
+
doc.css(".main-block a").each do |lk| #pull all links from main body
|
21
|
+
j = lk.attribute("href").text #look at only the url text
|
22
|
+
case type
|
23
|
+
when 'all'
|
24
|
+
case j.split("/").length
|
25
|
+
when 3
|
26
|
+
@state_pages << j
|
27
|
+
when 4
|
28
|
+
@city_pages << j
|
29
|
+
when 5
|
30
|
+
@loc_pages << j
|
31
|
+
end
|
32
|
+
when 'State'
|
33
|
+
case j.split("/").length
|
34
|
+
when 4
|
35
|
+
@city_pages << j
|
36
|
+
when 5
|
37
|
+
@loc_pages << j
|
38
|
+
end
|
39
|
+
when 'City'
|
40
|
+
case j.split("/").length
|
41
|
+
when 5
|
42
|
+
@loc_pages << j
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
rescue #OpenURI::HTTPError => e
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#when running a store, the first table of links will
|
51
|
+
#only ever be states, cities, or locations. On state pages,
|
52
|
+
#cities will be picked up at the bottom and need to be removed
|
53
|
+
#clean_out will determine next steps and de-duplicate arrays
|
54
|
+
#clean_out can only be used after the first pass
|
55
|
+
def clean_out
|
56
|
+
@state_pages.uniq!
|
57
|
+
@city_pages.uniq!
|
58
|
+
@loc_pages.uniq!
|
59
|
+
if @state_pages.length > 0 #if state links avaliable, clean other arrays and scrape each state
|
60
|
+
@city_pages.clear
|
61
|
+
@loc_pages.clear
|
62
|
+
linked_page_scrape(@state_pages,'State')
|
63
|
+
linked_page_scrape(@city_pages,'City')
|
64
|
+
elsif @city_pages.length > 0
|
65
|
+
linked_page_scrape(@city_pages,'City')
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def linked_page_scrape(array,type)
|
70
|
+
total = array.length
|
71
|
+
i = 0
|
72
|
+
array.each do |page|
|
73
|
+
i += 1
|
74
|
+
page_scrape("#{@@base}#{page}","#{type}")
|
75
|
+
print "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
|
76
|
+
end
|
77
|
+
puts "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_stores
|
81
|
+
total = @loc_pages.length
|
82
|
+
i = 1
|
83
|
+
@loc_pages.each do |loc|
|
84
|
+
begin
|
85
|
+
st = Nokogiri::HTML5(URI.open("#{@@base}#{loc}"))
|
86
|
+
j = st.css("li span")
|
87
|
+
info = {
|
88
|
+
idnum: i,
|
89
|
+
address: j[0].text,
|
90
|
+
city: j[1].text,
|
91
|
+
state: j[2].text,
|
92
|
+
zip: j[3].text
|
93
|
+
}
|
94
|
+
ListScraper::Store.new(info)
|
95
|
+
print "#{((i.to_f/total.to_f)*100).round(2)}% | Progress: #{i}/#{total}\r"
|
96
|
+
i += 1
|
97
|
+
rescue
|
98
|
+
next
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
class ListScraper::Store
|
3
|
+
attr_accessor :idnum, :address, :city, :state, :zip
|
4
|
+
@@all = []
|
5
|
+
|
6
|
+
def initialize(idnum:,address:,city:,state:,zip:)
|
7
|
+
@idnum = idnum
|
8
|
+
@address = address
|
9
|
+
@city = city
|
10
|
+
@state = state
|
11
|
+
@zip = zip
|
12
|
+
@@all << self
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.all
|
16
|
+
@@all
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
|
2
|
+
class ListScraper::UpdateScraper
|
3
|
+
attr_accessor :letters, :pages, :list
|
4
|
+
attr_reader :base
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@base = 'https://storefound.org/'
|
8
|
+
@letters = [] #array of links for each letter group
|
9
|
+
@pages = [] #array to store pages for each letter
|
10
|
+
File.delete('./lib/storeListScrapper/business_list.csv') if File.exist?('./lib/storeListScrapper/business_list.csv')
|
11
|
+
@list = CSV.open("./lib/storeListScrapper/business_list.csv", "w")
|
12
|
+
@list << ["Company Name", "link"] #headers
|
13
|
+
update
|
14
|
+
end
|
15
|
+
|
16
|
+
def update
|
17
|
+
letters_scrape
|
18
|
+
@letters.each do |letter|
|
19
|
+
pages_scrape(letter)
|
20
|
+
update_business_list
|
21
|
+
end
|
22
|
+
puts "Successfully updated!!"
|
23
|
+
end
|
24
|
+
|
25
|
+
def letters_scrape
|
26
|
+
#scrape all main links for letters group
|
27
|
+
doc = Nokogiri::HTML5(URI.open('https://storefound.org/store/starts-a/page-1'))
|
28
|
+
doc.css('.letter-block a').each do |lk|
|
29
|
+
@letters << lk.attribute('href').text
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def pages_scrape(letter_link)
|
34
|
+
#scrape all page links for each letter group
|
35
|
+
@pages.clear
|
36
|
+
doc = Nokogiri::HTML5(URI.open("#{@base}#{letter_link}"))
|
37
|
+
doc.css('.pagination a').each do |lk|
|
38
|
+
@pages << lk.attribute('href').text
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def update_business_list
|
43
|
+
#scrape all business names and corresponding links
|
44
|
+
@pages.each do |lk|
|
45
|
+
doc = Nokogiri::HTML5(URI.open("#{@base}#{lk}"))
|
46
|
+
j = doc.css('.main-block .col-half a').each do |biz|
|
47
|
+
@list << [biz.text, biz.attribute('href').text]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/storeListScraper/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "store_list_scraper"
|
7
|
+
spec.version = ListScraper::VERSION
|
8
|
+
spec.authors = ["itsmattpaw"]
|
9
|
+
spec.email = ["itsmattpaw@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "Scrape Store Address lists from StoreFound.org."
|
12
|
+
spec.description = "Scrape Store Address lists from StoreFound.org and export into a CSV file for distribution or use withe other softwares such as ESRI ARC GIS."
|
13
|
+
spec.homepage = "https://github.com/itsmattpaw/store_list_scraper"
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = ">= 2.6.0"
|
16
|
+
|
17
|
+
#spec.metadata["allowed_push_host"] = "https://example.com"
|
18
|
+
|
19
|
+
#spec.metadata["homepage_uri"] = spec.homepage
|
20
|
+
#spec.metadata["source_code_uri"] = "http://www.bob.com"
|
21
|
+
#spec.metadata["changelog_uri"] = "http://www.bob.com"
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
26
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
27
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
28
|
+
end
|
29
|
+
end
|
30
|
+
spec.bindir = "exe"
|
31
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
32
|
+
spec.require_paths = ["lib"]
|
33
|
+
|
34
|
+
# Uncomment to register a new dependency of your gem
|
35
|
+
# spec.add_dependency "example-gem", "~> 1.0"
|
36
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
37
|
+
spec.add_development_dependency "pry"
|
38
|
+
spec.add_dependency 'nokogiri', '~> 1.13'
|
39
|
+
spec.add_dependency "open-uri"
|
40
|
+
spec.add_dependency "csv"
|
41
|
+
|
42
|
+
# For more information and examples about making a new gem, check out our
|
43
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
44
|
+
end
|
metadata
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: store_list_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- itsmattpaw
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-03-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '13.0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '13.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: pry
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.13'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.13'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: open-uri
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: csv
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: Scrape Store Address lists from StoreFound.org and export into a CSV
|
84
|
+
file for distribution or use withe other softwares such as ESRI ARC GIS.
|
85
|
+
email:
|
86
|
+
- itsmattpaw@gmail.com
|
87
|
+
executables: []
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- ".DS_Store"
|
92
|
+
- CHANGELOG.md
|
93
|
+
- CODE_OF_CONDUCT.md
|
94
|
+
- Gemfile
|
95
|
+
- Gemfile.lock
|
96
|
+
- LICENSE.txt
|
97
|
+
- README.md
|
98
|
+
- Rakefile
|
99
|
+
- lib/List_Scraper.rb
|
100
|
+
- lib/storeListScraper/business_list.csv
|
101
|
+
- lib/storeListScraper/cli.rb
|
102
|
+
- lib/storeListScraper/csv_manage.rb
|
103
|
+
- lib/storeListScraper/location_scraper.rb
|
104
|
+
- lib/storeListScraper/store.rb
|
105
|
+
- lib/storeListScraper/update_scraper.rb
|
106
|
+
- lib/storeListScraper/version.rb
|
107
|
+
- sig/store_list_scraper.rbs
|
108
|
+
- store_list_scraper.gemspec
|
109
|
+
homepage: https://github.com/itsmattpaw/store_list_scraper
|
110
|
+
licenses:
|
111
|
+
- MIT
|
112
|
+
metadata: {}
|
113
|
+
post_install_message:
|
114
|
+
rdoc_options: []
|
115
|
+
require_paths:
|
116
|
+
- lib
|
117
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
118
|
+
requirements:
|
119
|
+
- - ">="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: 2.6.0
|
122
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
123
|
+
requirements:
|
124
|
+
- - ">="
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
127
|
+
requirements: []
|
128
|
+
rubygems_version: 3.0.9
|
129
|
+
signing_key:
|
130
|
+
specification_version: 4
|
131
|
+
summary: Scrape Store Address lists from StoreFound.org.
|
132
|
+
test_files: []
|