store_list_scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.DS_Store +0 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +12 -0
- data/Gemfile.lock +46 -0
- data/LICENSE.txt +21 -0
- data/README.md +35 -0
- data/Rakefile +4 -0
- data/lib/List_Scraper.rb +17 -0
- data/lib/storeListScraper/business_list.csv +27109 -0
- data/lib/storeListScraper/cli.rb +96 -0
- data/lib/storeListScraper/csv_manage.rb +28 -0
- data/lib/storeListScraper/location_scraper.rb +103 -0
- data/lib/storeListScraper/store.rb +19 -0
- data/lib/storeListScraper/update_scraper.rb +51 -0
- data/lib/storeListScraper/version.rb +4 -0
- data/sig/store_list_scraper.rbs +4 -0
- data/store_list_scraper.gemspec +44 -0
- metadata +132 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
|
2
|
+
class ListScraper::CLI
|
3
|
+
|
4
|
+
def start #greet user with menu
|
5
|
+
puts "\nHello user! ヽ(‘ ∇‘ )ノ"
|
6
|
+
menu
|
7
|
+
end
|
8
|
+
|
9
|
+
def menu
|
10
|
+
puts "\nWhat are we interested in accomplishing today?\n1) New Scrape\n2) View Business List\n3) Update Business List\n4) Exit Program\n"
|
11
|
+
case gets.strip.to_i
|
12
|
+
when 1
|
13
|
+
new_scrape
|
14
|
+
when 2
|
15
|
+
list_view
|
16
|
+
when 3
|
17
|
+
list_update
|
18
|
+
when 4
|
19
|
+
puts "\n(⌐■_■)ノ\nGoodbye friend!\n"
|
20
|
+
exit
|
21
|
+
else
|
22
|
+
puts "\n(ರ_ರ)\nThat is not a valid input.."
|
23
|
+
menu
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def new_scrape
|
28
|
+
puts "\n(∪ ◡ ∪)\nSo you want to run a scrape?\nPlease type a company name as found in my Business List.\nOR type 'list' to view the Business List\nOr 'menu' to return to the main menu"
|
29
|
+
name_check
|
30
|
+
end
|
31
|
+
|
32
|
+
def name_check
|
33
|
+
input = gets.strip
|
34
|
+
case input
|
35
|
+
when "list"
|
36
|
+
list_view
|
37
|
+
when "menu"
|
38
|
+
menu
|
39
|
+
else
|
40
|
+
#check business list for name
|
41
|
+
link = ListScraper::CSVmanager.business_list_check(input)
|
42
|
+
if link != nil
|
43
|
+
puts "\n( ゚ヮ゚)\nFound the business! Want me to scrape a list? (y/n)"
|
44
|
+
confimation = gets.strip
|
45
|
+
confimation == 'y' ? scrape(link) : exit
|
46
|
+
else
|
47
|
+
puts "\n(●_●)\nSorry, but it doesn't look like that's on the list.\nPlease enter another name\nOR type 'list' to view my Business List\nOR type 'menu' to return to the main menu"
|
48
|
+
name_check
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def list_view
|
54
|
+
puts "\nThis is a list of 27,000+ businesses,\nHow would you like to view it?\n1) select a letter group\n2) search a keyword\n3) return to main menu"
|
55
|
+
case gets.strip.to_i
|
56
|
+
when 1
|
57
|
+
puts "Please type letter(s) you want at the START of the business name:"
|
58
|
+
ListScraper::CSVmanager.list_view_by_letter(gets.strip)
|
59
|
+
menu
|
60
|
+
when 2
|
61
|
+
puts "Please type the keyword as you expect to see it in the business name.\n(example: 'Jimmy's Pizza' contains 'Pizza'"
|
62
|
+
ListScraper::CSVmanager.list_view_by_search(gets.strip)
|
63
|
+
menu
|
64
|
+
when 3
|
65
|
+
menu
|
66
|
+
else
|
67
|
+
puts "\n(ರ_ರ)\nThat is not a valid input.."
|
68
|
+
list_view
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def scrape(link)
|
73
|
+
a = ListScraper::LocationScraper.new("#{ListScraper::LocationScraper.base}#{link}")
|
74
|
+
a.page_scrape(a.link)
|
75
|
+
a.clean_out
|
76
|
+
puts "\n( ◕‿◕)\nI found #{a.loc_pages.length} locations across #{a.state_pages.length}for this business.\nWould you like to export? (y/n)"
|
77
|
+
confirmation = gets.strip
|
78
|
+
if confirmation == 'y'
|
79
|
+
puts "\n(°ロ°)☝\nWhat would you like to name the file?"
|
80
|
+
fileName = gets.strip
|
81
|
+
puts "...\n( ◉_◉)\nExporting now, this can take awhile. I will alert when done."
|
82
|
+
a.create_stores
|
83
|
+
ListScraper::CSVmanager.locations_export(fileName)
|
84
|
+
puts "Export Completed Successfully!"
|
85
|
+
else
|
86
|
+
puts "\n(⌐■_■)ノ\nI hope you have a great day!\nGoodbye"
|
87
|
+
exit
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def list_update
|
92
|
+
puts "\n⊙﹏⊙\nRunning update, this could take several minutes.."
|
93
|
+
ListScraper::UpdateScraper.new
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
|
2
|
+
class ListScraper::CSVmanager
|
3
|
+
|
4
|
+
def self.locations_export(name)
|
5
|
+
c = CSV.open("#{name}.csv", "w")
|
6
|
+
c << ["IDnum", "Address", "City", "State", "ZIP"] #headers
|
7
|
+
ListScraper::Store.all.each do |loc|
|
8
|
+
c << [loc.idnum, loc.address, loc.city, loc.state, loc.zip]
|
9
|
+
end
|
10
|
+
c.close()
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.business_list_check(company)
|
14
|
+
#check for business existance on file
|
15
|
+
h = CSV.read("./lib/storeListScraper/business_list.csv").find {|row| row[0] == "#{company}"}
|
16
|
+
h != nil ? h[1] : nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.list_view_by_letter(letter)
|
20
|
+
g = CSV.read("./lib/storeListScraper/business_list.csv").select {|row| row[0].downcase.start_with?("#{letter.downcase}")}
|
21
|
+
g.each {|item| puts item[0]}
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.list_view_by_search(word)
|
25
|
+
g = CSV.read("./lib/storeListScraper/business_list.csv").select {|row| row[0].downcase.include?("#{word.downcase}")}
|
26
|
+
g.each {|item| puts item[0]}
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
|
2
|
+
class ListScraper::LocationScraper
|
3
|
+
attr_accessor :link, :state_pages, :city_pages, :loc_pages
|
4
|
+
@@base = 'https://storefound.org'
|
5
|
+
|
6
|
+
def initialize(link)
|
7
|
+
@link = link
|
8
|
+
@state_pages = []
|
9
|
+
@city_pages = []
|
10
|
+
@loc_pages = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.base
|
14
|
+
@@base
|
15
|
+
end
|
16
|
+
|
17
|
+
def page_scrape(page, type = 'all')
|
18
|
+
begin
|
19
|
+
doc = Nokogiri::HTML5(URI.open(page))
|
20
|
+
doc.css(".main-block a").each do |lk| #pull all links from main body
|
21
|
+
j = lk.attribute("href").text #look at only the url text
|
22
|
+
case type
|
23
|
+
when 'all'
|
24
|
+
case j.split("/").length
|
25
|
+
when 3
|
26
|
+
@state_pages << j
|
27
|
+
when 4
|
28
|
+
@city_pages << j
|
29
|
+
when 5
|
30
|
+
@loc_pages << j
|
31
|
+
end
|
32
|
+
when 'State'
|
33
|
+
case j.split("/").length
|
34
|
+
when 4
|
35
|
+
@city_pages << j
|
36
|
+
when 5
|
37
|
+
@loc_pages << j
|
38
|
+
end
|
39
|
+
when 'City'
|
40
|
+
case j.split("/").length
|
41
|
+
when 5
|
42
|
+
@loc_pages << j
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
rescue #OpenURI::HTTPError => e
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#when running a store, the first table of links will
|
51
|
+
#only ever be states, cities, or locations. On state pages,
|
52
|
+
#cities will be picked up at the bottom and need to be removed
|
53
|
+
#clean_out will determine next steps and de-duplicate arrays
|
54
|
+
#clean_out can only be used after the first pass
|
55
|
+
def clean_out
|
56
|
+
@state_pages.uniq!
|
57
|
+
@city_pages.uniq!
|
58
|
+
@loc_pages.uniq!
|
59
|
+
if @state_pages.length > 0 #if state links avaliable, clean other arrays and scrape each state
|
60
|
+
@city_pages.clear
|
61
|
+
@loc_pages.clear
|
62
|
+
linked_page_scrape(@state_pages,'State')
|
63
|
+
linked_page_scrape(@city_pages,'City')
|
64
|
+
elsif @city_pages.length > 0
|
65
|
+
linked_page_scrape(@city_pages,'City')
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def linked_page_scrape(array,type)
|
70
|
+
total = array.length
|
71
|
+
i = 0
|
72
|
+
array.each do |page|
|
73
|
+
i += 1
|
74
|
+
page_scrape("#{@@base}#{page}","#{type}")
|
75
|
+
print "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
|
76
|
+
end
|
77
|
+
puts "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_stores
|
81
|
+
total = @loc_pages.length
|
82
|
+
i = 1
|
83
|
+
@loc_pages.each do |loc|
|
84
|
+
begin
|
85
|
+
st = Nokogiri::HTML5(URI.open("#{@@base}#{loc}"))
|
86
|
+
j = st.css("li span")
|
87
|
+
info = {
|
88
|
+
idnum: i,
|
89
|
+
address: j[0].text,
|
90
|
+
city: j[1].text,
|
91
|
+
state: j[2].text,
|
92
|
+
zip: j[3].text
|
93
|
+
}
|
94
|
+
ListScraper::Store.new(info)
|
95
|
+
print "#{((i.to_f/total.to_f)*100).round(2)}% | Progress: #{i}/#{total}\r"
|
96
|
+
i += 1
|
97
|
+
rescue
|
98
|
+
next
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
class ListScraper::Store
|
3
|
+
attr_accessor :idnum, :address, :city, :state, :zip
|
4
|
+
@@all = []
|
5
|
+
|
6
|
+
def initialize(idnum:,address:,city:,state:,zip:)
|
7
|
+
@idnum = idnum
|
8
|
+
@address = address
|
9
|
+
@city = city
|
10
|
+
@state = state
|
11
|
+
@zip = zip
|
12
|
+
@@all << self
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.all
|
16
|
+
@@all
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
|
2
|
+
class ListScraper::UpdateScraper
|
3
|
+
attr_accessor :letters, :pages, :list
|
4
|
+
attr_reader :base
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@base = 'https://storefound.org/'
|
8
|
+
@letters = [] #array of links for each letter group
|
9
|
+
@pages = [] #array to store pages for each letter
|
10
|
+
File.delete('./lib/storeListScrapper/business_list.csv') if File.exist?('./lib/storeListScrapper/business_list.csv')
|
11
|
+
@list = CSV.open("./lib/storeListScrapper/business_list.csv", "w")
|
12
|
+
@list << ["Company Name", "link"] #headers
|
13
|
+
update
|
14
|
+
end
|
15
|
+
|
16
|
+
def update
|
17
|
+
letters_scrape
|
18
|
+
@letters.each do |letter|
|
19
|
+
pages_scrape(letter)
|
20
|
+
update_business_list
|
21
|
+
end
|
22
|
+
puts "Successfully updated!!"
|
23
|
+
end
|
24
|
+
|
25
|
+
def letters_scrape
|
26
|
+
#scrape all main links for letters group
|
27
|
+
doc = Nokogiri::HTML5(URI.open('https://storefound.org/store/starts-a/page-1'))
|
28
|
+
doc.css('.letter-block a').each do |lk|
|
29
|
+
@letters << lk.attribute('href').text
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def pages_scrape(letter_link)
|
34
|
+
#scrape all page links for each letter group
|
35
|
+
@pages.clear
|
36
|
+
doc = Nokogiri::HTML5(URI.open("#{@base}#{letter_link}"))
|
37
|
+
doc.css('.pagination a').each do |lk|
|
38
|
+
@pages << lk.attribute('href').text
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def update_business_list
|
43
|
+
#scrape all business names and corresponding links
|
44
|
+
@pages.each do |lk|
|
45
|
+
doc = Nokogiri::HTML5(URI.open("#{@base}#{lk}"))
|
46
|
+
j = doc.css('.main-block .col-half a').each do |biz|
|
47
|
+
@list << [biz.text, biz.attribute('href').text]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/storeListScraper/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "store_list_scraper"
|
7
|
+
spec.version = ListScraper::VERSION
|
8
|
+
spec.authors = ["itsmattpaw"]
|
9
|
+
spec.email = ["itsmattpaw@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "Scrape Store Address lists from StoreFound.org."
|
12
|
+
spec.description = "Scrape Store Address lists from StoreFound.org and export into a CSV file for distribution or use withe other softwares such as ESRI ARC GIS."
|
13
|
+
spec.homepage = "https://github.com/itsmattpaw/store_list_scraper"
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = ">= 2.6.0"
|
16
|
+
|
17
|
+
#spec.metadata["allowed_push_host"] = "https://example.com"
|
18
|
+
|
19
|
+
#spec.metadata["homepage_uri"] = spec.homepage
|
20
|
+
#spec.metadata["source_code_uri"] = "http://www.bob.com"
|
21
|
+
#spec.metadata["changelog_uri"] = "http://www.bob.com"
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
26
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
27
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
28
|
+
end
|
29
|
+
end
|
30
|
+
spec.bindir = "exe"
|
31
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
32
|
+
spec.require_paths = ["lib"]
|
33
|
+
|
34
|
+
# Uncomment to register a new dependency of your gem
|
35
|
+
# spec.add_dependency "example-gem", "~> 1.0"
|
36
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
37
|
+
spec.add_development_dependency "pry"
|
38
|
+
spec.add_dependency 'nokogiri', '~> 1.13'
|
39
|
+
spec.add_dependency "open-uri"
|
40
|
+
spec.add_dependency "csv"
|
41
|
+
|
42
|
+
# For more information and examples about making a new gem, check out our
|
43
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
44
|
+
end
|
metadata
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: store_list_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- itsmattpaw
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-03-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '13.0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '13.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: pry
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.13'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.13'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: open-uri
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: csv
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: Scrape Store Address lists from StoreFound.org and export into a CSV
|
84
|
+
file for distribution or use withe other softwares such as ESRI ARC GIS.
|
85
|
+
email:
|
86
|
+
- itsmattpaw@gmail.com
|
87
|
+
executables: []
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- ".DS_Store"
|
92
|
+
- CHANGELOG.md
|
93
|
+
- CODE_OF_CONDUCT.md
|
94
|
+
- Gemfile
|
95
|
+
- Gemfile.lock
|
96
|
+
- LICENSE.txt
|
97
|
+
- README.md
|
98
|
+
- Rakefile
|
99
|
+
- lib/List_Scraper.rb
|
100
|
+
- lib/storeListScraper/business_list.csv
|
101
|
+
- lib/storeListScraper/cli.rb
|
102
|
+
- lib/storeListScraper/csv_manage.rb
|
103
|
+
- lib/storeListScraper/location_scraper.rb
|
104
|
+
- lib/storeListScraper/store.rb
|
105
|
+
- lib/storeListScraper/update_scraper.rb
|
106
|
+
- lib/storeListScraper/version.rb
|
107
|
+
- sig/store_list_scraper.rbs
|
108
|
+
- store_list_scraper.gemspec
|
109
|
+
homepage: https://github.com/itsmattpaw/store_list_scraper
|
110
|
+
licenses:
|
111
|
+
- MIT
|
112
|
+
metadata: {}
|
113
|
+
post_install_message:
|
114
|
+
rdoc_options: []
|
115
|
+
require_paths:
|
116
|
+
- lib
|
117
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
118
|
+
requirements:
|
119
|
+
- - ">="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: 2.6.0
|
122
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
123
|
+
requirements:
|
124
|
+
- - ">="
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
127
|
+
requirements: []
|
128
|
+
rubygems_version: 3.0.9
|
129
|
+
signing_key:
|
130
|
+
specification_version: 4
|
131
|
+
summary: Scrape Store Address lists from StoreFound.org.
|
132
|
+
test_files: []
|