store_list_scraper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,96 @@
1
+
2
+ class ListScraper::CLI
3
+
4
+ def start #greet user with menu
5
+ puts "\nHello user! ヽ(‘ ∇‘ )ノ"
6
+ menu
7
+ end
8
+
9
+ def menu
10
+ puts "\nWhat are we interested in accomplishing today?\n1) New Scrape\n2) View Business List\n3) Update Business List\n4) Exit Program\n"
11
+ case gets.strip.to_i
12
+ when 1
13
+ new_scrape
14
+ when 2
15
+ list_view
16
+ when 3
17
+ list_update
18
+ when 4
19
+ puts "\n(⌐■_■)ノ\nGoodbye friend!\n"
20
+ exit
21
+ else
22
+ puts "\n(ರ_ರ)\nThat is not a valid input.."
23
+ menu
24
+ end
25
+ end
26
+
27
+ def new_scrape
28
+ puts "\n(∪ ◡ ∪)\nSo you want to run a scrape?\nPlease type a company name as found in my Business List.\nOR type 'list' to view the Business List\nOr 'menu' to return to the main menu"
29
+ name_check
30
+ end
31
+
32
+ def name_check
33
+ input = gets.strip
34
+ case input
35
+ when "list"
36
+ list_view
37
+ when "menu"
38
+ menu
39
+ else
40
+ #check business list for name
41
+ link = ListScraper::CSVmanager.business_list_check(input)
42
+ if link != nil
43
+ puts "\n( ゚ヮ゚)\nFound the business! Want me to scrape a list? (y/n)"
44
+ confimation = gets.strip
45
+ confimation == 'y' ? scrape(link) : exit
46
+ else
47
+ puts "\n(●_●)\nSorry, but it doesn't look like that's on the list.\nPlease enter another name\nOR type 'list' to view my Business List\nOR type 'menu' to return to the main menu"
48
+ name_check
49
+ end
50
+ end
51
+ end
52
+
53
+ def list_view
54
+ puts "\nThis is a list of 27,000+ businesses,\nHow would you like to view it?\n1) select a letter group\n2) search a keyword\n3) return to main menu"
55
+ case gets.strip.to_i
56
+ when 1
57
+ puts "Please type letter(s) you want at the START of the business name:"
58
+ ListScraper::CSVmanager.list_view_by_letter(gets.strip)
59
+ menu
60
+ when 2
61
+ puts "Please type the keyword as you expect to see it in the business name.\n(example: 'Jimmy's Pizza' contains 'Pizza'"
62
+ ListScraper::CSVmanager.list_view_by_search(gets.strip)
63
+ menu
64
+ when 3
65
+ menu
66
+ else
67
+ puts "\n(ರ_ರ)\nThat is not a valid input.."
68
+ list_view
69
+ end
70
+ end
71
+
72
+ def scrape(link)
73
+ a = ListScraper::LocationScraper.new("#{ListScraper::LocationScraper.base}#{link}")
74
+ a.page_scrape(a.link)
75
+ a.clean_out
76
+ puts "\n( ◕‿◕)\nI found #{a.loc_pages.length} locations across #{a.state_pages.length}for this business.\nWould you like to export? (y/n)"
77
+ confirmation = gets.strip
78
+ if confirmation == 'y'
79
+ puts "\n(°ロ°)☝\nWhat would you like to name the file?"
80
+ fileName = gets.strip
81
+ puts "...\n( ◉_◉)\nExporting now, this can take awhile. I will alert when done."
82
+ a.create_stores
83
+ ListScraper::CSVmanager.locations_export(fileName)
84
+ puts "Export Completed Successfully!"
85
+ else
86
+ puts "\n(⌐■_■)ノ\nI hope you have a great day!\nGoodbye"
87
+ exit
88
+ end
89
+ end
90
+
91
+ def list_update
92
+ puts "\n⊙﹏⊙\nRunning update, this could take several minutes.."
93
+ ListScraper::UpdateScraper.new
94
+ end
95
+
96
+ end
@@ -0,0 +1,28 @@
1
+
2
+ class ListScraper::CSVmanager
3
+
4
+ def self.locations_export(name)
5
+ c = CSV.open("#{name}.csv", "w")
6
+ c << ["IDnum", "Address", "City", "State", "ZIP"] #headers
7
+ ListScraper::Store.all.each do |loc|
8
+ c << [loc.idnum, loc.address, loc.city, loc.state, loc.zip]
9
+ end
10
+ c.close()
11
+ end
12
+
13
+ def self.business_list_check(company)
14
+ #check for business existance on file
15
+ h = CSV.read("./lib/storeListScraper/business_list.csv").find {|row| row[0] == "#{company}"}
16
+ h != nil ? h[1] : nil
17
+ end
18
+
19
+ def self.list_view_by_letter(letter)
20
+ g = CSV.read("./lib/storeListScraper/business_list.csv").select {|row| row[0].downcase.start_with?("#{letter.downcase}")}
21
+ g.each {|item| puts item[0]}
22
+ end
23
+
24
+ def self.list_view_by_search(word)
25
+ g = CSV.read("./lib/storeListScraper/business_list.csv").select {|row| row[0].downcase.include?("#{word.downcase}")}
26
+ g.each {|item| puts item[0]}
27
+ end
28
+ end
@@ -0,0 +1,103 @@
1
+
2
+ class ListScraper::LocationScraper
3
+ attr_accessor :link, :state_pages, :city_pages, :loc_pages
4
+ @@base = 'https://storefound.org'
5
+
6
+ def initialize(link)
7
+ @link = link
8
+ @state_pages = []
9
+ @city_pages = []
10
+ @loc_pages = []
11
+ end
12
+
13
+ def self.base
14
+ @@base
15
+ end
16
+
17
+ def page_scrape(page, type = 'all')
18
+ begin
19
+ doc = Nokogiri::HTML5(URI.open(page))
20
+ doc.css(".main-block a").each do |lk| #pull all links from main body
21
+ j = lk.attribute("href").text #look at only the url text
22
+ case type
23
+ when 'all'
24
+ case j.split("/").length
25
+ when 3
26
+ @state_pages << j
27
+ when 4
28
+ @city_pages << j
29
+ when 5
30
+ @loc_pages << j
31
+ end
32
+ when 'State'
33
+ case j.split("/").length
34
+ when 4
35
+ @city_pages << j
36
+ when 5
37
+ @loc_pages << j
38
+ end
39
+ when 'City'
40
+ case j.split("/").length
41
+ when 5
42
+ @loc_pages << j
43
+ end
44
+ end
45
+ end
46
+ rescue #OpenURI::HTTPError => e
47
+ end
48
+ end
49
+
50
+ #when running a store, the first table of links will
51
+ #only ever be states, cities, or locations. On state pages,
52
+ #cities will be picked up at the bottom and need to be removed
53
+ #clean_out will determine next steps and de-duplicate arrays
54
+ #clean_out can only be used after the first pass
55
+ def clean_out
56
+ @state_pages.uniq!
57
+ @city_pages.uniq!
58
+ @loc_pages.uniq!
59
+ if @state_pages.length > 0 #if state links avaliable, clean other arrays and scrape each state
60
+ @city_pages.clear
61
+ @loc_pages.clear
62
+ linked_page_scrape(@state_pages,'State')
63
+ linked_page_scrape(@city_pages,'City')
64
+ elsif @city_pages.length > 0
65
+ linked_page_scrape(@city_pages,'City')
66
+ end
67
+ end
68
+
69
+ def linked_page_scrape(array,type)
70
+ total = array.length
71
+ i = 0
72
+ array.each do |page|
73
+ i += 1
74
+ page_scrape("#{@@base}#{page}","#{type}")
75
+ print "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
76
+ end
77
+ puts "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
78
+ end
79
+
80
+ def create_stores
81
+ total = @loc_pages.length
82
+ i = 1
83
+ @loc_pages.each do |loc|
84
+ begin
85
+ st = Nokogiri::HTML5(URI.open("#{@@base}#{loc}"))
86
+ j = st.css("li span")
87
+ info = {
88
+ idnum: i,
89
+ address: j[0].text,
90
+ city: j[1].text,
91
+ state: j[2].text,
92
+ zip: j[3].text
93
+ }
94
+ ListScraper::Store.new(info)
95
+ print "#{((i.to_f/total.to_f)*100).round(2)}% | Progress: #{i}/#{total}\r"
96
+ i += 1
97
+ rescue
98
+ next
99
+ end
100
+ end
101
+ end
102
+
103
+ end
@@ -0,0 +1,19 @@
1
+
2
+ class ListScraper::Store
3
+ attr_accessor :idnum, :address, :city, :state, :zip
4
+ @@all = []
5
+
6
+ def initialize(idnum:,address:,city:,state:,zip:)
7
+ @idnum = idnum
8
+ @address = address
9
+ @city = city
10
+ @state = state
11
+ @zip = zip
12
+ @@all << self
13
+ end
14
+
15
+ def self.all
16
+ @@all
17
+ end
18
+
19
+ end
@@ -0,0 +1,51 @@
1
+
2
+ class ListScraper::UpdateScraper
3
+ attr_accessor :letters, :pages, :list
4
+ attr_reader :base
5
+
6
+ def initialize
7
+ @base = 'https://storefound.org/'
8
+ @letters = [] #array of links for each letter group
9
+ @pages = [] #array to store pages for each letter
10
+ File.delete('./lib/storeListScrapper/business_list.csv') if File.exist?('./lib/storeListScrapper/business_list.csv')
11
+ @list = CSV.open("./lib/storeListScrapper/business_list.csv", "w")
12
+ @list << ["Company Name", "link"] #headers
13
+ update
14
+ end
15
+
16
+ def update
17
+ letters_scrape
18
+ @letters.each do |letter|
19
+ pages_scrape(letter)
20
+ update_business_list
21
+ end
22
+ puts "Successfully updated!!"
23
+ end
24
+
25
+ def letters_scrape
26
+ #scrape all main links for letters group
27
+ doc = Nokogiri::HTML5(URI.open('https://storefound.org/store/starts-a/page-1'))
28
+ doc.css('.letter-block a').each do |lk|
29
+ @letters << lk.attribute('href').text
30
+ end
31
+ end
32
+
33
+ def pages_scrape(letter_link)
34
+ #scrape all page links for each letter group
35
+ @pages.clear
36
+ doc = Nokogiri::HTML5(URI.open("#{@base}#{letter_link}"))
37
+ doc.css('.pagination a').each do |lk|
38
+ @pages << lk.attribute('href').text
39
+ end
40
+ end
41
+
42
+ def update_business_list
43
+ #scrape all business names and corresponding links
44
+ @pages.each do |lk|
45
+ doc = Nokogiri::HTML5(URI.open("#{@base}#{lk}"))
46
+ j = doc.css('.main-block .col-half a').each do |biz|
47
+ @list << [biz.text, biz.attribute('href').text]
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,4 @@
1
+
2
+ module ListScraper
3
+ VERSION = "0.1.0"
4
+ end
@@ -0,0 +1,4 @@
1
+ module ListScraper
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/storeListScraper/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "store_list_scraper"
7
+ spec.version = ListScraper::VERSION
8
+ spec.authors = ["itsmattpaw"]
9
+ spec.email = ["itsmattpaw@gmail.com"]
10
+
11
+ spec.summary = "Scrape Store Address lists from StoreFound.org."
12
+ spec.description = "Scrape Store Address lists from StoreFound.org and export into a CSV file for distribution or use withe other softwares such as ESRI ARC GIS."
13
+ spec.homepage = "https://github.com/itsmattpaw/store_list_scraper"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 2.6.0"
16
+
17
+ #spec.metadata["allowed_push_host"] = "https://example.com"
18
+
19
+ #spec.metadata["homepage_uri"] = spec.homepage
20
+ #spec.metadata["source_code_uri"] = "http://www.bob.com"
21
+ #spec.metadata["changelog_uri"] = "http://www.bob.com"
22
+
23
+ # Specify which files should be added to the gem when it is released.
24
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
25
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
26
+ `git ls-files -z`.split("\x0").reject do |f|
27
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
28
+ end
29
+ end
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ # Uncomment to register a new dependency of your gem
35
+ # spec.add_dependency "example-gem", "~> 1.0"
36
+ spec.add_development_dependency "rake", "~> 13.0"
37
+ spec.add_development_dependency "pry"
38
+ spec.add_dependency 'nokogiri', '~> 1.13'
39
+ spec.add_dependency "open-uri"
40
+ spec.add_dependency "csv"
41
+
42
+ # For more information and examples about making a new gem, check out our
43
+ # guide at: https://bundler.io/guides/creating_gem.html
44
+ end
metadata ADDED
@@ -0,0 +1,132 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: store_list_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - itsmattpaw
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-03-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '13.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '13.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.13'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.13'
55
+ - !ruby/object:Gem::Dependency
56
+ name: open-uri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: csv
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Scrape Store Address lists from StoreFound.org and export into a CSV
84
+ file for distribution or use withe other softwares such as ESRI ARC GIS.
85
+ email:
86
+ - itsmattpaw@gmail.com
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".DS_Store"
92
+ - CHANGELOG.md
93
+ - CODE_OF_CONDUCT.md
94
+ - Gemfile
95
+ - Gemfile.lock
96
+ - LICENSE.txt
97
+ - README.md
98
+ - Rakefile
99
+ - lib/List_Scraper.rb
100
+ - lib/storeListScraper/business_list.csv
101
+ - lib/storeListScraper/cli.rb
102
+ - lib/storeListScraper/csv_manage.rb
103
+ - lib/storeListScraper/location_scraper.rb
104
+ - lib/storeListScraper/store.rb
105
+ - lib/storeListScraper/update_scraper.rb
106
+ - lib/storeListScraper/version.rb
107
+ - sig/store_list_scraper.rbs
108
+ - store_list_scraper.gemspec
109
+ homepage: https://github.com/itsmattpaw/store_list_scraper
110
+ licenses:
111
+ - MIT
112
+ metadata: {}
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ version: 2.6.0
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubygems_version: 3.0.9
129
+ signing_key:
130
+ specification_version: 4
131
+ summary: Scrape Store Address lists from StoreFound.org.
132
+ test_files: []