store_list_scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,96 @@
1
+
2
+ class ListScraper::CLI
3
+
4
+ def start #greet user with menu
5
+ puts "\nHello user! ヽ(‘ ∇‘ )ノ"
6
+ menu
7
+ end
8
+
9
+ def menu
10
+ puts "\nWhat are we interested in accomplishing today?\n1) New Scrape\n2) View Business List\n3) Update Business List\n4) Exit Program\n"
11
+ case gets.strip.to_i
12
+ when 1
13
+ new_scrape
14
+ when 2
15
+ list_view
16
+ when 3
17
+ list_update
18
+ when 4
19
+ puts "\n(⌐■_■)ノ\nGoodbye friend!\n"
20
+ exit
21
+ else
22
+ puts "\n(ರ_ರ)\nThat is not a valid input.."
23
+ menu
24
+ end
25
+ end
26
+
27
+ def new_scrape
28
+ puts "\n(∪ ◡ ∪)\nSo you want to run a scrape?\nPlease type a company name as found in my Business List.\nOR type 'list' to view the Business List\nOr 'menu' to return to the main menu"
29
+ name_check
30
+ end
31
+
32
+ def name_check
33
+ input = gets.strip
34
+ case input
35
+ when "list"
36
+ list_view
37
+ when "menu"
38
+ menu
39
+ else
40
+ #check business list for name
41
+ link = ListScraper::CSVmanager.business_list_check(input)
42
+ if link != nil
43
+ puts "\n( ゚ヮ゚)\nFound the business! Want me to scrape a list? (y/n)"
44
+ confimation = gets.strip
45
+ confimation == 'y' ? scrape(link) : exit
46
+ else
47
+ puts "\n(●_●)\nSorry, but it doesn't look like that's on the list.\nPlease enter another name\nOR type 'list' to view my Business List\nOR type 'menu' to return to the main menu"
48
+ name_check
49
+ end
50
+ end
51
+ end
52
+
53
+ def list_view
54
+ puts "\nThis is a list of 27,000+ businesses,\nHow would you like to view it?\n1) select a letter group\n2) search a keyword\n3) return to main menu"
55
+ case gets.strip.to_i
56
+ when 1
57
+ puts "Please type letter(s) you want at the START of the business name:"
58
+ ListScraper::CSVmanager.list_view_by_letter(gets.strip)
59
+ menu
60
+ when 2
61
+ puts "Please type the keyword as you expect to see it in the business name.\n(example: 'Jimmy's Pizza' contains 'Pizza'"
62
+ ListScraper::CSVmanager.list_view_by_search(gets.strip)
63
+ menu
64
+ when 3
65
+ menu
66
+ else
67
+ puts "\n(ರ_ರ)\nThat is not a valid input.."
68
+ list_view
69
+ end
70
+ end
71
+
72
+ def scrape(link)
73
+ a = ListScraper::LocationScraper.new("#{ListScraper::LocationScraper.base}#{link}")
74
+ a.page_scrape(a.link)
75
+ a.clean_out
76
+ puts "\n( ◕‿◕)\nI found #{a.loc_pages.length} locations across #{a.state_pages.length}for this business.\nWould you like to export? (y/n)"
77
+ confirmation = gets.strip
78
+ if confirmation == 'y'
79
+ puts "\n(°ロ°)☝\nWhat would you like to name the file?"
80
+ fileName = gets.strip
81
+ puts "...\n( ◉_◉)\nExporting now, this can take awhile. I will alert when done."
82
+ a.create_stores
83
+ ListScraper::CSVmanager.locations_export(fileName)
84
+ puts "Export Completed Successfully!"
85
+ else
86
+ puts "\n(⌐■_■)ノ\nI hope you have a great day!\nGoodbye"
87
+ exit
88
+ end
89
+ end
90
+
91
+ def list_update
92
+ puts "\n⊙﹏⊙\nRunning update, this could take several minutes.."
93
+ ListScraper::UpdateScraper.new
94
+ end
95
+
96
+ end
@@ -0,0 +1,28 @@
1
+
2
+ class ListScraper::CSVmanager
3
+
4
+ def self.locations_export(name)
5
+ c = CSV.open("#{name}.csv", "w")
6
+ c << ["IDnum", "Address", "City", "State", "ZIP"] #headers
7
+ ListScraper::Store.all.each do |loc|
8
+ c << [loc.idnum, loc.address, loc.city, loc.state, loc.zip]
9
+ end
10
+ c.close()
11
+ end
12
+
13
+ def self.business_list_check(company)
14
+ #check for business existance on file
15
+ h = CSV.read("./lib/storeListScraper/business_list.csv").find {|row| row[0] == "#{company}"}
16
+ h != nil ? h[1] : nil
17
+ end
18
+
19
+ def self.list_view_by_letter(letter)
20
+ g = CSV.read("./lib/storeListScraper/business_list.csv").select {|row| row[0].downcase.start_with?("#{letter.downcase}")}
21
+ g.each {|item| puts item[0]}
22
+ end
23
+
24
+ def self.list_view_by_search(word)
25
+ g = CSV.read("./lib/storeListScraper/business_list.csv").select {|row| row[0].downcase.include?("#{word.downcase}")}
26
+ g.each {|item| puts item[0]}
27
+ end
28
+ end
@@ -0,0 +1,103 @@
1
+
2
+ class ListScraper::LocationScraper
3
+ attr_accessor :link, :state_pages, :city_pages, :loc_pages
4
+ @@base = 'https://storefound.org'
5
+
6
+ def initialize(link)
7
+ @link = link
8
+ @state_pages = []
9
+ @city_pages = []
10
+ @loc_pages = []
11
+ end
12
+
13
+ def self.base
14
+ @@base
15
+ end
16
+
17
+ def page_scrape(page, type = 'all')
18
+ begin
19
+ doc = Nokogiri::HTML5(URI.open(page))
20
+ doc.css(".main-block a").each do |lk| #pull all links from main body
21
+ j = lk.attribute("href").text #look at only the url text
22
+ case type
23
+ when 'all'
24
+ case j.split("/").length
25
+ when 3
26
+ @state_pages << j
27
+ when 4
28
+ @city_pages << j
29
+ when 5
30
+ @loc_pages << j
31
+ end
32
+ when 'State'
33
+ case j.split("/").length
34
+ when 4
35
+ @city_pages << j
36
+ when 5
37
+ @loc_pages << j
38
+ end
39
+ when 'City'
40
+ case j.split("/").length
41
+ when 5
42
+ @loc_pages << j
43
+ end
44
+ end
45
+ end
46
+ rescue #OpenURI::HTTPError => e
47
+ end
48
+ end
49
+
50
+ #when running a store, the first table of links will
51
+ #only ever be states, cities, or locations. On state pages,
52
+ #cities will be picked up at the bottom and need to be removed
53
+ #clean_out will determine next steps and de-duplicate arrays
54
+ #clean_out can only be used after the first pass
55
+ def clean_out
56
+ @state_pages.uniq!
57
+ @city_pages.uniq!
58
+ @loc_pages.uniq!
59
+ if @state_pages.length > 0 #if state links avaliable, clean other arrays and scrape each state
60
+ @city_pages.clear
61
+ @loc_pages.clear
62
+ linked_page_scrape(@state_pages,'State')
63
+ linked_page_scrape(@city_pages,'City')
64
+ elsif @city_pages.length > 0
65
+ linked_page_scrape(@city_pages,'City')
66
+ end
67
+ end
68
+
69
+ def linked_page_scrape(array,type)
70
+ total = array.length
71
+ i = 0
72
+ array.each do |page|
73
+ i += 1
74
+ page_scrape("#{@@base}#{page}","#{type}")
75
+ print "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
76
+ end
77
+ puts "#{((i.to_f/total.to_f)*100).round(2)}% | #{type} Progress: #{i}/#{total}\r"
78
+ end
79
+
80
+ def create_stores
81
+ total = @loc_pages.length
82
+ i = 1
83
+ @loc_pages.each do |loc|
84
+ begin
85
+ st = Nokogiri::HTML5(URI.open("#{@@base}#{loc}"))
86
+ j = st.css("li span")
87
+ info = {
88
+ idnum: i,
89
+ address: j[0].text,
90
+ city: j[1].text,
91
+ state: j[2].text,
92
+ zip: j[3].text
93
+ }
94
+ ListScraper::Store.new(info)
95
+ print "#{((i.to_f/total.to_f)*100).round(2)}% | Progress: #{i}/#{total}\r"
96
+ i += 1
97
+ rescue
98
+ next
99
+ end
100
+ end
101
+ end
102
+
103
+ end
@@ -0,0 +1,19 @@
1
+
2
+ class ListScraper::Store
3
+ attr_accessor :idnum, :address, :city, :state, :zip
4
+ @@all = []
5
+
6
+ def initialize(idnum:,address:,city:,state:,zip:)
7
+ @idnum = idnum
8
+ @address = address
9
+ @city = city
10
+ @state = state
11
+ @zip = zip
12
+ @@all << self
13
+ end
14
+
15
+ def self.all
16
+ @@all
17
+ end
18
+
19
+ end
@@ -0,0 +1,51 @@
1
+
2
+ class ListScraper::UpdateScraper
3
+ attr_accessor :letters, :pages, :list
4
+ attr_reader :base
5
+
6
+ def initialize
7
+ @base = 'https://storefound.org/'
8
+ @letters = [] #array of links for each letter group
9
+ @pages = [] #array to store pages for each letter
10
+ File.delete('./lib/storeListScrapper/business_list.csv') if File.exist?('./lib/storeListScrapper/business_list.csv')
11
+ @list = CSV.open("./lib/storeListScrapper/business_list.csv", "w")
12
+ @list << ["Company Name", "link"] #headers
13
+ update
14
+ end
15
+
16
+ def update
17
+ letters_scrape
18
+ @letters.each do |letter|
19
+ pages_scrape(letter)
20
+ update_business_list
21
+ end
22
+ puts "Successfully updated!!"
23
+ end
24
+
25
+ def letters_scrape
26
+ #scrape all main links for letters group
27
+ doc = Nokogiri::HTML5(URI.open('https://storefound.org/store/starts-a/page-1'))
28
+ doc.css('.letter-block a').each do |lk|
29
+ @letters << lk.attribute('href').text
30
+ end
31
+ end
32
+
33
+ def pages_scrape(letter_link)
34
+ #scrape all page links for each letter group
35
+ @pages.clear
36
+ doc = Nokogiri::HTML5(URI.open("#{@base}#{letter_link}"))
37
+ doc.css('.pagination a').each do |lk|
38
+ @pages << lk.attribute('href').text
39
+ end
40
+ end
41
+
42
+ def update_business_list
43
+ #scrape all business names and corresponding links
44
+ @pages.each do |lk|
45
+ doc = Nokogiri::HTML5(URI.open("#{@base}#{lk}"))
46
+ j = doc.css('.main-block .col-half a').each do |biz|
47
+ @list << [biz.text, biz.attribute('href').text]
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,4 @@
1
+
2
+ module ListScraper
3
+ VERSION = "0.1.0"
4
+ end
@@ -0,0 +1,4 @@
1
+ module ListScraper
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/storeListScraper/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "store_list_scraper"
7
+ spec.version = ListScraper::VERSION
8
+ spec.authors = ["itsmattpaw"]
9
+ spec.email = ["itsmattpaw@gmail.com"]
10
+
11
+ spec.summary = "Scrape Store Address lists from StoreFound.org."
12
+ spec.description = "Scrape Store Address lists from StoreFound.org and export into a CSV file for distribution or use withe other softwares such as ESRI ARC GIS."
13
+ spec.homepage = "https://github.com/itsmattpaw/store_list_scraper"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 2.6.0"
16
+
17
+ #spec.metadata["allowed_push_host"] = "https://example.com"
18
+
19
+ #spec.metadata["homepage_uri"] = spec.homepage
20
+ #spec.metadata["source_code_uri"] = "http://www.bob.com"
21
+ #spec.metadata["changelog_uri"] = "http://www.bob.com"
22
+
23
+ # Specify which files should be added to the gem when it is released.
24
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
25
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
26
+ `git ls-files -z`.split("\x0").reject do |f|
27
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
28
+ end
29
+ end
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ # Uncomment to register a new dependency of your gem
35
+ # spec.add_dependency "example-gem", "~> 1.0"
36
+ spec.add_development_dependency "rake", "~> 13.0"
37
+ spec.add_development_dependency "pry"
38
+ spec.add_dependency 'nokogiri', '~> 1.13'
39
+ spec.add_dependency "open-uri"
40
+ spec.add_dependency "csv"
41
+
42
+ # For more information and examples about making a new gem, check out our
43
+ # guide at: https://bundler.io/guides/creating_gem.html
44
+ end
metadata ADDED
@@ -0,0 +1,132 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: store_list_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - itsmattpaw
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-03-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '13.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '13.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.13'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.13'
55
+ - !ruby/object:Gem::Dependency
56
+ name: open-uri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: csv
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Scrape Store Address lists from StoreFound.org and export into a CSV
84
+ file for distribution or use withe other softwares such as ESRI ARC GIS.
85
+ email:
86
+ - itsmattpaw@gmail.com
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".DS_Store"
92
+ - CHANGELOG.md
93
+ - CODE_OF_CONDUCT.md
94
+ - Gemfile
95
+ - Gemfile.lock
96
+ - LICENSE.txt
97
+ - README.md
98
+ - Rakefile
99
+ - lib/List_Scraper.rb
100
+ - lib/storeListScraper/business_list.csv
101
+ - lib/storeListScraper/cli.rb
102
+ - lib/storeListScraper/csv_manage.rb
103
+ - lib/storeListScraper/location_scraper.rb
104
+ - lib/storeListScraper/store.rb
105
+ - lib/storeListScraper/update_scraper.rb
106
+ - lib/storeListScraper/version.rb
107
+ - sig/store_list_scraper.rbs
108
+ - store_list_scraper.gemspec
109
+ homepage: https://github.com/itsmattpaw/store_list_scraper
110
+ licenses:
111
+ - MIT
112
+ metadata: {}
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ version: 2.6.0
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubygems_version: 3.0.9
129
+ signing_key:
130
+ specification_version: 4
131
+ summary: Scrape Store Address lists from StoreFound.org.
132
+ test_files: []