link_scraper 1.01 → 1.02

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 763fe1b9044fc08f804e6949a2d3f996876d4720554238315d190853e246f8e1
4
- data.tar.gz: 8686914221781dac64b47a71a0269d220f6f64470c0495bc77c8916e2af4b913
3
+ metadata.gz: 1722daeff2d952711b2ed8247f5e1875c3b3d2739e96269ca483474405756cd2
4
+ data.tar.gz: bc901c65b11f0d123e5973bcb5b65b0b6add539f81b7102c625a47bb7a62994a
5
5
  SHA512:
6
- metadata.gz: 9107b027d965d5fdc610313504299752cc33e0726ef8041c69b8d4c4a5d291c836e30f9ebfd4cb87dfac8ee6eca82d9b7b79134ae8fde8ffe58afa5fc704ece9
7
- data.tar.gz: 63e57e717fed2a090d42c6ad2bdc8983d9db5dc3f106b59483f1c3a96d57cc8ffc1dba8019ecde92d35bd2611ec7dd5d4be97b169aa7b0f7b4037dd5c698b759
6
+ metadata.gz: aa4096762dc168b12d44909929a443d12dd901c40c4ad4d69baaafae4ffc2d97c53bb97f34a90175a613a843ccecd6caaeaf5d5fdc279e5f4062400d750057d6
7
+ data.tar.gz: 1acb651d9f98ec067551dc5f38fa10c2c98a597229016f76407a2d81a414805cb54fe23414ec698ed7b86caf70d45d2b3803ba318de6bab581f700b142c28e97
data/README.md CHANGED
@@ -3,7 +3,7 @@
3
3
  [![Gem Version](https://badge.fury.io/rb/link_scraper.svg)](https://badge.fury.io/rb/link_scraper)
4
4
  [![MIT License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
5
 
6
- #### Scrape website links' text and href with built-in scrubbing filter.
6
+ #### Scrape website links' text and path with built-in scrubbing filter.
7
7
 
8
8
  Designed to rapidly visit and scrape links from list of several URLs, then filters them based on your criteria. For example, to only grab the links for inventory, staff, or contact us, etc.
9
9
 
@@ -26,7 +26,53 @@ Or install it yourself as:
26
26
 
27
27
  ## Usage
28
28
 
29
- ### COMING SOON - GEM IS CURRENTLY BEING DEVELOPED.
29
+ This is an example of how to grab links from a URL. `args` are optional if you want to scrub and filter the links based on your criteria, like below.
30
+
31
+ ```
32
+ text_criteria = {
33
+ pos_criteria: ['coordinates', 'zip codes', 'area codes', 'climate', 'demographics'],
34
+ neg_criteria: %w[drought school]
35
+ }
36
+
37
+ path_criteria = {
38
+ pos_criteria: ['coordinates', 'zip codes', 'area codes', 'climate', 'demographics'],
39
+ neg_criteria: %w[drought school]
40
+ }
41
+
42
+ scraper = LinkScraper::Scrape.new(text_criteria: text_criteria, path_criteria: path_criteria)
43
+ scraped_links = scraper.start('https://en.wikipedia.org/wiki/Austin%2C_Texas')
44
+ ```
45
+
46
+ Example without Criteria (returns all links)
47
+
48
+ ```
49
+ scraper = LinkScraper::Scrape.new
50
+ scraped_links = scraper.start('https://en.wikipedia.org/wiki/Austin%2C_Texas')
51
+ ```
52
+
53
+ Returns Array of Links Based on Criteria in `args`:
54
+
55
+ ```
56
+ [
57
+ {:text=>"coordinates", :path=>"/wiki/geographic_coordinate_system"},
58
+ {:text=>"2.2 climate", :path=>""},
59
+ {:text=>"3 demographics", :path=>""},
60
+ {:text=>"explanation", :path=>"/wiki/template:climate_chart/how_to_read_a_climate_chart"},
61
+ {:text=>"humid subtropical climate", :path=>"/wiki/humid_subtropical_climate"},
62
+ {:text=>"kppen climate classification", :path=>"/wiki/k%c3%b6ppen_climate_classification"},
63
+ {:text=>"climate", :path=>""},
64
+ {:text=>"austin climate summary", :path=>"/web/20110606123855/http://www.srh.noaa.gov/images/ewx/aus/ausclisum.pdf"},
65
+ {:text=>"u.s. climate data", :path=>""},
66
+ {:text=>"nowdata - noaa online weather data", :path=>"/climate/xmacis.php"},
67
+ {:text=>"austin weather & climate", :path=>"/web/20070118231257/http://austin.about.com/od/weatherenvironment/a/weather.htm"},
68
+ {:text=>"nowdata - noaa online weather data", :path=>"/climate/xmacis.php"},
69
+ {:text=>"wmo climate normals for austin/municipal ap tx 19611990", :path=>"pub/gcos/wmo-normals/tables/reg_iv/us/group3/72254.txt"},
70
+ {:text=>"climate", :path=>"/wiki/climate_of_texas"},
71
+ {:text=>"demographics", :path=>"/wiki/demographics_of_texas"},
72
+ {:text=>"coordinates on wikidata", :path=>"/wiki/category:coordinates_on_wikidata"}
73
+ ]
74
+
75
+ ```
30
76
 
31
77
  ## Development
32
78
 
data/Rakefile CHANGED
@@ -18,32 +18,28 @@ task :console do
18
18
  ARGV.clear
19
19
 
20
20
  scraped_links = run_link_scraper
21
- # binding.pry
21
+ binding.pry
22
22
 
23
23
  IRB.start
24
24
  end
25
25
 
26
26
 
27
27
  def run_link_scraper
28
- urls = %w[
29
- austinchevrolet.not.real
30
- smith_acura.com/staff
31
- abcrepair.ca
32
- hertzrentals.com/review
33
- londonhyundai.uk/fleet
34
- http://www.townbuick.net/staff
35
- http://youtube.com/download
36
- www.madridinfiniti.es/collision
37
- www.mitsubishideals.sofake
38
- www.dallassubaru.com.sofake
39
- www.quickeats.net/contact_us
40
- www.school.edu/teachers
41
- www.www.nissancars/inventory
42
- www.www.toyotatown.net/staff/management
43
- www.www.yellowpages.com/business
44
- ]
28
+
29
+ text_criteria = {
30
+ pos_criteria: ['coordinates', 'zip codes', 'area codes', 'climate', 'demographics'],
31
+ neg_criteria: %w[drought school]
32
+ }
33
+
34
+ path_criteria = {
35
+ pos_criteria: ['coordinates', 'zip codes', 'area codes', 'climate', 'demographics'],
36
+ neg_criteria: %w[drought school]
37
+ }
38
+
39
+ scraper = LinkScraper::Scrape.new(text_criteria: text_criteria, path_criteria: path_criteria)
40
+ scraped_links = scraper.start('https://en.wikipedia.org/wiki/Austin%2C_Texas')
45
41
 
46
42
  binding.pry
47
- scraper_obj = LinkScraper::Scrape.new(WebsCriteria.all_scrub_web_criteria)
48
- scraped_links = scraper_obj.scrub_urls(urls)
43
+
44
+ # scraper = LinkScraper::Scrape.new(WebsCriteria.all_scrub_web_criteria)
49
45
  end
data/lib/link_scraper.rb CHANGED
@@ -1,6 +1,10 @@
1
+
1
2
  require "link_scraper/version"
2
- # require 'crm_formatter'
3
- # require 'pry'
3
+ require "link_scraper/scrape"
4
+
5
+ require 'mechanizer'
6
+ require 'scrub_db'
7
+ require 'pry'
4
8
 
5
9
  module LinkScraper
6
10
  # Your code goes here...
@@ -0,0 +1,256 @@
1
+ module LinkScraper
2
+ class Scrape
3
+ # attr_accessor :text_criteria, :path_criteria
4
+
5
+ def initialize(args={})
6
+ @text_scrub = ScrubDb::Strings.new(args.fetch(:text_criteria, {}))
7
+ @path_scrub = ScrubDb::Strings.new(args.fetch(:path_criteria, {}))
8
+ @noko = Mechanizer::Noko.new
9
+ end
10
+
11
+
12
+ def start(url)
13
+ noko_hash = @noko.scrape({url: url})
14
+ link_hashes = noko_hash[:texts_and_paths]
15
+
16
+ err_msg = noko_hash[:err_msg]
17
+ page = noko_hash[:page]
18
+ valid_links = scrub_link_hashes(link_hashes, url)
19
+ valid_links = extract_link_from_url(valid_links, url)
20
+ end
21
+
22
+
23
+ def extract_link_from_url(valid_links, url)
24
+ formatted = valid_links.map do |link|
25
+
26
+ begin
27
+ link[:path] = URI(link[:path])&.path
28
+ rescue StandardError => e
29
+ puts e.message
30
+ end
31
+
32
+ link
33
+ end
34
+ formatted
35
+ end
36
+
37
+
38
+ def scrub_link_hashes(link_hashes, url)
39
+ valid_hashes = link_hashes.map do |link_hsh|
40
+
41
+ if link_hsh[:text].present? || link_hsh[:path].present?
42
+ link_hsh = encode_link(link_hsh)
43
+ text_hsh = @text_scrub.scrub_string(link_hsh[:text])
44
+ path_hsh = @path_scrub.scrub_string(link_hsh[:path])
45
+
46
+ text = evaluate_scrub_hsh(text_hsh)
47
+ path = evaluate_scrub_hsh(path_hsh)
48
+ end
49
+
50
+ link_hsh = nil unless (text.present? || path.present?)
51
+ link_hsh
52
+ end
53
+
54
+ valid_hashes = valid_hashes.compact
55
+ end
56
+
57
+ def encode_link(link_hsh)
58
+ link_hsh[:text] = encoder(link_hsh[:text])
59
+ link_hsh[:path] = encoder(link_hsh[:path])
60
+ link_hsh
61
+ end
62
+
63
+
64
+ def encoder(text)
65
+ # if text.present? && !text.valid_encoding?
66
+ if text.present?
67
+ text = text.chars.select(&:valid_encoding?).join
68
+ text = text.delete("^\u{0000}-\u{007F}")
69
+ text = text&.gsub(/\s+/, ' ')&.strip
70
+ text = text.gsub("\"", ' ')&.strip
71
+ end
72
+ text
73
+ end
74
+
75
+
76
+ def evaluate_scrub_hsh(hsh)
77
+ string = nil
78
+ string = hsh[:string] if (hsh[:pos_criteria].any? && hsh[:neg_criteria].empty?)
79
+ end
80
+
81
+
82
+ ##############################################################################
83
+ ### Below has been replaced with dependency gems, but keep incase needed later. ####
84
+ # def old_but_important
85
+ # temp_name = nil
86
+ # stock_hsh = get_stocks(temp_name)
87
+ # stock_texts = stock_hsh[:stock_texts]
88
+ # stock_links = stock_hsh[:stock_links]
89
+ #
90
+ # link_text_results = []
91
+ # noko_page.links.each do |noko_text_link|
92
+ # noko_text = noko_text_link.text&.downcase&.gsub(/\W/,'')
93
+ # pre_noko_link = noko_text_link&.path&.downcase&.strip
94
+ # noko_link = @formatter.format_link(url, pre_noko_link)
95
+ #
96
+ # if (noko_text && noko_link) && (noko_text.length > 3 && noko_link.length > 3) && (check_text_link_ban(noko_link, noko_text, temp_name) != true)
97
+ # ## Find any Texts or Links that include 'team' or 'staff'
98
+ # if noko_text.include?('staff') || noko_link.include?('staff')
99
+ # link_text_hsh = {staff_text: noko_text, staff_link: noko_link}
100
+ # link_text_results << link_text_hsh
101
+ # end
102
+ #
103
+ # ## Find valid Links
104
+ # stock_links.each do |stock_link|
105
+ # stock_link = stock_link.downcase&.strip
106
+ # if noko_link.include?(stock_link) || stock_link.include?(noko_link)
107
+ # link_text_hsh = {staff_text: noko_text, staff_link: noko_link}
108
+ # link_text_results << link_text_hsh
109
+ # end
110
+ # end
111
+ #
112
+ # ## Find valid Texts
113
+ # stock_texts.each do |stock_text|
114
+ # stock_text = stock_text.downcase&.gsub(/\W/,'')
115
+ # if noko_text.include?(stock_text) || stock_text.include?(noko_text)
116
+ # link_text_hsh = {staff_text: noko_text, staff_link: noko_link}
117
+ # link_text_results << link_text_hsh
118
+ # end
119
+ # end
120
+ # end
121
+ # end
122
+ #
123
+ # link_text_results.uniq!
124
+ # puts "\n\n===================="
125
+ # puts "Valid Text and Links: #{link_text_results.count}"
126
+ # puts link_text_results.inspect
127
+ # # sleep(1)
128
+ # return link_text_results
129
+ # end
130
+
131
+
132
+ # def get_stocks(temp_name)
133
+ # special_templates = ["Cobalt", "Dealer Inspire", "DealerFire"]
134
+ # temp_name = 'general' if !special_templates.include?(temp_name)
135
+ #
136
+ # stock_texts = Term.where(sub_category: "staff_text").where(criteria_term: temp_name).map(&:response_term)
137
+ # # stock_texts += @tally_staff_texts
138
+ # # stock_texts.uniq!
139
+ #
140
+ # stock_links = Term.where(sub_category: "staff_path").where(criteria_term: temp_name).map(&:response_term)
141
+ # # stock_links += @tally_staff_links
142
+ # # stock_links.uniq!
143
+ #
144
+ # stock_hsh = {stock_texts: stock_texts, stock_links: stock_links}
145
+ # # puts stock_hsh
146
+ # # sleep(1)
147
+ # return stock_hsh
148
+ # end
149
+
150
+
151
+
152
+ # def get_query
153
+ # err_sts_arr = ['Error: Timeout', 'Error: Host', 'Error: TCP']
154
+ #
155
+ # query = Web.select(:id)
156
+ # .where(url_sts: 'Valid', page_sts: "Invalid")
157
+ # .where('page_date < ? OR page_date IS NULL', @cut_off)
158
+ # .or(Web.select(:id)
159
+ # .where(url_sts: 'Valid', temp_sts: 'Valid', page_sts: ['Valid', nil])
160
+ # .where('page_date < ? OR page_date IS NULL', @cut_off)
161
+ # ).or(Web.select(:id)
162
+ # .where(url_sts: 'Valid', temp_sts: 'Valid', page_sts: err_sts_arr)
163
+ # .where('timeout < ?', @db_timeout_limit)
164
+ # ).order("timeout ASC").pluck(:id)
165
+ # end
166
+ #
167
+ # def start_find_page
168
+ # query = get_query[0..20]
169
+ # while query.any?
170
+ # setup_iterator(query)
171
+ # query = get_query[0..20]
172
+ # break if !query.any?
173
+ # end
174
+ # end
175
+ #
176
+ # def setup_iterator(query)
177
+ # @query_count = query.count
178
+ # (@query_count & @query_count > @obj_in_grp) ? @group_count = (@query_count / @obj_in_grp) : @group_count = 2
179
+ # @dj_on ? iterate_query(query) : query.each { |id| template_starter(id) }
180
+ # end
181
+ #
182
+ #
183
+ # def template_starter(id)
184
+ # web = Web.find(id)
185
+ # web.links.destroy_all
186
+ # url = web.url
187
+ # temp_name = web.temp_name
188
+ # db_timeout = web.timeout
189
+ # db_timeout == 0 ? timeout = @dj_refresh_interval : timeout = (db_timeout * 3)
190
+ # puts "timeout: #{timeout}"
191
+ # puts "temp_name: #{temp_name}"
192
+ # puts url
193
+ #
194
+ # noko_hsh = start_noko(url, timeout)
195
+ # noko_page = noko_hsh[:noko_page]
196
+ # err_msg = noko_hsh[:err_msg]
197
+ #
198
+ # if err_msg.present?
199
+ # puts err_msg
200
+ # web.update(page_sts: err_msg, page_date: Time.now, timeout: timeout)
201
+ # elsif noko_page.present?
202
+ # link_text_results = scrub_link_hashes(noko_page, web)
203
+ # if !link_text_results.any?
204
+ # web.update(page_sts: 'Invalid', page_date: Time.now, timeout: timeout)
205
+ # else
206
+ # link_text_results.each do |link_text_hsh|
207
+ # link_obj = Link.find_or_create_by(link_text_hsh)
208
+ # web_link = web.links.where(id: link_obj).exists?
209
+ # web.links << link_obj if !web_link.present?
210
+ # web.update(page_sts: 'Valid', page_date: Time.now, timeout: 0)
211
+ # end
212
+ # end
213
+ # end
214
+ # end
215
+
216
+
217
+
218
+ ############ HELPER METHODS BELOW ################
219
+
220
+ #
221
+ # def check_text_link_ban(staff_link, staff_text, temp_name)
222
+ # return true if !staff_link.present? || !staff_text.present? || staff_link.length < 4
223
+ # return true if (temp_name = "Cobalt" && staff_text == 'sales')
224
+ # return true if check_link_ban(staff_link)
225
+ # return true if check_text_ban(staff_text)
226
+ #
227
+ # include_ban = %w(/#card-view/card/ 404 appl approve body career center click collision commercial contact customer demo direction discl drive employ espanol espaol finan get google guarantee habla history home hour inventory javascript job join lease legal location lube mail map match multilingual offers oil open opportunit parts phone place price quick rating review sales_tab schedule search service special start yourdeal survey tel test text trade value vehicle video virtual websiteby welcome why facebook commercial twit near dealernear educat faculty discount event year fleet build index amenit tire find award year blog)
228
+ #
229
+ # banned_link_text = include_ban.find { |ban| staff_link.include?(ban) || staff_text.include?(ban) }
230
+ # banned_link_text.present? ? true : false
231
+ # end
232
+ #
233
+ #
234
+ # def check_text_ban(staff_text)
235
+ # if staff_text.present?
236
+ # ## Make sure staff_text is downcase and compact like below for accurate comparisons.
237
+ # banned_texts = %w(dealershipinfo porsche preowned aboutus ourdealership newcars cars about honda ford learnmoreaboutus news fleet aboutourdealership fordf150 fordtrucks fordtransitconnectwagon fordtransitconnectwagon fordecosport fordfusion fordedge fordfocus fordescape fordexpedition fordexpeditionmax fordcmaxhybrid fordexplorer fordcars fordflex fordtransitcargovan fordsuvs fordtransitconnect fordtransitwagon fordtransitconnectvan fordfusionenergi fordvans fordfusionhybrid fordmustang moreaboutus tourournewdealership tourourdealership)
238
+ #
239
+ # banned_text = banned_texts.find { |ban| staff_text == ban }
240
+ # banned_text.present? ? true : false
241
+ # end
242
+ # end
243
+ #
244
+ #
245
+ # def check_link_ban(staff_link)
246
+ # if staff_link.present?
247
+ # link_strict_ban = %w(/about /about-us /about-us.htm /about.htm /about.html /#commercial /commercial.html /dealership/about.htm /dealeronlineretailing_d /dealeronlineretailing /dealership/department.htm /dealership/news.htm /departments.aspx /fleet /index.htm /meetourdepartments /sales.aspx /#tab-sales)
248
+ #
249
+ # banned_link = link_strict_ban.find { |ban| staff_link == ban }
250
+ # banned_link.present? ? true : false
251
+ # end
252
+ # end
253
+
254
+
255
+ end
256
+ end
@@ -1,3 +1,3 @@
1
1
  module LinkScraper
2
- VERSION = "1.01"
2
+ VERSION = "1.02"
3
3
  end
data/lib/webs_criteria.rb CHANGED
@@ -37,11 +37,11 @@ class WebsCriteria
37
37
  %w(com net)
38
38
  end
39
39
 
40
- # def self.seed_neg_hrefs
40
+ # def self.seed_neg_paths
41
41
  # %w(? .com .jpg @ * afri after anounc apply approved blog book business buy call care career cash charit cheap check click collis commerc cont contrib deal distrib download employ event face feature feed financ find fleet form gas generat golf here holiday hospi hour info insta inventory join later light login mail mobile movie museu music news none now oil part pay phone policy priva pump quick quote rate regist review saving schedul service shop sign site speci ticket tire today transla travel truck tv twitter watch youth)
42
42
  # end
43
43
  #
44
- # def self.seed_pos_hrefs
44
+ # def self.seed_pos_paths
45
45
  # %w(team staff management)
46
46
  # end
47
47
 
data/link_scraper.gemspec CHANGED
@@ -8,11 +8,11 @@ Gem::Specification.new do |spec|
8
8
  spec.version = LinkScraper::VERSION
9
9
  spec.authors = ["Adam Booth"]
10
10
  spec.email = ["4rlm@protonmail.ch"]
11
- spec.homepage = 'https://github.com/4rlm/scrub_db'
11
+ spec.homepage = 'https://github.com/4rlm/link_scraper'
12
12
  spec.license = "MIT"
13
13
 
14
- spec.summary = %q{Scrape website links' text and href with built-in scrubbing filter.}
15
- spec.description = %q{Scrape website links' text and href with built-in scrubbing filter. Designed to rapidly visit and scrape links from list of several URLs, then filters them based on your criteria. For example, to only grab the links for inventory, staff, or contact us, etc.}
14
+ spec.summary = %q{Scrape website links' text and path with built-in scrubbing filter.}
15
+ spec.description = %q{Scrape website links' text and path with built-in scrubbing filter. Designed to rapidly visit and scrape links from list of several URLs, then filters them based on your criteria. For example, to only grab the links for inventory, staff, or contact us, etc.}
16
16
 
17
17
 
18
18
  if spec.respond_to?(:metadata)
@@ -38,11 +38,11 @@ Gem::Specification.new do |spec|
38
38
 
39
39
  spec.required_ruby_version = '~> 2.5.1'
40
40
  spec.add_dependency 'activesupport', '~> 5.2'
41
- spec.add_dependency 'utf8_sanitizer', '~> 2.16'
42
- spec.add_dependency 'crm_formatter', '~> 2.61'
43
- spec.add_dependency 'mechanizer', '~> 1.11'
44
- spec.add_dependency 'scrub_db', '~> 2.22'
41
+ spec.add_dependency 'crm_formatter', '~> 2.64'
42
+ spec.add_dependency 'mechanizer', '~> 1.12'
43
+ spec.add_dependency 'scrub_db', '~> 2.23'
45
44
  spec.add_dependency 'url_verifier', '~> 2.12'
45
+ spec.add_dependency 'utf8_sanitizer', '~> 2.16'
46
46
 
47
47
  # spec.add_dependency "activesupport-inflector", ['~> 0.1.0']
48
48
  spec.add_development_dependency 'bundler', '~> 1.16', '>= 1.16.2'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: link_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.01'
4
+ version: '1.02'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Booth
@@ -25,75 +25,75 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '5.2'
27
27
  - !ruby/object:Gem::Dependency
28
- name: utf8_sanitizer
28
+ name: crm_formatter
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '2.16'
33
+ version: '2.64'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '2.16'
40
+ version: '2.64'
41
41
  - !ruby/object:Gem::Dependency
42
- name: crm_formatter
42
+ name: mechanizer
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '2.61'
47
+ version: '1.12'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '2.61'
54
+ version: '1.12'
55
55
  - !ruby/object:Gem::Dependency
56
- name: mechanizer
56
+ name: scrub_db
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.11'
61
+ version: '2.23'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.11'
68
+ version: '2.23'
69
69
  - !ruby/object:Gem::Dependency
70
- name: scrub_db
70
+ name: url_verifier
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '2.22'
75
+ version: '2.12'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '2.22'
82
+ version: '2.12'
83
83
  - !ruby/object:Gem::Dependency
84
- name: url_verifier
84
+ name: utf8_sanitizer
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '2.12'
89
+ version: '2.16'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '2.12'
96
+ version: '2.16'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: bundler
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -162,7 +162,7 @@ dependencies:
162
162
  - - "~>"
163
163
  - !ruby/object:Gem::Version
164
164
  version: '3.7'
165
- description: Scrape website links' text and href with built-in scrubbing filter. Designed
165
+ description: Scrape website links' text and path with built-in scrubbing filter. Designed
166
166
  to rapidly visit and scrape links from list of several URLs, then filters them based
167
167
  on your criteria. For example, to only grab the links for inventory, staff, or
168
168
  contact us, etc.
@@ -183,10 +183,11 @@ files:
183
183
  - bin/console
184
184
  - bin/setup
185
185
  - lib/link_scraper.rb
186
+ - lib/link_scraper/scrape.rb
186
187
  - lib/link_scraper/version.rb
187
188
  - lib/webs_criteria.rb
188
189
  - link_scraper.gemspec
189
- homepage: https://github.com/4rlm/scrub_db
190
+ homepage: https://github.com/4rlm/link_scraper
190
191
  licenses:
191
192
  - MIT
192
193
  metadata:
@@ -210,5 +211,5 @@ rubyforge_project:
210
211
  rubygems_version: 2.7.6
211
212
  signing_key:
212
213
  specification_version: 4
213
- summary: Scrape website links' text and href with built-in scrubbing filter.
214
+ summary: Scrape website links' text and path with built-in scrubbing filter.
214
215
  test_files: []