link_scraper 1.01 → 1.02
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +48 -2
- data/Rakefile +16 -20
- data/lib/link_scraper.rb +6 -2
- data/lib/link_scraper/scrape.rb +256 -0
- data/lib/link_scraper/version.rb +1 -1
- data/lib/webs_criteria.rb +2 -2
- data/link_scraper.gemspec +7 -7
- metadata +20 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1722daeff2d952711b2ed8247f5e1875c3b3d2739e96269ca483474405756cd2
|
4
|
+
data.tar.gz: bc901c65b11f0d123e5973bcb5b65b0b6add539f81b7102c625a47bb7a62994a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aa4096762dc168b12d44909929a443d12dd901c40c4ad4d69baaafae4ffc2d97c53bb97f34a90175a613a843ccecd6caaeaf5d5fdc279e5f4062400d750057d6
|
7
|
+
data.tar.gz: 1acb651d9f98ec067551dc5f38fa10c2c98a597229016f76407a2d81a414805cb54fe23414ec698ed7b86caf70d45d2b3803ba318de6bab581f700b142c28e97
|
data/README.md
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/link_scraper.svg)](https://badge.fury.io/rb/link_scraper)
|
4
4
|
[![MIT License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
|
5
5
|
|
6
|
-
#### Scrape website links' text and
|
6
|
+
#### Scrape website links' text and path with built-in scrubbing filter.
|
7
7
|
|
8
8
|
Designed to rapidly visit and scrape links from list of several URLs, then filters them based on your criteria. For example, to only grab the links for inventory, staff, or contact us, etc.
|
9
9
|
|
@@ -26,7 +26,53 @@ Or install it yourself as:
|
|
26
26
|
|
27
27
|
## Usage
|
28
28
|
|
29
|
-
|
29
|
+
This is an example of how to grab links from a URL. `args` are optional if you want to scrub and filter the links based on your criteria, like below.
|
30
|
+
|
31
|
+
```
|
32
|
+
text_criteria = {
|
33
|
+
pos_criteria: ['coordinates', 'zip codes', 'area codes', 'climate', 'demographics'],
|
34
|
+
neg_criteria: %w[drought school]
|
35
|
+
}
|
36
|
+
|
37
|
+
path_criteria = {
|
38
|
+
pos_criteria: ['coordinates', 'zip codes', 'area codes', 'climate', 'demographics'],
|
39
|
+
neg_criteria: %w[drought school]
|
40
|
+
}
|
41
|
+
|
42
|
+
scraper = LinkScraper::Scrape.new(text_criteria: text_criteria, path_criteria: path_criteria)
|
43
|
+
scraped_links = scraper.start('https://en.wikipedia.org/wiki/Austin%2C_Texas')
|
44
|
+
```
|
45
|
+
|
46
|
+
Example without Criteria (returns all links)
|
47
|
+
|
48
|
+
```
|
49
|
+
scraper = LinkScraper::Scrape.new
|
50
|
+
scraped_links = scraper.start('https://en.wikipedia.org/wiki/Austin%2C_Texas')
|
51
|
+
```
|
52
|
+
|
53
|
+
Returns Array of Links Based on Criteria in `args`:
|
54
|
+
|
55
|
+
```
|
56
|
+
[
|
57
|
+
{:text=>"coordinates", :path=>"/wiki/geographic_coordinate_system"},
|
58
|
+
{:text=>"2.2 climate", :path=>""},
|
59
|
+
{:text=>"3 demographics", :path=>""},
|
60
|
+
{:text=>"explanation", :path=>"/wiki/template:climate_chart/how_to_read_a_climate_chart"},
|
61
|
+
{:text=>"humid subtropical climate", :path=>"/wiki/humid_subtropical_climate"},
|
62
|
+
{:text=>"kppen climate classification", :path=>"/wiki/k%c3%b6ppen_climate_classification"},
|
63
|
+
{:text=>"climate", :path=>""},
|
64
|
+
{:text=>"austin climate summary", :path=>"/web/20110606123855/http://www.srh.noaa.gov/images/ewx/aus/ausclisum.pdf"},
|
65
|
+
{:text=>"u.s. climate data", :path=>""},
|
66
|
+
{:text=>"nowdata - noaa online weather data", :path=>"/climate/xmacis.php"},
|
67
|
+
{:text=>"austin weather & climate", :path=>"/web/20070118231257/http://austin.about.com/od/weatherenvironment/a/weather.htm"},
|
68
|
+
{:text=>"nowdata - noaa online weather data", :path=>"/climate/xmacis.php"},
|
69
|
+
{:text=>"wmo climate normals for austin/municipal ap tx 19611990", :path=>"pub/gcos/wmo-normals/tables/reg_iv/us/group3/72254.txt"},
|
70
|
+
{:text=>"climate", :path=>"/wiki/climate_of_texas"},
|
71
|
+
{:text=>"demographics", :path=>"/wiki/demographics_of_texas"},
|
72
|
+
{:text=>"coordinates on wikidata", :path=>"/wiki/category:coordinates_on_wikidata"}
|
73
|
+
]
|
74
|
+
|
75
|
+
```
|
30
76
|
|
31
77
|
## Development
|
32
78
|
|
data/Rakefile
CHANGED
@@ -18,32 +18,28 @@ task :console do
|
|
18
18
|
ARGV.clear
|
19
19
|
|
20
20
|
scraped_links = run_link_scraper
|
21
|
-
|
21
|
+
binding.pry
|
22
22
|
|
23
23
|
IRB.start
|
24
24
|
end
|
25
25
|
|
26
26
|
|
27
27
|
def run_link_scraper
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
www.www.nissancars/inventory
|
42
|
-
www.www.toyotatown.net/staff/management
|
43
|
-
www.www.yellowpages.com/business
|
44
|
-
]
|
28
|
+
|
29
|
+
text_criteria = {
|
30
|
+
pos_criteria: ['coordinates', 'zip codes', 'area codes', 'climate', 'demographics'],
|
31
|
+
neg_criteria: %w[drought school]
|
32
|
+
}
|
33
|
+
|
34
|
+
path_criteria = {
|
35
|
+
pos_criteria: ['coordinates', 'zip codes', 'area codes', 'climate', 'demographics'],
|
36
|
+
neg_criteria: %w[drought school]
|
37
|
+
}
|
38
|
+
|
39
|
+
scraper = LinkScraper::Scrape.new(text_criteria: text_criteria, path_criteria: path_criteria)
|
40
|
+
scraped_links = scraper.start('https://en.wikipedia.org/wiki/Austin%2C_Texas')
|
45
41
|
|
46
42
|
binding.pry
|
47
|
-
|
48
|
-
|
43
|
+
|
44
|
+
# scraper = LinkScraper::Scrape.new(WebsCriteria.all_scrub_web_criteria)
|
49
45
|
end
|
data/lib/link_scraper.rb
CHANGED
@@ -0,0 +1,256 @@
|
|
1
|
+
module LinkScraper
|
2
|
+
class Scrape
|
3
|
+
# attr_accessor :text_criteria, :path_criteria
|
4
|
+
|
5
|
+
def initialize(args={})
|
6
|
+
@text_scrub = ScrubDb::Strings.new(args.fetch(:text_criteria, {}))
|
7
|
+
@path_scrub = ScrubDb::Strings.new(args.fetch(:path_criteria, {}))
|
8
|
+
@noko = Mechanizer::Noko.new
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
def start(url)
|
13
|
+
noko_hash = @noko.scrape({url: url})
|
14
|
+
link_hashes = noko_hash[:texts_and_paths]
|
15
|
+
|
16
|
+
err_msg = noko_hash[:err_msg]
|
17
|
+
page = noko_hash[:page]
|
18
|
+
valid_links = scrub_link_hashes(link_hashes, url)
|
19
|
+
valid_links = extract_link_from_url(valid_links, url)
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def extract_link_from_url(valid_links, url)
|
24
|
+
formatted = valid_links.map do |link|
|
25
|
+
|
26
|
+
begin
|
27
|
+
link[:path] = URI(link[:path])&.path
|
28
|
+
rescue StandardError => e
|
29
|
+
puts e.message
|
30
|
+
end
|
31
|
+
|
32
|
+
link
|
33
|
+
end
|
34
|
+
formatted
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
def scrub_link_hashes(link_hashes, url)
|
39
|
+
valid_hashes = link_hashes.map do |link_hsh|
|
40
|
+
|
41
|
+
if link_hsh[:text].present? || link_hsh[:path].present?
|
42
|
+
link_hsh = encode_link(link_hsh)
|
43
|
+
text_hsh = @text_scrub.scrub_string(link_hsh[:text])
|
44
|
+
path_hsh = @path_scrub.scrub_string(link_hsh[:path])
|
45
|
+
|
46
|
+
text = evaluate_scrub_hsh(text_hsh)
|
47
|
+
path = evaluate_scrub_hsh(path_hsh)
|
48
|
+
end
|
49
|
+
|
50
|
+
link_hsh = nil unless (text.present? || path.present?)
|
51
|
+
link_hsh
|
52
|
+
end
|
53
|
+
|
54
|
+
valid_hashes = valid_hashes.compact
|
55
|
+
end
|
56
|
+
|
57
|
+
def encode_link(link_hsh)
|
58
|
+
link_hsh[:text] = encoder(link_hsh[:text])
|
59
|
+
link_hsh[:path] = encoder(link_hsh[:path])
|
60
|
+
link_hsh
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
def encoder(text)
|
65
|
+
# if text.present? && !text.valid_encoding?
|
66
|
+
if text.present?
|
67
|
+
text = text.chars.select(&:valid_encoding?).join
|
68
|
+
text = text.delete("^\u{0000}-\u{007F}")
|
69
|
+
text = text&.gsub(/\s+/, ' ')&.strip
|
70
|
+
text = text.gsub("\"", ' ')&.strip
|
71
|
+
end
|
72
|
+
text
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
def evaluate_scrub_hsh(hsh)
|
77
|
+
string = nil
|
78
|
+
string = hsh[:string] if (hsh[:pos_criteria].any? && hsh[:neg_criteria].empty?)
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
##############################################################################
|
83
|
+
### Below has been replaced with dependency gems, but keep incase needed later. ####
|
84
|
+
# def old_but_important
|
85
|
+
# temp_name = nil
|
86
|
+
# stock_hsh = get_stocks(temp_name)
|
87
|
+
# stock_texts = stock_hsh[:stock_texts]
|
88
|
+
# stock_links = stock_hsh[:stock_links]
|
89
|
+
#
|
90
|
+
# link_text_results = []
|
91
|
+
# noko_page.links.each do |noko_text_link|
|
92
|
+
# noko_text = noko_text_link.text&.downcase&.gsub(/\W/,'')
|
93
|
+
# pre_noko_link = noko_text_link&.path&.downcase&.strip
|
94
|
+
# noko_link = @formatter.format_link(url, pre_noko_link)
|
95
|
+
#
|
96
|
+
# if (noko_text && noko_link) && (noko_text.length > 3 && noko_link.length > 3) && (check_text_link_ban(noko_link, noko_text, temp_name) != true)
|
97
|
+
# ## Find any Texts or Links that include 'team' or 'staff'
|
98
|
+
# if noko_text.include?('staff') || noko_link.include?('staff')
|
99
|
+
# link_text_hsh = {staff_text: noko_text, staff_link: noko_link}
|
100
|
+
# link_text_results << link_text_hsh
|
101
|
+
# end
|
102
|
+
#
|
103
|
+
# ## Find valid Links
|
104
|
+
# stock_links.each do |stock_link|
|
105
|
+
# stock_link = stock_link.downcase&.strip
|
106
|
+
# if noko_link.include?(stock_link) || stock_link.include?(noko_link)
|
107
|
+
# link_text_hsh = {staff_text: noko_text, staff_link: noko_link}
|
108
|
+
# link_text_results << link_text_hsh
|
109
|
+
# end
|
110
|
+
# end
|
111
|
+
#
|
112
|
+
# ## Find valid Texts
|
113
|
+
# stock_texts.each do |stock_text|
|
114
|
+
# stock_text = stock_text.downcase&.gsub(/\W/,'')
|
115
|
+
# if noko_text.include?(stock_text) || stock_text.include?(noko_text)
|
116
|
+
# link_text_hsh = {staff_text: noko_text, staff_link: noko_link}
|
117
|
+
# link_text_results << link_text_hsh
|
118
|
+
# end
|
119
|
+
# end
|
120
|
+
# end
|
121
|
+
# end
|
122
|
+
#
|
123
|
+
# link_text_results.uniq!
|
124
|
+
# puts "\n\n===================="
|
125
|
+
# puts "Valid Text and Links: #{link_text_results.count}"
|
126
|
+
# puts link_text_results.inspect
|
127
|
+
# # sleep(1)
|
128
|
+
# return link_text_results
|
129
|
+
# end
|
130
|
+
|
131
|
+
|
132
|
+
# def get_stocks(temp_name)
|
133
|
+
# special_templates = ["Cobalt", "Dealer Inspire", "DealerFire"]
|
134
|
+
# temp_name = 'general' if !special_templates.include?(temp_name)
|
135
|
+
#
|
136
|
+
# stock_texts = Term.where(sub_category: "staff_text").where(criteria_term: temp_name).map(&:response_term)
|
137
|
+
# # stock_texts += @tally_staff_texts
|
138
|
+
# # stock_texts.uniq!
|
139
|
+
#
|
140
|
+
# stock_links = Term.where(sub_category: "staff_path").where(criteria_term: temp_name).map(&:response_term)
|
141
|
+
# # stock_links += @tally_staff_links
|
142
|
+
# # stock_links.uniq!
|
143
|
+
#
|
144
|
+
# stock_hsh = {stock_texts: stock_texts, stock_links: stock_links}
|
145
|
+
# # puts stock_hsh
|
146
|
+
# # sleep(1)
|
147
|
+
# return stock_hsh
|
148
|
+
# end
|
149
|
+
|
150
|
+
|
151
|
+
|
152
|
+
# def get_query
|
153
|
+
# err_sts_arr = ['Error: Timeout', 'Error: Host', 'Error: TCP']
|
154
|
+
#
|
155
|
+
# query = Web.select(:id)
|
156
|
+
# .where(url_sts: 'Valid', page_sts: "Invalid")
|
157
|
+
# .where('page_date < ? OR page_date IS NULL', @cut_off)
|
158
|
+
# .or(Web.select(:id)
|
159
|
+
# .where(url_sts: 'Valid', temp_sts: 'Valid', page_sts: ['Valid', nil])
|
160
|
+
# .where('page_date < ? OR page_date IS NULL', @cut_off)
|
161
|
+
# ).or(Web.select(:id)
|
162
|
+
# .where(url_sts: 'Valid', temp_sts: 'Valid', page_sts: err_sts_arr)
|
163
|
+
# .where('timeout < ?', @db_timeout_limit)
|
164
|
+
# ).order("timeout ASC").pluck(:id)
|
165
|
+
# end
|
166
|
+
#
|
167
|
+
# def start_find_page
|
168
|
+
# query = get_query[0..20]
|
169
|
+
# while query.any?
|
170
|
+
# setup_iterator(query)
|
171
|
+
# query = get_query[0..20]
|
172
|
+
# break if !query.any?
|
173
|
+
# end
|
174
|
+
# end
|
175
|
+
#
|
176
|
+
# def setup_iterator(query)
|
177
|
+
# @query_count = query.count
|
178
|
+
# (@query_count & @query_count > @obj_in_grp) ? @group_count = (@query_count / @obj_in_grp) : @group_count = 2
|
179
|
+
# @dj_on ? iterate_query(query) : query.each { |id| template_starter(id) }
|
180
|
+
# end
|
181
|
+
#
|
182
|
+
#
|
183
|
+
# def template_starter(id)
|
184
|
+
# web = Web.find(id)
|
185
|
+
# web.links.destroy_all
|
186
|
+
# url = web.url
|
187
|
+
# temp_name = web.temp_name
|
188
|
+
# db_timeout = web.timeout
|
189
|
+
# db_timeout == 0 ? timeout = @dj_refresh_interval : timeout = (db_timeout * 3)
|
190
|
+
# puts "timeout: #{timeout}"
|
191
|
+
# puts "temp_name: #{temp_name}"
|
192
|
+
# puts url
|
193
|
+
#
|
194
|
+
# noko_hsh = start_noko(url, timeout)
|
195
|
+
# noko_page = noko_hsh[:noko_page]
|
196
|
+
# err_msg = noko_hsh[:err_msg]
|
197
|
+
#
|
198
|
+
# if err_msg.present?
|
199
|
+
# puts err_msg
|
200
|
+
# web.update(page_sts: err_msg, page_date: Time.now, timeout: timeout)
|
201
|
+
# elsif noko_page.present?
|
202
|
+
# link_text_results = scrub_link_hashes(noko_page, web)
|
203
|
+
# if !link_text_results.any?
|
204
|
+
# web.update(page_sts: 'Invalid', page_date: Time.now, timeout: timeout)
|
205
|
+
# else
|
206
|
+
# link_text_results.each do |link_text_hsh|
|
207
|
+
# link_obj = Link.find_or_create_by(link_text_hsh)
|
208
|
+
# web_link = web.links.where(id: link_obj).exists?
|
209
|
+
# web.links << link_obj if !web_link.present?
|
210
|
+
# web.update(page_sts: 'Valid', page_date: Time.now, timeout: 0)
|
211
|
+
# end
|
212
|
+
# end
|
213
|
+
# end
|
214
|
+
# end
|
215
|
+
|
216
|
+
|
217
|
+
|
218
|
+
############ HELPER METHODS BELOW ################
|
219
|
+
|
220
|
+
#
|
221
|
+
# def check_text_link_ban(staff_link, staff_text, temp_name)
|
222
|
+
# return true if !staff_link.present? || !staff_text.present? || staff_link.length < 4
|
223
|
+
# return true if (temp_name = "Cobalt" && staff_text == 'sales')
|
224
|
+
# return true if check_link_ban(staff_link)
|
225
|
+
# return true if check_text_ban(staff_text)
|
226
|
+
#
|
227
|
+
# include_ban = %w(/#card-view/card/ 404 appl approve body career center click collision commercial contact customer demo direction discl drive employ espanol espaol finan get google guarantee habla history home hour inventory javascript job join lease legal location lube mail map match multilingual offers oil open opportunit parts phone place price quick rating review sales_tab schedule search service special start yourdeal survey tel test text trade value vehicle video virtual websiteby welcome why facebook commercial twit near dealernear educat faculty discount event year fleet build index amenit tire find award year blog)
|
228
|
+
#
|
229
|
+
# banned_link_text = include_ban.find { |ban| staff_link.include?(ban) || staff_text.include?(ban) }
|
230
|
+
# banned_link_text.present? ? true : false
|
231
|
+
# end
|
232
|
+
#
|
233
|
+
#
|
234
|
+
# def check_text_ban(staff_text)
|
235
|
+
# if staff_text.present?
|
236
|
+
# ## Make sure staff_text is downcase and compact like below for accurate comparisons.
|
237
|
+
# banned_texts = %w(dealershipinfo porsche preowned aboutus ourdealership newcars cars about honda ford learnmoreaboutus news fleet aboutourdealership fordf150 fordtrucks fordtransitconnectwagon fordtransitconnectwagon fordecosport fordfusion fordedge fordfocus fordescape fordexpedition fordexpeditionmax fordcmaxhybrid fordexplorer fordcars fordflex fordtransitcargovan fordsuvs fordtransitconnect fordtransitwagon fordtransitconnectvan fordfusionenergi fordvans fordfusionhybrid fordmustang moreaboutus tourournewdealership tourourdealership)
|
238
|
+
#
|
239
|
+
# banned_text = banned_texts.find { |ban| staff_text == ban }
|
240
|
+
# banned_text.present? ? true : false
|
241
|
+
# end
|
242
|
+
# end
|
243
|
+
#
|
244
|
+
#
|
245
|
+
# def check_link_ban(staff_link)
|
246
|
+
# if staff_link.present?
|
247
|
+
# link_strict_ban = %w(/about /about-us /about-us.htm /about.htm /about.html /#commercial /commercial.html /dealership/about.htm /dealeronlineretailing_d /dealeronlineretailing /dealership/department.htm /dealership/news.htm /departments.aspx /fleet /index.htm /meetourdepartments /sales.aspx /#tab-sales)
|
248
|
+
#
|
249
|
+
# banned_link = link_strict_ban.find { |ban| staff_link == ban }
|
250
|
+
# banned_link.present? ? true : false
|
251
|
+
# end
|
252
|
+
# end
|
253
|
+
|
254
|
+
|
255
|
+
end
|
256
|
+
end
|
data/lib/link_scraper/version.rb
CHANGED
data/lib/webs_criteria.rb
CHANGED
@@ -37,11 +37,11 @@ class WebsCriteria
|
|
37
37
|
%w(com net)
|
38
38
|
end
|
39
39
|
|
40
|
-
# def self.
|
40
|
+
# def self.seed_neg_paths
|
41
41
|
# %w(? .com .jpg @ * afri after anounc apply approved blog book business buy call care career cash charit cheap check click collis commerc cont contrib deal distrib download employ event face feature feed financ find fleet form gas generat golf here holiday hospi hour info insta inventory join later light login mail mobile movie museu music news none now oil part pay phone policy priva pump quick quote rate regist review saving schedul service shop sign site speci ticket tire today transla travel truck tv twitter watch youth)
|
42
42
|
# end
|
43
43
|
#
|
44
|
-
# def self.
|
44
|
+
# def self.seed_pos_paths
|
45
45
|
# %w(team staff management)
|
46
46
|
# end
|
47
47
|
|
data/link_scraper.gemspec
CHANGED
@@ -8,11 +8,11 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = LinkScraper::VERSION
|
9
9
|
spec.authors = ["Adam Booth"]
|
10
10
|
spec.email = ["4rlm@protonmail.ch"]
|
11
|
-
spec.homepage = 'https://github.com/4rlm/
|
11
|
+
spec.homepage = 'https://github.com/4rlm/link_scraper'
|
12
12
|
spec.license = "MIT"
|
13
13
|
|
14
|
-
spec.summary = %q{Scrape website links' text and
|
15
|
-
spec.description = %q{Scrape website links' text and
|
14
|
+
spec.summary = %q{Scrape website links' text and path with built-in scrubbing filter.}
|
15
|
+
spec.description = %q{Scrape website links' text and path with built-in scrubbing filter. Designed to rapidly visit and scrape links from list of several URLs, then filters them based on your criteria. For example, to only grab the links for inventory, staff, or contact us, etc.}
|
16
16
|
|
17
17
|
|
18
18
|
if spec.respond_to?(:metadata)
|
@@ -38,11 +38,11 @@ Gem::Specification.new do |spec|
|
|
38
38
|
|
39
39
|
spec.required_ruby_version = '~> 2.5.1'
|
40
40
|
spec.add_dependency 'activesupport', '~> 5.2'
|
41
|
-
spec.add_dependency '
|
42
|
-
spec.add_dependency '
|
43
|
-
spec.add_dependency '
|
44
|
-
spec.add_dependency 'scrub_db', '~> 2.22'
|
41
|
+
spec.add_dependency 'crm_formatter', '~> 2.64'
|
42
|
+
spec.add_dependency 'mechanizer', '~> 1.12'
|
43
|
+
spec.add_dependency 'scrub_db', '~> 2.23'
|
45
44
|
spec.add_dependency 'url_verifier', '~> 2.12'
|
45
|
+
spec.add_dependency 'utf8_sanitizer', '~> 2.16'
|
46
46
|
|
47
47
|
# spec.add_dependency "activesupport-inflector", ['~> 0.1.0']
|
48
48
|
spec.add_development_dependency 'bundler', '~> 1.16', '>= 1.16.2'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: link_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.02'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Booth
|
@@ -25,75 +25,75 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '5.2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: crm_formatter
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '2.
|
33
|
+
version: '2.64'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '2.
|
40
|
+
version: '2.64'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: mechanizer
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '1.12'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '1.12'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: scrub_db
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '2.23'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '2.23'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: url_verifier
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '2.
|
75
|
+
version: '2.12'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '2.
|
82
|
+
version: '2.12'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: utf8_sanitizer
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '2.
|
89
|
+
version: '2.16'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '2.
|
96
|
+
version: '2.16'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: bundler
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -162,7 +162,7 @@ dependencies:
|
|
162
162
|
- - "~>"
|
163
163
|
- !ruby/object:Gem::Version
|
164
164
|
version: '3.7'
|
165
|
-
description: Scrape website links' text and
|
165
|
+
description: Scrape website links' text and path with built-in scrubbing filter. Designed
|
166
166
|
to rapidly visit and scrape links from list of several URLs, then filters them based
|
167
167
|
on your criteria. For example, to only grab the links for inventory, staff, or
|
168
168
|
contact us, etc.
|
@@ -183,10 +183,11 @@ files:
|
|
183
183
|
- bin/console
|
184
184
|
- bin/setup
|
185
185
|
- lib/link_scraper.rb
|
186
|
+
- lib/link_scraper/scrape.rb
|
186
187
|
- lib/link_scraper/version.rb
|
187
188
|
- lib/webs_criteria.rb
|
188
189
|
- link_scraper.gemspec
|
189
|
-
homepage: https://github.com/4rlm/
|
190
|
+
homepage: https://github.com/4rlm/link_scraper
|
190
191
|
licenses:
|
191
192
|
- MIT
|
192
193
|
metadata:
|
@@ -210,5 +211,5 @@ rubyforge_project:
|
|
210
211
|
rubygems_version: 2.7.6
|
211
212
|
signing_key:
|
212
213
|
specification_version: 4
|
213
|
-
summary: Scrape website links' text and
|
214
|
+
summary: Scrape website links' text and path with built-in scrubbing filter.
|
214
215
|
test_files: []
|