link_scrapper 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/link_scrapper.rb +43 -17
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 98b6d54cc75862b1e063a04e355a6f638deff14b
|
4
|
+
data.tar.gz: fbe3a8ecfbf27d81458862ec96543700e994a049
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 34296971fcebf4d437aba6703c36ca93a8599ee51d5ee73032e2e58217e8c25029feabb9c83c51f298fdbe5a27155196faaec2bf860e5751a228da843dcd499b
|
7
|
+
data.tar.gz: f3d82c1973ec70e37815ad88f35523b5edab699d0c473e7c8ce61e9387e83b1ffbf3ec5b456eb05001cc7e653282b88954790bc4136c79e24b818c8cabf05b28
|
data/lib/link_scrapper.rb
CHANGED
@@ -7,9 +7,15 @@ SEARCH_DOMAIN = 'http://virginiabeachwebdevelopment.com/'
|
|
7
7
|
# class for grabbing and parsing domain links
|
8
8
|
class LinkScrapper
|
9
9
|
|
10
|
-
def initialize(
|
10
|
+
def initialize(settings)
|
11
|
+
|
12
|
+
# available default settings
|
13
|
+
# domain: domain to be searched
|
14
|
+
# verbose: prints output as the script goes along
|
15
|
+
# results: hash or csv
|
11
16
|
|
12
17
|
# init link store hashes
|
18
|
+
@settings = settings;
|
13
19
|
@search_index = 0
|
14
20
|
@search_iteration = 0
|
15
21
|
@links = Array.new
|
@@ -20,11 +26,11 @@ class LinkScrapper
|
|
20
26
|
# gather search domain
|
21
27
|
if ARGV[0]
|
22
28
|
@search_domain = ARGV[0].dup
|
23
|
-
elsif
|
29
|
+
elsif @settings[:domain] == 'ue'
|
24
30
|
puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
|
25
31
|
@search_domain = gets.chomp
|
26
|
-
elsif
|
27
|
-
@search_domain =
|
32
|
+
elsif @settings[:domain]
|
33
|
+
@search_domain = @settings[:domain]
|
28
34
|
end
|
29
35
|
|
30
36
|
# override with default domain if entry is left empty
|
@@ -59,12 +65,17 @@ class LinkScrapper
|
|
59
65
|
@search_uri = @links[@search_index][0].chomp
|
60
66
|
else
|
61
67
|
# save results and exit
|
62
|
-
|
68
|
+
if @settings[:results] == 'csv'
|
69
|
+
save_results
|
70
|
+
else
|
71
|
+
return { checked_links: @checked_links, error_links: @error_links, external_links: @external_links}
|
72
|
+
end
|
63
73
|
exit
|
64
74
|
end
|
65
75
|
|
66
76
|
# check for direct link
|
67
77
|
if @search_uri =~ /^htt(p|ps):/
|
78
|
+
|
68
79
|
# if external link go to next link
|
69
80
|
if @search_uri.index(@local_domain[0]) == nil
|
70
81
|
if !@external_links[@search_uri.to_sym]
|
@@ -73,11 +84,11 @@ class LinkScrapper
|
|
73
84
|
response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
|
74
85
|
t2 = Time.now
|
75
86
|
delta = t2 - t1
|
76
|
-
|
87
|
+
code = response.code
|
77
88
|
rescue => ex
|
78
|
-
|
89
|
+
code = 408
|
79
90
|
end
|
80
|
-
@external_links[@search_uri.to_sym] = {res:
|
91
|
+
@external_links[@search_uri.to_sym] = {res: code, time: delta}
|
81
92
|
end
|
82
93
|
@skip = 1
|
83
94
|
end
|
@@ -93,8 +104,17 @@ class LinkScrapper
|
|
93
104
|
@skip = 1
|
94
105
|
else
|
95
106
|
# check for protocol agnostic and indirect links
|
96
|
-
|
97
|
-
|
107
|
+
case @search_uri[0,1]
|
108
|
+
when '.'
|
109
|
+
@search_uri[0,1] = ''
|
110
|
+
end
|
111
|
+
case @search_uri[0,2]
|
112
|
+
when '//', './', '..'
|
113
|
+
@search_uri[0,2] = ''
|
114
|
+
end
|
115
|
+
case @search_uri[0,3]
|
116
|
+
when '../'
|
117
|
+
@search_uri[0,3] = ''
|
98
118
|
end
|
99
119
|
# check for relative link
|
100
120
|
if @search_uri[0] == '/'
|
@@ -104,7 +124,8 @@ class LinkScrapper
|
|
104
124
|
if @search_uri !~ /^([\w]|%|#|\?)/
|
105
125
|
@search_index += 1
|
106
126
|
@skip = 1
|
107
|
-
|
127
|
+
@error_links[@search_uri] = ''
|
128
|
+
puts "invalid uri #{@search_uri}" if @settings[:verbose]
|
108
129
|
return
|
109
130
|
end
|
110
131
|
# define uri string
|
@@ -143,7 +164,7 @@ class LinkScrapper
|
|
143
164
|
if @skip == 0
|
144
165
|
|
145
166
|
# let user know which uri is currently active
|
146
|
-
puts @search_uri
|
167
|
+
puts @search_uri if @settings[:verbose]
|
147
168
|
|
148
169
|
# gather page request response
|
149
170
|
begin
|
@@ -193,15 +214,20 @@ class LinkScrapper
|
|
193
214
|
def save_results
|
194
215
|
# save search results
|
195
216
|
CSV.open('results.csv', 'wb') {|csv|
|
196
|
-
@checked_links.each {|
|
197
|
-
csv << [
|
217
|
+
@checked_links.each {|link|
|
218
|
+
csv << [link[0], link[1][:res], link[1][:time]]
|
198
219
|
}
|
199
220
|
}
|
200
|
-
|
201
221
|
# save list of external links
|
202
222
|
CSV.open('external-links.csv', 'wb') {|csv|
|
203
|
-
@external_links.each do |
|
204
|
-
csv << [
|
223
|
+
@external_links.each do |link|
|
224
|
+
csv << [link[0], link[1][:res], link[1][:time]]
|
225
|
+
end
|
226
|
+
}
|
227
|
+
# save list of invalid links
|
228
|
+
CSV.open('invalid.csv', 'wb') {|csv|
|
229
|
+
@error_links.each do |link|
|
230
|
+
csv << link
|
205
231
|
end
|
206
232
|
}
|
207
233
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: link_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robert McDowell
|
@@ -10,7 +10,9 @@ bindir: bin
|
|
10
10
|
cert_chain: []
|
11
11
|
date: 2016-04-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: A simple gem for scrapping links within an assigned website.
|
13
|
+
description: A simple gem for scrapping links within an assigned website. Results
|
14
|
+
for domain, external links, and invalid URLs can be saved as CSVs or returned as
|
15
|
+
a hash.
|
14
16
|
email: doodersrage@gmail.com
|
15
17
|
executables: []
|
16
18
|
extensions: []
|