link_scrapper 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/link_scrapper.rb +43 -17
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 98b6d54cc75862b1e063a04e355a6f638deff14b
|
4
|
+
data.tar.gz: fbe3a8ecfbf27d81458862ec96543700e994a049
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 34296971fcebf4d437aba6703c36ca93a8599ee51d5ee73032e2e58217e8c25029feabb9c83c51f298fdbe5a27155196faaec2bf860e5751a228da843dcd499b
|
7
|
+
data.tar.gz: f3d82c1973ec70e37815ad88f35523b5edab699d0c473e7c8ce61e9387e83b1ffbf3ec5b456eb05001cc7e653282b88954790bc4136c79e24b818c8cabf05b28
|
data/lib/link_scrapper.rb
CHANGED
@@ -7,9 +7,15 @@ SEARCH_DOMAIN = 'http://virginiabeachwebdevelopment.com/'
|
|
7
7
|
# class for grabbing and parsing domain links
|
8
8
|
class LinkScrapper
|
9
9
|
|
10
|
-
def initialize(
|
10
|
+
def initialize(settings)
|
11
|
+
|
12
|
+
# available default settings
|
13
|
+
# domain: domain to be searched
|
14
|
+
# verbose: prints output as the script goes along
|
15
|
+
# results: hash or csv
|
11
16
|
|
12
17
|
# init link store hashes
|
18
|
+
@settings = settings;
|
13
19
|
@search_index = 0
|
14
20
|
@search_iteration = 0
|
15
21
|
@links = Array.new
|
@@ -20,11 +26,11 @@ class LinkScrapper
|
|
20
26
|
# gather search domain
|
21
27
|
if ARGV[0]
|
22
28
|
@search_domain = ARGV[0].dup
|
23
|
-
elsif
|
29
|
+
elsif @settings[:domain] == 'ue'
|
24
30
|
puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
|
25
31
|
@search_domain = gets.chomp
|
26
|
-
elsif
|
27
|
-
@search_domain =
|
32
|
+
elsif @settings[:domain]
|
33
|
+
@search_domain = @settings[:domain]
|
28
34
|
end
|
29
35
|
|
30
36
|
# override with default domain if entry is left empty
|
@@ -59,12 +65,17 @@ class LinkScrapper
|
|
59
65
|
@search_uri = @links[@search_index][0].chomp
|
60
66
|
else
|
61
67
|
# save results and exit
|
62
|
-
|
68
|
+
if @settings[:results] == 'csv'
|
69
|
+
save_results
|
70
|
+
else
|
71
|
+
return { checked_links: @checked_links, error_links: @error_links, external_links: @external_links}
|
72
|
+
end
|
63
73
|
exit
|
64
74
|
end
|
65
75
|
|
66
76
|
# check for direct link
|
67
77
|
if @search_uri =~ /^htt(p|ps):/
|
78
|
+
|
68
79
|
# if external link go to next link
|
69
80
|
if @search_uri.index(@local_domain[0]) == nil
|
70
81
|
if !@external_links[@search_uri.to_sym]
|
@@ -73,11 +84,11 @@ class LinkScrapper
|
|
73
84
|
response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
|
74
85
|
t2 = Time.now
|
75
86
|
delta = t2 - t1
|
76
|
-
|
87
|
+
code = response.code
|
77
88
|
rescue => ex
|
78
|
-
|
89
|
+
code = 408
|
79
90
|
end
|
80
|
-
@external_links[@search_uri.to_sym] = {res:
|
91
|
+
@external_links[@search_uri.to_sym] = {res: code, time: delta}
|
81
92
|
end
|
82
93
|
@skip = 1
|
83
94
|
end
|
@@ -93,8 +104,17 @@ class LinkScrapper
|
|
93
104
|
@skip = 1
|
94
105
|
else
|
95
106
|
# check for protocol agnostic and indirect links
|
96
|
-
|
97
|
-
|
107
|
+
case @search_uri[0,1]
|
108
|
+
when '.'
|
109
|
+
@search_uri[0,1] = ''
|
110
|
+
end
|
111
|
+
case @search_uri[0,2]
|
112
|
+
when '//', './', '..'
|
113
|
+
@search_uri[0,2] = ''
|
114
|
+
end
|
115
|
+
case @search_uri[0,3]
|
116
|
+
when '../'
|
117
|
+
@search_uri[0,3] = ''
|
98
118
|
end
|
99
119
|
# check for relative link
|
100
120
|
if @search_uri[0] == '/'
|
@@ -104,7 +124,8 @@ class LinkScrapper
|
|
104
124
|
if @search_uri !~ /^([\w]|%|#|\?)/
|
105
125
|
@search_index += 1
|
106
126
|
@skip = 1
|
107
|
-
|
127
|
+
@error_links[@search_uri] = ''
|
128
|
+
puts "invalid uri #{@search_uri}" if @settings[:verbose]
|
108
129
|
return
|
109
130
|
end
|
110
131
|
# define uri string
|
@@ -143,7 +164,7 @@ class LinkScrapper
|
|
143
164
|
if @skip == 0
|
144
165
|
|
145
166
|
# let user know which uri is currently active
|
146
|
-
puts @search_uri
|
167
|
+
puts @search_uri if @settings[:verbose]
|
147
168
|
|
148
169
|
# gather page request response
|
149
170
|
begin
|
@@ -193,15 +214,20 @@ class LinkScrapper
|
|
193
214
|
def save_results
|
194
215
|
# save search results
|
195
216
|
CSV.open('results.csv', 'wb') {|csv|
|
196
|
-
@checked_links.each {|
|
197
|
-
csv << [
|
217
|
+
@checked_links.each {|link|
|
218
|
+
csv << [link[0], link[1][:res], link[1][:time]]
|
198
219
|
}
|
199
220
|
}
|
200
|
-
|
201
221
|
# save list of external links
|
202
222
|
CSV.open('external-links.csv', 'wb') {|csv|
|
203
|
-
@external_links.each do |
|
204
|
-
csv << [
|
223
|
+
@external_links.each do |link|
|
224
|
+
csv << [link[0], link[1][:res], link[1][:time]]
|
225
|
+
end
|
226
|
+
}
|
227
|
+
# save list of invalid links
|
228
|
+
CSV.open('invalid.csv', 'wb') {|csv|
|
229
|
+
@error_links.each do |link|
|
230
|
+
csv << link
|
205
231
|
end
|
206
232
|
}
|
207
233
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: link_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robert McDowell
|
@@ -10,7 +10,9 @@ bindir: bin
|
|
10
10
|
cert_chain: []
|
11
11
|
date: 2016-04-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: A simple gem for scrapping links within an assigned website.
|
13
|
+
description: A simple gem for scrapping links within an assigned website. Results
|
14
|
+
for domain, external links, and invalid URLs can be saved as CSVs or returned as
|
15
|
+
a hash.
|
14
16
|
email: doodersrage@gmail.com
|
15
17
|
executables: []
|
16
18
|
extensions: []
|