link_scrapper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/link_scrapper.rb +209 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 85990ac2f989af0e103e598c16d09d72949e06e2
|
4
|
+
data.tar.gz: 5f9a3400af2843329e1e27f0f481b6288d651efc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: deb545e9fa82e03c2547063aa3007d2109b89cf9fada26ef81ac05176655f3828c0e7712a719cbdf9cc4a60fdb99133518569870721574f933a087fa141b6e28
|
7
|
+
data.tar.gz: 835cd0415fc1480d4956106247278681fc69e100582e59c4c77da7e7c5c67f1d3228241dc5c9f8287be1ee0e04f16bef0f716d48a8f73395bc39748a90e739bc
|
@@ -0,0 +1,209 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'csv'
|
3
|
+
|
4
|
+
# default search domain
|
5
|
+
SEARCH_DOMAIN = 'http://virginiabeachwebdevelopment.com/'
|
6
|
+
|
7
|
+
# class for grabbing and parsing domain links
|
8
|
+
class LinkScrapper
|
9
|
+
|
10
|
+
def initialize(search_domain = SEARCH_DOMAIN)
|
11
|
+
|
12
|
+
# init link store hashes
|
13
|
+
@search_index = 0
|
14
|
+
@search_iteration = 0
|
15
|
+
@links = Array.new
|
16
|
+
@checked_links = Hash.new
|
17
|
+
@error_links = Hash.new
|
18
|
+
@external_links = Hash.new
|
19
|
+
|
20
|
+
# gather search domain
|
21
|
+
if ARGV[0]
|
22
|
+
@search_domain = ARGV[0].dup
|
23
|
+
elsif search_domain
|
24
|
+
@search_domain = search_domain
|
25
|
+
elsif search_domain == 'ue'
|
26
|
+
puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
|
27
|
+
@search_domain = gets.chomp
|
28
|
+
end
|
29
|
+
|
30
|
+
# override with default domain if entry is left empty
|
31
|
+
@search_domain = SEARCH_DOMAIN if @search_domain == ''
|
32
|
+
|
33
|
+
# get and store local domain string
|
34
|
+
@local_domain = @search_domain.match(/\w+\.\w+(?=\/|\s|$)/)
|
35
|
+
|
36
|
+
# configure initial search uri
|
37
|
+
@search_uri = @search_domain
|
38
|
+
|
39
|
+
# verify domain entry includes protocol
|
40
|
+
if @search_uri !~ /^htt(p|ps):/
|
41
|
+
@search_uri.insert(0, 'http://')
|
42
|
+
end
|
43
|
+
|
44
|
+
# verify leading forward slash
|
45
|
+
if @search_uri[@search_uri.length-1] != '/'
|
46
|
+
@search_uri << '/'
|
47
|
+
end
|
48
|
+
|
49
|
+
# start scan
|
50
|
+
get_links
|
51
|
+
end
|
52
|
+
|
53
|
+
# gather search uri
|
54
|
+
def get_search_uri
|
55
|
+
# do not override initial domain setting
|
56
|
+
if @search_iteration > 0
|
57
|
+
# set search uri
|
58
|
+
if !@links[@search_index].nil?
|
59
|
+
@search_uri = @links[@search_index][0].chomp
|
60
|
+
else
|
61
|
+
# save results and exit
|
62
|
+
save_results
|
63
|
+
exit
|
64
|
+
end
|
65
|
+
|
66
|
+
# check for direct link
|
67
|
+
if @search_uri =~ /^htt(p|ps):/
|
68
|
+
# if external link go to next link
|
69
|
+
if @search_uri.index(@local_domain[0]) == nil
|
70
|
+
if !@external_links[@search_uri.to_sym]
|
71
|
+
begin
|
72
|
+
t1 = Time.now
|
73
|
+
response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
|
74
|
+
t2 = Time.now
|
75
|
+
delta = t2 - t1
|
76
|
+
rescode = response.code
|
77
|
+
rescue => ex
|
78
|
+
rescode = 408
|
79
|
+
end
|
80
|
+
@external_links[@search_uri.to_sym] = {res: rescode, time: delta}
|
81
|
+
end
|
82
|
+
@skip = 1
|
83
|
+
end
|
84
|
+
else
|
85
|
+
|
86
|
+
# skip various files
|
87
|
+
if @search_uri =~ /[^\s]+(\.(?i)flv|gif|jpg|png|mp3|mp4|m4v|pdf|zip|txt)$/
|
88
|
+
@skip = 1
|
89
|
+
end
|
90
|
+
|
91
|
+
# check for mailto link
|
92
|
+
if @search_uri[0,7] == 'mailto:' || @search_uri[0,4] == 'tel:'
|
93
|
+
@skip = 1
|
94
|
+
else
|
95
|
+
# check for protocol agnostic and indirect links
|
96
|
+
if @search_uri[0,2] == '//' || @search_uri[0,2] == './' || @search_uri[0,3] == '../'
|
97
|
+
@search_uri[0,2] = ""
|
98
|
+
end
|
99
|
+
# check for relative link
|
100
|
+
if @search_uri[0] == '/'
|
101
|
+
@search_uri[0] = ''
|
102
|
+
end
|
103
|
+
# verify uri portion is valid
|
104
|
+
if @search_uri !~ /^([\w]|%|#|\?)/
|
105
|
+
@search_index += 1
|
106
|
+
@skip = 1
|
107
|
+
puts "invalid uri #{@search_uri}"
|
108
|
+
return
|
109
|
+
end
|
110
|
+
# define uri string
|
111
|
+
if @search_uri[0,2] != '//'
|
112
|
+
@search_uri = "#{@search_domain}#{@search_uri}"
|
113
|
+
else
|
114
|
+
# handle protocol agnostic link requests
|
115
|
+
if @search_domain[0,6] == 'https:'
|
116
|
+
@search_uri = "https:#{@search_uri}"
|
117
|
+
else
|
118
|
+
@search_uri = "http:#{@search_uri}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
# increment search index value
|
124
|
+
@search_index += 1
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# gather link data
|
129
|
+
def get_links
|
130
|
+
|
131
|
+
# init skip bit
|
132
|
+
@skip = 0
|
133
|
+
|
134
|
+
# define search uri if undefined
|
135
|
+
get_search_uri
|
136
|
+
|
137
|
+
# check for existing uri hash index
|
138
|
+
if @checked_links[@search_uri.to_sym]
|
139
|
+
@skip = 1
|
140
|
+
end
|
141
|
+
|
142
|
+
# run link scan if @skip bit is not set
|
143
|
+
if @skip == 0
|
144
|
+
|
145
|
+
# let user know which uri is currently active
|
146
|
+
puts @search_uri
|
147
|
+
|
148
|
+
# gather page request response
|
149
|
+
begin
|
150
|
+
t1 = Time.now
|
151
|
+
response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri.strip)))
|
152
|
+
t2 = Time.now
|
153
|
+
delta = t2 - t1
|
154
|
+
|
155
|
+
# store response page body
|
156
|
+
body = response.body
|
157
|
+
|
158
|
+
# store response code
|
159
|
+
code = response.code
|
160
|
+
|
161
|
+
# extract all links within page
|
162
|
+
links_array = body.scan(/<a[^>]+href\s*=\s*["']([^"']+)["'][^>]*>(.*?)<\/a>/mi)
|
163
|
+
|
164
|
+
# update anchors and indirect links to use direct links
|
165
|
+
links_array.each { |val|
|
166
|
+
if val[0] != '/' || val !~ /^htt(p|ps):/ || val[0,2] != '//'
|
167
|
+
val = "#{@search_uri}#{val}"
|
168
|
+
end
|
169
|
+
}
|
170
|
+
|
171
|
+
# combine found links with links array
|
172
|
+
@links.concat(links_array)
|
173
|
+
|
174
|
+
# remove duplicates
|
175
|
+
@links.uniq!
|
176
|
+
|
177
|
+
rescue => ex
|
178
|
+
rescode = 408
|
179
|
+
end
|
180
|
+
|
181
|
+
# store results in checked hash
|
182
|
+
@checked_links[@search_uri.to_sym] = {res: code, time: delta}
|
183
|
+
|
184
|
+
end
|
185
|
+
|
186
|
+
# iterate through found links
|
187
|
+
@search_iteration += 1
|
188
|
+
get_links
|
189
|
+
|
190
|
+
end
|
191
|
+
|
192
|
+
# save results to csvs
|
193
|
+
def save_results
|
194
|
+
# save search results
|
195
|
+
CSV.open('results.csv', 'wb') {|csv|
|
196
|
+
@checked_links.each {|key|
|
197
|
+
csv << [key[0], key[1][:res], key[1][:time]]
|
198
|
+
}
|
199
|
+
}
|
200
|
+
|
201
|
+
# save list of external links
|
202
|
+
CSV.open('external-links.csv', 'wb') {|csv|
|
203
|
+
@external_links.each do |key|
|
204
|
+
csv << [key[0], key[1][:res], key[1][:time]]
|
205
|
+
end
|
206
|
+
}
|
207
|
+
end
|
208
|
+
|
209
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: link_scrapper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Robert McDowell
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-04-02 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A simple gem for scrapping links within an assigned website.
|
14
|
+
email: doodersrage@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/link_scrapper.rb
|
20
|
+
homepage: http://virginiabeachwebdevelopment.com
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.5.1
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Simple website link scrapper.
|
44
|
+
test_files: []
|