link_scrapper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/link_scrapper.rb +209 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 85990ac2f989af0e103e598c16d09d72949e06e2
|
4
|
+
data.tar.gz: 5f9a3400af2843329e1e27f0f481b6288d651efc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: deb545e9fa82e03c2547063aa3007d2109b89cf9fada26ef81ac05176655f3828c0e7712a719cbdf9cc4a60fdb99133518569870721574f933a087fa141b6e28
|
7
|
+
data.tar.gz: 835cd0415fc1480d4956106247278681fc69e100582e59c4c77da7e7c5c67f1d3228241dc5c9f8287be1ee0e04f16bef0f716d48a8f73395bc39748a90e739bc
|
@@ -0,0 +1,209 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'csv'
|
3
|
+
|
4
|
+
# default search domain
|
5
|
+
SEARCH_DOMAIN = 'http://virginiabeachwebdevelopment.com/'
|
6
|
+
|
7
|
+
# class for grabbing and parsing domain links
|
8
|
+
class LinkScrapper
|
9
|
+
|
10
|
+
def initialize(search_domain = SEARCH_DOMAIN)
|
11
|
+
|
12
|
+
# init link store hashes
|
13
|
+
@search_index = 0
|
14
|
+
@search_iteration = 0
|
15
|
+
@links = Array.new
|
16
|
+
@checked_links = Hash.new
|
17
|
+
@error_links = Hash.new
|
18
|
+
@external_links = Hash.new
|
19
|
+
|
20
|
+
# gather search domain
|
21
|
+
if ARGV[0]
|
22
|
+
@search_domain = ARGV[0].dup
|
23
|
+
elsif search_domain
|
24
|
+
@search_domain = search_domain
|
25
|
+
elsif search_domain == 'ue'
|
26
|
+
puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
|
27
|
+
@search_domain = gets.chomp
|
28
|
+
end
|
29
|
+
|
30
|
+
# override with default domain if entry is left empty
|
31
|
+
@search_domain = SEARCH_DOMAIN if @search_domain == ''
|
32
|
+
|
33
|
+
# get and store local domain string
|
34
|
+
@local_domain = @search_domain.match(/\w+\.\w+(?=\/|\s|$)/)
|
35
|
+
|
36
|
+
# configure initial search uri
|
37
|
+
@search_uri = @search_domain
|
38
|
+
|
39
|
+
# verify domain entry includes protocol
|
40
|
+
if @search_uri !~ /^htt(p|ps):/
|
41
|
+
@search_uri.insert(0, 'http://')
|
42
|
+
end
|
43
|
+
|
44
|
+
# verify leading forward slash
|
45
|
+
if @search_uri[@search_uri.length-1] != '/'
|
46
|
+
@search_uri << '/'
|
47
|
+
end
|
48
|
+
|
49
|
+
# start scan
|
50
|
+
get_links
|
51
|
+
end
|
52
|
+
|
53
|
+
# gather search uri
|
54
|
+
def get_search_uri
|
55
|
+
# do not override initial domain setting
|
56
|
+
if @search_iteration > 0
|
57
|
+
# set search uri
|
58
|
+
if !@links[@search_index].nil?
|
59
|
+
@search_uri = @links[@search_index][0].chomp
|
60
|
+
else
|
61
|
+
# save results and exit
|
62
|
+
save_results
|
63
|
+
exit
|
64
|
+
end
|
65
|
+
|
66
|
+
# check for direct link
|
67
|
+
if @search_uri =~ /^htt(p|ps):/
|
68
|
+
# if external link go to next link
|
69
|
+
if @search_uri.index(@local_domain[0]) == nil
|
70
|
+
if !@external_links[@search_uri.to_sym]
|
71
|
+
begin
|
72
|
+
t1 = Time.now
|
73
|
+
response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
|
74
|
+
t2 = Time.now
|
75
|
+
delta = t2 - t1
|
76
|
+
rescode = response.code
|
77
|
+
rescue => ex
|
78
|
+
rescode = 408
|
79
|
+
end
|
80
|
+
@external_links[@search_uri.to_sym] = {res: rescode, time: delta}
|
81
|
+
end
|
82
|
+
@skip = 1
|
83
|
+
end
|
84
|
+
else
|
85
|
+
|
86
|
+
# skip various files
|
87
|
+
if @search_uri =~ /[^\s]+(\.(?i)flv|gif|jpg|png|mp3|mp4|m4v|pdf|zip|txt)$/
|
88
|
+
@skip = 1
|
89
|
+
end
|
90
|
+
|
91
|
+
# check for mailto link
|
92
|
+
if @search_uri[0,7] == 'mailto:' || @search_uri[0,4] == 'tel:'
|
93
|
+
@skip = 1
|
94
|
+
else
|
95
|
+
# check for protocol agnostic and indirect links
|
96
|
+
if @search_uri[0,2] == '//' || @search_uri[0,2] == './' || @search_uri[0,3] == '../'
|
97
|
+
@search_uri[0,2] = ""
|
98
|
+
end
|
99
|
+
# check for relative link
|
100
|
+
if @search_uri[0] == '/'
|
101
|
+
@search_uri[0] = ''
|
102
|
+
end
|
103
|
+
# verify uri portion is valid
|
104
|
+
if @search_uri !~ /^([\w]|%|#|\?)/
|
105
|
+
@search_index += 1
|
106
|
+
@skip = 1
|
107
|
+
puts "invalid uri #{@search_uri}"
|
108
|
+
return
|
109
|
+
end
|
110
|
+
# define uri string
|
111
|
+
if @search_uri[0,2] != '//'
|
112
|
+
@search_uri = "#{@search_domain}#{@search_uri}"
|
113
|
+
else
|
114
|
+
# handle protocol agnostic link requests
|
115
|
+
if @search_domain[0,6] == 'https:'
|
116
|
+
@search_uri = "https:#{@search_uri}"
|
117
|
+
else
|
118
|
+
@search_uri = "http:#{@search_uri}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
# increment search index value
|
124
|
+
@search_index += 1
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# gather link data
|
129
|
+
def get_links
|
130
|
+
|
131
|
+
# init skip bit
|
132
|
+
@skip = 0
|
133
|
+
|
134
|
+
# define search uri if undefined
|
135
|
+
get_search_uri
|
136
|
+
|
137
|
+
# check for existing uri hash index
|
138
|
+
if @checked_links[@search_uri.to_sym]
|
139
|
+
@skip = 1
|
140
|
+
end
|
141
|
+
|
142
|
+
# run link scan if @skip bit is not set
|
143
|
+
if @skip == 0
|
144
|
+
|
145
|
+
# let user know which uri is currently active
|
146
|
+
puts @search_uri
|
147
|
+
|
148
|
+
# gather page request response
|
149
|
+
begin
|
150
|
+
t1 = Time.now
|
151
|
+
response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri.strip)))
|
152
|
+
t2 = Time.now
|
153
|
+
delta = t2 - t1
|
154
|
+
|
155
|
+
# store response page body
|
156
|
+
body = response.body
|
157
|
+
|
158
|
+
# store response code
|
159
|
+
code = response.code
|
160
|
+
|
161
|
+
# extract all links within page
|
162
|
+
links_array = body.scan(/<a[^>]+href\s*=\s*["']([^"']+)["'][^>]*>(.*?)<\/a>/mi)
|
163
|
+
|
164
|
+
# update anchors and indirect links to use direct links
|
165
|
+
links_array.each { |val|
|
166
|
+
if val[0] != '/' || val !~ /^htt(p|ps):/ || val[0,2] != '//'
|
167
|
+
val = "#{@search_uri}#{val}"
|
168
|
+
end
|
169
|
+
}
|
170
|
+
|
171
|
+
# combine found links with links array
|
172
|
+
@links.concat(links_array)
|
173
|
+
|
174
|
+
# remove duplicates
|
175
|
+
@links.uniq!
|
176
|
+
|
177
|
+
rescue => ex
|
178
|
+
rescode = 408
|
179
|
+
end
|
180
|
+
|
181
|
+
# store results in checked hash
|
182
|
+
@checked_links[@search_uri.to_sym] = {res: code, time: delta}
|
183
|
+
|
184
|
+
end
|
185
|
+
|
186
|
+
# iterate through found links
|
187
|
+
@search_iteration += 1
|
188
|
+
get_links
|
189
|
+
|
190
|
+
end
|
191
|
+
|
192
|
+
# save results to csvs
|
193
|
+
def save_results
|
194
|
+
# save search results
|
195
|
+
CSV.open('results.csv', 'wb') {|csv|
|
196
|
+
@checked_links.each {|key|
|
197
|
+
csv << [key[0], key[1][:res], key[1][:time]]
|
198
|
+
}
|
199
|
+
}
|
200
|
+
|
201
|
+
# save list of external links
|
202
|
+
CSV.open('external-links.csv', 'wb') {|csv|
|
203
|
+
@external_links.each do |key|
|
204
|
+
csv << [key[0], key[1][:res], key[1][:time]]
|
205
|
+
end
|
206
|
+
}
|
207
|
+
end
|
208
|
+
|
209
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: link_scrapper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Robert McDowell
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-04-02 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A simple gem for scrapping links within an assigned website.
|
14
|
+
email: doodersrage@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/link_scrapper.rb
|
20
|
+
homepage: http://virginiabeachwebdevelopment.com
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.5.1
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Simple website link scrapper.
|
44
|
+
test_files: []
|