link_scrapper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/link_scrapper.rb +209 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 85990ac2f989af0e103e598c16d09d72949e06e2
4
+ data.tar.gz: 5f9a3400af2843329e1e27f0f481b6288d651efc
5
+ SHA512:
6
+ metadata.gz: deb545e9fa82e03c2547063aa3007d2109b89cf9fada26ef81ac05176655f3828c0e7712a719cbdf9cc4a60fdb99133518569870721574f933a087fa141b6e28
7
+ data.tar.gz: 835cd0415fc1480d4956106247278681fc69e100582e59c4c77da7e7c5c67f1d3228241dc5c9f8287be1ee0e04f16bef0f716d48a8f73395bc39748a90e739bc
@@ -0,0 +1,209 @@
1
+ require 'net/http'
2
+ require 'csv'
3
+
4
+ # default search domain
5
+ SEARCH_DOMAIN = 'http://virginiabeachwebdevelopment.com/'
6
+
7
+ # class for grabbing and parsing domain links
8
+ class LinkScrapper
9
+
10
+ def initialize(search_domain = SEARCH_DOMAIN)
11
+
12
+ # init link store hashes
13
+ @search_index = 0
14
+ @search_iteration = 0
15
+ @links = Array.new
16
+ @checked_links = Hash.new
17
+ @error_links = Hash.new
18
+ @external_links = Hash.new
19
+
20
+ # gather search domain
21
+ if ARGV[0]
22
+ @search_domain = ARGV[0].dup
23
+ elsif search_domain
24
+ @search_domain = search_domain
25
+ elsif search_domain == 'ue'
26
+ puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
27
+ @search_domain = gets.chomp
28
+ end
29
+
30
+ # override with default domain if entry is left empty
31
+ @search_domain = SEARCH_DOMAIN if @search_domain == ''
32
+
33
+ # get and store local domain string
34
+ @local_domain = @search_domain.match(/\w+\.\w+(?=\/|\s|$)/)
35
+
36
+ # configure initial search uri
37
+ @search_uri = @search_domain
38
+
39
+ # verify domain entry includes protocol
40
+ if @search_uri !~ /^htt(p|ps):/
41
+ @search_uri.insert(0, 'http://')
42
+ end
43
+
44
+ # verify leading forward slash
45
+ if @search_uri[@search_uri.length-1] != '/'
46
+ @search_uri << '/'
47
+ end
48
+
49
+ # start scan
50
+ get_links
51
+ end
52
+
53
+ # gather search uri
54
+ def get_search_uri
55
+ # do not override initial domain setting
56
+ if @search_iteration > 0
57
+ # set search uri
58
+ if !@links[@search_index].nil?
59
+ @search_uri = @links[@search_index][0].chomp
60
+ else
61
+ # save results and exit
62
+ save_results
63
+ exit
64
+ end
65
+
66
+ # check for direct link
67
+ if @search_uri =~ /^htt(p|ps):/
68
+ # if external link go to next link
69
+ if @search_uri.index(@local_domain[0]) == nil
70
+ if !@external_links[@search_uri.to_sym]
71
+ begin
72
+ t1 = Time.now
73
+ response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
74
+ t2 = Time.now
75
+ delta = t2 - t1
76
+ rescode = response.code
77
+ rescue => ex
78
+ rescode = 408
79
+ end
80
+ @external_links[@search_uri.to_sym] = {res: rescode, time: delta}
81
+ end
82
+ @skip = 1
83
+ end
84
+ else
85
+
86
+ # skip various files
87
+ if @search_uri =~ /[^\s]+(\.(?i)flv|gif|jpg|png|mp3|mp4|m4v|pdf|zip|txt)$/
88
+ @skip = 1
89
+ end
90
+
91
+ # check for mailto link
92
+ if @search_uri[0,7] == 'mailto:' || @search_uri[0,4] == 'tel:'
93
+ @skip = 1
94
+ else
95
+ # check for protocol agnostic and indirect links
96
+ if @search_uri[0,2] == '//' || @search_uri[0,2] == './' || @search_uri[0,3] == '../'
97
+ @search_uri[0,2] = ""
98
+ end
99
+ # check for relative link
100
+ if @search_uri[0] == '/'
101
+ @search_uri[0] = ''
102
+ end
103
+ # verify uri portion is valid
104
+ if @search_uri !~ /^([\w]|%|#|\?)/
105
+ @search_index += 1
106
+ @skip = 1
107
+ puts "invalid uri #{@search_uri}"
108
+ return
109
+ end
110
+ # define uri string
111
+ if @search_uri[0,2] != '//'
112
+ @search_uri = "#{@search_domain}#{@search_uri}"
113
+ else
114
+ # handle protocol agnostic link requests
115
+ if @search_domain[0,6] == 'https:'
116
+ @search_uri = "https:#{@search_uri}"
117
+ else
118
+ @search_uri = "http:#{@search_uri}"
119
+ end
120
+ end
121
+ end
122
+ end
123
+ # increment search index value
124
+ @search_index += 1
125
+ end
126
+ end
127
+
128
+ # gather link data
129
+ def get_links
130
+
131
+ # init skip bit
132
+ @skip = 0
133
+
134
+ # define search uri if undefined
135
+ get_search_uri
136
+
137
+ # check for existing uri hash index
138
+ if @checked_links[@search_uri.to_sym]
139
+ @skip = 1
140
+ end
141
+
142
+ # run link scan if @skip bit is not set
143
+ if @skip == 0
144
+
145
+ # let user know which uri is currently active
146
+ puts @search_uri
147
+
148
+ # gather page request response
149
+ begin
150
+ t1 = Time.now
151
+ response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri.strip)))
152
+ t2 = Time.now
153
+ delta = t2 - t1
154
+
155
+ # store response page body
156
+ body = response.body
157
+
158
+ # store response code
159
+ code = response.code
160
+
161
+ # extract all links within page
162
+ links_array = body.scan(/<a[^>]+href\s*=\s*["']([^"']+)["'][^>]*>(.*?)<\/a>/mi)
163
+
164
+ # update anchors and indirect links to use direct links
165
+ links_array.each { |val|
166
+ if val[0] != '/' || val !~ /^htt(p|ps):/ || val[0,2] != '//'
167
+ val = "#{@search_uri}#{val}"
168
+ end
169
+ }
170
+
171
+ # combine found links with links array
172
+ @links.concat(links_array)
173
+
174
+ # remove duplicates
175
+ @links.uniq!
176
+
177
+ rescue => ex
178
+ rescode = 408
179
+ end
180
+
181
+ # store results in checked hash
182
+ @checked_links[@search_uri.to_sym] = {res: code, time: delta}
183
+
184
+ end
185
+
186
+ # iterate through found links
187
+ @search_iteration += 1
188
+ get_links
189
+
190
+ end
191
+
192
+ # save results to csvs
193
+ def save_results
194
+ # save search results
195
+ CSV.open('results.csv', 'wb') {|csv|
196
+ @checked_links.each {|key|
197
+ csv << [key[0], key[1][:res], key[1][:time]]
198
+ }
199
+ }
200
+
201
+ # save list of external links
202
+ CSV.open('external-links.csv', 'wb') {|csv|
203
+ @external_links.each do |key|
204
+ csv << [key[0], key[1][:res], key[1][:time]]
205
+ end
206
+ }
207
+ end
208
+
209
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: link_scrapper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Robert McDowell
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-04-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A simple gem for scrapping links within an assigned website.
14
+ email: doodersrage@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/link_scrapper.rb
20
+ homepage: http://virginiabeachwebdevelopment.com
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.5.1
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Simple website link scrapper.
44
+ test_files: []