link_scrapper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/link_scrapper.rb +209 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 85990ac2f989af0e103e598c16d09d72949e06e2
4
+ data.tar.gz: 5f9a3400af2843329e1e27f0f481b6288d651efc
5
+ SHA512:
6
+ metadata.gz: deb545e9fa82e03c2547063aa3007d2109b89cf9fada26ef81ac05176655f3828c0e7712a719cbdf9cc4a60fdb99133518569870721574f933a087fa141b6e28
7
+ data.tar.gz: 835cd0415fc1480d4956106247278681fc69e100582e59c4c77da7e7c5c67f1d3228241dc5c9f8287be1ee0e04f16bef0f716d48a8f73395bc39748a90e739bc
@@ -0,0 +1,209 @@
1
+ require 'net/http'
2
+ require 'csv'
3
+
4
+ # default search domain
5
+ SEARCH_DOMAIN = 'http://virginiabeachwebdevelopment.com/'
6
+
7
+ # class for grabbing and parsing domain links
8
+ class LinkScrapper
9
+
10
+ def initialize(search_domain = SEARCH_DOMAIN)
11
+
12
+ # init link store hashes
13
+ @search_index = 0
14
+ @search_iteration = 0
15
+ @links = Array.new
16
+ @checked_links = Hash.new
17
+ @error_links = Hash.new
18
+ @external_links = Hash.new
19
+
20
+ # gather search domain
21
+ if ARGV[0]
22
+ @search_domain = ARGV[0].dup
23
+ elsif search_domain
24
+ @search_domain = search_domain
25
+ elsif search_domain == 'ue'
26
+ puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
27
+ @search_domain = gets.chomp
28
+ end
29
+
30
+ # override with default domain if entry is left empty
31
+ @search_domain = SEARCH_DOMAIN if @search_domain == ''
32
+
33
+ # get and store local domain string
34
+ @local_domain = @search_domain.match(/\w+\.\w+(?=\/|\s|$)/)
35
+
36
+ # configure initial search uri
37
+ @search_uri = @search_domain
38
+
39
+ # verify domain entry includes protocol
40
+ if @search_uri !~ /^htt(p|ps):/
41
+ @search_uri.insert(0, 'http://')
42
+ end
43
+
44
+ # verify leading forward slash
45
+ if @search_uri[@search_uri.length-1] != '/'
46
+ @search_uri << '/'
47
+ end
48
+
49
+ # start scan
50
+ get_links
51
+ end
52
+
53
+ # gather search uri
54
+ def get_search_uri
55
+ # do not override initial domain setting
56
+ if @search_iteration > 0
57
+ # set search uri
58
+ if !@links[@search_index].nil?
59
+ @search_uri = @links[@search_index][0].chomp
60
+ else
61
+ # save results and exit
62
+ save_results
63
+ exit
64
+ end
65
+
66
+ # check for direct link
67
+ if @search_uri =~ /^htt(p|ps):/
68
+ # if external link go to next link
69
+ if @search_uri.index(@local_domain[0]) == nil
70
+ if !@external_links[@search_uri.to_sym]
71
+ begin
72
+ t1 = Time.now
73
+ response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
74
+ t2 = Time.now
75
+ delta = t2 - t1
76
+ rescode = response.code
77
+ rescue => ex
78
+ rescode = 408
79
+ end
80
+ @external_links[@search_uri.to_sym] = {res: rescode, time: delta}
81
+ end
82
+ @skip = 1
83
+ end
84
+ else
85
+
86
+ # skip various files
87
+ if @search_uri =~ /[^\s]+(\.(?i)flv|gif|jpg|png|mp3|mp4|m4v|pdf|zip|txt)$/
88
+ @skip = 1
89
+ end
90
+
91
+ # check for mailto link
92
+ if @search_uri[0,7] == 'mailto:' || @search_uri[0,4] == 'tel:'
93
+ @skip = 1
94
+ else
95
+ # check for protocol agnostic and indirect links
96
+ if @search_uri[0,2] == '//' || @search_uri[0,2] == './' || @search_uri[0,3] == '../'
97
+ @search_uri[0,2] = ""
98
+ end
99
+ # check for relative link
100
+ if @search_uri[0] == '/'
101
+ @search_uri[0] = ''
102
+ end
103
+ # verify uri portion is valid
104
+ if @search_uri !~ /^([\w]|%|#|\?)/
105
+ @search_index += 1
106
+ @skip = 1
107
+ puts "invalid uri #{@search_uri}"
108
+ return
109
+ end
110
+ # define uri string
111
+ if @search_uri[0,2] != '//'
112
+ @search_uri = "#{@search_domain}#{@search_uri}"
113
+ else
114
+ # handle protocol agnostic link requests
115
+ if @search_domain[0,6] == 'https:'
116
+ @search_uri = "https:#{@search_uri}"
117
+ else
118
+ @search_uri = "http:#{@search_uri}"
119
+ end
120
+ end
121
+ end
122
+ end
123
+ # increment search index value
124
+ @search_index += 1
125
+ end
126
+ end
127
+
128
+ # gather link data
129
+ def get_links
130
+
131
+ # init skip bit
132
+ @skip = 0
133
+
134
+ # define search uri if undefined
135
+ get_search_uri
136
+
137
+ # check for existing uri hash index
138
+ if @checked_links[@search_uri.to_sym]
139
+ @skip = 1
140
+ end
141
+
142
+ # run link scan if @skip bit is not set
143
+ if @skip == 0
144
+
145
+ # let user know which uri is currently active
146
+ puts @search_uri
147
+
148
+ # gather page request response
149
+ begin
150
+ t1 = Time.now
151
+ response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri.strip)))
152
+ t2 = Time.now
153
+ delta = t2 - t1
154
+
155
+ # store response page body
156
+ body = response.body
157
+
158
+ # store response code
159
+ code = response.code
160
+
161
+ # extract all links within page
162
+ links_array = body.scan(/<a[^>]+href\s*=\s*["']([^"']+)["'][^>]*>(.*?)<\/a>/mi)
163
+
164
+ # update anchors and indirect links to use direct links
165
+ links_array.each { |val|
166
+ if val[0] != '/' || val !~ /^htt(p|ps):/ || val[0,2] != '//'
167
+ val = "#{@search_uri}#{val}"
168
+ end
169
+ }
170
+
171
+ # combine found links with links array
172
+ @links.concat(links_array)
173
+
174
+ # remove duplicates
175
+ @links.uniq!
176
+
177
+ rescue => ex
178
+ rescode = 408
179
+ end
180
+
181
+ # store results in checked hash
182
+ @checked_links[@search_uri.to_sym] = {res: code, time: delta}
183
+
184
+ end
185
+
186
+ # iterate through found links
187
+ @search_iteration += 1
188
+ get_links
189
+
190
+ end
191
+
192
+ # save results to csvs
193
+ def save_results
194
+ # save search results
195
+ CSV.open('results.csv', 'wb') {|csv|
196
+ @checked_links.each {|key|
197
+ csv << [key[0], key[1][:res], key[1][:time]]
198
+ }
199
+ }
200
+
201
+ # save list of external links
202
+ CSV.open('external-links.csv', 'wb') {|csv|
203
+ @external_links.each do |key|
204
+ csv << [key[0], key[1][:res], key[1][:time]]
205
+ end
206
+ }
207
+ end
208
+
209
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: link_scrapper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Robert McDowell
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-04-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A simple gem for scrapping links within an assigned website.
14
+ email: doodersrage@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/link_scrapper.rb
20
+ homepage: http://virginiabeachwebdevelopment.com
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.5.1
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Simple website link scrapper.
44
+ test_files: []