link_scrapper 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/link_scrapper.rb +43 -17
  3. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6117f7af82850c7d9d9072ab59837b266a4c82ae
4
- data.tar.gz: c4e01ea6b7d55f4bc7e51a15f131b306bab0246d
3
+ metadata.gz: 98b6d54cc75862b1e063a04e355a6f638deff14b
4
+ data.tar.gz: fbe3a8ecfbf27d81458862ec96543700e994a049
5
5
  SHA512:
6
- metadata.gz: 8de711206d0270bfb23ac104684822355069ce0609b402157454e32c4dad4bf5c833b93054cc6546c9b9d42d3295d08a37c54c9fd90d2a0283bdfdbd85e4c7da
7
- data.tar.gz: 17bf223f967f0eb4573a36f5013c825181c82861382cf1f7f5caa28fdc129af9d93267d1928ffc4fd79e1a8f36f8e259f128393dabffeb381e2cc1e0661bd3d4
6
+ metadata.gz: 34296971fcebf4d437aba6703c36ca93a8599ee51d5ee73032e2e58217e8c25029feabb9c83c51f298fdbe5a27155196faaec2bf860e5751a228da843dcd499b
7
+ data.tar.gz: f3d82c1973ec70e37815ad88f35523b5edab699d0c473e7c8ce61e9387e83b1ffbf3ec5b456eb05001cc7e653282b88954790bc4136c79e24b818c8cabf05b28
data/lib/link_scrapper.rb CHANGED
@@ -7,9 +7,15 @@ SEARCH_DOMAIN = 'http://virginiabeachwebdevelopment.com/'
7
7
  # class for grabbing and parsing domain links
8
8
  class LinkScrapper
9
9
 
10
- def initialize(search_domain = SEARCH_DOMAIN)
10
+ def initialize(settings)
11
+
12
+ # available default settings
13
+ # domain: domain to be searched
14
+ # verbose: prints output as the script goes along
15
+ # results: hash or csv
11
16
 
12
17
  # init link store hashes
18
+ @settings = settings;
13
19
  @search_index = 0
14
20
  @search_iteration = 0
15
21
  @links = Array.new
@@ -20,11 +26,11 @@ class LinkScrapper
20
26
  # gather search domain
21
27
  if ARGV[0]
22
28
  @search_domain = ARGV[0].dup
23
- elsif search_domain == 'ue'
29
+ elsif @settings[:domain] == 'ue'
24
30
  puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
25
31
  @search_domain = gets.chomp
26
- elsif search_domain
27
- @search_domain = search_domain
32
+ elsif @settings[:domain]
33
+ @search_domain = @settings[:domain]
28
34
  end
29
35
 
30
36
  # override with default domain if entry is left empty
@@ -59,12 +65,17 @@ class LinkScrapper
59
65
  @search_uri = @links[@search_index][0].chomp
60
66
  else
61
67
  # save results and exit
62
- save_results
68
+ if @settings[:results] == 'csv'
69
+ save_results
70
+ else
71
+ return { checked_links: @checked_links, error_links: @error_links, external_links: @external_links}
72
+ end
63
73
  exit
64
74
  end
65
75
 
66
76
  # check for direct link
67
77
  if @search_uri =~ /^htt(p|ps):/
78
+
68
79
  # if external link go to next link
69
80
  if @search_uri.index(@local_domain[0]) == nil
70
81
  if !@external_links[@search_uri.to_sym]
@@ -73,11 +84,11 @@ class LinkScrapper
73
84
  response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
74
85
  t2 = Time.now
75
86
  delta = t2 - t1
76
- rescode = response.code
87
+ code = response.code
77
88
  rescue => ex
78
- rescode = 408
89
+ code = 408
79
90
  end
80
- @external_links[@search_uri.to_sym] = {res: rescode, time: delta}
91
+ @external_links[@search_uri.to_sym] = {res: code, time: delta}
81
92
  end
82
93
  @skip = 1
83
94
  end
@@ -93,8 +104,17 @@ class LinkScrapper
93
104
  @skip = 1
94
105
  else
95
106
  # check for protocol agnostic and indirect links
96
- if @search_uri[0,2] == '//' || @search_uri[0,2] == './' || @search_uri[0,3] == '../'
97
- @search_uri[0,2] = ""
107
+ case @search_uri[0,1]
108
+ when '.'
109
+ @search_uri[0,1] = ''
110
+ end
111
+ case @search_uri[0,2]
112
+ when '//', './', '..'
113
+ @search_uri[0,2] = ''
114
+ end
115
+ case @search_uri[0,3]
116
+ when '../'
117
+ @search_uri[0,3] = ''
98
118
  end
99
119
  # check for relative link
100
120
  if @search_uri[0] == '/'
@@ -104,7 +124,8 @@ class LinkScrapper
104
124
  if @search_uri !~ /^([\w]|%|#|\?)/
105
125
  @search_index += 1
106
126
  @skip = 1
107
- puts "invalid uri #{@search_uri}"
127
+ @error_links[@search_uri] = ''
128
+ puts "invalid uri #{@search_uri}" if @settings[:verbose]
108
129
  return
109
130
  end
110
131
  # define uri string
@@ -143,7 +164,7 @@ class LinkScrapper
143
164
  if @skip == 0
144
165
 
145
166
  # let user know which uri is currently active
146
- puts @search_uri
167
+ puts @search_uri if @settings[:verbose]
147
168
 
148
169
  # gather page request response
149
170
  begin
@@ -193,15 +214,20 @@ class LinkScrapper
193
214
  def save_results
194
215
  # save search results
195
216
  CSV.open('results.csv', 'wb') {|csv|
196
- @checked_links.each {|key|
197
- csv << [key[0], key[1][:res], key[1][:time]]
217
+ @checked_links.each {|link|
218
+ csv << [link[0], link[1][:res], link[1][:time]]
198
219
  }
199
220
  }
200
-
201
221
  # save list of external links
202
222
  CSV.open('external-links.csv', 'wb') {|csv|
203
- @external_links.each do |key|
204
- csv << [key[0], key[1][:res], key[1][:time]]
223
+ @external_links.each do |link|
224
+ csv << [link[0], link[1][:res], link[1][:time]]
225
+ end
226
+ }
227
+ # save list of invalid links
228
+ CSV.open('invalid.csv', 'wb') {|csv|
229
+ @error_links.each do |link|
230
+ csv << link
205
231
  end
206
232
  }
207
233
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: link_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Robert McDowell
@@ -10,7 +10,9 @@ bindir: bin
10
10
  cert_chain: []
11
11
  date: 2016-04-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: A simple gem for scrapping links within an assigned website.
13
+ description: A simple gem for scrapping links within an assigned website. Results
14
+ for domain, external links, and invalid URLs can be saved as CSVs or returned as
15
+ a hash.
14
16
  email: doodersrage@gmail.com
15
17
  executables: []
16
18
  extensions: []