link_scrapper 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/link_scrapper.rb +43 -17
  3. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6117f7af82850c7d9d9072ab59837b266a4c82ae
4
- data.tar.gz: c4e01ea6b7d55f4bc7e51a15f131b306bab0246d
3
+ metadata.gz: 98b6d54cc75862b1e063a04e355a6f638deff14b
4
+ data.tar.gz: fbe3a8ecfbf27d81458862ec96543700e994a049
5
5
  SHA512:
6
- metadata.gz: 8de711206d0270bfb23ac104684822355069ce0609b402157454e32c4dad4bf5c833b93054cc6546c9b9d42d3295d08a37c54c9fd90d2a0283bdfdbd85e4c7da
7
- data.tar.gz: 17bf223f967f0eb4573a36f5013c825181c82861382cf1f7f5caa28fdc129af9d93267d1928ffc4fd79e1a8f36f8e259f128393dabffeb381e2cc1e0661bd3d4
6
+ metadata.gz: 34296971fcebf4d437aba6703c36ca93a8599ee51d5ee73032e2e58217e8c25029feabb9c83c51f298fdbe5a27155196faaec2bf860e5751a228da843dcd499b
7
+ data.tar.gz: f3d82c1973ec70e37815ad88f35523b5edab699d0c473e7c8ce61e9387e83b1ffbf3ec5b456eb05001cc7e653282b88954790bc4136c79e24b818c8cabf05b28
data/lib/link_scrapper.rb CHANGED
@@ -7,9 +7,15 @@ SEARCH_DOMAIN = 'http://virginiabeachwebdevelopment.com/'
7
7
  # class for grabbing and parsing domain links
8
8
  class LinkScrapper
9
9
 
10
- def initialize(search_domain = SEARCH_DOMAIN)
10
+ def initialize(settings)
11
+
12
+ # available default settings
13
+ # domain: domain to be searched
14
+ # verbose: prints output as the script goes along
15
+ # results: hash or csv
11
16
 
12
17
  # init link store hashes
18
+ @settings = settings;
13
19
  @search_index = 0
14
20
  @search_iteration = 0
15
21
  @links = Array.new
@@ -20,11 +26,11 @@ class LinkScrapper
20
26
  # gather search domain
21
27
  if ARGV[0]
22
28
  @search_domain = ARGV[0].dup
23
- elsif search_domain == 'ue'
29
+ elsif @settings[:domain] == 'ue'
24
30
  puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
25
31
  @search_domain = gets.chomp
26
- elsif search_domain
27
- @search_domain = search_domain
32
+ elsif @settings[:domain]
33
+ @search_domain = @settings[:domain]
28
34
  end
29
35
 
30
36
  # override with default domain if entry is left empty
@@ -59,12 +65,17 @@ class LinkScrapper
59
65
  @search_uri = @links[@search_index][0].chomp
60
66
  else
61
67
  # save results and exit
62
- save_results
68
+ if @settings[:results] == 'csv'
69
+ save_results
70
+ else
71
+ return { checked_links: @checked_links, error_links: @error_links, external_links: @external_links}
72
+ end
63
73
  exit
64
74
  end
65
75
 
66
76
  # check for direct link
67
77
  if @search_uri =~ /^htt(p|ps):/
78
+
68
79
  # if external link go to next link
69
80
  if @search_uri.index(@local_domain[0]) == nil
70
81
  if !@external_links[@search_uri.to_sym]
@@ -73,11 +84,11 @@ class LinkScrapper
73
84
  response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
74
85
  t2 = Time.now
75
86
  delta = t2 - t1
76
- rescode = response.code
87
+ code = response.code
77
88
  rescue => ex
78
- rescode = 408
89
+ code = 408
79
90
  end
80
- @external_links[@search_uri.to_sym] = {res: rescode, time: delta}
91
+ @external_links[@search_uri.to_sym] = {res: code, time: delta}
81
92
  end
82
93
  @skip = 1
83
94
  end
@@ -93,8 +104,17 @@ class LinkScrapper
93
104
  @skip = 1
94
105
  else
95
106
  # check for protocol agnostic and indirect links
96
- if @search_uri[0,2] == '//' || @search_uri[0,2] == './' || @search_uri[0,3] == '../'
97
- @search_uri[0,2] = ""
107
+ case @search_uri[0,1]
108
+ when '.'
109
+ @search_uri[0,1] = ''
110
+ end
111
+ case @search_uri[0,2]
112
+ when '//', './', '..'
113
+ @search_uri[0,2] = ''
114
+ end
115
+ case @search_uri[0,3]
116
+ when '../'
117
+ @search_uri[0,3] = ''
98
118
  end
99
119
  # check for relative link
100
120
  if @search_uri[0] == '/'
@@ -104,7 +124,8 @@ class LinkScrapper
104
124
  if @search_uri !~ /^([\w]|%|#|\?)/
105
125
  @search_index += 1
106
126
  @skip = 1
107
- puts "invalid uri #{@search_uri}"
127
+ @error_links[@search_uri] = ''
128
+ puts "invalid uri #{@search_uri}" if @settings[:verbose]
108
129
  return
109
130
  end
110
131
  # define uri string
@@ -143,7 +164,7 @@ class LinkScrapper
143
164
  if @skip == 0
144
165
 
145
166
  # let user know which uri is currently active
146
- puts @search_uri
167
+ puts @search_uri if @settings[:verbose]
147
168
 
148
169
  # gather page request response
149
170
  begin
@@ -193,15 +214,20 @@ class LinkScrapper
193
214
  def save_results
194
215
  # save search results
195
216
  CSV.open('results.csv', 'wb') {|csv|
196
- @checked_links.each {|key|
197
- csv << [key[0], key[1][:res], key[1][:time]]
217
+ @checked_links.each {|link|
218
+ csv << [link[0], link[1][:res], link[1][:time]]
198
219
  }
199
220
  }
200
-
201
221
  # save list of external links
202
222
  CSV.open('external-links.csv', 'wb') {|csv|
203
- @external_links.each do |key|
204
- csv << [key[0], key[1][:res], key[1][:time]]
223
+ @external_links.each do |link|
224
+ csv << [link[0], link[1][:res], link[1][:time]]
225
+ end
226
+ }
227
+ # save list of invalid links
228
+ CSV.open('invalid.csv', 'wb') {|csv|
229
+ @error_links.each do |link|
230
+ csv << link
205
231
  end
206
232
  }
207
233
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: link_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Robert McDowell
@@ -10,7 +10,9 @@ bindir: bin
10
10
  cert_chain: []
11
11
  date: 2016-04-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: A simple gem for scrapping links within an assigned website.
13
+ description: A simple gem for scrapping links within an assigned website. Results
14
+ for domain, external links, and invalid URLs can be saved as CSVs or returned as
15
+ a hash.
14
16
  email: doodersrage@gmail.com
15
17
  executables: []
16
18
  extensions: []